diff --git a/data/benchmarks/ace.json b/data/benchmarks/ace.json new file mode 100644 index 0000000000000000000000000000000000000000..18193b42999b541398e50912a784f3d34d911f94 --- /dev/null +++ b/data/benchmarks/ace.json @@ -0,0 +1,120 @@ +{ + "models": [ + { + "model_id": "anthropic/Opus 4.1", + "name": "Opus 4.1", + "developer": "anthropic", + "scores": { + "Overall Score": 0.4, + "Gaming Score": 0.318 + } + }, + { + "model_id": "anthropic/Opus 4.5", + "name": "Opus 4.5", + "developer": "anthropic", + "scores": { + "Overall Score": 0.478, + "Gaming Score": 0.391 + } + }, + { + "model_id": "anthropic/Sonnet 4.5", + "name": "Sonnet 4.5", + "developer": "anthropic", + "scores": { + "Overall Score": 0.44, + "Gaming Score": 0.373 + } + }, + { + "model_id": "google/Gemini 2.5 Flash", + "name": "Gemini 2.5 Flash", + "developer": "google", + "scores": { + "Overall Score": 0.38, + "Gaming Score": 0.284 + } + }, + { + "model_id": "google/Gemini 2.5 Pro", + "name": "Gemini 2.5 Pro", + "developer": "google", + "scores": { + "Overall Score": 0.4, + "Gaming Score": 0.285 + } + }, + { + "model_id": "google/Gemini 3 Flash", + "name": "Gemini 3 Flash", + "developer": "google", + "scores": { + "Gaming Score": 0.415 + } + }, + { + "model_id": "google/Gemini 3 Pro", + "name": "Gemini 3 Pro", + "developer": "google", + "scores": { + "Overall Score": 0.47, + "Gaming Score": 0.509 + } + }, + { + "model_id": "openai/GPT 5", + "name": "GPT 5", + "developer": "openai", + "scores": { + "Overall Score": 0.561, + "DIY Score": 0.55, + "Food Score": 0.7, + "Gaming Score": 0.575 + } + }, + { + "model_id": "openai/GPT 5.1", + "name": "GPT 5.1", + "developer": "openai", + "scores": { + "Overall Score": 0.551, + "DIY Score": 0.56, + "Gaming Score": 0.61, + "Shopping Score": 0.45 + } + }, + { + "model_id": "openai/GPT 5.2", + "name": "GPT 5.2", + "developer": "openai", + "scores": { + "Overall Score": 0.515, + "Food Score": 0.65, + "Gaming Score": 0.578 + } + }, + { + "model_id": "openai/o3", + "name": "o3", + "developer": "openai", + "scores": { + "Overall Score": 0.529, + "Gaming Score": 0.585, + "Shopping Score": 0.45 + } + }, + { + "model_id": "openai/o3 Pro", + "name": "o3 Pro", + "developer": "openai", + "scores": { + "Overall Score": 0.552, + "DIY Score": 0.54, + "Food Score": 0.6, + "Gaming Score": 0.613, + "Shopping Score": 0.45 + } + } + ] +} \ No newline at end of file diff --git a/data/benchmarks/apex-agents.json b/data/benchmarks/apex-agents.json new file mode 100644 index 0000000000000000000000000000000000000000..94f808e80278dd438e7bff82cce29dbeb8781217 --- /dev/null +++ b/data/benchmarks/apex-agents.json @@ -0,0 +1,218 @@ +{ + "models": [ + { + "model_id": "anthropic/Opus 4.5", + "name": "Opus 4.5", + "developer": "anthropic", + "scores": { + "Overall Pass@1": 0.184, + "Overall Pass@8": 0.34, + "Overall Mean Score": 0.348, + "Investment Banking Pass@1": 0.216, + "Management Consulting Pass@1": 0.132, + "Corporate Law Pass@1": 0.202, + "Corporate Lawyer Mean Score": 0.471 + } + }, + { + "model_id": "anthropic/Opus 4.6", + "name": "Opus 4.6", + "developer": "anthropic", + "scores": { + "Overall Pass@1": 0.298, + "Corporate Lawyer Mean Score": 0.502 + } + }, + { + "model_id": "applied-compute/Applied Compute: Small", + "name": "Applied Compute: Small", + "developer": "applied-compute", + "scores": { + "Overall Pass@1": 0.23, + "Overall Mean Score": 0.401, + "Corporate Law Pass@1": 0.266, + "Corporate Lawyer Mean Score": 0.548 + } + }, + { + "model_id": "google/Gemini 3 Flash", + "name": "Gemini 3 Flash", + "developer": "google", + "scores": { + "Overall Pass@1": 0.24, + "Overall Pass@8": 0.367, + "Overall Mean Score": 0.395, + "Investment Banking Pass@1": 0.267, + "Management Consulting Pass@1": 0.193, + "Corporate Law Pass@1": 0.259, + "Corporate Lawyer Mean Score": 0.524 + } + }, + { + "model_id": "google/Gemini 3 Pro", + "name": "Gemini 3 Pro", + "developer": "google", + "scores": { + "Overall Pass@1": 0.184, + "Overall Pass@8": 0.373, + "Overall Mean Score": 0.341, + "Investment Banking Pass@1": 0.188, + "Management Consulting Pass@1": 0.124, + "Corporate Law Pass@1": 0.239, + "Corporate Lawyer Mean Score": 0.487 + } + }, + { + "model_id": "google/Gemini 3.1 Pro", + "name": "Gemini 3.1 Pro", + "developer": "google", + "scores": { + "Overall Pass@1": 0.335, + "Corporate Lawyer Mean Score": 0.494 + } + }, + { + "model_id": "minimax/Minimax-2.5", + "name": "Minimax-2.5", + "developer": "minimax", + "scores": { + "Corporate Lawyer Mean Score": 0.339 + } + }, + { + "model_id": "moonshot/Kimi K2 Thinking", + "name": "Kimi K2 Thinking", + "developer": "moonshot", + "scores": { + "Overall Pass@1": 0.04, + "Overall Pass@8": 0.144, + "Overall Mean Score": 0.115, + "Investment Banking Pass@1": 0.012, + "Management Consulting Pass@1": 0.029, + "Corporate Law Pass@1": 0.08, + "Corporate Lawyer Mean Score": 0.223 + } + }, + { + "model_id": "moonshot/Kimi K2.5", + "name": "Kimi K2.5", + "developer": "moonshot", + "scores": { + "Corporate Lawyer Mean Score": 0.402 + } + }, + { + "model_id": "openai/GPT 5", + "name": "GPT 5", + "developer": "openai", + "scores": { + "Overall Pass@1": 0.183, + "Overall Pass@8": 0.31, + "Overall Mean Score": 0.329, + "Investment Banking Pass@1": 0.273, + "Management Consulting Pass@1": 0.123, + "Corporate Law Pass@1": 0.153, + "Corporate Lawyer Mean Score": 0.382 + } + }, + { + "model_id": "openai/GPT 5 Codex", + "name": "GPT 5 Codex", + "developer": "openai", + "scores": { + "Corporate Lawyer Mean Score": 0.362 + } + }, + { + "model_id": "openai/GPT 5.1", + "name": "GPT 5.1", + "developer": "openai", + "scores": { + "Corporate Lawyer Mean Score": 0.376 + } + }, + { + "model_id": "openai/GPT 5.1 Codex", + "name": "GPT 5.1 Codex", + "developer": "openai", + "scores": { + "Corporate Lawyer Mean Score": 0.366 + } + }, + { + "model_id": "openai/GPT 5.2", + "name": "GPT 5.2", + "developer": "openai", + "scores": { + "Overall Pass@1": 0.23, + "Overall Pass@8": 0.4, + "Overall Mean Score": 0.387, + "Investment Banking Pass@1": 0.273, + "Management Consulting Pass@1": 0.227, + "Corporate Law Pass@1": 0.189, + "Corporate Lawyer Mean Score": 0.443 + } + }, + { + "model_id": "openai/GPT 5.2 Codex", + "name": "GPT 5.2 Codex", + "developer": "openai", + "scores": { + "Overall Pass@1": 0.276, + "Corporate Lawyer Mean Score": 0.394 + } + }, + { + "model_id": "openai/GPT 5.3 Codex", + "name": "GPT 5.3 Codex", + "developer": "openai", + "scores": { + "Overall Pass@1": 0.317 + } + }, + { + "model_id": "openai/GPT OSS 120B", + "name": "GPT OSS 120B", + "developer": "openai", + "scores": { + "Overall Pass@1": 0.047, + "Overall Pass@8": 0.115, + "Overall Mean Score": 0.145, + "Investment Banking Pass@1": 0.027, + "Management Consulting Pass@1": 0.035, + "Corporate Law Pass@1": 0.078, + "Corporate Lawyer Mean Score": 0.269 + } + }, + { + "model_id": "xai/Grok 4", + "name": "Grok 4", + "developer": "xai", + "scores": { + "Overall Pass@1": 0.152, + "Overall Pass@8": 0.329, + "Overall Mean Score": 0.303, + "Investment Banking Pass@1": 0.17, + "Management Consulting Pass@1": 0.12, + "Corporate Law Pass@1": 0.165, + "Corporate Lawyer Mean Score": 0.41 + } + }, + { + "model_id": "zhipu/GLM 4.6", + "name": "GLM 4.6", + "developer": "zhipu", + "scores": { + "Corporate Lawyer Mean Score": 0.196 + } + }, + { + "model_id": "zhipu/GLM 4.7", + "name": "GLM 4.7", + "developer": "zhipu", + "scores": { + "Corporate Lawyer Mean Score": 0.147 + } + } + ] +} \ No newline at end of file diff --git a/data/benchmarks/apex-v1.json b/data/benchmarks/apex-v1.json new file mode 100644 index 0000000000000000000000000000000000000000..62b4d342f1679668e6413832133d325acac1b77a --- /dev/null +++ b/data/benchmarks/apex-v1.json @@ -0,0 +1,93 @@ +{ + "models": [ + { + "model_id": "anthropic/Opus 4.5", + "name": "Opus 4.5", + "developer": "anthropic", + "scores": { + "Medicine (MD) Score": 0.65 + } + }, + { + "model_id": "google/Gemini 2.5 Flash", + "name": "Gemini 2.5 Flash", + "developer": "google", + "scores": { + "Overall Score": 0.604 + } + }, + { + "model_id": "google/Gemini 3 Flash", + "name": "Gemini 3 Flash", + "developer": "google", + "scores": { + "Overall Score": 0.64, + "Consulting Score": 0.64 + } + }, + { + "model_id": "google/Gemini 3 Pro", + "name": "Gemini 3 Pro", + "developer": "google", + "scores": { + "Overall Score": 0.643, + "Consulting Score": 0.64, + "Investment Banking Score": 0.63 + } + }, + { + "model_id": "openai/GPT 4o", + "name": "GPT 4o", + "developer": "openai", + "scores": { + "Overall Score": 0.359 + } + }, + { + "model_id": "openai/GPT 5", + "name": "GPT 5", + "developer": "openai", + "scores": { + "Overall Score": 0.67, + "Big Law Score": 0.78, + "Medicine (MD) Score": 0.66, + "Investment Banking Score": 0.61 + } + }, + { + "model_id": "openai/GPT 5.1", + "name": "GPT 5.1", + "developer": "openai", + "scores": { + "Big Law Score": 0.77 + } + }, + { + "model_id": "openai/GPT 5.2 Pro", + "name": "GPT 5.2 Pro", + "developer": "openai", + "scores": { + "Overall Score": 0.668, + "Consulting Score": 0.64, + "Medicine (MD) Score": 0.65, + "Investment Banking Score": 0.64 + } + }, + { + "model_id": "openai/o3", + "name": "o3", + "developer": "openai", + "scores": { + "Big Law Score": 0.76 + } + }, + { + "model_id": "xai/Grok 4", + "name": "Grok 4", + "developer": "xai", + "scores": { + "Overall Score": 0.635 + } + } + ] +} \ No newline at end of file diff --git a/data/benchmarks/appworld_test_normal.json b/data/benchmarks/appworld_test_normal.json new file mode 100644 index 0000000000000000000000000000000000000000..f149d2e83dd4295170a7dd85ee296bc311ef3b12 --- /dev/null +++ b/data/benchmarks/appworld_test_normal.json @@ -0,0 +1,28 @@ +{ + "models": [ + { + "model_id": "anthropic/claude-opus-4-5", + "name": "claude-opus-4-5", + "developer": "Anthropic", + "scores": { + "appworld/test_normal": 0.7 + } + }, + { + "model_id": "google/gemini-3-pro-preview", + "name": "gemini-3-pro-preview", + "developer": "Google", + "scores": { + "appworld/test_normal": 0.36 + } + }, + { + "model_id": "openai/gpt-5.2-2025-12-11", + "name": "gpt-5.2-2025-12-11", + "developer": "OpenAI", + "scores": { + "appworld/test_normal": 0.0 + } + } + ] +} \ No newline at end of file diff --git a/data/benchmarks/browsecompplus.json b/data/benchmarks/browsecompplus.json new file mode 100644 index 0000000000000000000000000000000000000000..7beeaa9ca1194eca39b656d11dd6cb8fd5ccd590 --- /dev/null +++ b/data/benchmarks/browsecompplus.json @@ -0,0 +1,28 @@ +{ + "models": [ + { + "model_id": "anthropic/claude-opus-4-5", + "name": "claude-opus-4-5", + "developer": "Anthropic", + "scores": { + "browsecompplus": 0.61 + } + }, + { + "model_id": "google/gemini-3-pro-preview", + "name": "gemini-3-pro-preview", + "developer": "Google", + "scores": { + "browsecompplus": 0.57 + } + }, + { + "model_id": "openai/gpt-5.2-2025-12-11", + "name": "gpt-5.2-2025-12-11", + "developer": "OpenAI", + "scores": { + "browsecompplus": 0.46 + } + } + ] +} \ No newline at end of file diff --git a/data/benchmarks/global-mmlu-lite.json b/data/benchmarks/global-mmlu-lite.json new file mode 100644 index 0000000000000000000000000000000000000000..d435988274520e6c53f552dcf3f5b6760c0ea503 --- /dev/null +++ b/data/benchmarks/global-mmlu-lite.json @@ -0,0 +1,706 @@ +{ + "models": [ + { + "model_id": "alibaba/qwen3-235b-a22b-instruct-2507", + "name": "qwen3-235b-a22b-instruct-2507", + "developer": "alibaba", + "scores": { + "Global MMLU Lite": 0.8798, + "Culturally Sensitive": 0.8522, + "Culturally Agnostic": 0.9075, + "Arabic": 0.88, + "English": 0.89, + "Bengali": 0.8875, + "German": 0.885, + "French": 0.88, + "Hindi": 0.8775, + "Indonesian": 0.88, + "Italian": 0.88, + "Japanese": 0.88, + "Korean": 0.875, + "Portuguese": 0.8875, + "Spanish": 0.875, + "Swahili": 0.87, + "Yoruba": 0.8725, + "Chinese": 0.8775, + "Burmese": 0.88 + } + }, + { + "model_id": "anthropic/claude-3-5-haiku-20241022", + "name": "claude-3-5-haiku-20241022", + "developer": "anthropic", + "scores": { + "Global MMLU Lite": 0.6114, + "Culturally Sensitive": 0.5834, + "Culturally Agnostic": 0.6394, + "Arabic": 0.695, + "English": 0.485, + "Bengali": 0.675, + "German": 0.565, + "French": 0.61, + "Hindi": 0.6575, + "Indonesian": 0.5475, + "Italian": 0.48, + "Japanese": 0.655, + "Korean": 0.6575, + "Portuguese": 0.5225, + "Spanish": 0.485, + "Swahili": 0.69, + "Yoruba": 0.6675, + "Chinese": 0.69, + "Burmese": 0.7 + } + }, + { + "model_id": "anthropic/claude-3-7-sonnet-20250219", + "name": "claude-3-7-sonnet-20250219", + "developer": "anthropic", + "scores": { + "Global MMLU Lite": 0.8078, + "Culturally Sensitive": 0.7794, + "Culturally Agnostic": 0.8362, + "Arabic": 0.7925, + "English": 0.7625, + "Bengali": 0.825, + "German": 0.8125, + "French": 0.7675, + "Hindi": 0.805, + "Indonesian": 0.8175, + "Italian": 0.8225, + "Japanese": 0.8425, + "Korean": 0.83, + "Portuguese": 0.77, + "Spanish": 0.8075, + "Swahili": 0.8125, + "Yoruba": 0.81, + "Chinese": 0.835, + "Burmese": 0.8125 + } + }, + { + "model_id": "anthropic/claude-opus-4-1-20250805", + "name": "claude-opus-4-1-20250805", + "developer": "anthropic", + "scores": { + "Global MMLU Lite": 0.943, + "Culturally Sensitive": 0.9331, + "Culturally Agnostic": 0.9528, + "Arabic": 0.945, + "English": 0.9475, + "Bengali": 0.9425, + "German": 0.94, + "French": 0.945, + "Hindi": 0.9475, + "Indonesian": 0.9425, + "Italian": 0.94, + "Japanese": 0.94, + "Korean": 0.95, + "Portuguese": 0.945, + "Spanish": 0.945, + "Swahili": 0.93, + "Yoruba": 0.9375, + "Chinese": 0.945, + "Burmese": 0.945 + } + }, + { + "model_id": "anthropic/claude-sonnet-4-20250514", + "name": "claude-sonnet-4-20250514", + "developer": "anthropic", + "scores": { + "Global MMLU Lite": 0.9058, + "Culturally Sensitive": 0.8913, + "Culturally Agnostic": 0.9203, + "Arabic": 0.9125, + "English": 0.905, + "Bengali": 0.9075, + "German": 0.9125, + "French": 0.91, + "Hindi": 0.9, + "Indonesian": 0.9025, + "Italian": 0.9075, + "Japanese": 0.9, + "Korean": 0.9125, + "Portuguese": 0.91, + "Spanish": 0.9075, + "Swahili": 0.8975, + "Yoruba": 0.8975, + "Chinese": 0.9175, + "Burmese": 0.8925 + } + }, + { + "model_id": "cohere/aya-expanse-32b", + "name": "aya-expanse-32b", + "developer": "cohere", + "scores": { + "Global MMLU Lite": 0.7353, + "Culturally Sensitive": 0.6891, + "Culturally Agnostic": 0.7815, + "Arabic": 0.7425, + "English": 0.7544, + "Bengali": 0.7343, + "German": 0.7425, + "French": 0.7325, + "Hindi": 0.7375, + "Indonesian": 0.7594, + "Italian": 0.7305, + "Japanese": 0.7419, + "Korean": 0.7525, + "Portuguese": 0.7544, + "Spanish": 0.7362, + "Swahili": 0.7071, + "Yoruba": 0.6942, + "Chinese": 0.743, + "Burmese": 0.7025 + } + }, + { + "model_id": "cohere/command-a-03-2025", + "name": "command-a-03-2025", + "developer": "cohere", + "scores": { + "Global MMLU Lite": 0.8385, + "Culturally Sensitive": 0.7993, + "Culturally Agnostic": 0.8778, + "Arabic": 0.8425, + "English": 0.855, + "Bengali": 0.8225, + "German": 0.8425, + "French": 0.8375, + "Hindi": 0.8421, + "Indonesian": 0.8546, + "Italian": 0.8375, + "Japanese": 0.845, + "Korean": 0.85, + "Portuguese": 0.84, + "Spanish": 0.8525, + "Swahili": 0.8275, + "Yoruba": 0.815, + "Chinese": 0.835, + "Burmese": 0.8175 + } + }, + { + "model_id": "deepseek/deepseek-r1-0528", + "name": "deepseek-r1-0528", + "developer": "deepseek", + "scores": { + "Global MMLU Lite": 0.6744, + "Culturally Sensitive": 0.6672, + "Culturally Agnostic": 0.6816, + "Arabic": 0.6825, + "English": 0.715, + "Bengali": 0.655, + "German": 0.6375, + "French": 0.6925, + "Hindi": 0.6475, + "Indonesian": 0.655, + "Italian": 0.6775, + "Japanese": 0.7725, + "Korean": 0.6575, + "Portuguese": 0.635, + "Spanish": 0.7175, + "Swahili": 0.6775, + "Yoruba": 0.77, + "Chinese": 0.5075, + "Burmese": 0.69 + } + }, + { + "model_id": "deepseek/deepseek-v3.1", + "name": "deepseek-v3.1", + "developer": "deepseek", + "scores": { + "Global MMLU Lite": 0.8044, + "Culturally Sensitive": 0.7793, + "Culturally Agnostic": 0.8295, + "Arabic": 0.805, + "English": 0.825, + "Bengali": 0.8157, + "German": 0.7925, + "French": 0.8175, + "Hindi": 0.7569, + "Indonesian": 0.7764, + "Italian": 0.8075, + "Japanese": 0.8312, + "Korean": 0.8125, + "Portuguese": 0.8246, + "Spanish": 0.8125, + "Swahili": 0.801, + "Yoruba": 0.7831, + "Chinese": 0.8161, + "Burmese": 0.7925 + } + }, + { + "model_id": "google/gemini-2.5-flash", + "name": "gemini-2.5-flash", + "developer": "google", + "scores": { + "Global MMLU Lite": 0.9145, + "Culturally Sensitive": 0.9, + "Culturally Agnostic": 0.9291, + "Arabic": 0.9125, + "English": 0.9325, + "Bengali": 0.91, + "German": 0.9025, + "French": 0.91, + "Hindi": 0.925, + "Indonesian": 0.9075, + "Italian": 0.9225, + "Japanese": 0.9125, + "Korean": 0.915, + "Portuguese": 0.9125, + "Spanish": 0.9175, + "Swahili": 0.915, + "Yoruba": 0.9075, + "Chinese": 0.915, + "Burmese": 0.915 + } + }, + { + "model_id": "google/gemini-2.5-flash-preview-05-20", + "name": "gemini-2.5-flash-preview-05-20", + "developer": "google", + "scores": { + "Global MMLU Lite": 0.9092, + "Culturally Sensitive": 0.8925, + "Culturally Agnostic": 0.9259, + "Arabic": 0.905, + "English": 0.9225, + "Bengali": 0.91, + "German": 0.905, + "French": 0.925, + "Hindi": 0.9125, + "Indonesian": 0.9075, + "Italian": 0.89, + "Japanese": 0.9125, + "Korean": 0.9075, + "Portuguese": 0.915, + "Spanish": 0.915, + "Swahili": 0.905, + "Yoruba": 0.8825, + "Chinese": 0.93, + "Burmese": 0.9025 + } + }, + { + "model_id": "google/gemini-2.5-pro", + "name": "gemini-2.5-pro", + "developer": "google", + "scores": { + "Global MMLU Lite": 0.9323, + "Culturally Sensitive": 0.9241, + "Culturally Agnostic": 0.9406, + "Arabic": 0.9475, + "English": 0.9275, + "Bengali": 0.9275, + "German": 0.93, + "French": 0.9425, + "Hindi": 0.9275, + "Indonesian": 0.925, + "Italian": 0.935, + "Japanese": 0.9375, + "Korean": 0.9275, + "Portuguese": 0.93, + "Spanish": 0.94, + "Swahili": 0.9375, + "Yoruba": 0.925, + "Chinese": 0.9275, + "Burmese": 0.93 + } + }, + { + "model_id": "google/gemini-3-pro-preview", + "name": "gemini-3-pro-preview", + "developer": "Google", + "scores": { + "Global MMLU Lite": 0.9453, + "Culturally Sensitive": 0.9397, + "Culturally Agnostic": 0.9509, + "Arabic": 0.9475, + "English": 0.9425, + "Bengali": 0.9425, + "German": 0.94, + "French": 0.9575, + "Hindi": 0.9425, + "Indonesian": 0.955, + "Italian": 0.955, + "Japanese": 0.94, + "Korean": 0.94, + "Portuguese": 0.9425, + "Spanish": 0.9475, + "Swahili": 0.94, + "Yoruba": 0.9425, + "Chinese": 0.9475, + "Burmese": 0.9425 + } + }, + { + "model_id": "google/gemma-3-27b-it", + "name": "gemma-3-27b-it", + "developer": "google", + "scores": { + "Global MMLU Lite": 0.763, + "Culturally Sensitive": 0.7528, + "Culturally Agnostic": 0.7733, + "Arabic": 0.78, + "English": 0.7337, + "Bengali": 0.75, + "German": 0.775, + "French": 0.7481, + "Hindi": 0.7335, + "Indonesian": 0.7563, + "Italian": 0.75, + "Japanese": 0.7925, + "Korean": 0.798, + "Portuguese": 0.7481, + "Spanish": 0.7494, + "Swahili": 0.785, + "Yoruba": 0.7444, + "Chinese": 0.7925, + "Burmese": 0.7719 + } + }, + { + "model_id": "google/gemma-3-4b-it", + "name": "gemma-3-4b-it", + "developer": "google", + "scores": { + "Global MMLU Lite": 0.6511, + "Culturally Sensitive": 0.6116, + "Culturally Agnostic": 0.6906, + "Arabic": 0.6525, + "English": 0.67, + "Bengali": 0.68, + "German": 0.6525, + "French": 0.6575, + "Hindi": 0.6475, + "Indonesian": 0.6775, + "Italian": 0.6675, + "Japanese": 0.6325, + "Korean": 0.66, + "Portuguese": 0.68, + "Spanish": 0.6725, + "Swahili": 0.6075, + "Yoruba": 0.5825, + "Chinese": 0.6475, + "Burmese": 0.63 + } + }, + { + "model_id": "ibm/granite-4.0-h-small", + "name": "granite-4.0-h-small", + "developer": "ibm", + "scores": { + "Global MMLU Lite": 0.7503, + "Culturally Sensitive": 0.7182, + "Culturally Agnostic": 0.7826, + "Arabic": 0.7613, + "English": 0.77, + "Bengali": 0.7613, + "German": 0.755, + "French": 0.7594, + "Hindi": 0.7575, + "Indonesian": 0.7614, + "Italian": 0.7525, + "Japanese": 0.7406, + "Korean": 0.7525, + "Portuguese": 0.757, + "Spanish": 0.7638, + "Swahili": 0.7318, + "Yoruba": 0.6921, + "Chinese": 0.7475, + "Burmese": 0.7419 + } + }, + { + "model_id": "mistralai/mistral-medium-3", + "name": "mistral-medium-3", + "developer": "mistralai", + "scores": { + "Global MMLU Lite": 0.5511, + "Culturally Sensitive": 0.5391, + "Culturally Agnostic": 0.5631, + "Arabic": 0.455, + "English": 0.38, + "Bengali": 0.5175, + "German": 0.4775, + "French": 0.41, + "Hindi": 0.555, + "Indonesian": 0.515, + "Italian": 0.535, + "Japanese": 0.58, + "Korean": 0.595, + "Portuguese": 0.5175, + "Spanish": 0.5375, + "Swahili": 0.7075, + "Yoruba": 0.7675, + "Chinese": 0.535, + "Burmese": 0.7325 + } + }, + { + "model_id": "mistralai/mistral-small-2503", + "name": "mistral-small-2503", + "developer": "mistralai", + "scores": { + "Global MMLU Lite": 0.7852, + "Culturally Sensitive": 0.7537, + "Culturally Agnostic": 0.8166, + "Arabic": 0.7875, + "English": 0.8, + "Bengali": 0.7725, + "German": 0.7975, + "French": 0.8, + "Hindi": 0.795, + "Indonesian": 0.785, + "Italian": 0.805, + "Japanese": 0.77, + "Korean": 0.79, + "Portuguese": 0.7925, + "Spanish": 0.7825, + "Swahili": 0.775, + "Yoruba": 0.735, + "Chinese": 0.7925, + "Burmese": 0.7825 + } + }, + { + "model_id": "openai/gpt-4.1-2025-04-14", + "name": "gpt-4.1-2025-04-14", + "developer": "openai", + "scores": { + "Global MMLU Lite": 0.8755, + "Culturally Sensitive": 0.8541, + "Culturally Agnostic": 0.8969, + "Arabic": 0.88, + "English": 0.8825, + "Bengali": 0.8625, + "German": 0.875, + "French": 0.8875, + "Hindi": 0.8775, + "Indonesian": 0.885, + "Italian": 0.88, + "Japanese": 0.8725, + "Korean": 0.87, + "Portuguese": 0.875, + "Spanish": 0.885, + "Swahili": 0.8725, + "Yoruba": 0.875, + "Chinese": 0.87, + "Burmese": 0.8575 + } + }, + { + "model_id": "openai/gpt-5-2025-08-07", + "name": "gpt-5-2025-08-07", + "developer": "openai", + "scores": { + "Global MMLU Lite": 0.8895, + "Culturally Sensitive": 0.8913, + "Culturally Agnostic": 0.8878, + "Arabic": 0.8925, + "English": 0.8725, + "Bengali": 0.9, + "German": 0.91, + "French": 0.9075, + "Hindi": 0.865, + "Indonesian": 0.795, + "Italian": 0.9075, + "Japanese": 0.8875, + "Korean": 0.915, + "Portuguese": 0.8875, + "Spanish": 0.905, + "Swahili": 0.865, + "Yoruba": 0.9125, + "Chinese": 0.895, + "Burmese": 0.915 + } + }, + { + "model_id": "openai/o3-mini-2025-01-31", + "name": "o3-mini-2025-01-31", + "developer": "openai", + "scores": { + "Global MMLU Lite": 0.78, + "Culturally Sensitive": 0.765, + "Culturally Agnostic": 0.795, + "Arabic": 0.7725, + "English": 0.8025, + "Bengali": 0.77, + "German": 0.7525, + "French": 0.74, + "Hindi": 0.7525, + "Indonesian": 0.7425, + "Italian": 0.8, + "Japanese": 0.81, + "Korean": 0.8075, + "Portuguese": 0.7975, + "Spanish": 0.775, + "Swahili": 0.765, + "Yoruba": 0.7725, + "Chinese": 0.8125, + "Burmese": 0.8075 + } + }, + { + "model_id": "openai/o4-mini-2025-04-16", + "name": "o4-mini-2025-04-16", + "developer": "openai", + "scores": { + "Global MMLU Lite": 0.8705, + "Culturally Sensitive": 0.8503, + "Culturally Agnostic": 0.8906, + "Arabic": 0.865, + "English": 0.8675, + "Bengali": 0.8875, + "German": 0.8775, + "French": 0.87, + "Hindi": 0.87, + "Indonesian": 0.8675, + "Italian": 0.855, + "Japanese": 0.885, + "Korean": 0.88, + "Portuguese": 0.88, + "Spanish": 0.855, + "Swahili": 0.8525, + "Yoruba": 0.8525, + "Chinese": 0.89, + "Burmese": 0.8725 + } + }, + { + "model_id": "unknown/aya-expanse-32b", + "name": "aya-expanse-32b", + "developer": "unknown", + "scores": { + "Global MMLU Lite": 0.7353, + "Culturally Sensitive": 0.6891, + "Culturally Agnostic": 0.7815, + "Arabic": 0.7425, + "English": 0.7544, + "Bengali": 0.7343, + "German": 0.7425, + "French": 0.7325, + "Hindi": 0.7375, + "Indonesian": 0.7594, + "Italian": 0.7305, + "Japanese": 0.7419, + "Korean": 0.7525, + "Portuguese": 0.7544, + "Spanish": 0.7362, + "Swahili": 0.7071, + "Yoruba": 0.6942, + "Chinese": 0.743, + "Burmese": 0.7025 + } + }, + { + "model_id": "unknown/granite-4.0-h-small", + "name": "granite-4.0-h-small", + "developer": "unknown", + "scores": { + "Global MMLU Lite": 0.7503, + "Culturally Sensitive": 0.7182, + "Culturally Agnostic": 0.7826, + "Arabic": 0.7613, + "English": 0.77, + "Bengali": 0.7613, + "German": 0.755, + "French": 0.7594, + "Hindi": 0.7575, + "Indonesian": 0.7614, + "Italian": 0.7525, + "Japanese": 0.7406, + "Korean": 0.7525, + "Portuguese": 0.757, + "Spanish": 0.7638, + "Swahili": 0.7318, + "Yoruba": 0.6921, + "Chinese": 0.7475, + "Burmese": 0.7419 + } + }, + { + "model_id": "unknown/o4-mini-2025-04-16", + "name": "o4-mini-2025-04-16", + "developer": "unknown", + "scores": { + "Global MMLU Lite": 0.8705, + "Culturally Sensitive": 0.8503, + "Culturally Agnostic": 0.8906, + "Arabic": 0.865, + "English": 0.8675, + "Bengali": 0.8875, + "German": 0.8775, + "French": 0.87, + "Hindi": 0.87, + "Indonesian": 0.8675, + "Italian": 0.855, + "Japanese": 0.885, + "Korean": 0.88, + "Portuguese": 0.88, + "Spanish": 0.855, + "Swahili": 0.8525, + "Yoruba": 0.8525, + "Chinese": 0.89, + "Burmese": 0.8725 + } + }, + { + "model_id": "xai/grok-3-mini", + "name": "grok-3-mini", + "developer": "xai", + "scores": { + "Global MMLU Lite": 0.673, + "Culturally Sensitive": 0.6717, + "Culturally Agnostic": 0.6743, + "Arabic": 0.755, + "English": 0.5075, + "Bengali": 0.7355, + "German": 0.6591, + "French": 0.485, + "Hindi": 0.56, + "Indonesian": 0.725, + "Italian": 0.696, + "Japanese": 0.6575, + "Korean": 0.7325, + "Portuguese": 0.6275, + "Spanish": 0.61, + "Swahili": 0.7625, + "Yoruba": 0.8296, + "Chinese": 0.5564, + "Burmese": 0.8693 + } + }, + { + "model_id": "xai/grok-4-0709", + "name": "grok-4-0709", + "developer": "xai", + "scores": { + "Global MMLU Lite": 0.8881, + "Culturally Sensitive": 0.8862, + "Culturally Agnostic": 0.89, + "Arabic": 0.885, + "English": 0.905, + "Bengali": 0.8925, + "German": 0.8725, + "French": 0.875, + "Hindi": 0.8675, + "Indonesian": 0.89, + "Italian": 0.9025, + "Japanese": 0.87, + "Korean": 0.895, + "Portuguese": 0.8725, + "Spanish": 0.9075, + "Swahili": 0.91, + "Yoruba": 0.905, + "Chinese": 0.8525, + "Burmese": 0.9075 + } + } + ] +} \ No newline at end of file diff --git a/data/benchmarks/helm_capabilities.json b/data/benchmarks/helm_capabilities.json new file mode 100644 index 0000000000000000000000000000000000000000..c8df71f4134b2203bdb20f8860263b048651a013 --- /dev/null +++ b/data/benchmarks/helm_capabilities.json @@ -0,0 +1,797 @@ +{ + "models": [ + { + "model_id": "allenai/olmo-2-0325-32b-instruct", + "name": "OLMo 2 32B Instruct March 2025", + "developer": "allenai", + "scores": { + "Mean score": 0.475, + "MMLU-Pro": 0.414, + "GPQA": 0.287, + "IFEval": 0.78, + "WildBench": 0.734, + "Omni-MATH": 0.161 + } + }, + { + "model_id": "allenai/olmo-2-1124-13b-instruct", + "name": "OLMo 2 13B Instruct November 2024", + "developer": "allenai", + "scores": { + "Mean score": 0.44, + "MMLU-Pro": 0.31, + "GPQA": 0.316, + "IFEval": 0.73, + "WildBench": 0.689, + "Omni-MATH": 0.156 + } + }, + { + "model_id": "allenai/olmo-2-1124-7b-instruct", + "name": "OLMo 2 7B Instruct November 2024", + "developer": "allenai", + "scores": { + "Mean score": 0.405, + "MMLU-Pro": 0.292, + "GPQA": 0.296, + "IFEval": 0.693, + "WildBench": 0.628, + "Omni-MATH": 0.116 + } + }, + { + "model_id": "allenai/olmoe-1b-7b-0125-instruct", + "name": "OLMoE 1B-7B Instruct January 2025", + "developer": "allenai", + "scores": { + "Mean score": 0.332, + "MMLU-Pro": 0.169, + "GPQA": 0.22, + "IFEval": 0.628, + "WildBench": 0.551, + "Omni-MATH": 0.093 + } + }, + { + "model_id": "amazon/nova-lite-v1:0", + "name": "Amazon Nova Lite", + "developer": "amazon", + "scores": { + "Mean score": 0.551, + "MMLU-Pro": 0.6, + "GPQA": 0.397, + "IFEval": 0.776, + "WildBench": 0.75, + "Omni-MATH": 0.233 + } + }, + { + "model_id": "amazon/nova-micro-v1:0", + "name": "Amazon Nova Micro", + "developer": "amazon", + "scores": { + "Mean score": 0.522, + "MMLU-Pro": 0.511, + "GPQA": 0.383, + "IFEval": 0.76, + "WildBench": 0.743, + "Omni-MATH": 0.214 + } + }, + { + "model_id": "amazon/nova-premier-v1:0", + "name": "Amazon Nova Premier", + "developer": "amazon", + "scores": { + "Mean score": 0.637, + "MMLU-Pro": 0.726, + "GPQA": 0.518, + "IFEval": 0.803, + "WildBench": 0.788, + "Omni-MATH": 0.35 + } + }, + { + "model_id": "amazon/nova-pro-v1:0", + "name": "Amazon Nova Pro", + "developer": "amazon", + "scores": { + "Mean score": 0.591, + "MMLU-Pro": 0.673, + "GPQA": 0.446, + "IFEval": 0.815, + "WildBench": 0.777, + "Omni-MATH": 0.242 + } + }, + { + "model_id": "anthropic/claude-3-5-haiku-20241022", + "name": "claude-3-5-haiku-20241022", + "developer": "anthropic", + "scores": { + "Mean score": 0.549, + "MMLU-Pro": 0.605, + "GPQA": 0.363, + "IFEval": 0.792, + "WildBench": 0.76, + "Omni-MATH": 0.224 + } + }, + { + "model_id": "anthropic/claude-3-5-sonnet-20241022", + "name": "Claude 3.5 Sonnet 20241022", + "developer": "anthropic", + "scores": { + "Mean score": 0.653, + "MMLU-Pro": 0.777, + "GPQA": 0.565, + "IFEval": 0.856, + "WildBench": 0.792, + "Omni-MATH": 0.276 + } + }, + { + "model_id": "anthropic/claude-3-7-sonnet-20250219", + "name": "claude-3-7-sonnet-20250219", + "developer": "anthropic", + "scores": { + "Mean score": 0.674, + "MMLU-Pro": 0.784, + "GPQA": 0.608, + "IFEval": 0.834, + "WildBench": 0.814, + "Omni-MATH": 0.33 + } + }, + { + "model_id": "anthropic/claude-opus-4-20250514", + "name": "Claude 4 Opus 20250514", + "developer": "anthropic", + "scores": { + "Mean score": 0.757, + "MMLU-Pro": 0.859, + "GPQA": 0.666, + "IFEval": 0.918, + "WildBench": 0.833, + "Omni-MATH": 0.511 + } + }, + { + "model_id": "anthropic/claude-opus-4-20250514-thinking-10k", + "name": "Claude 4 Opus 20250514, extended thinking", + "developer": "anthropic", + "scores": { + "Mean score": 0.78, + "MMLU-Pro": 0.875, + "GPQA": 0.709, + "IFEval": 0.849, + "WildBench": 0.852, + "Omni-MATH": 0.616 + } + }, + { + "model_id": "anthropic/claude-sonnet-4-20250514", + "name": "claude-sonnet-4-20250514", + "developer": "anthropic", + "scores": { + "Mean score": 0.733, + "MMLU-Pro": 0.843, + "GPQA": 0.643, + "IFEval": 0.839, + "WildBench": 0.825, + "Omni-MATH": 0.512 + } + }, + { + "model_id": "anthropic/claude-sonnet-4-20250514-thinking-10k", + "name": "Claude 4 Sonnet 20250514, extended thinking", + "developer": "anthropic", + "scores": { + "Mean score": 0.766, + "MMLU-Pro": 0.843, + "GPQA": 0.706, + "IFEval": 0.84, + "WildBench": 0.838, + "Omni-MATH": 0.602 + } + }, + { + "model_id": "deepseek-ai/deepseek-r1-0528", + "name": "DeepSeek-R1-0528", + "developer": "deepseek-ai", + "scores": { + "Mean score": 0.699, + "MMLU-Pro": 0.793, + "GPQA": 0.666, + "IFEval": 0.784, + "WildBench": 0.828, + "Omni-MATH": 0.424 + } + }, + { + "model_id": "deepseek-ai/deepseek-v3", + "name": "DeepSeek v3", + "developer": "deepseek-ai", + "scores": { + "Mean score": 0.665, + "MMLU-Pro": 0.723, + "GPQA": 0.538, + "IFEval": 0.832, + "WildBench": 0.831, + "Omni-MATH": 0.403 + } + }, + { + "model_id": "google/gemini-1.5-flash-002", + "name": "Gemini 1.5 Flash 002", + "developer": "google", + "scores": { + "Mean score": 0.609, + "MMLU-Pro": 0.678, + "GPQA": 0.437, + "IFEval": 0.831, + "WildBench": 0.792, + "Omni-MATH": 0.305 + } + }, + { + "model_id": "google/gemini-1.5-pro-002", + "name": "Gemini 1.5 Pro 002", + "developer": "google", + "scores": { + "Mean score": 0.657, + "MMLU-Pro": 0.737, + "GPQA": 0.534, + "IFEval": 0.837, + "WildBench": 0.813, + "Omni-MATH": 0.364 + } + }, + { + "model_id": "google/gemini-2.0-flash-001", + "name": "Gemini 2.0 Flash", + "developer": "google", + "scores": { + "Mean score": 0.679, + "MMLU-Pro": 0.737, + "GPQA": 0.556, + "IFEval": 0.841, + "WildBench": 0.8, + "Omni-MATH": 0.459 + } + }, + { + "model_id": "google/gemini-2.0-flash-lite-preview-02-05", + "name": "Gemini 2.0 Flash Lite 02-05 preview", + "developer": "google", + "scores": { + "Mean score": 0.642, + "MMLU-Pro": 0.72, + "GPQA": 0.5, + "IFEval": 0.824, + "WildBench": 0.79, + "Omni-MATH": 0.374 + } + }, + { + "model_id": "google/gemini-2.5-flash-lite", + "name": "Gemini 2.5 Flash-Lite", + "developer": "google", + "scores": { + "Mean score": 0.591, + "MMLU-Pro": 0.537, + "GPQA": 0.309, + "IFEval": 0.81, + "WildBench": 0.818, + "Omni-MATH": 0.48 + } + }, + { + "model_id": "google/gemini-2.5-flash-preview-04-17", + "name": "Gemini 2.5 Flash 04-17 preview", + "developer": "google", + "scores": { + "Mean score": 0.626, + "MMLU-Pro": 0.639, + "GPQA": 0.39, + "IFEval": 0.898, + "WildBench": 0.817, + "Omni-MATH": 0.384 + } + }, + { + "model_id": "google/gemini-2.5-pro-preview-03-25", + "name": "Gemini 2.5 Pro 03-25 preview", + "developer": "google", + "scores": { + "Mean score": 0.745, + "MMLU-Pro": 0.863, + "GPQA": 0.749, + "IFEval": 0.84, + "WildBench": 0.857, + "Omni-MATH": 0.416 + } + }, + { + "model_id": "ibm/granite-3.3-8b-instruct", + "name": "IBM Granite 3.3 8B Instruct", + "developer": "ibm", + "scores": { + "Mean score": 0.463, + "MMLU-Pro": 0.343, + "GPQA": 0.325, + "IFEval": 0.729, + "WildBench": 0.741, + "Omni-MATH": 0.176 + } + }, + { + "model_id": "marin-community/marin-8b-instruct", + "name": "Marin 8B Instruct", + "developer": "marin-community", + "scores": { + "Mean score": 0.325, + "MMLU-Pro": 0.188, + "GPQA": 0.168, + "IFEval": 0.632, + "WildBench": 0.477, + "Omni-MATH": 0.16 + } + }, + { + "model_id": "meta/llama-3.1-405b-instruct-turbo", + "name": "Llama 3.1 Instruct Turbo 405B", + "developer": "meta", + "scores": { + "Mean score": 0.618, + "MMLU-Pro": 0.723, + "GPQA": 0.522, + "IFEval": 0.811, + "WildBench": 0.783, + "Omni-MATH": 0.249 + } + }, + { + "model_id": "meta/llama-3.1-70b-instruct-turbo", + "name": "Llama 3.1 Instruct Turbo 70B", + "developer": "meta", + "scores": { + "Mean score": 0.574, + "MMLU-Pro": 0.653, + "GPQA": 0.426, + "IFEval": 0.821, + "WildBench": 0.758, + "Omni-MATH": 0.21 + } + }, + { + "model_id": "meta/llama-3.1-8b-instruct-turbo", + "name": "Llama 3.1 Instruct Turbo 8B", + "developer": "meta", + "scores": { + "Mean score": 0.444, + "MMLU-Pro": 0.406, + "GPQA": 0.247, + "IFEval": 0.743, + "WildBench": 0.686, + "Omni-MATH": 0.137 + } + }, + { + "model_id": "meta/llama-4-maverick-17b-128e-instruct-fp8", + "name": "Llama 4 Maverick 17Bx128E Instruct FP8", + "developer": "meta", + "scores": { + "Mean score": 0.718, + "MMLU-Pro": 0.81, + "GPQA": 0.65, + "IFEval": 0.908, + "WildBench": 0.8, + "Omni-MATH": 0.422 + } + }, + { + "model_id": "meta/llama-4-scout-17b-16e-instruct", + "name": "Llama 4 Scout 17Bx16E Instruct", + "developer": "meta", + "scores": { + "Mean score": 0.644, + "MMLU-Pro": 0.742, + "GPQA": 0.507, + "IFEval": 0.818, + "WildBench": 0.779, + "Omni-MATH": 0.373 + } + }, + { + "model_id": "mistralai/mistral-7b-instruct-v0.3", + "name": "Mistral Instruct v0.3 7B", + "developer": "mistralai", + "scores": { + "Mean score": 0.376, + "MMLU-Pro": 0.277, + "GPQA": 0.303, + "IFEval": 0.567, + "WildBench": 0.66, + "Omni-MATH": 0.072 + } + }, + { + "model_id": "mistralai/mistral-large-2411", + "name": "Mistral Large 2411", + "developer": "mistralai", + "scores": { + "Mean score": 0.598, + "MMLU-Pro": 0.599, + "GPQA": 0.435, + "IFEval": 0.876, + "WildBench": 0.801, + "Omni-MATH": 0.281 + } + }, + { + "model_id": "mistralai/mistral-small-2503", + "name": "mistral-small-2503", + "developer": "mistralai", + "scores": { + "Mean score": 0.558, + "MMLU-Pro": 0.61, + "GPQA": 0.392, + "IFEval": 0.75, + "WildBench": 0.788, + "Omni-MATH": 0.248 + } + }, + { + "model_id": "mistralai/mixtral-8x22b-instruct-v0.1", + "name": "Mixtral Instruct 8x22B", + "developer": "mistralai", + "scores": { + "Mean score": 0.478, + "MMLU-Pro": 0.46, + "GPQA": 0.334, + "IFEval": 0.724, + "WildBench": 0.711, + "Omni-MATH": 0.163 + } + }, + { + "model_id": "mistralai/mixtral-8x7b-instruct-v0.1", + "name": "Mixtral Instruct 8x7B", + "developer": "mistralai", + "scores": { + "Mean score": 0.397, + "MMLU-Pro": 0.335, + "GPQA": 0.296, + "IFEval": 0.575, + "WildBench": 0.673, + "Omni-MATH": 0.105 + } + }, + { + "model_id": "moonshotai/kimi-k2-instruct", + "name": "Kimi K2 Instruct", + "developer": "moonshotai", + "scores": { + "Mean score": 0.768, + "MMLU-Pro": 0.819, + "GPQA": 0.652, + "IFEval": 0.85, + "WildBench": 0.862, + "Omni-MATH": 0.654 + } + }, + { + "model_id": "openai/gpt-4.1-2025-04-14", + "name": "gpt-4.1-2025-04-14", + "developer": "openai", + "scores": { + "Mean score": 0.727, + "MMLU-Pro": 0.811, + "GPQA": 0.659, + "IFEval": 0.838, + "WildBench": 0.854, + "Omni-MATH": 0.471 + } + }, + { + "model_id": "openai/gpt-4.1-mini-2025-04-14", + "name": "GPT-4.1 mini 2025-04-14", + "developer": "openai", + "scores": { + "Mean score": 0.726, + "MMLU-Pro": 0.783, + "GPQA": 0.614, + "IFEval": 0.904, + "WildBench": 0.838, + "Omni-MATH": 0.491 + } + }, + { + "model_id": "openai/gpt-4.1-nano-2025-04-14", + "name": "GPT-4.1 nano 2025-04-14", + "developer": "openai", + "scores": { + "Mean score": 0.616, + "MMLU-Pro": 0.55, + "GPQA": 0.507, + "IFEval": 0.843, + "WildBench": 0.811, + "Omni-MATH": 0.367 + } + }, + { + "model_id": "openai/gpt-4o-2024-11-20", + "name": "GPT-4o 2024-11-20", + "developer": "openai", + "scores": { + "Mean score": 0.634, + "MMLU-Pro": 0.713, + "GPQA": 0.52, + "IFEval": 0.817, + "WildBench": 0.828, + "Omni-MATH": 0.293 + } + }, + { + "model_id": "openai/gpt-4o-mini-2024-07-18", + "name": "GPT-4o mini 2024-07-18", + "developer": "openai", + "scores": { + "Mean score": 0.565, + "MMLU-Pro": 0.603, + "GPQA": 0.368, + "IFEval": 0.782, + "WildBench": 0.791, + "Omni-MATH": 0.28 + } + }, + { + "model_id": "openai/gpt-5-2025-08-07", + "name": "gpt-5-2025-08-07", + "developer": "openai", + "scores": { + "Mean score": 0.807, + "MMLU-Pro": 0.863, + "GPQA": 0.791, + "IFEval": 0.875, + "WildBench": 0.857, + "Omni-MATH": 0.647 + } + }, + { + "model_id": "openai/gpt-5-mini-2025-08-07", + "name": "GPT-5 mini 2025-08-07", + "developer": "openai", + "scores": { + "Mean score": 0.819, + "MMLU-Pro": 0.835, + "GPQA": 0.756, + "IFEval": 0.927, + "WildBench": 0.855, + "Omni-MATH": 0.722 + } + }, + { + "model_id": "openai/gpt-5-nano-2025-08-07", + "name": "GPT-5 nano 2025-08-07", + "developer": "openai", + "scores": { + "Mean score": 0.748, + "MMLU-Pro": 0.778, + "GPQA": 0.679, + "IFEval": 0.932, + "WildBench": 0.806, + "Omni-MATH": 0.547 + } + }, + { + "model_id": "openai/gpt-oss-120b", + "name": "gpt-oss-120b", + "developer": "openai", + "scores": { + "Mean score": 0.77, + "MMLU-Pro": 0.795, + "GPQA": 0.684, + "IFEval": 0.836, + "WildBench": 0.845, + "Omni-MATH": 0.688 + } + }, + { + "model_id": "openai/gpt-oss-20b", + "name": "gpt-oss-20b", + "developer": "openai", + "scores": { + "Mean score": 0.674, + "MMLU-Pro": 0.74, + "GPQA": 0.594, + "IFEval": 0.732, + "WildBench": 0.737, + "Omni-MATH": 0.565 + } + }, + { + "model_id": "openai/o3-2025-04-16", + "name": "o3 2025-04-16", + "developer": "openai", + "scores": { + "Mean score": 0.811, + "MMLU-Pro": 0.859, + "GPQA": 0.753, + "IFEval": 0.869, + "WildBench": 0.861, + "Omni-MATH": 0.714 + } + }, + { + "model_id": "openai/o4-mini-2025-04-16", + "name": "o4-mini-2025-04-16", + "developer": "openai", + "scores": { + "Mean score": 0.812, + "MMLU-Pro": 0.82, + "GPQA": 0.735, + "IFEval": 0.929, + "WildBench": 0.854, + "Omni-MATH": 0.72 + } + }, + { + "model_id": "qwen/qwen2.5-72b-instruct-turbo", + "name": "Qwen2.5 Instruct Turbo 72B", + "developer": "qwen", + "scores": { + "Mean score": 0.599, + "MMLU-Pro": 0.631, + "GPQA": 0.426, + "IFEval": 0.806, + "WildBench": 0.802, + "Omni-MATH": 0.33 + } + }, + { + "model_id": "qwen/qwen2.5-7b-instruct-turbo", + "name": "Qwen2.5 Instruct Turbo 7B", + "developer": "qwen", + "scores": { + "Mean score": 0.529, + "MMLU-Pro": 0.539, + "GPQA": 0.341, + "IFEval": 0.741, + "WildBench": 0.731, + "Omni-MATH": 0.294 + } + }, + { + "model_id": "qwen/qwen3-235b-a22b-fp8-tput", + "name": "Qwen3 235B A22B FP8 Throughput", + "developer": "qwen", + "scores": { + "Mean score": 0.726, + "MMLU-Pro": 0.817, + "GPQA": 0.623, + "IFEval": 0.816, + "WildBench": 0.828, + "Omni-MATH": 0.548 + } + }, + { + "model_id": "qwen/qwen3-235b-a22b-instruct-2507-fp8", + "name": "Qwen3 235B A22B Instruct 2507 FP8", + "developer": "qwen", + "scores": { + "Mean score": 0.798, + "MMLU-Pro": 0.844, + "GPQA": 0.726, + "IFEval": 0.835, + "WildBench": 0.866, + "Omni-MATH": 0.718 + } + }, + { + "model_id": "writer/palmyra-fin", + "name": "Palmyra Fin", + "developer": "writer", + "scores": { + "Mean score": 0.577, + "MMLU-Pro": 0.591, + "GPQA": 0.422, + "IFEval": 0.793, + "WildBench": 0.783, + "Omni-MATH": 0.295 + } + }, + { + "model_id": "writer/palmyra-med", + "name": "Palmyra Med", + "developer": "writer", + "scores": { + "Mean score": 0.476, + "MMLU-Pro": 0.411, + "GPQA": 0.368, + "IFEval": 0.767, + "WildBench": 0.676, + "Omni-MATH": 0.156 + } + }, + { + "model_id": "writer/palmyra-x-004", + "name": "Palmyra-X-004", + "developer": "writer", + "scores": { + "Mean score": 0.609, + "MMLU-Pro": 0.657, + "GPQA": 0.395, + "IFEval": 0.872, + "WildBench": 0.802, + "Omni-MATH": 0.32 + } + }, + { + "model_id": "writer/palmyra-x5", + "name": "Palmyra X5", + "developer": "writer", + "scores": { + "Mean score": 0.696, + "MMLU-Pro": 0.804, + "GPQA": 0.661, + "IFEval": 0.823, + "WildBench": 0.78, + "Omni-MATH": 0.414 + } + }, + { + "model_id": "xai/grok-3-beta", + "name": "Grok 3 Beta", + "developer": "xai", + "scores": { + "Mean score": 0.727, + "MMLU-Pro": 0.788, + "GPQA": 0.65, + "IFEval": 0.884, + "WildBench": 0.849, + "Omni-MATH": 0.464 + } + }, + { + "model_id": "xai/grok-3-mini-beta", + "name": "Grok 3 mini Beta", + "developer": "xai", + "scores": { + "Mean score": 0.679, + "MMLU-Pro": 0.799, + "GPQA": 0.675, + "IFEval": 0.951, + "WildBench": 0.651, + "Omni-MATH": 0.318 + } + }, + { + "model_id": "xai/grok-4-0709", + "name": "grok-4-0709", + "developer": "xai", + "scores": { + "Mean score": 0.785, + "MMLU-Pro": 0.851, + "GPQA": 0.726, + "IFEval": 0.949, + "WildBench": 0.797, + "Omni-MATH": 0.603 + } + }, + { + "model_id": "zai-org/glm-4.5-air-fp8", + "name": "GLM-4.5-Air-FP8", + "developer": "zai-org", + "scores": { + "Mean score": 0.67, + "MMLU-Pro": 0.762, + "GPQA": 0.594, + "IFEval": 0.812, + "WildBench": 0.789, + "Omni-MATH": 0.391 + } + } + ] +} \ No newline at end of file diff --git a/data/benchmarks/helm_classic.json b/data/benchmarks/helm_classic.json new file mode 100644 index 0000000000000000000000000000000000000000..a826d03c38f42141548dc36877c11365ad447123 --- /dev/null +++ b/data/benchmarks/helm_classic.json @@ -0,0 +1,1478 @@ +{ + "models": [ + { + "model_id": "Anthropic-LM-v4-s3-52B", + "name": "Anthropic-LM v4-s3 52B", + "developer": "unknown", + "scores": { + "Mean win rate": 0.78, + "MMLU": 0.481, + "BoolQ": 0.815, + "NarrativeQA": 0.728, + "NaturalQuestions (open-book)": 0.686, + "QuAC": 0.431, + "HellaSwag": 0.807, + "OpenbookQA": 0.558, + "TruthfulQA": 0.368, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": 0.154, + "XSUM": 0.134, + "IMDB": 0.934, + "CivilComments": 0.61, + "RAFT": 0.699 + } + }, + { + "model_id": "ai21/J1-Grande-v1-17B", + "name": "J1-Grande v1 17B", + "developer": "ai21", + "scores": { + "Mean win rate": 0.433, + "MMLU": 0.27, + "BoolQ": 0.722, + "NarrativeQA": 0.672, + "NaturalQuestions (open-book)": 0.578, + "QuAC": 0.362, + "HellaSwag": 0.739, + "OpenbookQA": 0.52, + "TruthfulQA": 0.193, + "MS MARCO (TREC)": 0.341, + "CNN/DailyMail": 0.143, + "XSUM": 0.122, + "IMDB": 0.953, + "CivilComments": 0.529, + "RAFT": 0.658 + } + }, + { + "model_id": "ai21/J1-Grande-v2-beta-17B", + "name": "J1-Grande v2 beta 17B", + "developer": "ai21", + "scores": { + "Mean win rate": 0.706, + "MMLU": 0.445, + "BoolQ": 0.812, + "NarrativeQA": 0.725, + "NaturalQuestions (open-book)": 0.625, + "QuAC": 0.392, + "HellaSwag": 0.764, + "OpenbookQA": 0.56, + "TruthfulQA": 0.306, + "MS MARCO (TREC)": 0.46, + "CNN/DailyMail": 0.146, + "XSUM": 0.152, + "IMDB": 0.957, + "CivilComments": 0.546, + "RAFT": 0.679 + } + }, + { + "model_id": "ai21/J1-Jumbo-v1-178B", + "name": "J1-Jumbo v1 178B", + "developer": "ai21", + "scores": { + "Mean win rate": 0.517, + "MMLU": 0.259, + "BoolQ": 0.776, + "NarrativeQA": 0.695, + "NaturalQuestions (open-book)": 0.595, + "QuAC": 0.358, + "HellaSwag": 0.765, + "OpenbookQA": 0.534, + "TruthfulQA": 0.175, + "MS MARCO (TREC)": 0.363, + "CNN/DailyMail": 0.144, + "XSUM": 0.129, + "IMDB": 0.943, + "CivilComments": 0.553, + "RAFT": 0.681 + } + }, + { + "model_id": "ai21/J1-Large-v1-7.5B", + "name": "J1-Large v1 7.5B", + "developer": "ai21", + "scores": { + "Mean win rate": 0.285, + "MMLU": 0.241, + "BoolQ": 0.683, + "NarrativeQA": 0.623, + "NaturalQuestions (open-book)": 0.532, + "QuAC": 0.328, + "HellaSwag": 0.7, + "OpenbookQA": 0.514, + "TruthfulQA": 0.197, + "MS MARCO (TREC)": 0.292, + "CNN/DailyMail": 0.134, + "XSUM": 0.102, + "IMDB": 0.956, + "CivilComments": 0.532, + "RAFT": 0.545 + } + }, + { + "model_id": "ai21/Jurassic-2-Grande-17B", + "name": "Jurassic-2 Grande 17B", + "developer": "ai21", + "scores": { + "Mean win rate": 0.743, + "MMLU": 0.475, + "BoolQ": 0.826, + "NarrativeQA": 0.737, + "NaturalQuestions (open-book)": 0.639, + "QuAC": 0.418, + "HellaSwag": 0.781, + "OpenbookQA": 0.542, + "TruthfulQA": 0.348, + "MS MARCO (TREC)": 0.514, + "CNN/DailyMail": 0.144, + "XSUM": 0.167, + "IMDB": 0.938, + "CivilComments": 0.547, + "RAFT": 0.712 + } + }, + { + "model_id": "ai21/Jurassic-2-Jumbo-178B", + "name": "Jurassic-2 Jumbo 178B", + "developer": "ai21", + "scores": { + "Mean win rate": 0.824, + "MMLU": 0.48, + "BoolQ": 0.829, + "NarrativeQA": 0.733, + "NaturalQuestions (open-book)": 0.669, + "QuAC": 0.435, + "HellaSwag": 0.788, + "OpenbookQA": 0.558, + "TruthfulQA": 0.437, + "MS MARCO (TREC)": 0.661, + "CNN/DailyMail": 0.149, + "XSUM": 0.182, + "IMDB": 0.938, + "CivilComments": 0.57, + "RAFT": 0.746 + } + }, + { + "model_id": "ai21/Jurassic-2-Large-7.5B", + "name": "Jurassic-2 Large 7.5B", + "developer": "ai21", + "scores": { + "Mean win rate": 0.553, + "MMLU": 0.339, + "BoolQ": 0.742, + "NarrativeQA": -1.0, + "NaturalQuestions (open-book)": 0.589, + "QuAC": -1.0, + "HellaSwag": 0.729, + "OpenbookQA": 0.53, + "TruthfulQA": 0.245, + "MS MARCO (TREC)": 0.464, + "CNN/DailyMail": 0.136, + "XSUM": 0.142, + "IMDB": 0.956, + "CivilComments": 0.57, + "RAFT": 0.622 + } + }, + { + "model_id": "aleph-alpha/Luminous-Base-13B", + "name": "Luminous Base 13B", + "developer": "aleph-alpha", + "scores": { + "Mean win rate": 0.315, + "MMLU": 0.27, + "BoolQ": 0.719, + "NarrativeQA": 0.605, + "NaturalQuestions (open-book)": 0.568, + "QuAC": 0.334, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.182, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": 0.11, + "XSUM": 0.105, + "IMDB": 0.939, + "CivilComments": 0.544, + "RAFT": 0.473 + } + }, + { + "model_id": "aleph-alpha/Luminous-Extended-30B", + "name": "Luminous Extended 30B", + "developer": "aleph-alpha", + "scores": { + "Mean win rate": 0.485, + "MMLU": 0.321, + "BoolQ": 0.767, + "NarrativeQA": 0.665, + "NaturalQuestions (open-book)": 0.609, + "QuAC": 0.349, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.221, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": 0.139, + "XSUM": 0.124, + "IMDB": 0.947, + "CivilComments": 0.524, + "RAFT": 0.523 + } + }, + { + "model_id": "aleph-alpha/Luminous-Supreme-70B", + "name": "Luminous Supreme 70B", + "developer": "aleph-alpha", + "scores": { + "Mean win rate": 0.662, + "MMLU": 0.38, + "BoolQ": 0.775, + "NarrativeQA": 0.711, + "NaturalQuestions (open-book)": 0.649, + "QuAC": 0.37, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.222, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": 0.15, + "XSUM": 0.136, + "IMDB": 0.959, + "CivilComments": 0.562, + "RAFT": 0.653 + } + }, + { + "model_id": "bigscience/BLOOM-176B", + "name": "BLOOM 176B", + "developer": "bigscience", + "scores": { + "Mean win rate": 0.446, + "MMLU": 0.299, + "BoolQ": 0.704, + "NarrativeQA": 0.662, + "NaturalQuestions (open-book)": 0.621, + "QuAC": 0.361, + "HellaSwag": 0.744, + "OpenbookQA": 0.534, + "TruthfulQA": 0.205, + "MS MARCO (TREC)": 0.386, + "CNN/DailyMail": 0.08, + "XSUM": 0.03, + "IMDB": 0.945, + "CivilComments": 0.62, + "RAFT": 0.592 + } + }, + { + "model_id": "bigscience/T0pp-11B", + "name": "T0pp 11B", + "developer": "bigscience", + "scores": { + "Mean win rate": 0.197, + "MMLU": 0.407, + "BoolQ": 0.0, + "NarrativeQA": 0.151, + "NaturalQuestions (open-book)": 0.19, + "QuAC": 0.121, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.377, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": 0.122, + "XSUM": 0.09, + "IMDB": 0.207, + "CivilComments": 0.234, + "RAFT": 0.118 + } + }, + { + "model_id": "cohere/Cohere-Command-beta-52.4B", + "name": "Cohere Command beta 52.4B", + "developer": "cohere", + "scores": { + "Mean win rate": 0.874, + "MMLU": 0.452, + "BoolQ": 0.856, + "NarrativeQA": 0.752, + "NaturalQuestions (open-book)": 0.76, + "QuAC": 0.432, + "HellaSwag": 0.811, + "OpenbookQA": 0.582, + "TruthfulQA": 0.269, + "MS MARCO (TREC)": 0.762, + "CNN/DailyMail": 0.161, + "XSUM": 0.152, + "IMDB": 0.96, + "CivilComments": 0.601, + "RAFT": 0.667 + } + }, + { + "model_id": "cohere/Cohere-Command-beta-6.1B", + "name": "Cohere Command beta 6.1B", + "developer": "cohere", + "scores": { + "Mean win rate": 0.675, + "MMLU": 0.406, + "BoolQ": 0.798, + "NarrativeQA": 0.709, + "NaturalQuestions (open-book)": 0.717, + "QuAC": 0.375, + "HellaSwag": 0.752, + "OpenbookQA": 0.55, + "TruthfulQA": 0.203, + "MS MARCO (TREC)": 0.709, + "CNN/DailyMail": 0.153, + "XSUM": 0.122, + "IMDB": 0.961, + "CivilComments": 0.54, + "RAFT": 0.634 + } + }, + { + "model_id": "cohere/Cohere-large-v20220720-13.1B", + "name": "Cohere large v20220720 13.1B", + "developer": "cohere", + "scores": { + "Mean win rate": 0.372, + "MMLU": 0.324, + "BoolQ": 0.725, + "NarrativeQA": 0.625, + "NaturalQuestions (open-book)": 0.573, + "QuAC": 0.338, + "HellaSwag": 0.736, + "OpenbookQA": 0.542, + "TruthfulQA": 0.181, + "MS MARCO (TREC)": 0.33, + "CNN/DailyMail": 0.126, + "XSUM": 0.108, + "IMDB": 0.933, + "CivilComments": 0.507, + "RAFT": 0.596 + } + }, + { + "model_id": "cohere/Cohere-medium-v20220720-6.1B", + "name": "Cohere medium v20220720 6.1B", + "developer": "cohere", + "scores": { + "Mean win rate": 0.23, + "MMLU": 0.279, + "BoolQ": 0.659, + "NarrativeQA": 0.559, + "NaturalQuestions (open-book)": 0.504, + "QuAC": 0.279, + "HellaSwag": 0.706, + "OpenbookQA": 0.496, + "TruthfulQA": 0.19, + "MS MARCO (TREC)": 0.374, + "CNN/DailyMail": 0.077, + "XSUM": 0.087, + "IMDB": 0.935, + "CivilComments": 0.504, + "RAFT": 0.52 + } + }, + { + "model_id": "cohere/Cohere-medium-v20221108-6.1B", + "name": "Cohere medium v20221108 6.1B", + "developer": "cohere", + "scores": { + "Mean win rate": 0.312, + "MMLU": 0.254, + "BoolQ": 0.7, + "NarrativeQA": 0.61, + "NaturalQuestions (open-book)": 0.517, + "QuAC": 0.314, + "HellaSwag": 0.726, + "OpenbookQA": 0.538, + "TruthfulQA": 0.215, + "MS MARCO (TREC)": 0.373, + "CNN/DailyMail": 0.121, + "XSUM": 0.099, + "IMDB": 0.935, + "CivilComments": 0.5, + "RAFT": 0.591 + } + }, + { + "model_id": "cohere/Cohere-small-v20220720-410M", + "name": "Cohere small v20220720 410M", + "developer": "cohere", + "scores": { + "Mean win rate": 0.109, + "MMLU": 0.264, + "BoolQ": 0.457, + "NarrativeQA": 0.294, + "NaturalQuestions (open-book)": 0.309, + "QuAC": 0.219, + "HellaSwag": 0.483, + "OpenbookQA": 0.348, + "TruthfulQA": 0.217, + "MS MARCO (TREC)": 0.304, + "CNN/DailyMail": 0.063, + "XSUM": 0.033, + "IMDB": 0.578, + "CivilComments": 0.501, + "RAFT": 0.492 + } + }, + { + "model_id": "cohere/Cohere-xlarge-v20220609-52.4B", + "name": "Cohere xlarge v20220609 52.4B", + "developer": "cohere", + "scores": { + "Mean win rate": 0.56, + "MMLU": 0.353, + "BoolQ": 0.718, + "NarrativeQA": 0.65, + "NaturalQuestions (open-book)": 0.595, + "QuAC": 0.361, + "HellaSwag": 0.811, + "OpenbookQA": 0.55, + "TruthfulQA": 0.198, + "MS MARCO (TREC)": 0.459, + "CNN/DailyMail": 0.144, + "XSUM": 0.129, + "IMDB": 0.956, + "CivilComments": 0.532, + "RAFT": 0.633 + } + }, + { + "model_id": "cohere/Cohere-xlarge-v20221108-52.4B", + "name": "Cohere xlarge v20221108 52.4B", + "developer": "cohere", + "scores": { + "Mean win rate": 0.664, + "MMLU": 0.382, + "BoolQ": 0.762, + "NarrativeQA": 0.672, + "NaturalQuestions (open-book)": 0.628, + "QuAC": 0.374, + "HellaSwag": 0.81, + "OpenbookQA": 0.588, + "TruthfulQA": 0.169, + "MS MARCO (TREC)": 0.55, + "CNN/DailyMail": 0.153, + "XSUM": 0.153, + "IMDB": 0.956, + "CivilComments": 0.524, + "RAFT": 0.624 + } + }, + { + "model_id": "eleutherai/Pythia-12B", + "name": "Pythia 12B", + "developer": "eleutherai", + "scores": { + "Mean win rate": 0.257, + "MMLU": 0.274, + "BoolQ": 0.662, + "NarrativeQA": 0.596, + "NaturalQuestions (open-book)": 0.581, + "QuAC": 0.313, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.177, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.931, + "CivilComments": 0.531, + "RAFT": 0.514 + } + }, + { + "model_id": "eleutherai/Pythia-6.9B", + "name": "Pythia 6.9B", + "developer": "eleutherai", + "scores": { + "Mean win rate": 0.196, + "MMLU": 0.236, + "BoolQ": 0.631, + "NarrativeQA": 0.528, + "NaturalQuestions (open-book)": 0.539, + "QuAC": 0.296, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.213, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.928, + "CivilComments": 0.511, + "RAFT": 0.502 + } + }, + { + "model_id": "google/Palmyra-X-43B", + "name": "Palmyra X 43B", + "developer": "google", + "scores": { + "Mean win rate": 0.732, + "MMLU": 0.609, + "BoolQ": 0.896, + "NarrativeQA": 0.742, + "NaturalQuestions (open-book)": -1.0, + "QuAC": 0.473, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.616, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": 0.049, + "XSUM": 0.149, + "IMDB": 0.935, + "CivilComments": 0.008, + "RAFT": 0.701 + } + }, + { + "model_id": "google/T5-11B", + "name": "T5 11B", + "developer": "google", + "scores": { + "Mean win rate": 0.131, + "MMLU": 0.29, + "BoolQ": 0.761, + "NarrativeQA": 0.086, + "NaturalQuestions (open-book)": 0.477, + "QuAC": 0.116, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.133, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": 0.043, + "XSUM": 0.015, + "IMDB": 0.379, + "CivilComments": 0.509, + "RAFT": 0.37 + } + }, + { + "model_id": "google/UL2-20B", + "name": "UL2 20B", + "developer": "google", + "scores": { + "Mean win rate": 0.167, + "MMLU": 0.291, + "BoolQ": 0.746, + "NarrativeQA": 0.083, + "NaturalQuestions (open-book)": 0.349, + "QuAC": 0.144, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.193, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": 0.03, + "XSUM": 0.058, + "IMDB": 0.337, + "CivilComments": 0.521, + "RAFT": 0.404 + } + }, + { + "model_id": "lmsys/Vicuna-v1.3-13B", + "name": "Vicuna v1.3 13B", + "developer": "lmsys", + "scores": { + "Mean win rate": 0.706, + "MMLU": 0.462, + "BoolQ": 0.808, + "NarrativeQA": 0.691, + "NaturalQuestions (open-book)": 0.686, + "QuAC": 0.403, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.385, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.762, + "CivilComments": 0.645, + "RAFT": 0.657 + } + }, + { + "model_id": "lmsys/Vicuna-v1.3-7B", + "name": "Vicuna v1.3 7B", + "developer": "lmsys", + "scores": { + "Mean win rate": 0.625, + "MMLU": 0.434, + "BoolQ": 0.76, + "NarrativeQA": 0.643, + "NaturalQuestions (open-book)": 0.634, + "QuAC": 0.392, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.292, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.916, + "CivilComments": 0.62, + "RAFT": 0.693 + } + }, + { + "model_id": "meta/LLaMA-13B", + "name": "LLaMA 13B", + "developer": "meta", + "scores": { + "Mean win rate": 0.595, + "MMLU": 0.422, + "BoolQ": 0.714, + "NarrativeQA": 0.711, + "NaturalQuestions (open-book)": 0.614, + "QuAC": 0.347, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.324, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.928, + "CivilComments": 0.6, + "RAFT": 0.643 + } + }, + { + "model_id": "meta/LLaMA-30B", + "name": "LLaMA 30B", + "developer": "meta", + "scores": { + "Mean win rate": 0.781, + "MMLU": 0.531, + "BoolQ": 0.861, + "NarrativeQA": 0.752, + "NaturalQuestions (open-book)": 0.666, + "QuAC": 0.39, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.344, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.927, + "CivilComments": 0.549, + "RAFT": 0.752 + } + }, + { + "model_id": "meta/LLaMA-65B", + "name": "LLaMA 65B", + "developer": "meta", + "scores": { + "Mean win rate": 0.908, + "MMLU": 0.584, + "BoolQ": 0.871, + "NarrativeQA": 0.755, + "NaturalQuestions (open-book)": 0.672, + "QuAC": 0.401, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.508, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.962, + "CivilComments": 0.655, + "RAFT": 0.702 + } + }, + { + "model_id": "meta/LLaMA-7B", + "name": "LLaMA 7B", + "developer": "meta", + "scores": { + "Mean win rate": 0.533, + "MMLU": 0.321, + "BoolQ": 0.756, + "NarrativeQA": 0.669, + "NaturalQuestions (open-book)": 0.589, + "QuAC": 0.338, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.28, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.947, + "CivilComments": 0.563, + "RAFT": 0.573 + } + }, + { + "model_id": "meta/Llama-2-13B", + "name": "Llama 2 13B", + "developer": "meta", + "scores": { + "Mean win rate": 0.823, + "MMLU": 0.507, + "BoolQ": 0.811, + "NarrativeQA": 0.744, + "NaturalQuestions (open-book)": 0.637, + "QuAC": 0.424, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.33, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.962, + "CivilComments": 0.588, + "RAFT": 0.707 + } + }, + { + "model_id": "meta/Llama-2-70B", + "name": "Llama 2 70B", + "developer": "meta", + "scores": { + "Mean win rate": 0.944, + "MMLU": 0.582, + "BoolQ": 0.886, + "NarrativeQA": 0.77, + "NaturalQuestions (open-book)": 0.674, + "QuAC": 0.484, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.554, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.961, + "CivilComments": 0.652, + "RAFT": 0.727 + } + }, + { + "model_id": "meta/Llama-2-7B", + "name": "Llama 2 7B", + "developer": "meta", + "scores": { + "Mean win rate": 0.607, + "MMLU": 0.431, + "BoolQ": 0.762, + "NarrativeQA": 0.691, + "NaturalQuestions (open-book)": 0.611, + "QuAC": 0.406, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.272, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.907, + "CivilComments": 0.562, + "RAFT": 0.643 + } + }, + { + "model_id": "meta/OPT-175B", + "name": "OPT 175B", + "developer": "meta", + "scores": { + "Mean win rate": 0.609, + "MMLU": 0.318, + "BoolQ": 0.793, + "NarrativeQA": 0.671, + "NaturalQuestions (open-book)": 0.615, + "QuAC": 0.36, + "HellaSwag": 0.791, + "OpenbookQA": 0.586, + "TruthfulQA": 0.25, + "MS MARCO (TREC)": 0.448, + "CNN/DailyMail": 0.146, + "XSUM": 0.155, + "IMDB": 0.947, + "CivilComments": 0.505, + "RAFT": 0.606 + } + }, + { + "model_id": "meta/OPT-66B", + "name": "OPT 66B", + "developer": "meta", + "scores": { + "Mean win rate": 0.448, + "MMLU": 0.276, + "BoolQ": 0.76, + "NarrativeQA": 0.638, + "NaturalQuestions (open-book)": 0.596, + "QuAC": 0.357, + "HellaSwag": 0.745, + "OpenbookQA": 0.534, + "TruthfulQA": 0.201, + "MS MARCO (TREC)": 0.482, + "CNN/DailyMail": 0.136, + "XSUM": 0.126, + "IMDB": 0.917, + "CivilComments": 0.506, + "RAFT": 0.557 + } + }, + { + "model_id": "microsoft/TNLG-v2-530B", + "name": "TNLG v2 530B", + "developer": "microsoft", + "scores": { + "Mean win rate": 0.787, + "MMLU": 0.469, + "BoolQ": 0.809, + "NarrativeQA": 0.722, + "NaturalQuestions (open-book)": 0.642, + "QuAC": 0.39, + "HellaSwag": 0.799, + "OpenbookQA": 0.562, + "TruthfulQA": 0.251, + "MS MARCO (TREC)": 0.643, + "CNN/DailyMail": 0.161, + "XSUM": 0.169, + "IMDB": 0.941, + "CivilComments": 0.601, + "RAFT": 0.679 + } + }, + { + "model_id": "microsoft/TNLG-v2-6.7B", + "name": "TNLG v2 6.7B", + "developer": "microsoft", + "scores": { + "Mean win rate": 0.309, + "MMLU": 0.242, + "BoolQ": 0.698, + "NarrativeQA": 0.631, + "NaturalQuestions (open-book)": 0.561, + "QuAC": 0.345, + "HellaSwag": 0.704, + "OpenbookQA": 0.478, + "TruthfulQA": 0.167, + "MS MARCO (TREC)": 0.332, + "CNN/DailyMail": 0.146, + "XSUM": 0.11, + "IMDB": 0.927, + "CivilComments": 0.532, + "RAFT": 0.525 + } + }, + { + "model_id": "mistralai/Mistral-v0.1-7B", + "name": "Mistral v0.1 7B", + "developer": "mistralai", + "scores": { + "Mean win rate": 0.884, + "MMLU": 0.572, + "BoolQ": 0.874, + "NarrativeQA": 0.716, + "NaturalQuestions (open-book)": 0.687, + "QuAC": 0.423, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.422, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.962, + "CivilComments": 0.624, + "RAFT": 0.707 + } + }, + { + "model_id": "mosaicml/MPT-30B", + "name": "MPT 30B", + "developer": "mosaicml", + "scores": { + "Mean win rate": 0.714, + "MMLU": 0.437, + "BoolQ": 0.704, + "NarrativeQA": 0.732, + "NaturalQuestions (open-book)": 0.673, + "QuAC": 0.393, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.231, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.959, + "CivilComments": 0.599, + "RAFT": 0.723 + } + }, + { + "model_id": "mosaicml/MPT-Instruct-30B", + "name": "MPT-Instruct 30B", + "developer": "mosaicml", + "scores": { + "Mean win rate": 0.716, + "MMLU": 0.444, + "BoolQ": 0.85, + "NarrativeQA": 0.733, + "NaturalQuestions (open-book)": 0.697, + "QuAC": 0.327, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.234, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.956, + "CivilComments": 0.573, + "RAFT": 0.68 + } + }, + { + "model_id": "openai/GPT-J-6B", + "name": "GPT-J 6B", + "developer": "openai", + "scores": { + "Mean win rate": 0.273, + "MMLU": 0.249, + "BoolQ": 0.649, + "NarrativeQA": 0.545, + "NaturalQuestions (open-book)": 0.559, + "QuAC": 0.33, + "HellaSwag": 0.663, + "OpenbookQA": 0.514, + "TruthfulQA": 0.199, + "MS MARCO (TREC)": 0.345, + "CNN/DailyMail": 0.131, + "XSUM": 0.096, + "IMDB": 0.939, + "CivilComments": 0.52, + "RAFT": 0.619 + } + }, + { + "model_id": "openai/GPT-NeoX-20B", + "name": "GPT-NeoX 20B", + "developer": "openai", + "scores": { + "Mean win rate": 0.351, + "MMLU": 0.276, + "BoolQ": 0.683, + "NarrativeQA": 0.599, + "NaturalQuestions (open-book)": 0.596, + "QuAC": 0.326, + "HellaSwag": 0.718, + "OpenbookQA": 0.524, + "TruthfulQA": 0.216, + "MS MARCO (TREC)": 0.398, + "CNN/DailyMail": 0.123, + "XSUM": 0.102, + "IMDB": 0.948, + "CivilComments": 0.516, + "RAFT": 0.505 + } + }, + { + "model_id": "openai/ada-350M", + "name": "ada 350M", + "developer": "openai", + "scores": { + "Mean win rate": 0.108, + "MMLU": 0.243, + "BoolQ": 0.581, + "NarrativeQA": 0.326, + "NaturalQuestions (open-book)": 0.365, + "QuAC": 0.242, + "HellaSwag": 0.435, + "OpenbookQA": 0.38, + "TruthfulQA": 0.215, + "MS MARCO (TREC)": 0.29, + "CNN/DailyMail": 0.09, + "XSUM": 0.022, + "IMDB": 0.849, + "CivilComments": 0.517, + "RAFT": 0.423 + } + }, + { + "model_id": "openai/babbage-1.3B", + "name": "babbage 1.3B", + "developer": "openai", + "scores": { + "Mean win rate": 0.114, + "MMLU": 0.235, + "BoolQ": 0.574, + "NarrativeQA": 0.491, + "NaturalQuestions (open-book)": 0.451, + "QuAC": 0.273, + "HellaSwag": 0.555, + "OpenbookQA": 0.438, + "TruthfulQA": 0.188, + "MS MARCO (TREC)": 0.317, + "CNN/DailyMail": 0.079, + "XSUM": 0.045, + "IMDB": 0.597, + "CivilComments": 0.519, + "RAFT": 0.455 + } + }, + { + "model_id": "openai/curie-6.7B", + "name": "curie 6.7B", + "developer": "openai", + "scores": { + "Mean win rate": 0.247, + "MMLU": 0.243, + "BoolQ": 0.656, + "NarrativeQA": 0.604, + "NaturalQuestions (open-book)": 0.552, + "QuAC": 0.321, + "HellaSwag": 0.682, + "OpenbookQA": 0.502, + "TruthfulQA": 0.232, + "MS MARCO (TREC)": 0.3, + "CNN/DailyMail": 0.113, + "XSUM": 0.091, + "IMDB": 0.889, + "CivilComments": 0.539, + "RAFT": 0.49 + } + }, + { + "model_id": "openai/davinci-175B", + "name": "davinci 175B", + "developer": "openai", + "scores": { + "Mean win rate": 0.538, + "MMLU": 0.422, + "BoolQ": 0.722, + "NarrativeQA": 0.687, + "NaturalQuestions (open-book)": 0.625, + "QuAC": 0.36, + "HellaSwag": 0.775, + "OpenbookQA": 0.586, + "TruthfulQA": 0.194, + "MS MARCO (TREC)": 0.378, + "CNN/DailyMail": 0.127, + "XSUM": 0.126, + "IMDB": 0.933, + "CivilComments": 0.532, + "RAFT": 0.642 + } + }, + { + "model_id": "openai/gpt-3.5-turbo-0301", + "name": "gpt-3.5-turbo-0301", + "developer": "openai", + "scores": { + "Mean win rate": 0.76, + "MMLU": 0.59, + "BoolQ": 0.74, + "NarrativeQA": 0.663, + "NaturalQuestions (open-book)": 0.624, + "QuAC": 0.512, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.609, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.899, + "CivilComments": 0.674, + "RAFT": 0.768 + } + }, + { + "model_id": "openai/gpt-3.5-turbo-0613", + "name": "gpt-3.5-turbo-0613", + "developer": "openai", + "scores": { + "Mean win rate": 0.783, + "MMLU": 0.391, + "BoolQ": 0.87, + "NarrativeQA": 0.625, + "NaturalQuestions (open-book)": 0.675, + "QuAC": 0.485, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.339, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.943, + "CivilComments": 0.696, + "RAFT": 0.748 + } + }, + { + "model_id": "openai/text-ada-001", + "name": "text-ada-001", + "developer": "openai", + "scores": { + "Mean win rate": 0.107, + "MMLU": 0.238, + "BoolQ": 0.464, + "NarrativeQA": 0.238, + "NaturalQuestions (open-book)": 0.149, + "QuAC": 0.176, + "HellaSwag": 0.429, + "OpenbookQA": 0.346, + "TruthfulQA": 0.232, + "MS MARCO (TREC)": 0.302, + "CNN/DailyMail": 0.136, + "XSUM": 0.034, + "IMDB": 0.822, + "CivilComments": 0.503, + "RAFT": 0.406 + } + }, + { + "model_id": "openai/text-babbage-001", + "name": "text-babbage-001", + "developer": "openai", + "scores": { + "Mean win rate": 0.229, + "MMLU": 0.229, + "BoolQ": 0.451, + "NarrativeQA": 0.429, + "NaturalQuestions (open-book)": 0.33, + "QuAC": 0.284, + "HellaSwag": 0.561, + "OpenbookQA": 0.452, + "TruthfulQA": 0.233, + "MS MARCO (TREC)": 0.449, + "CNN/DailyMail": 0.151, + "XSUM": 0.046, + "IMDB": 0.913, + "CivilComments": 0.499, + "RAFT": 0.509 + } + }, + { + "model_id": "openai/text-curie-001", + "name": "text-curie-001", + "developer": "openai", + "scores": { + "Mean win rate": 0.36, + "MMLU": 0.237, + "BoolQ": 0.62, + "NarrativeQA": 0.582, + "NaturalQuestions (open-book)": 0.571, + "QuAC": 0.358, + "HellaSwag": 0.676, + "OpenbookQA": 0.514, + "TruthfulQA": 0.257, + "MS MARCO (TREC)": 0.507, + "CNN/DailyMail": 0.152, + "XSUM": 0.076, + "IMDB": 0.923, + "CivilComments": 0.537, + "RAFT": 0.489 + } + }, + { + "model_id": "openai/text-davinci-002", + "name": "text-davinci-002", + "developer": "openai", + "scores": { + "Mean win rate": 0.905, + "MMLU": 0.568, + "BoolQ": 0.877, + "NarrativeQA": 0.727, + "NaturalQuestions (open-book)": 0.713, + "QuAC": 0.445, + "HellaSwag": 0.815, + "OpenbookQA": 0.594, + "TruthfulQA": 0.61, + "MS MARCO (TREC)": 0.664, + "CNN/DailyMail": 0.153, + "XSUM": 0.144, + "IMDB": 0.948, + "CivilComments": 0.668, + "RAFT": 0.733 + } + }, + { + "model_id": "openai/text-davinci-003", + "name": "text-davinci-003", + "developer": "openai", + "scores": { + "Mean win rate": 0.872, + "MMLU": 0.569, + "BoolQ": 0.881, + "NarrativeQA": 0.727, + "NaturalQuestions (open-book)": 0.77, + "QuAC": 0.525, + "HellaSwag": 0.822, + "OpenbookQA": 0.646, + "TruthfulQA": 0.593, + "MS MARCO (TREC)": 0.644, + "CNN/DailyMail": 0.156, + "XSUM": 0.124, + "IMDB": 0.848, + "CivilComments": 0.684, + "RAFT": 0.759 + } + }, + { + "model_id": "stanford/Alpaca-7B", + "name": "Alpaca 7B", + "developer": "stanford", + "scores": { + "Mean win rate": 0.381, + "MMLU": 0.385, + "BoolQ": 0.778, + "NarrativeQA": 0.396, + "NaturalQuestions (open-book)": 0.592, + "QuAC": 0.27, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.243, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.738, + "CivilComments": 0.566, + "RAFT": 0.486 + } + }, + { + "model_id": "tiiuae/Falcon-40B", + "name": "Falcon 40B", + "developer": "tiiuae", + "scores": { + "Mean win rate": 0.729, + "MMLU": 0.509, + "BoolQ": 0.819, + "NarrativeQA": 0.673, + "NaturalQuestions (open-book)": 0.675, + "QuAC": 0.307, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.353, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.959, + "CivilComments": 0.552, + "RAFT": 0.661 + } + }, + { + "model_id": "tiiuae/Falcon-7B", + "name": "Falcon 7B", + "developer": "tiiuae", + "scores": { + "Mean win rate": 0.378, + "MMLU": 0.286, + "BoolQ": 0.753, + "NarrativeQA": 0.621, + "NaturalQuestions (open-book)": 0.579, + "QuAC": 0.332, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.234, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.836, + "CivilComments": 0.514, + "RAFT": 0.602 + } + }, + { + "model_id": "tiiuae/Falcon-Instruct-40B", + "name": "Falcon-Instruct 40B", + "developer": "tiiuae", + "scores": { + "Mean win rate": 0.727, + "MMLU": 0.497, + "BoolQ": 0.829, + "NarrativeQA": 0.625, + "NaturalQuestions (open-book)": 0.666, + "QuAC": 0.371, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.384, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.959, + "CivilComments": 0.603, + "RAFT": 0.586 + } + }, + { + "model_id": "tiiuae/Falcon-Instruct-7B", + "name": "Falcon-Instruct 7B", + "developer": "tiiuae", + "scores": { + "Mean win rate": 0.244, + "MMLU": 0.275, + "BoolQ": 0.72, + "NarrativeQA": 0.476, + "NaturalQuestions (open-book)": 0.449, + "QuAC": 0.311, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.213, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.852, + "CivilComments": 0.511, + "RAFT": 0.523 + } + }, + { + "model_id": "together/RedPajama-INCITE-Base-7B", + "name": "RedPajama-INCITE-Base 7B", + "developer": "together", + "scores": { + "Mean win rate": 0.378, + "MMLU": 0.302, + "BoolQ": 0.713, + "NarrativeQA": 0.617, + "NaturalQuestions (open-book)": 0.586, + "QuAC": 0.336, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.205, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.752, + "CivilComments": 0.547, + "RAFT": 0.648 + } + }, + { + "model_id": "together/RedPajama-INCITE-Base-v1-3B", + "name": "RedPajama-INCITE-Base-v1 3B", + "developer": "together", + "scores": { + "Mean win rate": 0.311, + "MMLU": 0.263, + "BoolQ": 0.685, + "NarrativeQA": 0.555, + "NaturalQuestions (open-book)": 0.52, + "QuAC": 0.309, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.277, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.907, + "CivilComments": 0.549, + "RAFT": 0.502 + } + }, + { + "model_id": "together/RedPajama-INCITE-Instruct-7B", + "name": "RedPajama-INCITE-Instruct 7B", + "developer": "together", + "scores": { + "Mean win rate": 0.524, + "MMLU": 0.363, + "BoolQ": 0.705, + "NarrativeQA": 0.638, + "NaturalQuestions (open-book)": 0.659, + "QuAC": 0.26, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.243, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.927, + "CivilComments": 0.664, + "RAFT": 0.695 + } + }, + { + "model_id": "together/RedPajama-INCITE-Instruct-v1-3B", + "name": "RedPajama-INCITE-Instruct-v1 3B", + "developer": "together", + "scores": { + "Mean win rate": 0.366, + "MMLU": 0.257, + "BoolQ": 0.677, + "NarrativeQA": 0.638, + "NaturalQuestions (open-book)": 0.637, + "QuAC": 0.259, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.208, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": -1.0, + "XSUM": -1.0, + "IMDB": 0.894, + "CivilComments": 0.549, + "RAFT": 0.661 + } + }, + { + "model_id": "writer/InstructPalmyra-30B", + "name": "InstructPalmyra 30B", + "developer": "writer", + "scores": { + "Mean win rate": 0.568, + "MMLU": 0.403, + "BoolQ": 0.751, + "NarrativeQA": 0.496, + "NaturalQuestions (open-book)": 0.682, + "QuAC": 0.433, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.185, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": 0.152, + "XSUM": 0.104, + "IMDB": 0.94, + "CivilComments": 0.555, + "RAFT": 0.652 + } + }, + { + "model_id": "yandex/YaLM-100B", + "name": "YaLM 100B", + "developer": "yandex", + "scores": { + "Mean win rate": 0.075, + "MMLU": 0.243, + "BoolQ": 0.634, + "NarrativeQA": 0.252, + "NaturalQuestions (open-book)": 0.227, + "QuAC": 0.162, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.202, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": 0.017, + "XSUM": 0.021, + "IMDB": 0.836, + "CivilComments": 0.49, + "RAFT": 0.395 + } + }, + { + "model_id": "zhipu-ai/GLM-130B", + "name": "GLM 130B", + "developer": "zhipu-ai", + "scores": { + "Mean win rate": 0.512, + "MMLU": 0.344, + "BoolQ": 0.784, + "NarrativeQA": 0.706, + "NaturalQuestions (open-book)": 0.642, + "QuAC": 0.272, + "HellaSwag": -1.0, + "OpenbookQA": -1.0, + "TruthfulQA": 0.218, + "MS MARCO (TREC)": -1.0, + "CNN/DailyMail": 0.154, + "XSUM": 0.132, + "IMDB": 0.955, + "CivilComments": 0.5, + "RAFT": 0.598 + } + } + ] +} \ No newline at end of file diff --git a/data/benchmarks/helm_instruct.json b/data/benchmarks/helm_instruct.json new file mode 100644 index 0000000000000000000000000000000000000000..163cbbb755dcda6a889b6d08b49142d004d37912 --- /dev/null +++ b/data/benchmarks/helm_instruct.json @@ -0,0 +1,60 @@ +{ + "models": [ + { + "model_id": "anthropic/claude-v1.3", + "name": "Anthropic Claude v1.3", + "developer": "anthropic", + "scores": { + "Mean win rate": 0.611, + "Anthropic RLHF dataset": 4.965, + "Best ChatGPT Prompts": 4.995, + "Koala test dataset": 4.981, + "Open Assistant": 4.975, + "Self Instruct": 4.992, + "Vicuna": 4.989 + } + }, + { + "model_id": "cohere/command-xlarge-beta", + "name": "Cohere Command beta 52.4B", + "developer": "cohere", + "scores": { + "Mean win rate": 0.089, + "Anthropic RLHF dataset": 4.214, + "Best ChatGPT Prompts": 4.988, + "Koala test dataset": 4.969, + "Open Assistant": 4.967, + "Self Instruct": 4.971, + "Vicuna": 4.995 + } + }, + { + "model_id": "openai/gpt-3.5-turbo-0613", + "name": "gpt-3.5-turbo-0613", + "developer": "openai", + "scores": { + "Mean win rate": 0.689, + "Anthropic RLHF dataset": 4.964, + "Best ChatGPT Prompts": 4.986, + "Koala test dataset": 4.987, + "Open Assistant": 4.987, + "Self Instruct": 4.99, + "Vicuna": 4.992 + } + }, + { + "model_id": "openai/gpt-4-0314", + "name": "GPT-4 0314", + "developer": "openai", + "scores": { + "Mean win rate": 0.611, + "Anthropic RLHF dataset": 4.934, + "Best ChatGPT Prompts": 4.973, + "Koala test dataset": 4.966, + "Open Assistant": 4.986, + "Self Instruct": 4.976, + "Vicuna": 4.995 + } + } + ] +} \ No newline at end of file diff --git a/data/benchmarks/helm_lite.json b/data/benchmarks/helm_lite.json new file mode 100644 index 0000000000000000000000000000000000000000..2b90cf9aeff2e9b3f601331606648af757034e40 --- /dev/null +++ b/data/benchmarks/helm_lite.json @@ -0,0 +1,1551 @@ +{ + "models": [ + { + "model_id": "01-ai/yi-34b", + "name": "Yi 34B", + "developer": "01-ai", + "scores": { + "Mean win rate": 0.57, + "NarrativeQA": 0.782, + "NaturalQuestions (closed-book)": 0.443, + "OpenbookQA": 0.92, + "MMLU": 0.65, + "MATH": 0.375, + "GSM8K": 0.648, + "LegalBench": 0.618, + "MedQA": 0.656, + "WMT 2014": 0.172 + } + }, + { + "model_id": "01-ai/yi-6b", + "name": "Yi 6B", + "developer": "01-ai", + "scores": { + "Mean win rate": 0.253, + "NarrativeQA": 0.702, + "NaturalQuestions (closed-book)": 0.31, + "OpenbookQA": 0.8, + "MMLU": 0.53, + "MATH": 0.126, + "GSM8K": 0.375, + "LegalBench": 0.519, + "MedQA": 0.497, + "WMT 2014": 0.117 + } + }, + { + "model_id": "01-ai/yi-large-preview", + "name": "Yi Large Preview", + "developer": "01-ai", + "scores": { + "Mean win rate": 0.471, + "NarrativeQA": 0.373, + "NaturalQuestions (closed-book)": 0.428, + "OpenbookQA": 0.946, + "MMLU": 0.712, + "MATH": 0.712, + "GSM8K": 0.69, + "LegalBench": 0.519, + "MedQA": 0.66, + "WMT 2014": 0.176 + } + }, + { + "model_id": "AlephAlpha/luminous-base", + "name": "Luminous Base 13B", + "developer": "AlephAlpha", + "scores": { + "Mean win rate": 0.041, + "NarrativeQA": 0.633, + "NaturalQuestions (closed-book)": 0.197, + "OpenbookQA": 0.286, + "MMLU": 0.243, + "MATH": 0.026, + "GSM8K": 0.028, + "LegalBench": 0.332, + "MedQA": 0.26, + "WMT 2014": 0.066 + } + }, + { + "model_id": "AlephAlpha/luminous-extended", + "name": "Luminous Extended 30B", + "developer": "AlephAlpha", + "scores": { + "Mean win rate": 0.078, + "NarrativeQA": 0.684, + "NaturalQuestions (closed-book)": 0.253, + "OpenbookQA": 0.272, + "MMLU": 0.248, + "MATH": 0.04, + "GSM8K": 0.075, + "LegalBench": 0.421, + "MedQA": 0.276, + "WMT 2014": 0.083 + } + }, + { + "model_id": "AlephAlpha/luminous-supreme", + "name": "Luminous Supreme 70B", + "developer": "AlephAlpha", + "scores": { + "Mean win rate": 0.145, + "NarrativeQA": 0.743, + "NaturalQuestions (closed-book)": 0.299, + "OpenbookQA": 0.284, + "MMLU": 0.316, + "MATH": 0.078, + "GSM8K": 0.137, + "LegalBench": 0.452, + "MedQA": 0.276, + "WMT 2014": 0.102 + } + }, + { + "model_id": "ai21/j2-grande", + "name": "Jurassic-2 Grande 17B", + "developer": "ai21", + "scores": { + "Mean win rate": 0.172, + "NarrativeQA": 0.744, + "NaturalQuestions (closed-book)": 0.35, + "OpenbookQA": 0.614, + "MMLU": 0.471, + "MATH": 0.064, + "GSM8K": 0.159, + "LegalBench": 0.468, + "MedQA": 0.39, + "WMT 2014": 0.102 + } + }, + { + "model_id": "ai21/j2-jumbo", + "name": "Jurassic-2 Jumbo 178B", + "developer": "ai21", + "scores": { + "Mean win rate": 0.215, + "NarrativeQA": 0.728, + "NaturalQuestions (closed-book)": 0.385, + "OpenbookQA": 0.688, + "MMLU": 0.483, + "MATH": 0.103, + "GSM8K": 0.239, + "LegalBench": 0.533, + "MedQA": 0.431, + "WMT 2014": 0.114 + } + }, + { + "model_id": "ai21/jamba-1.5-large", + "name": "Jamba 1.5 Large", + "developer": "ai21", + "scores": { + "Mean win rate": 0.637, + "NarrativeQA": 0.664, + "NaturalQuestions (closed-book)": 0.394, + "OpenbookQA": 0.948, + "MMLU": 0.683, + "MATH": 0.692, + "GSM8K": 0.846, + "LegalBench": 0.675, + "MedQA": 0.698, + "WMT 2014": 0.203 + } + }, + { + "model_id": "ai21/jamba-1.5-mini", + "name": "Jamba 1.5 Mini", + "developer": "ai21", + "scores": { + "Mean win rate": 0.414, + "NarrativeQA": 0.746, + "NaturalQuestions (closed-book)": 0.388, + "OpenbookQA": 0.89, + "MMLU": 0.582, + "MATH": 0.318, + "GSM8K": 0.691, + "LegalBench": 0.503, + "MedQA": 0.632, + "WMT 2014": 0.179 + } + }, + { + "model_id": "ai21/jamba-instruct", + "name": "Jamba Instruct", + "developer": "ai21", + "scores": { + "Mean win rate": 0.287, + "NarrativeQA": 0.658, + "NaturalQuestions (closed-book)": 0.384, + "OpenbookQA": 0.796, + "MMLU": 0.582, + "MATH": 0.38, + "GSM8K": 0.67, + "LegalBench": 0.54, + "MedQA": 0.519, + "WMT 2014": 0.164 + } + }, + { + "model_id": "allenai/olmo-7b", + "name": "OLMo 7B", + "developer": "allenai", + "scores": { + "Mean win rate": 0.052, + "NarrativeQA": 0.597, + "NaturalQuestions (closed-book)": 0.259, + "OpenbookQA": 0.222, + "MMLU": 0.305, + "MATH": 0.029, + "GSM8K": 0.044, + "LegalBench": 0.341, + "MedQA": 0.229, + "WMT 2014": 0.097 + } + }, + { + "model_id": "amazon/nova-lite-v1:0", + "name": "Amazon Nova Lite", + "developer": "amazon", + "scores": { + "Mean win rate": 0.708, + "NarrativeQA": 0.768, + "NaturalQuestions (closed-book)": 0.352, + "OpenbookQA": 0.928, + "MMLU": 0.693, + "MATH": 0.779, + "GSM8K": 0.829, + "LegalBench": 0.659, + "MedQA": 0.696, + "WMT 2014": 0.204 + } + }, + { + "model_id": "amazon/nova-micro-v1:0", + "name": "Amazon Nova Micro", + "developer": "amazon", + "scores": { + "Mean win rate": 0.524, + "NarrativeQA": 0.744, + "NaturalQuestions (closed-book)": 0.285, + "OpenbookQA": 0.888, + "MMLU": 0.64, + "MATH": 0.76, + "GSM8K": 0.794, + "LegalBench": 0.615, + "MedQA": 0.608, + "WMT 2014": 0.192 + } + }, + { + "model_id": "amazon/nova-pro-v1:0", + "name": "Amazon Nova Pro", + "developer": "amazon", + "scores": { + "Mean win rate": 0.885, + "NarrativeQA": 0.791, + "NaturalQuestions (closed-book)": 0.405, + "OpenbookQA": 0.96, + "MMLU": 0.758, + "MATH": 0.821, + "GSM8K": 0.87, + "LegalBench": 0.736, + "MedQA": 0.811, + "WMT 2014": 0.229 + } + }, + { + "model_id": "anthropic/claude-2.0", + "name": "Claude 2.0", + "developer": "anthropic", + "scores": { + "Mean win rate": 0.489, + "NarrativeQA": 0.718, + "NaturalQuestions (closed-book)": 0.428, + "OpenbookQA": 0.862, + "MMLU": 0.639, + "MATH": 0.603, + "GSM8K": 0.583, + "LegalBench": 0.643, + "MedQA": 0.652, + "WMT 2014": 0.219 + } + }, + { + "model_id": "anthropic/claude-2.1", + "name": "Claude 2.1", + "developer": "anthropic", + "scores": { + "Mean win rate": 0.437, + "NarrativeQA": 0.677, + "NaturalQuestions (closed-book)": 0.375, + "OpenbookQA": 0.872, + "MMLU": 0.643, + "MATH": 0.632, + "GSM8K": 0.604, + "LegalBench": 0.643, + "MedQA": 0.644, + "WMT 2014": 0.204 + } + }, + { + "model_id": "anthropic/claude-3-5-haiku-20241022", + "name": "claude-3-5-haiku-20241022", + "developer": "anthropic", + "scores": { + "Mean win rate": 0.531, + "NarrativeQA": 0.763, + "NaturalQuestions (closed-book)": 0.344, + "OpenbookQA": 0.854, + "MMLU": 0.671, + "MATH": 0.872, + "GSM8K": 0.815, + "LegalBench": 0.631, + "MedQA": 0.722, + "WMT 2014": 0.135 + } + }, + { + "model_id": "anthropic/claude-3-5-sonnet-20240620", + "name": "Claude 3.5 Sonnet 20240620", + "developer": "anthropic", + "scores": { + "Mean win rate": 0.885, + "NarrativeQA": 0.746, + "NaturalQuestions (closed-book)": 0.502, + "OpenbookQA": 0.972, + "MMLU": 0.799, + "MATH": 0.813, + "GSM8K": 0.949, + "LegalBench": 0.707, + "MedQA": 0.825, + "WMT 2014": 0.229 + } + }, + { + "model_id": "anthropic/claude-3-5-sonnet-20241022", + "name": "Claude 3.5 Sonnet 20241022", + "developer": "anthropic", + "scores": { + "Mean win rate": 0.846, + "NarrativeQA": 0.77, + "NaturalQuestions (closed-book)": 0.467, + "OpenbookQA": 0.966, + "MMLU": 0.809, + "MATH": 0.904, + "GSM8K": 0.956, + "LegalBench": 0.647, + "MedQA": 0.859, + "WMT 2014": 0.226 + } + }, + { + "model_id": "anthropic/claude-3-haiku-20240307", + "name": "Claude 3 Haiku 20240307", + "developer": "anthropic", + "scores": { + "Mean win rate": 0.263, + "NarrativeQA": 0.244, + "NaturalQuestions (closed-book)": 0.144, + "OpenbookQA": 0.838, + "MMLU": 0.662, + "MATH": 0.131, + "GSM8K": 0.699, + "LegalBench": 0.46, + "MedQA": 0.702, + "WMT 2014": 0.148 + } + }, + { + "model_id": "anthropic/claude-3-opus-20240229", + "name": "Claude 3 Opus 20240229", + "developer": "anthropic", + "scores": { + "Mean win rate": 0.683, + "NarrativeQA": 0.351, + "NaturalQuestions (closed-book)": 0.441, + "OpenbookQA": 0.956, + "MMLU": 0.768, + "MATH": 0.76, + "GSM8K": 0.924, + "LegalBench": 0.662, + "MedQA": 0.775, + "WMT 2014": 0.24 + } + }, + { + "model_id": "anthropic/claude-3-sonnet-20240229", + "name": "Claude 3 Sonnet 20240229", + "developer": "anthropic", + "scores": { + "Mean win rate": 0.377, + "NarrativeQA": 0.111, + "NaturalQuestions (closed-book)": 0.028, + "OpenbookQA": 0.918, + "MMLU": 0.652, + "MATH": 0.084, + "GSM8K": 0.907, + "LegalBench": 0.49, + "MedQA": 0.684, + "WMT 2014": 0.218 + } + }, + { + "model_id": "anthropic/claude-instant-1.2", + "name": "Claude Instant 1.2", + "developer": "anthropic", + "scores": { + "Mean win rate": 0.399, + "NarrativeQA": 0.616, + "NaturalQuestions (closed-book)": 0.343, + "OpenbookQA": 0.844, + "MMLU": 0.631, + "MATH": 0.499, + "GSM8K": 0.721, + "LegalBench": 0.586, + "MedQA": 0.559, + "WMT 2014": 0.194 + } + }, + { + "model_id": "anthropic/claude-v1.3", + "name": "Anthropic Claude v1.3", + "developer": "anthropic", + "scores": { + "Mean win rate": 0.518, + "NarrativeQA": 0.723, + "NaturalQuestions (closed-book)": 0.409, + "OpenbookQA": 0.908, + "MMLU": 0.631, + "MATH": 0.54, + "GSM8K": 0.784, + "LegalBench": 0.629, + "MedQA": 0.618, + "WMT 2014": 0.219 + } + }, + { + "model_id": "cohere/command", + "name": "Command", + "developer": "cohere", + "scores": { + "Mean win rate": 0.327, + "NarrativeQA": 0.749, + "NaturalQuestions (closed-book)": 0.391, + "OpenbookQA": 0.774, + "MMLU": 0.525, + "MATH": 0.236, + "GSM8K": 0.452, + "LegalBench": 0.578, + "MedQA": 0.445, + "WMT 2014": 0.088 + } + }, + { + "model_id": "cohere/command-light", + "name": "Command Light", + "developer": "cohere", + "scores": { + "Mean win rate": 0.105, + "NarrativeQA": 0.629, + "NaturalQuestions (closed-book)": 0.195, + "OpenbookQA": 0.398, + "MMLU": 0.386, + "MATH": 0.098, + "GSM8K": 0.149, + "LegalBench": 0.397, + "MedQA": 0.312, + "WMT 2014": 0.023 + } + }, + { + "model_id": "cohere/command-r", + "name": "Command R", + "developer": "cohere", + "scores": { + "Mean win rate": 0.299, + "NarrativeQA": 0.742, + "NaturalQuestions (closed-book)": 0.352, + "OpenbookQA": 0.782, + "MMLU": 0.567, + "MATH": 0.266, + "GSM8K": 0.551, + "LegalBench": 0.507, + "MedQA": 0.555, + "WMT 2014": 0.149 + } + }, + { + "model_id": "cohere/command-r-plus", + "name": "Command R Plus", + "developer": "cohere", + "scores": { + "Mean win rate": 0.441, + "NarrativeQA": 0.735, + "NaturalQuestions (closed-book)": 0.343, + "OpenbookQA": 0.828, + "MMLU": 0.59, + "MATH": 0.403, + "GSM8K": 0.738, + "LegalBench": 0.672, + "MedQA": 0.567, + "WMT 2014": 0.203 + } + }, + { + "model_id": "databricks/dbrx-instruct", + "name": "DBRX Instruct", + "developer": "databricks", + "scores": { + "Mean win rate": 0.289, + "NarrativeQA": 0.488, + "NaturalQuestions (closed-book)": 0.284, + "OpenbookQA": 0.91, + "MMLU": 0.643, + "MATH": 0.358, + "GSM8K": 0.671, + "LegalBench": 0.426, + "MedQA": 0.694, + "WMT 2014": 0.131 + } + }, + { + "model_id": "deepseek-ai/deepseek-llm-67b-chat", + "name": "DeepSeek LLM Chat 67B", + "developer": "deepseek-ai", + "scores": { + "Mean win rate": 0.488, + "NarrativeQA": 0.581, + "NaturalQuestions (closed-book)": 0.412, + "OpenbookQA": 0.88, + "MMLU": 0.641, + "MATH": 0.615, + "GSM8K": 0.795, + "LegalBench": 0.637, + "MedQA": 0.628, + "WMT 2014": 0.186 + } + }, + { + "model_id": "deepseek-ai/deepseek-v3", + "name": "DeepSeek v3", + "developer": "deepseek-ai", + "scores": { + "Mean win rate": 0.908, + "NarrativeQA": 0.796, + "NaturalQuestions (closed-book)": 0.467, + "OpenbookQA": 0.954, + "MMLU": 0.803, + "MATH": 0.912, + "GSM8K": 0.94, + "LegalBench": 0.718, + "MedQA": 0.809, + "WMT 2014": 0.209 + } + }, + { + "model_id": "google/gemini-1.0-pro-002", + "name": "Gemini 1.0 Pro 002", + "developer": "google", + "scores": { + "Mean win rate": 0.422, + "NarrativeQA": 0.751, + "NaturalQuestions (closed-book)": 0.391, + "OpenbookQA": 0.788, + "MMLU": 0.534, + "MATH": 0.665, + "GSM8K": 0.816, + "LegalBench": 0.475, + "MedQA": 0.483, + "WMT 2014": 0.194 + } + }, + { + "model_id": "google/gemini-1.5-flash-001", + "name": "Gemini 1.5 Flash 001", + "developer": "google", + "scores": { + "Mean win rate": 0.667, + "NarrativeQA": 0.783, + "NaturalQuestions (closed-book)": 0.332, + "OpenbookQA": 0.928, + "MMLU": 0.703, + "MATH": 0.753, + "GSM8K": 0.785, + "LegalBench": 0.661, + "MedQA": 0.68, + "WMT 2014": 0.225 + } + }, + { + "model_id": "google/gemini-1.5-flash-002", + "name": "Gemini 1.5 Flash 002", + "developer": "google", + "scores": { + "Mean win rate": 0.573, + "NarrativeQA": 0.746, + "NaturalQuestions (closed-book)": 0.323, + "OpenbookQA": 0.914, + "MMLU": 0.679, + "MATH": 0.908, + "GSM8K": 0.328, + "LegalBench": 0.67, + "MedQA": 0.656, + "WMT 2014": 0.212 + } + }, + { + "model_id": "google/gemini-1.5-pro-001", + "name": "Gemini 1.5 Pro 001", + "developer": "google", + "scores": { + "Mean win rate": 0.739, + "NarrativeQA": 0.783, + "NaturalQuestions (closed-book)": 0.378, + "OpenbookQA": 0.902, + "MMLU": 0.772, + "MATH": 0.825, + "GSM8K": 0.836, + "LegalBench": 0.757, + "MedQA": 0.692, + "WMT 2014": 0.189 + } + }, + { + "model_id": "google/gemini-1.5-pro-002", + "name": "Gemini 1.5 Pro 002", + "developer": "google", + "scores": { + "Mean win rate": 0.842, + "NarrativeQA": 0.756, + "NaturalQuestions (closed-book)": 0.455, + "OpenbookQA": 0.952, + "MMLU": 0.795, + "MATH": 0.92, + "GSM8K": 0.817, + "LegalBench": 0.747, + "MedQA": 0.771, + "WMT 2014": 0.231 + } + }, + { + "model_id": "google/gemini-2.0-flash-exp", + "name": "Gemini 2.0 Flash Experimental", + "developer": "google", + "scores": { + "Mean win rate": 0.813, + "NarrativeQA": 0.783, + "NaturalQuestions (closed-book)": 0.443, + "OpenbookQA": 0.946, + "MMLU": 0.717, + "MATH": 0.901, + "GSM8K": 0.946, + "LegalBench": 0.674, + "MedQA": 0.73, + "WMT 2014": 0.212 + } + }, + { + "model_id": "google/gemma-2-27b-it", + "name": "Gemma 2 Instruct 27B", + "developer": "google", + "scores": { + "Mean win rate": 0.675, + "NarrativeQA": 0.79, + "NaturalQuestions (closed-book)": 0.353, + "OpenbookQA": 0.918, + "MMLU": 0.664, + "MATH": 0.746, + "GSM8K": 0.812, + "LegalBench": 0.7, + "MedQA": 0.684, + "WMT 2014": 0.214 + } + }, + { + "model_id": "google/gemma-2-9b-it", + "name": "Gemma 2 Instruct 9B", + "developer": "google", + "scores": { + "Mean win rate": 0.562, + "NarrativeQA": 0.768, + "NaturalQuestions (closed-book)": 0.328, + "OpenbookQA": 0.91, + "MMLU": 0.645, + "MATH": 0.724, + "GSM8K": 0.762, + "LegalBench": 0.639, + "MedQA": 0.63, + "WMT 2014": 0.201 + } + }, + { + "model_id": "google/gemma-7b", + "name": "Gemma 7B", + "developer": "google", + "scores": { + "Mean win rate": 0.336, + "NarrativeQA": 0.752, + "NaturalQuestions (closed-book)": 0.336, + "OpenbookQA": 0.808, + "MMLU": 0.571, + "MATH": 0.5, + "GSM8K": 0.559, + "LegalBench": 0.581, + "MedQA": 0.513, + "WMT 2014": 0.187 + } + }, + { + "model_id": "google/text-bison@001", + "name": "PaLM-2 Bison", + "developer": "google", + "scores": { + "Mean win rate": 0.526, + "NarrativeQA": 0.718, + "NaturalQuestions (closed-book)": 0.39, + "OpenbookQA": 0.878, + "MMLU": 0.608, + "MATH": 0.421, + "GSM8K": 0.61, + "LegalBench": 0.645, + "MedQA": 0.547, + "WMT 2014": 0.241 + } + }, + { + "model_id": "google/text-unicorn@001", + "name": "PaLM-2 Unicorn", + "developer": "google", + "scores": { + "Mean win rate": 0.644, + "NarrativeQA": 0.583, + "NaturalQuestions (closed-book)": 0.435, + "OpenbookQA": 0.938, + "MMLU": 0.702, + "MATH": 0.674, + "GSM8K": 0.831, + "LegalBench": 0.677, + "MedQA": 0.684, + "WMT 2014": 0.26 + } + }, + { + "model_id": "meta/llama-2-13b", + "name": "Llama 2 13B", + "developer": "meta", + "scores": { + "Mean win rate": 0.233, + "NarrativeQA": 0.741, + "NaturalQuestions (closed-book)": 0.371, + "OpenbookQA": 0.634, + "MMLU": 0.505, + "MATH": 0.102, + "GSM8K": 0.266, + "LegalBench": 0.591, + "MedQA": 0.392, + "WMT 2014": 0.167 + } + }, + { + "model_id": "meta/llama-2-70b", + "name": "Llama 2 70B", + "developer": "meta", + "scores": { + "Mean win rate": 0.482, + "NarrativeQA": 0.763, + "NaturalQuestions (closed-book)": 0.46, + "OpenbookQA": 0.838, + "MMLU": 0.58, + "MATH": 0.323, + "GSM8K": 0.567, + "LegalBench": 0.673, + "MedQA": 0.618, + "WMT 2014": 0.196 + } + }, + { + "model_id": "meta/llama-2-7b", + "name": "Llama 2 7B", + "developer": "meta", + "scores": { + "Mean win rate": 0.152, + "NarrativeQA": 0.686, + "NaturalQuestions (closed-book)": 0.333, + "OpenbookQA": 0.544, + "MMLU": 0.425, + "MATH": 0.097, + "GSM8K": 0.154, + "LegalBench": 0.502, + "MedQA": 0.392, + "WMT 2014": 0.144 + } + }, + { + "model_id": "meta/llama-3-70b", + "name": "Llama 3 70B", + "developer": "meta", + "scores": { + "Mean win rate": 0.793, + "NarrativeQA": 0.798, + "NaturalQuestions (closed-book)": 0.475, + "OpenbookQA": 0.934, + "MMLU": 0.695, + "MATH": 0.663, + "GSM8K": 0.805, + "LegalBench": 0.733, + "MedQA": 0.777, + "WMT 2014": 0.225 + } + }, + { + "model_id": "meta/llama-3-8b", + "name": "Llama 3 8B", + "developer": "meta", + "scores": { + "Mean win rate": 0.387, + "NarrativeQA": 0.754, + "NaturalQuestions (closed-book)": 0.378, + "OpenbookQA": 0.766, + "MMLU": 0.602, + "MATH": 0.391, + "GSM8K": 0.499, + "LegalBench": 0.637, + "MedQA": 0.581, + "WMT 2014": 0.183 + } + }, + { + "model_id": "meta/llama-3.1-405b-instruct-turbo", + "name": "Llama 3.1 Instruct Turbo 405B", + "developer": "meta", + "scores": { + "Mean win rate": 0.854, + "NarrativeQA": 0.749, + "NaturalQuestions (closed-book)": 0.456, + "OpenbookQA": 0.94, + "MMLU": 0.759, + "MATH": 0.827, + "GSM8K": 0.949, + "LegalBench": 0.707, + "MedQA": 0.805, + "WMT 2014": 0.238 + } + }, + { + "model_id": "meta/llama-3.1-70b-instruct-turbo", + "name": "Llama 3.1 Instruct Turbo 70B", + "developer": "meta", + "scores": { + "Mean win rate": 0.808, + "NarrativeQA": 0.772, + "NaturalQuestions (closed-book)": 0.452, + "OpenbookQA": 0.938, + "MMLU": 0.709, + "MATH": 0.783, + "GSM8K": 0.938, + "LegalBench": 0.687, + "MedQA": 0.769, + "WMT 2014": 0.223 + } + }, + { + "model_id": "meta/llama-3.1-8b-instruct-turbo", + "name": "Llama 3.1 Instruct Turbo 8B", + "developer": "meta", + "scores": { + "Mean win rate": 0.303, + "NarrativeQA": 0.756, + "NaturalQuestions (closed-book)": 0.209, + "OpenbookQA": 0.74, + "MMLU": 0.5, + "MATH": 0.703, + "GSM8K": 0.798, + "LegalBench": 0.342, + "MedQA": 0.245, + "WMT 2014": 0.181 + } + }, + { + "model_id": "meta/llama-3.2-11b-vision-instruct-turbo", + "name": "Llama 3.2 Vision Instruct Turbo 11B", + "developer": "meta", + "scores": { + "Mean win rate": 0.325, + "NarrativeQA": 0.756, + "NaturalQuestions (closed-book)": 0.234, + "OpenbookQA": 0.724, + "MMLU": 0.511, + "MATH": 0.739, + "GSM8K": 0.823, + "LegalBench": 0.435, + "MedQA": 0.27, + "WMT 2014": 0.179 + } + }, + { + "model_id": "meta/llama-3.2-90b-vision-instruct-turbo", + "name": "Llama 3.2 Vision Instruct Turbo 90B", + "developer": "meta", + "scores": { + "Mean win rate": 0.819, + "NarrativeQA": 0.777, + "NaturalQuestions (closed-book)": 0.457, + "OpenbookQA": 0.942, + "MMLU": 0.703, + "MATH": 0.791, + "GSM8K": 0.936, + "LegalBench": 0.68, + "MedQA": 0.769, + "WMT 2014": 0.224 + } + }, + { + "model_id": "meta/llama-3.3-70b-instruct-turbo", + "name": "Llama 3.3 Instruct Turbo 70B", + "developer": "meta", + "scores": { + "Mean win rate": 0.812, + "NarrativeQA": 0.791, + "NaturalQuestions (closed-book)": 0.431, + "OpenbookQA": 0.928, + "MMLU": 0.7, + "MATH": 0.808, + "GSM8K": 0.942, + "LegalBench": 0.725, + "MedQA": 0.761, + "WMT 2014": 0.219 + } + }, + { + "model_id": "meta/llama-65b", + "name": "LLaMA 65B", + "developer": "meta", + "scores": { + "Mean win rate": 0.345, + "NarrativeQA": 0.755, + "NaturalQuestions (closed-book)": 0.433, + "OpenbookQA": 0.754, + "MMLU": 0.584, + "MATH": 0.257, + "GSM8K": 0.489, + "LegalBench": 0.48, + "MedQA": 0.507, + "WMT 2014": 0.189 + } + }, + { + "model_id": "microsoft/phi-2", + "name": "Phi-2", + "developer": "microsoft", + "scores": { + "Mean win rate": 0.169, + "NarrativeQA": 0.703, + "NaturalQuestions (closed-book)": 0.155, + "OpenbookQA": 0.798, + "MMLU": 0.518, + "MATH": 0.255, + "GSM8K": 0.581, + "LegalBench": 0.334, + "MedQA": 0.41, + "WMT 2014": 0.038 + } + }, + { + "model_id": "microsoft/phi-3-medium-4k-instruct", + "name": "Phi-3 14B", + "developer": "microsoft", + "scores": { + "Mean win rate": 0.509, + "NarrativeQA": 0.724, + "NaturalQuestions (closed-book)": 0.278, + "OpenbookQA": 0.916, + "MMLU": 0.675, + "MATH": 0.611, + "GSM8K": 0.878, + "LegalBench": 0.593, + "MedQA": 0.696, + "WMT 2014": 0.17 + } + }, + { + "model_id": "microsoft/phi-3-small-8k-instruct", + "name": "Phi-3 7B", + "developer": "microsoft", + "scores": { + "Mean win rate": 0.473, + "NarrativeQA": 0.754, + "NaturalQuestions (closed-book)": 0.324, + "OpenbookQA": 0.912, + "MMLU": 0.659, + "MATH": 0.703, + "GSM8K": -1.0, + "LegalBench": 0.584, + "MedQA": 0.672, + "WMT 2014": 0.154 + } + }, + { + "model_id": "mistralai/mistral-7b-instruct-v0.3", + "name": "Mistral Instruct v0.3 7B", + "developer": "mistralai", + "scores": { + "Mean win rate": 0.196, + "NarrativeQA": 0.716, + "NaturalQuestions (closed-book)": 0.253, + "OpenbookQA": 0.79, + "MMLU": 0.51, + "MATH": 0.289, + "GSM8K": 0.538, + "LegalBench": 0.331, + "MedQA": 0.517, + "WMT 2014": 0.142 + } + }, + { + "model_id": "mistralai/mistral-7b-v0.1", + "name": "Mistral v0.1 7B", + "developer": "mistralai", + "scores": { + "Mean win rate": 0.292, + "NarrativeQA": 0.716, + "NaturalQuestions (closed-book)": 0.367, + "OpenbookQA": 0.776, + "MMLU": 0.584, + "MATH": 0.297, + "GSM8K": 0.377, + "LegalBench": 0.58, + "MedQA": 0.525, + "WMT 2014": 0.16 + } + }, + { + "model_id": "mistralai/mistral-large-2402", + "name": "Mistral Large 2402", + "developer": "mistralai", + "scores": { + "Mean win rate": 0.328, + "NarrativeQA": 0.454, + "NaturalQuestions (closed-book)": 0.311, + "OpenbookQA": 0.894, + "MMLU": 0.638, + "MATH": 0.75, + "GSM8K": 0.694, + "LegalBench": 0.479, + "MedQA": 0.499, + "WMT 2014": 0.182 + } + }, + { + "model_id": "mistralai/mistral-large-2407", + "name": "Mistral Large 2 2407", + "developer": "mistralai", + "scores": { + "Mean win rate": 0.744, + "NarrativeQA": 0.779, + "NaturalQuestions (closed-book)": 0.453, + "OpenbookQA": 0.932, + "MMLU": 0.725, + "MATH": 0.677, + "GSM8K": 0.912, + "LegalBench": 0.646, + "MedQA": 0.775, + "WMT 2014": 0.192 + } + }, + { + "model_id": "mistralai/mistral-medium-2312", + "name": "Mistral Medium 2312", + "developer": "mistralai", + "scores": { + "Mean win rate": 0.268, + "NarrativeQA": 0.449, + "NaturalQuestions (closed-book)": 0.29, + "OpenbookQA": 0.83, + "MMLU": 0.618, + "MATH": 0.565, + "GSM8K": 0.706, + "LegalBench": 0.452, + "MedQA": 0.61, + "WMT 2014": 0.169 + } + }, + { + "model_id": "mistralai/mistral-small-2402", + "name": "Mistral Small 2402", + "developer": "mistralai", + "scores": { + "Mean win rate": 0.288, + "NarrativeQA": 0.519, + "NaturalQuestions (closed-book)": 0.304, + "OpenbookQA": 0.862, + "MMLU": 0.593, + "MATH": 0.621, + "GSM8K": 0.734, + "LegalBench": 0.389, + "MedQA": 0.616, + "WMT 2014": 0.169 + } + }, + { + "model_id": "mistralai/mixtral-8x22b", + "name": "Mixtral 8x22B", + "developer": "mistralai", + "scores": { + "Mean win rate": 0.705, + "NarrativeQA": 0.779, + "NaturalQuestions (closed-book)": 0.478, + "OpenbookQA": 0.882, + "MMLU": 0.701, + "MATH": 0.656, + "GSM8K": 0.8, + "LegalBench": 0.708, + "MedQA": 0.704, + "WMT 2014": 0.209 + } + }, + { + "model_id": "mistralai/mixtral-8x7b-32kseqlen", + "name": "Mixtral 8x7B 32K seqlen", + "developer": "mistralai", + "scores": { + "Mean win rate": 0.51, + "NarrativeQA": 0.767, + "NaturalQuestions (closed-book)": 0.427, + "OpenbookQA": 0.868, + "MMLU": 0.649, + "MATH": 0.494, + "GSM8K": 0.622, + "LegalBench": 0.63, + "MedQA": 0.652, + "WMT 2014": 0.19 + } + }, + { + "model_id": "mistralai/open-mistral-nemo-2407", + "name": "Mistral NeMo 2402", + "developer": "mistralai", + "scores": { + "Mean win rate": 0.333, + "NarrativeQA": 0.731, + "NaturalQuestions (closed-book)": 0.265, + "OpenbookQA": 0.822, + "MMLU": 0.604, + "MATH": 0.668, + "GSM8K": 0.782, + "LegalBench": 0.415, + "MedQA": 0.59, + "WMT 2014": 0.177 + } + }, + { + "model_id": "openai/gpt-3.5-turbo-0613", + "name": "gpt-3.5-turbo-0613", + "developer": "openai", + "scores": { + "Mean win rate": 0.358, + "NarrativeQA": 0.655, + "NaturalQuestions (closed-book)": 0.335, + "OpenbookQA": 0.838, + "MMLU": 0.614, + "MATH": 0.667, + "GSM8K": 0.501, + "LegalBench": 0.528, + "MedQA": 0.622, + "WMT 2014": 0.187 + } + }, + { + "model_id": "openai/gpt-4-0613", + "name": "GPT-4 0613", + "developer": "openai", + "scores": { + "Mean win rate": 0.867, + "NarrativeQA": 0.768, + "NaturalQuestions (closed-book)": 0.457, + "OpenbookQA": 0.96, + "MMLU": 0.735, + "MATH": 0.802, + "GSM8K": 0.932, + "LegalBench": 0.713, + "MedQA": 0.815, + "WMT 2014": 0.211 + } + }, + { + "model_id": "openai/gpt-4-1106-preview", + "name": "GPT-4 Turbo 1106 preview", + "developer": "openai", + "scores": { + "Mean win rate": 0.698, + "NarrativeQA": 0.727, + "NaturalQuestions (closed-book)": 0.435, + "OpenbookQA": 0.95, + "MMLU": 0.699, + "MATH": 0.857, + "GSM8K": 0.668, + "LegalBench": 0.626, + "MedQA": 0.817, + "WMT 2014": 0.205 + } + }, + { + "model_id": "openai/gpt-4-turbo-2024-04-09", + "name": "GPT-4 Turbo 2024-04-09", + "developer": "openai", + "scores": { + "Mean win rate": 0.864, + "NarrativeQA": 0.761, + "NaturalQuestions (closed-book)": 0.482, + "OpenbookQA": 0.97, + "MMLU": 0.711, + "MATH": 0.833, + "GSM8K": 0.824, + "LegalBench": 0.727, + "MedQA": 0.783, + "WMT 2014": 0.218 + } + }, + { + "model_id": "openai/gpt-4o-2024-05-13", + "name": "GPT-4o 2024-05-13", + "developer": "openai", + "scores": { + "Mean win rate": 0.938, + "NarrativeQA": 0.804, + "NaturalQuestions (closed-book)": 0.501, + "OpenbookQA": 0.966, + "MMLU": 0.748, + "MATH": 0.829, + "GSM8K": 0.905, + "LegalBench": 0.733, + "MedQA": 0.857, + "WMT 2014": 0.231 + } + }, + { + "model_id": "openai/gpt-4o-2024-08-06", + "name": "GPT-4o 2024-08-06", + "developer": "openai", + "scores": { + "Mean win rate": 0.928, + "NarrativeQA": 0.795, + "NaturalQuestions (closed-book)": 0.496, + "OpenbookQA": 0.968, + "MMLU": 0.738, + "MATH": 0.853, + "GSM8K": 0.909, + "LegalBench": 0.721, + "MedQA": 0.863, + "WMT 2014": 0.225 + } + }, + { + "model_id": "openai/gpt-4o-mini-2024-07-18", + "name": "GPT-4o mini 2024-07-18", + "developer": "openai", + "scores": { + "Mean win rate": 0.701, + "NarrativeQA": 0.768, + "NaturalQuestions (closed-book)": 0.386, + "OpenbookQA": 0.92, + "MMLU": 0.668, + "MATH": 0.802, + "GSM8K": 0.843, + "LegalBench": 0.653, + "MedQA": 0.748, + "WMT 2014": 0.206 + } + }, + { + "model_id": "openai/text-davinci-002", + "name": "text-davinci-002", + "developer": "openai", + "scores": { + "Mean win rate": 0.336, + "NarrativeQA": 0.719, + "NaturalQuestions (closed-book)": 0.394, + "OpenbookQA": 0.796, + "MMLU": 0.568, + "MATH": 0.428, + "GSM8K": 0.479, + "LegalBench": 0.58, + "MedQA": 0.525, + "WMT 2014": 0.174 + } + }, + { + "model_id": "openai/text-davinci-003", + "name": "text-davinci-003", + "developer": "openai", + "scores": { + "Mean win rate": 0.439, + "NarrativeQA": 0.731, + "NaturalQuestions (closed-book)": 0.413, + "OpenbookQA": 0.828, + "MMLU": 0.555, + "MATH": 0.449, + "GSM8K": 0.615, + "LegalBench": 0.622, + "MedQA": 0.531, + "WMT 2014": 0.191 + } + }, + { + "model_id": "qwen/qwen1.5-110b-chat", + "name": "Qwen1.5 Chat 110B", + "developer": "qwen", + "scores": { + "Mean win rate": 0.55, + "NarrativeQA": 0.721, + "NaturalQuestions (closed-book)": 0.35, + "OpenbookQA": 0.922, + "MMLU": 0.704, + "MATH": 0.568, + "GSM8K": 0.815, + "LegalBench": 0.624, + "MedQA": 0.64, + "WMT 2014": 0.192 + } + }, + { + "model_id": "qwen/qwen1.5-14b", + "name": "Qwen1.5 14B", + "developer": "qwen", + "scores": { + "Mean win rate": 0.425, + "NarrativeQA": 0.711, + "NaturalQuestions (closed-book)": 0.3, + "OpenbookQA": 0.862, + "MMLU": 0.626, + "MATH": 0.686, + "GSM8K": 0.693, + "LegalBench": 0.593, + "MedQA": 0.515, + "WMT 2014": 0.178 + } + }, + { + "model_id": "qwen/qwen1.5-32b", + "name": "Qwen1.5 32B", + "developer": "qwen", + "scores": { + "Mean win rate": 0.546, + "NarrativeQA": 0.589, + "NaturalQuestions (closed-book)": 0.353, + "OpenbookQA": 0.932, + "MMLU": 0.628, + "MATH": 0.733, + "GSM8K": 0.773, + "LegalBench": 0.636, + "MedQA": 0.656, + "WMT 2014": 0.193 + } + }, + { + "model_id": "qwen/qwen1.5-72b", + "name": "Qwen1.5 72B", + "developer": "qwen", + "scores": { + "Mean win rate": 0.608, + "NarrativeQA": 0.601, + "NaturalQuestions (closed-book)": 0.417, + "OpenbookQA": 0.93, + "MMLU": 0.647, + "MATH": 0.683, + "GSM8K": 0.799, + "LegalBench": 0.694, + "MedQA": 0.67, + "WMT 2014": 0.201 + } + }, + { + "model_id": "qwen/qwen1.5-7b", + "name": "Qwen1.5 7B", + "developer": "qwen", + "scores": { + "Mean win rate": 0.275, + "NarrativeQA": 0.448, + "NaturalQuestions (closed-book)": 0.27, + "OpenbookQA": 0.806, + "MMLU": 0.569, + "MATH": 0.561, + "GSM8K": 0.6, + "LegalBench": 0.523, + "MedQA": 0.479, + "WMT 2014": 0.153 + } + }, + { + "model_id": "qwen/qwen2-72b-instruct", + "name": "Qwen2 Instruct 72B", + "developer": "qwen", + "scores": { + "Mean win rate": 0.77, + "NarrativeQA": 0.727, + "NaturalQuestions (closed-book)": 0.39, + "OpenbookQA": 0.954, + "MMLU": 0.769, + "MATH": 0.79, + "GSM8K": 0.92, + "LegalBench": 0.712, + "MedQA": 0.746, + "WMT 2014": 0.207 + } + }, + { + "model_id": "qwen/qwen2.5-72b-instruct-turbo", + "name": "Qwen2.5 Instruct Turbo 72B", + "developer": "qwen", + "scores": { + "Mean win rate": 0.745, + "NarrativeQA": 0.745, + "NaturalQuestions (closed-book)": 0.359, + "OpenbookQA": 0.962, + "MMLU": 0.77, + "MATH": 0.884, + "GSM8K": 0.9, + "LegalBench": 0.74, + "MedQA": 0.753, + "WMT 2014": 0.207 + } + }, + { + "model_id": "qwen/qwen2.5-7b-instruct-turbo", + "name": "Qwen2.5 Instruct Turbo 7B", + "developer": "qwen", + "scores": { + "Mean win rate": 0.488, + "NarrativeQA": 0.742, + "NaturalQuestions (closed-book)": 0.205, + "OpenbookQA": 0.862, + "MMLU": 0.658, + "MATH": 0.835, + "GSM8K": 0.83, + "LegalBench": 0.632, + "MedQA": 0.6, + "WMT 2014": 0.155 + } + }, + { + "model_id": "snowflake/snowflake-arctic-instruct", + "name": "Arctic Instruct", + "developer": "snowflake", + "scores": { + "Mean win rate": 0.338, + "NarrativeQA": 0.654, + "NaturalQuestions (closed-book)": 0.39, + "OpenbookQA": 0.828, + "MMLU": 0.575, + "MATH": 0.519, + "GSM8K": 0.768, + "LegalBench": 0.588, + "MedQA": 0.581, + "WMT 2014": 0.172 + } + }, + { + "model_id": "tiiuae/falcon-40b", + "name": "Falcon 40B", + "developer": "tiiuae", + "scores": { + "Mean win rate": 0.217, + "NarrativeQA": 0.671, + "NaturalQuestions (closed-book)": 0.392, + "OpenbookQA": 0.662, + "MMLU": 0.507, + "MATH": 0.128, + "GSM8K": 0.267, + "LegalBench": 0.442, + "MedQA": 0.419, + "WMT 2014": 0.162 + } + }, + { + "model_id": "tiiuae/falcon-7b", + "name": "Falcon 7B", + "developer": "tiiuae", + "scores": { + "Mean win rate": 0.064, + "NarrativeQA": 0.621, + "NaturalQuestions (closed-book)": 0.285, + "OpenbookQA": 0.26, + "MMLU": 0.288, + "MATH": 0.044, + "GSM8K": 0.055, + "LegalBench": 0.346, + "MedQA": 0.254, + "WMT 2014": 0.094 + } + }, + { + "model_id": "upstage/solar-pro-241126", + "name": "Solar Pro", + "developer": "upstage", + "scores": { + "Mean win rate": 0.602, + "NarrativeQA": 0.753, + "NaturalQuestions (closed-book)": 0.297, + "OpenbookQA": 0.922, + "MMLU": 0.679, + "MATH": 0.567, + "GSM8K": 0.871, + "LegalBench": 0.67, + "MedQA": 0.698, + "WMT 2014": 0.169 + } + }, + { + "model_id": "writer/palmyra-x-004", + "name": "Palmyra-X-004", + "developer": "writer", + "scores": { + "Mean win rate": 0.808, + "NarrativeQA": 0.773, + "NaturalQuestions (closed-book)": 0.457, + "OpenbookQA": 0.926, + "MMLU": 0.739, + "MATH": 0.767, + "GSM8K": 0.905, + "LegalBench": 0.73, + "MedQA": 0.775, + "WMT 2014": 0.203 + } + }, + { + "model_id": "writer/palmyra-x-v2", + "name": "Palmyra X V2 33B", + "developer": "writer", + "scores": { + "Mean win rate": 0.589, + "NarrativeQA": 0.752, + "NaturalQuestions (closed-book)": 0.428, + "OpenbookQA": 0.878, + "MMLU": 0.621, + "MATH": 0.58, + "GSM8K": 0.735, + "LegalBench": 0.644, + "MedQA": 0.598, + "WMT 2014": 0.239 + } + }, + { + "model_id": "writer/palmyra-x-v3", + "name": "Palmyra X V3 72B", + "developer": "writer", + "scores": { + "Mean win rate": 0.679, + "NarrativeQA": 0.706, + "NaturalQuestions (closed-book)": 0.407, + "OpenbookQA": 0.938, + "MMLU": 0.702, + "MATH": 0.723, + "GSM8K": 0.831, + "LegalBench": 0.709, + "MedQA": 0.684, + "WMT 2014": 0.262 + } + } + ] +} \ No newline at end of file diff --git a/data/benchmarks/helm_mmlu.json b/data/benchmarks/helm_mmlu.json new file mode 100644 index 0000000000000000000000000000000000000000..d02fd683e129eb8ae137f45f82465cf48a4fd5ce --- /dev/null +++ b/data/benchmarks/helm_mmlu.json @@ -0,0 +1,3401 @@ +{ + "models": [ + { + "model_id": "01-ai/yi-34b", + "name": "Yi 34B", + "developer": "01-ai", + "scores": { + "MMLU All Subjects": 0.762, + "Abstract Algebra": 0.4, + "Anatomy": 0.748, + "College Physics": 0.5, + "Computer Security": 0.83, + "Econometrics": 0.588, + "Global Facts": 0.53, + "Jurisprudence": 0.898, + "Philosophy": 0.82, + "Professional Psychology": 0.835, + "Us Foreign Policy": 0.91, + "Astronomy": 0.901, + "Business Ethics": 0.75, + "Clinical Knowledge": 0.8, + "Conceptual Physics": 0.77, + "Electrical Engineering": 0.779, + "Elementary Mathematics": 0.656, + "Formal Logic": 0.548, + "High School World History": 0.907, + "Human Sexuality": 0.87, + "International Law": 0.909, + "Logical Fallacies": 0.883, + "Machine Learning": 0.58, + "Management": 0.893, + "Marketing": 0.936, + "Medical Genetics": 0.87, + "Miscellaneous": 0.902, + "Moral Scenarios": 0.606, + "Nutrition": 0.869, + "Prehistory": 0.877, + "Public Relations": 0.745, + "Security Studies": 0.833, + "Sociology": 0.9, + "Virology": 0.572, + "World Religions": 0.877, + "Mean win rate": 0.315 + } + }, + { + "model_id": "01-ai/yi-6b", + "name": "Yi 6B", + "developer": "01-ai", + "scores": { + "MMLU All Subjects": 0.64, + "Abstract Algebra": 0.3, + "Anatomy": 0.6, + "College Physics": 0.422, + "Computer Security": 0.73, + "Econometrics": 0.351, + "Global Facts": 0.43, + "Jurisprudence": 0.796, + "Philosophy": 0.678, + "Professional Psychology": 0.668, + "Us Foreign Policy": 0.87, + "Astronomy": 0.684, + "Business Ethics": 0.67, + "Clinical Knowledge": 0.66, + "Conceptual Physics": 0.621, + "Electrical Engineering": 0.662, + "Elementary Mathematics": 0.452, + "Formal Logic": 0.452, + "High School World History": 0.785, + "Human Sexuality": 0.763, + "International Law": 0.769, + "Logical Fallacies": 0.779, + "Machine Learning": 0.411, + "Management": 0.806, + "Marketing": 0.893, + "Medical Genetics": 0.77, + "Miscellaneous": 0.796, + "Moral Scenarios": 0.335, + "Nutrition": 0.739, + "Prehistory": 0.713, + "Public Relations": 0.718, + "Security Studies": 0.735, + "Sociology": 0.831, + "Virology": 0.452, + "World Religions": 0.836, + "Mean win rate": 0.651 + } + }, + { + "model_id": "01-ai/yi-large-preview", + "name": "Yi Large Preview", + "developer": "01-ai", + "scores": { + "MMLU All Subjects": 0.793, + "Abstract Algebra": 0.6, + "Anatomy": 0.83, + "College Physics": 0.569, + "Computer Security": 0.86, + "Econometrics": 0.728, + "Global Facts": 0.52, + "Jurisprudence": 0.852, + "Philosophy": 0.842, + "Professional Psychology": 0.853, + "Us Foreign Policy": 0.85, + "Astronomy": 0.914, + "Business Ethics": 0.8, + "Clinical Knowledge": 0.857, + "Conceptual Physics": 0.864, + "Electrical Engineering": 0.779, + "Elementary Mathematics": 0.685, + "Formal Logic": 0.603, + "High School World History": 0.928, + "Human Sexuality": 0.901, + "International Law": 0.917, + "Logical Fallacies": 0.865, + "Machine Learning": 0.616, + "Management": 0.903, + "Marketing": 0.927, + "Medical Genetics": 0.83, + "Miscellaneous": 0.916, + "Moral Scenarios": 0.831, + "Nutrition": 0.846, + "Prehistory": 0.892, + "Public Relations": 0.827, + "Security Studies": 0.82, + "Sociology": 0.881, + "Virology": 0.59, + "World Religions": 0.871, + "Mean win rate": 0.258 + } + }, + { + "model_id": "ai21/jamba-1.5-large", + "name": "Jamba 1.5 Large", + "developer": "ai21", + "scores": { + "MMLU All Subjects": 0.782, + "Abstract Algebra": 0.53, + "Anatomy": 0.793, + "College Physics": 0.51, + "Computer Security": 0.8, + "Econometrics": 0.614, + "Global Facts": 0.54, + "Jurisprudence": 0.87, + "Philosophy": 0.849, + "Professional Psychology": 0.842, + "Us Foreign Policy": 0.92, + "Astronomy": 0.882, + "Business Ethics": 0.77, + "Clinical Knowledge": 0.849, + "Conceptual Physics": 0.779, + "Electrical Engineering": 0.793, + "Elementary Mathematics": 0.656, + "Formal Logic": 0.619, + "High School World History": 0.911, + "Human Sexuality": 0.832, + "International Law": 0.884, + "Logical Fallacies": 0.859, + "Machine Learning": 0.688, + "Management": 0.864, + "Marketing": 0.94, + "Medical Genetics": 0.89, + "Miscellaneous": 0.931, + "Moral Scenarios": 0.686, + "Nutrition": 0.869, + "Prehistory": 0.892, + "Public Relations": 0.755, + "Security Studies": 0.771, + "Sociology": 0.93, + "Virology": 0.554, + "World Religions": 0.865, + "Mean win rate": 0.147 + } + }, + { + "model_id": "ai21/jamba-1.5-mini", + "name": "Jamba 1.5 Mini", + "developer": "ai21", + "scores": { + "MMLU All Subjects": 0.699, + "Abstract Algebra": 0.33, + "Anatomy": 0.711, + "College Physics": 0.48, + "Computer Security": 0.73, + "Econometrics": 0.491, + "Global Facts": 0.43, + "Jurisprudence": 0.88, + "Philosophy": 0.752, + "Professional Psychology": 0.76, + "Us Foreign Policy": 0.9, + "Astronomy": 0.822, + "Business Ethics": 0.76, + "Clinical Knowledge": 0.74, + "Conceptual Physics": 0.677, + "Electrical Engineering": 0.683, + "Elementary Mathematics": 0.553, + "Formal Logic": 0.452, + "High School World History": 0.84, + "Human Sexuality": 0.809, + "International Law": 0.893, + "Logical Fallacies": 0.81, + "Machine Learning": 0.509, + "Management": 0.825, + "Marketing": 0.915, + "Medical Genetics": 0.69, + "Miscellaneous": 0.902, + "Moral Scenarios": 0.269, + "Nutrition": 0.801, + "Prehistory": 0.824, + "Public Relations": 0.727, + "Security Studies": 0.755, + "Sociology": 0.876, + "Virology": 0.578, + "World Religions": 0.842, + "Mean win rate": 0.206 + } + }, + { + "model_id": "ai21/jamba-instruct", + "name": "Jamba Instruct", + "developer": "ai21", + "scores": { + "MMLU All Subjects": 0.659, + "Abstract Algebra": 0.36, + "Anatomy": 0.615, + "College Physics": 0.422, + "Computer Security": 0.76, + "Econometrics": 0.439, + "Global Facts": 0.4, + "Jurisprudence": 0.796, + "Philosophy": 0.749, + "Professional Psychology": 0.716, + "Us Foreign Policy": 0.91, + "Astronomy": 0.73, + "Business Ethics": 0.6, + "Clinical Knowledge": 0.702, + "Conceptual Physics": 0.677, + "Electrical Engineering": 0.621, + "Elementary Mathematics": 0.497, + "Formal Logic": 0.444, + "High School World History": 0.797, + "Human Sexuality": 0.794, + "International Law": 0.835, + "Logical Fallacies": 0.706, + "Machine Learning": 0.536, + "Management": 0.786, + "Marketing": 0.885, + "Medical Genetics": 0.67, + "Miscellaneous": 0.865, + "Moral Scenarios": 0.465, + "Nutrition": 0.745, + "Prehistory": 0.796, + "Public Relations": 0.682, + "Security Studies": 0.743, + "Sociology": 0.891, + "Virology": 0.53, + "World Religions": 0.813, + "Mean win rate": 0.887 + } + }, + { + "model_id": "allenai/olmo-1.7-7b", + "name": "OLMo 1.7 7B", + "developer": "allenai", + "scores": { + "MMLU All Subjects": 0.538, + "Abstract Algebra": 0.33, + "Anatomy": 0.496, + "College Physics": 0.333, + "Computer Security": 0.65, + "Econometrics": 0.404, + "Global Facts": 0.34, + "Jurisprudence": 0.565, + "Philosophy": 0.592, + "Professional Psychology": 0.526, + "Us Foreign Policy": 0.76, + "Astronomy": 0.526, + "Business Ethics": 0.59, + "Clinical Knowledge": 0.57, + "Conceptual Physics": 0.434, + "Electrical Engineering": 0.517, + "Elementary Mathematics": 0.307, + "Formal Logic": 0.325, + "High School World History": 0.713, + "Human Sexuality": 0.595, + "International Law": 0.612, + "Logical Fallacies": 0.607, + "Machine Learning": 0.375, + "Management": 0.689, + "Marketing": 0.769, + "Medical Genetics": 0.56, + "Miscellaneous": 0.734, + "Moral Scenarios": 0.335, + "Nutrition": 0.608, + "Prehistory": 0.593, + "Public Relations": 0.6, + "Security Studies": 0.522, + "Sociology": 0.751, + "Virology": 0.452, + "World Religions": 0.731, + "Mean win rate": 0.196 + } + }, + { + "model_id": "allenai/olmo-7b", + "name": "OLMo 7B", + "developer": "allenai", + "scores": { + "MMLU All Subjects": 0.295, + "Abstract Algebra": 0.26, + "Anatomy": 0.222, + "College Physics": 0.294, + "Computer Security": 0.3, + "Econometrics": 0.325, + "Global Facts": 0.32, + "Jurisprudence": 0.25, + "Philosophy": 0.325, + "Professional Psychology": 0.232, + "Us Foreign Policy": 0.26, + "Astronomy": 0.342, + "Business Ethics": 0.24, + "Clinical Knowledge": 0.26, + "Conceptual Physics": 0.319, + "Electrical Engineering": 0.29, + "Elementary Mathematics": 0.254, + "Formal Logic": 0.278, + "High School World History": 0.253, + "Human Sexuality": 0.267, + "International Law": 0.306, + "Logical Fallacies": 0.264, + "Machine Learning": 0.286, + "Management": 0.272, + "Marketing": 0.269, + "Medical Genetics": 0.28, + "Miscellaneous": 0.292, + "Moral Scenarios": 0.265, + "Nutrition": 0.34, + "Prehistory": 0.318, + "Public Relations": 0.345, + "Security Studies": 0.408, + "Sociology": 0.383, + "Virology": 0.416, + "World Religions": 0.234, + "Mean win rate": 0.68 + } + }, + { + "model_id": "amazon/nova-lite-v1:0", + "name": "Amazon Nova Lite", + "developer": "amazon", + "scores": { + "MMLU All Subjects": 0.77, + "Abstract Algebra": 0.52, + "Anatomy": 0.719, + "College Physics": 0.608, + "Computer Security": 0.79, + "Econometrics": 0.675, + "Global Facts": 0.55, + "Jurisprudence": 0.852, + "Philosophy": 0.817, + "Professional Psychology": 0.812, + "Us Foreign Policy": 0.92, + "Astronomy": 0.862, + "Business Ethics": 0.73, + "Clinical Knowledge": 0.8, + "Conceptual Physics": 0.796, + "Electrical Engineering": 0.779, + "Elementary Mathematics": 0.757, + "Formal Logic": 0.643, + "High School World History": 0.886, + "Human Sexuality": 0.84, + "International Law": 0.843, + "Logical Fallacies": 0.81, + "Machine Learning": 0.509, + "Management": 0.864, + "Marketing": 0.889, + "Medical Genetics": 0.9, + "Miscellaneous": 0.872, + "Moral Scenarios": 0.694, + "Nutrition": 0.788, + "Prehistory": 0.849, + "Public Relations": 0.682, + "Security Studies": 0.788, + "Sociology": 0.896, + "Virology": 0.542, + "World Religions": 0.871, + "Mean win rate": 0.987 + } + }, + { + "model_id": "amazon/nova-micro-v1:0", + "name": "Amazon Nova Micro", + "developer": "amazon", + "scores": { + "MMLU All Subjects": 0.708, + "Abstract Algebra": 0.42, + "Anatomy": 0.726, + "College Physics": 0.5, + "Computer Security": 0.77, + "Econometrics": 0.57, + "Global Facts": 0.44, + "Jurisprudence": 0.815, + "Philosophy": 0.733, + "Professional Psychology": 0.739, + "Us Foreign Policy": 0.9, + "Astronomy": 0.822, + "Business Ethics": 0.71, + "Clinical Knowledge": 0.751, + "Conceptual Physics": 0.706, + "Electrical Engineering": 0.683, + "Elementary Mathematics": 0.55, + "Formal Logic": 0.508, + "High School World History": 0.84, + "Human Sexuality": 0.824, + "International Law": 0.843, + "Logical Fallacies": 0.798, + "Machine Learning": 0.562, + "Management": 0.816, + "Marketing": 0.91, + "Medical Genetics": 0.82, + "Miscellaneous": 0.83, + "Moral Scenarios": 0.464, + "Nutrition": 0.778, + "Prehistory": 0.787, + "Public Relations": 0.673, + "Security Studies": 0.718, + "Sociology": 0.846, + "Virology": 0.524, + "World Religions": 0.825, + "Mean win rate": 1.0 + } + }, + { + "model_id": "amazon/nova-pro-v1:0", + "name": "Amazon Nova Pro", + "developer": "amazon", + "scores": { + "MMLU All Subjects": 0.82, + "Abstract Algebra": 0.69, + "Anatomy": 0.807, + "College Physics": 0.647, + "Computer Security": 0.84, + "Econometrics": 0.702, + "Global Facts": 0.54, + "Jurisprudence": 0.861, + "Philosophy": 0.826, + "Professional Psychology": 0.864, + "Us Foreign Policy": 0.93, + "Astronomy": 0.895, + "Business Ethics": 0.81, + "Clinical Knowledge": 0.875, + "Conceptual Physics": 0.851, + "Electrical Engineering": 0.8, + "Elementary Mathematics": 0.831, + "Formal Logic": 0.714, + "High School World History": 0.928, + "Human Sexuality": 0.885, + "International Law": 0.901, + "Logical Fallacies": 0.871, + "Machine Learning": 0.625, + "Management": 0.922, + "Marketing": 0.923, + "Medical Genetics": 0.87, + "Miscellaneous": 0.912, + "Moral Scenarios": 0.76, + "Nutrition": 0.866, + "Prehistory": 0.926, + "Public Relations": 0.8, + "Security Studies": 0.849, + "Sociology": 0.905, + "Virology": 0.59, + "World Religions": 0.877, + "Mean win rate": 0.975 + } + }, + { + "model_id": "anthropic/claude-2.1", + "name": "Claude 2.1", + "developer": "anthropic", + "scores": { + "MMLU All Subjects": 0.735, + "Abstract Algebra": 0.4, + "Anatomy": 0.726, + "College Physics": 0.5, + "Computer Security": 0.81, + "Econometrics": 0.596, + "Global Facts": 0.55, + "Jurisprudence": 0.87, + "Philosophy": 0.794, + "Professional Psychology": 0.797, + "Us Foreign Policy": 0.92, + "Astronomy": 0.855, + "Business Ethics": 0.73, + "Clinical Knowledge": 0.785, + "Conceptual Physics": 0.766, + "Electrical Engineering": 0.724, + "Elementary Mathematics": 0.521, + "Formal Logic": 0.5, + "High School World History": 0.903, + "Human Sexuality": 0.847, + "International Law": 0.901, + "Logical Fallacies": 0.834, + "Machine Learning": 0.482, + "Management": 0.825, + "Marketing": 0.923, + "Medical Genetics": 0.81, + "Miscellaneous": 0.88, + "Moral Scenarios": 0.52, + "Nutrition": 0.781, + "Prehistory": 0.821, + "Public Relations": 0.773, + "Security Studies": 0.812, + "Sociology": 0.886, + "Virology": 0.554, + "World Religions": 0.854, + "Mean win rate": 0.048 + } + }, + { + "model_id": "anthropic/claude-3-5-haiku-20241022", + "name": "claude-3-5-haiku-20241022", + "developer": "anthropic", + "scores": { + "MMLU All Subjects": 0.743, + "Abstract Algebra": 0.47, + "Anatomy": 0.793, + "College Physics": 0.52, + "Computer Security": 0.84, + "Econometrics": 0.596, + "Global Facts": 0.5, + "Jurisprudence": 0.861, + "Philosophy": 0.823, + "Professional Psychology": 0.825, + "Us Foreign Policy": 0.94, + "Astronomy": 0.829, + "Business Ethics": 0.8, + "Clinical Knowledge": 0.823, + "Conceptual Physics": 0.723, + "Electrical Engineering": 0.717, + "Elementary Mathematics": 0.561, + "Formal Logic": 0.619, + "High School World History": 0.882, + "Human Sexuality": 0.885, + "International Law": 0.884, + "Logical Fallacies": 0.822, + "Machine Learning": 0.518, + "Management": 0.845, + "Marketing": 0.897, + "Medical Genetics": 0.83, + "Miscellaneous": 0.905, + "Moral Scenarios": 0.476, + "Nutrition": 0.846, + "Prehistory": 0.877, + "Public Relations": 0.727, + "Security Studies": 0.792, + "Sociology": 0.905, + "Virology": 0.566, + "World Religions": 0.865, + "Mean win rate": 0.128 + } + }, + { + "model_id": "anthropic/claude-3-5-sonnet-20240620", + "name": "Claude 3.5 Sonnet 20240620", + "developer": "anthropic", + "scores": { + "MMLU All Subjects": 0.865, + "Abstract Algebra": 0.75, + "Anatomy": 0.844, + "College Physics": 0.696, + "Computer Security": 0.89, + "Econometrics": 0.807, + "Global Facts": 0.72, + "Jurisprudence": 0.889, + "Philosophy": 0.891, + "Professional Psychology": 0.922, + "Us Foreign Policy": 0.96, + "Astronomy": 0.961, + "Business Ethics": 0.85, + "Clinical Knowledge": 0.913, + "Conceptual Physics": 0.885, + "Electrical Engineering": 0.828, + "Elementary Mathematics": 0.892, + "Formal Logic": 0.698, + "High School World History": 0.954, + "Human Sexuality": 0.939, + "International Law": 0.959, + "Logical Fallacies": 0.926, + "Machine Learning": 0.786, + "Management": 0.942, + "Marketing": 0.949, + "Medical Genetics": 0.98, + "Miscellaneous": 0.962, + "Moral Scenarios": 0.882, + "Nutrition": 0.912, + "Prehistory": 0.951, + "Public Relations": 0.855, + "Security Studies": 0.878, + "Sociology": 0.96, + "Virology": 0.602, + "World Religions": 0.924, + "Mean win rate": 0.17 + } + }, + { + "model_id": "anthropic/claude-3-5-sonnet-20241022", + "name": "Claude 3.5 Sonnet 20241022", + "developer": "anthropic", + "scores": { + "MMLU All Subjects": 0.873, + "Abstract Algebra": 0.78, + "Anatomy": 0.859, + "College Physics": 0.775, + "Computer Security": 0.87, + "Econometrics": 0.807, + "Global Facts": 0.8, + "Jurisprudence": 0.898, + "Philosophy": 0.891, + "Professional Psychology": 0.922, + "Us Foreign Policy": 0.96, + "Astronomy": 0.974, + "Business Ethics": 0.83, + "Clinical Knowledge": 0.928, + "Conceptual Physics": 0.906, + "Electrical Engineering": 0.848, + "Elementary Mathematics": 0.918, + "Formal Logic": 0.786, + "High School World History": 0.958, + "Human Sexuality": 0.939, + "International Law": 0.959, + "Logical Fallacies": 0.914, + "Machine Learning": 0.839, + "Management": 0.932, + "Marketing": 0.953, + "Medical Genetics": 0.96, + "Miscellaneous": 0.964, + "Moral Scenarios": 0.888, + "Nutrition": 0.922, + "Prehistory": 0.941, + "Public Relations": 0.8, + "Security Studies": 0.882, + "Sociology": 0.955, + "Virology": 0.584, + "World Religions": 0.901, + "Mean win rate": 0.311 + } + }, + { + "model_id": "anthropic/claude-3-haiku-20240307", + "name": "Claude 3 Haiku 20240307", + "developer": "anthropic", + "scores": { + "MMLU All Subjects": 0.738, + "Abstract Algebra": 0.42, + "Anatomy": 0.711, + "College Physics": 0.48, + "Computer Security": 0.79, + "Econometrics": 0.632, + "Global Facts": 0.47, + "Jurisprudence": 0.861, + "Philosophy": 0.814, + "Professional Psychology": 0.802, + "Us Foreign Policy": 0.95, + "Astronomy": 0.901, + "Business Ethics": 0.78, + "Clinical Knowledge": 0.789, + "Conceptual Physics": 0.715, + "Electrical Engineering": 0.69, + "Elementary Mathematics": 0.558, + "Formal Logic": 0.579, + "High School World History": 0.878, + "Human Sexuality": 0.824, + "International Law": 0.901, + "Logical Fallacies": 0.791, + "Machine Learning": 0.589, + "Management": 0.874, + "Marketing": 0.91, + "Medical Genetics": 0.8, + "Miscellaneous": 0.893, + "Moral Scenarios": 0.502, + "Nutrition": 0.83, + "Prehistory": 0.824, + "Public Relations": 0.755, + "Security Studies": 0.808, + "Sociology": 0.9, + "Virology": 0.542, + "World Religions": 0.871, + "Mean win rate": 0.28 + } + }, + { + "model_id": "anthropic/claude-3-opus-20240229", + "name": "Claude 3 Opus 20240229", + "developer": "anthropic", + "scores": { + "MMLU All Subjects": 0.846, + "Abstract Algebra": 0.64, + "Anatomy": 0.8, + "College Physics": 0.716, + "Computer Security": 0.85, + "Econometrics": 0.789, + "Global Facts": 0.66, + "Jurisprudence": 0.88, + "Philosophy": 0.9, + "Professional Psychology": 0.904, + "Us Foreign Policy": 0.96, + "Astronomy": 0.967, + "Business Ethics": 0.86, + "Clinical Knowledge": 0.879, + "Conceptual Physics": 0.881, + "Electrical Engineering": 0.814, + "Elementary Mathematics": 0.862, + "Formal Logic": 0.698, + "High School World History": 0.941, + "Human Sexuality": 0.908, + "International Law": 0.901, + "Logical Fallacies": 0.896, + "Machine Learning": 0.741, + "Management": 0.942, + "Marketing": 0.944, + "Medical Genetics": 0.93, + "Miscellaneous": 0.951, + "Moral Scenarios": 0.826, + "Nutrition": 0.925, + "Prehistory": 0.941, + "Public Relations": 0.827, + "Security Studies": 0.886, + "Sociology": 0.94, + "Virology": 0.578, + "World Religions": 0.901, + "Mean win rate": 0.014 + } + }, + { + "model_id": "anthropic/claude-3-sonnet-20240229", + "name": "Claude 3 Sonnet 20240229", + "developer": "anthropic", + "scores": { + "MMLU All Subjects": 0.759, + "Abstract Algebra": 0.39, + "Anatomy": 0.711, + "College Physics": 0.559, + "Computer Security": 0.79, + "Econometrics": 0.64, + "Global Facts": 0.53, + "Jurisprudence": 0.861, + "Philosophy": 0.852, + "Professional Psychology": 0.814, + "Us Foreign Policy": 0.94, + "Astronomy": 0.855, + "Business Ethics": 0.82, + "Clinical Knowledge": 0.804, + "Conceptual Physics": 0.774, + "Electrical Engineering": 0.703, + "Elementary Mathematics": 0.635, + "Formal Logic": 0.579, + "High School World History": 0.895, + "Human Sexuality": 0.809, + "International Law": 0.909, + "Logical Fallacies": 0.853, + "Machine Learning": 0.643, + "Management": 0.922, + "Marketing": 0.85, + "Medical Genetics": 0.79, + "Miscellaneous": 0.872, + "Moral Scenarios": 0.626, + "Nutrition": 0.82, + "Prehistory": 0.864, + "Public Relations": 0.782, + "Security Studies": 0.865, + "Sociology": 0.905, + "Virology": 0.578, + "World Religions": 0.871, + "Mean win rate": 0.082 + } + }, + { + "model_id": "anthropic/claude-instant-1.2", + "name": "Claude Instant 1.2", + "developer": "anthropic", + "scores": { + "MMLU All Subjects": 0.688, + "Abstract Algebra": 0.37, + "Anatomy": 0.637, + "College Physics": 0.49, + "Computer Security": 0.76, + "Econometrics": 0.614, + "Global Facts": 0.38, + "Jurisprudence": 0.833, + "Philosophy": 0.756, + "Professional Psychology": 0.724, + "Us Foreign Policy": 0.9, + "Astronomy": 0.743, + "Business Ethics": 0.7, + "Clinical Knowledge": 0.709, + "Conceptual Physics": 0.613, + "Electrical Engineering": 0.641, + "Elementary Mathematics": 0.45, + "Formal Logic": 0.444, + "High School World History": 0.878, + "Human Sexuality": 0.794, + "International Law": 0.851, + "Logical Fallacies": 0.81, + "Machine Learning": 0.67, + "Management": 0.835, + "Marketing": 0.885, + "Medical Genetics": 0.71, + "Miscellaneous": 0.828, + "Moral Scenarios": 0.488, + "Nutrition": 0.735, + "Prehistory": 0.762, + "Public Relations": 0.627, + "Security Studies": 0.784, + "Sociology": 0.841, + "Virology": 0.548, + "World Religions": 0.784, + "Mean win rate": 0.186 + } + }, + { + "model_id": "cohere/command-r", + "name": "Command R", + "developer": "cohere", + "scores": { + "MMLU All Subjects": 0.652, + "Abstract Algebra": 0.33, + "Anatomy": 0.615, + "College Physics": 0.382, + "Computer Security": 0.78, + "Econometrics": 0.456, + "Global Facts": 0.42, + "Jurisprudence": 0.796, + "Philosophy": 0.685, + "Professional Psychology": 0.681, + "Us Foreign Policy": 0.82, + "Astronomy": 0.743, + "Business Ethics": 0.63, + "Clinical Knowledge": 0.751, + "Conceptual Physics": 0.528, + "Electrical Engineering": 0.593, + "Elementary Mathematics": 0.437, + "Formal Logic": 0.405, + "High School World History": 0.84, + "Human Sexuality": 0.763, + "International Law": 0.802, + "Logical Fallacies": 0.798, + "Machine Learning": 0.446, + "Management": 0.796, + "Marketing": 0.872, + "Medical Genetics": 0.81, + "Miscellaneous": 0.848, + "Moral Scenarios": 0.451, + "Nutrition": 0.703, + "Prehistory": 0.728, + "Public Relations": 0.7, + "Security Studies": 0.714, + "Sociology": 0.866, + "Virology": 0.542, + "World Religions": 0.813, + "Mean win rate": 0.959 + } + }, + { + "model_id": "cohere/command-r-plus", + "name": "Command R Plus", + "developer": "cohere", + "scores": { + "MMLU All Subjects": 0.694, + "Abstract Algebra": 0.21, + "Anatomy": 0.644, + "College Physics": 0.52, + "Computer Security": 0.74, + "Econometrics": 0.561, + "Global Facts": 0.5, + "Jurisprudence": 0.806, + "Philosophy": 0.695, + "Professional Psychology": 0.735, + "Us Foreign Policy": 0.89, + "Astronomy": 0.783, + "Business Ethics": 0.77, + "Clinical Knowledge": 0.743, + "Conceptual Physics": 0.591, + "Electrical Engineering": 0.71, + "Elementary Mathematics": 0.474, + "Formal Logic": 0.484, + "High School World History": 0.827, + "Human Sexuality": 0.786, + "International Law": 0.835, + "Logical Fallacies": 0.791, + "Machine Learning": 0.518, + "Management": 0.835, + "Marketing": 0.927, + "Medical Genetics": 0.77, + "Miscellaneous": 0.844, + "Moral Scenarios": 0.585, + "Nutrition": 0.742, + "Prehistory": 0.821, + "Public Relations": 0.709, + "Security Studies": 0.751, + "Sociology": 0.876, + "Virology": 0.56, + "World Religions": 0.842, + "Mean win rate": 0.825 + } + }, + { + "model_id": "databricks/dbrx-instruct", + "name": "DBRX Instruct", + "developer": "databricks", + "scores": { + "MMLU All Subjects": 0.741, + "Abstract Algebra": 0.34, + "Anatomy": 0.667, + "College Physics": 0.539, + "Computer Security": 0.83, + "Econometrics": 0.605, + "Global Facts": 0.46, + "Jurisprudence": 0.843, + "Philosophy": 0.804, + "Professional Psychology": 0.801, + "Us Foreign Policy": 0.93, + "Astronomy": 0.836, + "Business Ethics": 0.78, + "Clinical Knowledge": 0.789, + "Conceptual Physics": 0.74, + "Electrical Engineering": 0.71, + "Elementary Mathematics": 0.563, + "Formal Logic": 0.563, + "High School World History": 0.903, + "Human Sexuality": 0.878, + "International Law": 0.884, + "Logical Fallacies": 0.847, + "Machine Learning": 0.625, + "Management": 0.854, + "Marketing": 0.94, + "Medical Genetics": 0.85, + "Miscellaneous": 0.911, + "Moral Scenarios": 0.465, + "Nutrition": 0.814, + "Prehistory": 0.84, + "Public Relations": 0.691, + "Security Studies": 0.804, + "Sociology": 0.896, + "Virology": 0.566, + "World Religions": 0.871, + "Mean win rate": 0.537 + } + }, + { + "model_id": "deepseek-ai/deepseek-llm-67b-chat", + "name": "DeepSeek LLM Chat 67B", + "developer": "deepseek-ai", + "scores": { + "MMLU All Subjects": 0.725, + "Abstract Algebra": 0.44, + "Anatomy": 0.667, + "College Physics": 0.363, + "Computer Security": 0.79, + "Econometrics": 0.553, + "Global Facts": 0.46, + "Jurisprudence": 0.852, + "Philosophy": 0.801, + "Professional Psychology": 0.809, + "Us Foreign Policy": 0.91, + "Astronomy": 0.822, + "Business Ethics": 0.86, + "Clinical Knowledge": 0.785, + "Conceptual Physics": 0.723, + "Electrical Engineering": 0.669, + "Elementary Mathematics": 0.548, + "Formal Logic": 0.548, + "High School World History": 0.911, + "Human Sexuality": 0.84, + "International Law": 0.851, + "Logical Fallacies": 0.847, + "Machine Learning": 0.562, + "Management": 0.903, + "Marketing": 0.923, + "Medical Genetics": 0.73, + "Miscellaneous": 0.904, + "Moral Scenarios": 0.544, + "Nutrition": 0.781, + "Prehistory": 0.858, + "Public Relations": 0.7, + "Security Studies": 0.796, + "Sociology": 0.876, + "Virology": 0.554, + "World Religions": 0.865, + "Mean win rate": 0.387 + } + }, + { + "model_id": "deepseek-ai/deepseek-v3", + "name": "DeepSeek v3", + "developer": "deepseek-ai", + "scores": { + "MMLU All Subjects": 0.872, + "Abstract Algebra": 0.84, + "Anatomy": 0.867, + "College Physics": 0.814, + "Computer Security": 0.86, + "Econometrics": 0.746, + "Global Facts": 0.68, + "Jurisprudence": 0.898, + "Philosophy": 0.9, + "Professional Psychology": 0.887, + "Us Foreign Policy": 0.92, + "Astronomy": 0.921, + "Business Ethics": 0.89, + "Clinical Knowledge": 0.913, + "Conceptual Physics": 0.94, + "Electrical Engineering": 0.869, + "Elementary Mathematics": 0.942, + "Formal Logic": 0.77, + "High School World History": 0.928, + "Human Sexuality": 0.924, + "International Law": 0.95, + "Logical Fallacies": 0.914, + "Machine Learning": 0.786, + "Management": 0.903, + "Marketing": 0.949, + "Medical Genetics": 0.96, + "Miscellaneous": 0.949, + "Moral Scenarios": 0.808, + "Nutrition": 0.918, + "Prehistory": 0.923, + "Public Relations": 0.809, + "Security Studies": 0.837, + "Sociology": 0.955, + "Virology": 0.596, + "World Religions": 0.912, + "Mean win rate": 0.215 + } + }, + { + "model_id": "google/gemini-1.0-pro-001", + "name": "Gemini 1.0 Pro 001", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.7, + "Abstract Algebra": 0.34, + "Anatomy": 0.652, + "College Physics": 0.333, + "Computer Security": 0.84, + "Econometrics": 0.553, + "Global Facts": 0.49, + "Jurisprudence": 0.861, + "Philosophy": 0.762, + "Professional Psychology": 0.752, + "Us Foreign Policy": 0.89, + "Astronomy": 0.796, + "Business Ethics": 0.69, + "Clinical Knowledge": 0.758, + "Conceptual Physics": 0.706, + "Electrical Engineering": 0.69, + "Elementary Mathematics": 0.476, + "Formal Logic": 0.468, + "High School World History": 0.865, + "Human Sexuality": 0.618, + "International Law": 0.876, + "Logical Fallacies": 0.804, + "Machine Learning": 0.527, + "Management": 0.845, + "Marketing": 0.91, + "Medical Genetics": 0.8, + "Miscellaneous": 0.851, + "Moral Scenarios": 0.46, + "Nutrition": 0.788, + "Prehistory": 0.802, + "Public Relations": 0.691, + "Security Studies": 0.804, + "Sociology": 0.9, + "Virology": 0.536, + "World Religions": 0.86, + "Mean win rate": 0.677 + } + }, + { + "model_id": "google/gemini-1.5-flash-001", + "name": "Gemini 1.5 Flash 001", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.779, + "Abstract Algebra": 0.58, + "Anatomy": 0.8, + "College Physics": 0.696, + "Computer Security": 0.79, + "Econometrics": 0.614, + "Global Facts": 0.53, + "Jurisprudence": 0.889, + "Philosophy": 0.791, + "Professional Psychology": 0.828, + "Us Foreign Policy": 0.93, + "Astronomy": 0.882, + "Business Ethics": 0.81, + "Clinical Knowledge": 0.834, + "Conceptual Physics": 0.851, + "Electrical Engineering": 0.8, + "Elementary Mathematics": 0.754, + "Formal Logic": 0.627, + "High School World History": 0.907, + "Human Sexuality": 0.374, + "International Law": 0.901, + "Logical Fallacies": 0.853, + "Machine Learning": 0.571, + "Management": 0.864, + "Marketing": 0.94, + "Medical Genetics": 0.86, + "Miscellaneous": 0.886, + "Moral Scenarios": 0.637, + "Nutrition": 0.82, + "Prehistory": 0.867, + "Public Relations": 0.764, + "Security Studies": 0.808, + "Sociology": 0.915, + "Virology": 0.566, + "World Religions": 0.883, + "Mean win rate": 0.47 + } + }, + { + "model_id": "google/gemini-1.5-flash-002", + "name": "Gemini 1.5 Flash 002", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.739, + "Abstract Algebra": 0.63, + "Anatomy": 0.793, + "College Physics": 0.637, + "Computer Security": 0.72, + "Econometrics": 0.675, + "Global Facts": 0.47, + "Jurisprudence": 0.852, + "Philosophy": 0.797, + "Professional Psychology": 0.806, + "Us Foreign Policy": 0.81, + "Astronomy": 0.895, + "Business Ethics": 0.27, + "Clinical Knowledge": 0.792, + "Conceptual Physics": 0.851, + "Electrical Engineering": 0.772, + "Elementary Mathematics": 0.704, + "Formal Logic": 0.595, + "High School World History": 0.869, + "Human Sexuality": 0.847, + "International Law": 0.752, + "Logical Fallacies": 0.859, + "Machine Learning": 0.616, + "Management": 0.893, + "Marketing": 0.953, + "Medical Genetics": 0.89, + "Miscellaneous": 0.9, + "Moral Scenarios": 0.676, + "Nutrition": 0.588, + "Prehistory": 0.762, + "Public Relations": 0.7, + "Security Studies": 0.547, + "Sociology": 0.851, + "Virology": 0.524, + "World Religions": 0.865, + "Mean win rate": 0.817 + } + }, + { + "model_id": "google/gemini-1.5-flash-preview-0514", + "name": "Gemini 1.5 Flash 0514 preview", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.778, + "Abstract Algebra": 0.56, + "Anatomy": 0.807, + "College Physics": 0.667, + "Computer Security": 0.77, + "Econometrics": 0.64, + "Global Facts": 0.55, + "Jurisprudence": 0.889, + "Philosophy": 0.807, + "Professional Psychology": 0.825, + "Us Foreign Policy": 0.93, + "Astronomy": 0.868, + "Business Ethics": 0.82, + "Clinical Knowledge": 0.838, + "Conceptual Physics": 0.855, + "Electrical Engineering": 0.814, + "Elementary Mathematics": 0.778, + "Formal Logic": 0.611, + "High School World History": 0.907, + "Human Sexuality": 0.374, + "International Law": 0.876, + "Logical Fallacies": 0.853, + "Machine Learning": 0.562, + "Management": 0.854, + "Marketing": 0.936, + "Medical Genetics": 0.86, + "Miscellaneous": 0.884, + "Moral Scenarios": 0.631, + "Nutrition": 0.801, + "Prehistory": 0.867, + "Public Relations": 0.773, + "Security Studies": 0.812, + "Sociology": 0.9, + "Virology": 0.566, + "World Religions": 0.871, + "Mean win rate": 0.713 + } + }, + { + "model_id": "google/gemini-1.5-pro-001", + "name": "Gemini 1.5 Pro 001", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.827, + "Abstract Algebra": 0.75, + "Anatomy": 0.83, + "College Physics": 0.745, + "Computer Security": 0.83, + "Econometrics": 0.728, + "Global Facts": 0.66, + "Jurisprudence": 0.889, + "Philosophy": 0.871, + "Professional Psychology": 0.894, + "Us Foreign Policy": 0.93, + "Astronomy": 0.914, + "Business Ethics": 0.8, + "Clinical Knowledge": 0.853, + "Conceptual Physics": 0.949, + "Electrical Engineering": 0.745, + "Elementary Mathematics": 0.939, + "Formal Logic": 0.706, + "High School World History": 0.924, + "Human Sexuality": 0.374, + "International Law": 0.917, + "Logical Fallacies": 0.896, + "Machine Learning": 0.652, + "Management": 0.922, + "Marketing": 0.932, + "Medical Genetics": 0.91, + "Miscellaneous": 0.958, + "Moral Scenarios": 0.739, + "Nutrition": 0.879, + "Prehistory": 0.87, + "Public Relations": 0.818, + "Security Studies": 0.873, + "Sociology": 0.92, + "Virology": 0.554, + "World Religions": 0.854, + "Mean win rate": 0.349 + } + }, + { + "model_id": "google/gemini-1.5-pro-002", + "name": "Gemini 1.5 Pro 002", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.869, + "Abstract Algebra": 0.82, + "Anatomy": 0.83, + "College Physics": 0.863, + "Computer Security": 0.85, + "Econometrics": 0.693, + "Global Facts": 0.77, + "Jurisprudence": 0.898, + "Philosophy": 0.887, + "Professional Psychology": 0.912, + "Us Foreign Policy": 0.94, + "Astronomy": 0.934, + "Business Ethics": 0.84, + "Clinical Knowledge": 0.906, + "Conceptual Physics": 0.945, + "Electrical Engineering": 0.855, + "Elementary Mathematics": 0.942, + "Formal Logic": 0.754, + "High School World History": 0.937, + "Human Sexuality": 0.878, + "International Law": 0.917, + "Logical Fallacies": 0.902, + "Machine Learning": 0.83, + "Management": 0.903, + "Marketing": 0.962, + "Medical Genetics": 0.92, + "Miscellaneous": 0.959, + "Moral Scenarios": 0.792, + "Nutrition": 0.886, + "Prehistory": 0.926, + "Public Relations": 0.809, + "Security Studies": 0.857, + "Sociology": 0.95, + "Virology": 0.566, + "World Religions": 0.889, + "Mean win rate": 0.334 + } + }, + { + "model_id": "google/gemini-1.5-pro-preview-0409", + "name": "Gemini 1.5 Pro 0409 preview", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.81, + "Abstract Algebra": 0.6, + "Anatomy": 0.77, + "College Physics": 0.804, + "Computer Security": 0.81, + "Econometrics": 0.737, + "Global Facts": 0.66, + "Jurisprudence": 0.87, + "Philosophy": 0.846, + "Professional Psychology": 0.866, + "Us Foreign Policy": 0.94, + "Astronomy": 0.914, + "Business Ethics": 0.8, + "Clinical Knowledge": 0.868, + "Conceptual Physics": 0.915, + "Electrical Engineering": 0.772, + "Elementary Mathematics": 0.884, + "Formal Logic": 0.643, + "High School World History": 0.924, + "Human Sexuality": 0.397, + "International Law": 0.917, + "Logical Fallacies": 0.859, + "Machine Learning": 0.67, + "Management": 0.874, + "Marketing": 0.953, + "Medical Genetics": 0.91, + "Miscellaneous": 0.928, + "Moral Scenarios": 0.696, + "Nutrition": 0.846, + "Prehistory": 0.886, + "Public Relations": 0.755, + "Security Studies": 0.849, + "Sociology": 0.925, + "Virology": 0.584, + "World Religions": 0.877, + "Mean win rate": 0.118 + } + }, + { + "model_id": "google/gemini-2.0-flash-exp", + "name": "Gemini 2.0 Flash Experimental", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.797, + "Abstract Algebra": 0.72, + "Anatomy": 0.807, + "College Physics": 0.696, + "Computer Security": 0.83, + "Econometrics": 0.693, + "Global Facts": 0.66, + "Jurisprudence": 0.898, + "Philosophy": 0.887, + "Professional Psychology": 0.876, + "Us Foreign Policy": 0.78, + "Astronomy": 0.928, + "Business Ethics": 0.73, + "Clinical Knowledge": 0.879, + "Conceptual Physics": 0.813, + "Electrical Engineering": 0.834, + "Elementary Mathematics": 0.857, + "Formal Logic": 0.571, + "High School World History": 0.743, + "Human Sexuality": 0.901, + "International Law": 0.645, + "Logical Fallacies": 0.914, + "Machine Learning": 0.759, + "Management": 0.718, + "Marketing": 0.944, + "Medical Genetics": 0.89, + "Miscellaneous": 0.939, + "Moral Scenarios": 0.815, + "Nutrition": 0.856, + "Prehistory": 0.898, + "Public Relations": 0.791, + "Security Studies": 0.69, + "Sociology": 0.786, + "Virology": 0.554, + "World Religions": 0.731, + "Mean win rate": 0.567 + } + }, + { + "model_id": "google/gemma-2-27b", + "name": "Gemma 2 27B", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.757, + "Abstract Algebra": 0.4, + "Anatomy": 0.77, + "College Physics": 0.5, + "Computer Security": 0.84, + "Econometrics": 0.667, + "Global Facts": 0.43, + "Jurisprudence": 0.861, + "Philosophy": 0.849, + "Professional Psychology": 0.84, + "Us Foreign Policy": 0.95, + "Astronomy": 0.829, + "Business Ethics": 0.78, + "Clinical Knowledge": 0.808, + "Conceptual Physics": 0.834, + "Electrical Engineering": 0.738, + "Elementary Mathematics": 0.558, + "Formal Logic": 0.516, + "High School World History": 0.89, + "Human Sexuality": 0.84, + "International Law": 0.843, + "Logical Fallacies": 0.865, + "Machine Learning": 0.625, + "Management": 0.864, + "Marketing": 0.94, + "Medical Genetics": 0.87, + "Miscellaneous": 0.885, + "Moral Scenarios": 0.394, + "Nutrition": 0.824, + "Prehistory": 0.877, + "Public Relations": 0.745, + "Security Studies": 0.808, + "Sociology": 0.9, + "Virology": 0.56, + "World Religions": 0.924, + "Mean win rate": 0.05 + } + }, + { + "model_id": "google/gemma-2-9b", + "name": "Gemma 2 9B", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.721, + "Abstract Algebra": 0.4, + "Anatomy": 0.704, + "College Physics": 0.5, + "Computer Security": 0.81, + "Econometrics": 0.579, + "Global Facts": 0.53, + "Jurisprudence": 0.833, + "Philosophy": 0.772, + "Professional Psychology": 0.788, + "Us Foreign Policy": 0.9, + "Astronomy": 0.789, + "Business Ethics": 0.77, + "Clinical Knowledge": 0.777, + "Conceptual Physics": 0.732, + "Electrical Engineering": 0.724, + "Elementary Mathematics": 0.577, + "Formal Logic": 0.492, + "High School World History": 0.865, + "Human Sexuality": 0.809, + "International Law": 0.835, + "Logical Fallacies": 0.816, + "Machine Learning": 0.509, + "Management": 0.874, + "Marketing": 0.919, + "Medical Genetics": 0.84, + "Miscellaneous": 0.844, + "Moral Scenarios": 0.295, + "Nutrition": 0.775, + "Prehistory": 0.812, + "Public Relations": 0.736, + "Security Studies": 0.78, + "Sociology": 0.9, + "Virology": 0.53, + "World Religions": 0.86, + "Mean win rate": 0.265 + } + }, + { + "model_id": "google/gemma-7b", + "name": "Gemma 7B", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.661, + "Abstract Algebra": 0.28, + "Anatomy": 0.563, + "College Physics": 0.412, + "Computer Security": 0.75, + "Econometrics": 0.474, + "Global Facts": 0.42, + "Jurisprudence": 0.769, + "Philosophy": 0.727, + "Professional Psychology": 0.712, + "Us Foreign Policy": 0.87, + "Astronomy": 0.717, + "Business Ethics": 0.65, + "Clinical Knowledge": 0.698, + "Conceptual Physics": 0.621, + "Electrical Engineering": 0.628, + "Elementary Mathematics": 0.516, + "Formal Logic": 0.508, + "High School World History": 0.857, + "Human Sexuality": 0.733, + "International Law": 0.835, + "Logical Fallacies": 0.742, + "Machine Learning": 0.554, + "Management": 0.864, + "Marketing": 0.885, + "Medical Genetics": 0.7, + "Miscellaneous": 0.838, + "Moral Scenarios": 0.377, + "Nutrition": 0.778, + "Prehistory": 0.756, + "Public Relations": 0.682, + "Security Studies": 0.735, + "Sociology": 0.841, + "Virology": 0.548, + "World Religions": 0.842, + "Mean win rate": 0.824 + } + }, + { + "model_id": "google/text-bison@001", + "name": "PaLM-2 Bison", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.692, + "Abstract Algebra": 0.39, + "Anatomy": 0.644, + "College Physics": 0.51, + "Computer Security": 0.74, + "Econometrics": 0.518, + "Global Facts": 0.38, + "Jurisprudence": 0.769, + "Philosophy": 0.736, + "Professional Psychology": 0.761, + "Us Foreign Policy": 0.87, + "Astronomy": 0.803, + "Business Ethics": 0.76, + "Clinical Knowledge": 0.725, + "Conceptual Physics": 0.694, + "Electrical Engineering": 0.69, + "Elementary Mathematics": 0.487, + "Formal Logic": 0.5, + "High School World History": 0.869, + "Human Sexuality": 0.84, + "International Law": 0.835, + "Logical Fallacies": 0.853, + "Machine Learning": 0.562, + "Management": 0.893, + "Marketing": 0.893, + "Medical Genetics": 0.75, + "Miscellaneous": 0.866, + "Moral Scenarios": 0.369, + "Nutrition": 0.709, + "Prehistory": 0.812, + "Public Relations": 0.691, + "Security Studies": 0.812, + "Sociology": 0.92, + "Virology": 0.494, + "World Religions": 0.883, + "Mean win rate": 0.192 + } + }, + { + "model_id": "google/text-unicorn@001", + "name": "PaLM-2 Unicorn", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.786, + "Abstract Algebra": 0.51, + "Anatomy": 0.733, + "College Physics": 0.549, + "Computer Security": 0.77, + "Econometrics": 0.649, + "Global Facts": 0.53, + "Jurisprudence": 0.88, + "Philosophy": 0.836, + "Professional Psychology": 0.858, + "Us Foreign Policy": 0.96, + "Astronomy": 0.862, + "Business Ethics": 0.83, + "Clinical Knowledge": 0.804, + "Conceptual Physics": 0.809, + "Electrical Engineering": 0.772, + "Elementary Mathematics": 0.661, + "Formal Logic": 0.659, + "High School World History": 0.911, + "Human Sexuality": 0.924, + "International Law": 0.909, + "Logical Fallacies": 0.877, + "Machine Learning": 0.625, + "Management": 0.903, + "Marketing": 0.94, + "Medical Genetics": 0.83, + "Miscellaneous": 0.894, + "Moral Scenarios": 0.562, + "Nutrition": 0.856, + "Prehistory": 0.87, + "Public Relations": 0.773, + "Security Studies": 0.829, + "Sociology": 0.91, + "Virology": 0.572, + "World Religions": 0.877, + "Mean win rate": 0.142 + } + }, + { + "model_id": "meta/llama-2-13b", + "name": "Llama 2 13B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.554, + "Abstract Algebra": 0.27, + "Anatomy": 0.496, + "College Physics": 0.235, + "Computer Security": 0.69, + "Econometrics": 0.307, + "Global Facts": 0.38, + "Jurisprudence": 0.704, + "Philosophy": 0.672, + "Professional Psychology": 0.567, + "Us Foreign Policy": 0.83, + "Astronomy": 0.546, + "Business Ethics": 0.55, + "Clinical Knowledge": 0.592, + "Conceptual Physics": 0.413, + "Electrical Engineering": 0.49, + "Elementary Mathematics": 0.307, + "Formal Logic": 0.381, + "High School World History": 0.705, + "Human Sexuality": 0.618, + "International Law": 0.752, + "Logical Fallacies": 0.687, + "Machine Learning": 0.286, + "Management": 0.738, + "Marketing": 0.786, + "Medical Genetics": 0.57, + "Miscellaneous": 0.748, + "Moral Scenarios": 0.407, + "Nutrition": 0.627, + "Prehistory": 0.654, + "Public Relations": 0.6, + "Security Studies": 0.608, + "Sociology": 0.761, + "Virology": 0.476, + "World Religions": 0.76, + "Mean win rate": 0.502 + } + }, + { + "model_id": "meta/llama-2-70b", + "name": "Llama 2 70B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.695, + "Abstract Algebra": 0.31, + "Anatomy": 0.607, + "College Physics": 0.363, + "Computer Security": 0.77, + "Econometrics": 0.43, + "Global Facts": 0.47, + "Jurisprudence": 0.824, + "Philosophy": 0.791, + "Professional Psychology": 0.76, + "Us Foreign Policy": 0.92, + "Astronomy": 0.829, + "Business Ethics": 0.73, + "Clinical Knowledge": 0.717, + "Conceptual Physics": 0.668, + "Electrical Engineering": 0.634, + "Elementary Mathematics": 0.421, + "Formal Logic": 0.468, + "High School World History": 0.882, + "Human Sexuality": 0.84, + "International Law": 0.868, + "Logical Fallacies": 0.791, + "Machine Learning": 0.491, + "Management": 0.845, + "Marketing": 0.889, + "Medical Genetics": 0.72, + "Miscellaneous": 0.857, + "Moral Scenarios": 0.45, + "Nutrition": 0.758, + "Prehistory": 0.84, + "Public Relations": 0.745, + "Security Studies": 0.796, + "Sociology": 0.9, + "Virology": 0.53, + "World Religions": 0.854, + "Mean win rate": 0.508 + } + }, + { + "model_id": "meta/llama-2-7b", + "name": "Llama 2 7B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.458, + "Abstract Algebra": 0.29, + "Anatomy": 0.452, + "College Physics": 0.196, + "Computer Security": 0.59, + "Econometrics": 0.316, + "Global Facts": 0.29, + "Jurisprudence": 0.519, + "Philosophy": 0.592, + "Professional Psychology": 0.459, + "Us Foreign Policy": 0.64, + "Astronomy": 0.408, + "Business Ethics": 0.48, + "Clinical Knowledge": 0.453, + "Conceptual Physics": 0.434, + "Electrical Engineering": 0.407, + "Elementary Mathematics": 0.254, + "Formal Logic": 0.27, + "High School World History": 0.662, + "Human Sexuality": 0.557, + "International Law": 0.628, + "Logical Fallacies": 0.466, + "Machine Learning": 0.402, + "Management": 0.563, + "Marketing": 0.697, + "Medical Genetics": 0.53, + "Miscellaneous": 0.632, + "Moral Scenarios": 0.238, + "Nutrition": 0.497, + "Prehistory": 0.503, + "Public Relations": 0.509, + "Security Studies": 0.433, + "Sociology": 0.617, + "Virology": 0.392, + "World Religions": 0.713, + "Mean win rate": 0.681 + } + }, + { + "model_id": "meta/llama-3-70b", + "name": "Llama 3 70B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.793, + "Abstract Algebra": 0.43, + "Anatomy": 0.785, + "College Physics": 0.529, + "Computer Security": 0.85, + "Econometrics": 0.693, + "Global Facts": 0.49, + "Jurisprudence": 0.861, + "Philosophy": 0.865, + "Professional Psychology": 0.871, + "Us Foreign Policy": 0.94, + "Astronomy": 0.921, + "Business Ethics": 0.83, + "Clinical Knowledge": 0.845, + "Conceptual Physics": 0.838, + "Electrical Engineering": 0.766, + "Elementary Mathematics": 0.632, + "Formal Logic": 0.651, + "High School World History": 0.941, + "Human Sexuality": 0.878, + "International Law": 0.901, + "Logical Fallacies": 0.865, + "Machine Learning": 0.714, + "Management": 0.913, + "Marketing": 0.94, + "Medical Genetics": 0.89, + "Miscellaneous": 0.917, + "Moral Scenarios": 0.598, + "Nutrition": 0.876, + "Prehistory": 0.91, + "Public Relations": 0.727, + "Security Studies": 0.833, + "Sociology": 0.93, + "Virology": 0.59, + "World Religions": 0.906, + "Mean win rate": 0.524 + } + }, + { + "model_id": "meta/llama-3-8b", + "name": "Llama 3 8B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.668, + "Abstract Algebra": 0.33, + "Anatomy": 0.696, + "College Physics": 0.451, + "Computer Security": 0.8, + "Econometrics": 0.518, + "Global Facts": 0.34, + "Jurisprudence": 0.741, + "Philosophy": 0.743, + "Professional Psychology": 0.711, + "Us Foreign Policy": 0.88, + "Astronomy": 0.711, + "Business Ethics": 0.65, + "Clinical Knowledge": 0.751, + "Conceptual Physics": 0.557, + "Electrical Engineering": 0.669, + "Elementary Mathematics": 0.426, + "Formal Logic": 0.468, + "High School World History": 0.823, + "Human Sexuality": 0.748, + "International Law": 0.843, + "Logical Fallacies": 0.755, + "Machine Learning": 0.545, + "Management": 0.874, + "Marketing": 0.885, + "Medical Genetics": 0.83, + "Miscellaneous": 0.831, + "Moral Scenarios": 0.416, + "Nutrition": 0.761, + "Prehistory": 0.738, + "Public Relations": 0.736, + "Security Studies": 0.771, + "Sociology": 0.866, + "Virology": 0.566, + "World Religions": 0.819, + "Mean win rate": 0.733 + } + }, + { + "model_id": "meta/llama-3.1-405b-instruct-turbo", + "name": "Llama 3.1 Instruct Turbo 405B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.845, + "Abstract Algebra": 0.7, + "Anatomy": 0.822, + "College Physics": 0.696, + "Computer Security": 0.81, + "Econometrics": 0.746, + "Global Facts": 0.71, + "Jurisprudence": 0.87, + "Philosophy": 0.878, + "Professional Psychology": 0.861, + "Us Foreign Policy": 0.94, + "Astronomy": 0.921, + "Business Ethics": 0.81, + "Clinical Knowledge": 0.879, + "Conceptual Physics": 0.877, + "Electrical Engineering": 0.821, + "Elementary Mathematics": 0.828, + "Formal Logic": 0.698, + "High School World History": 0.941, + "Human Sexuality": 0.855, + "International Law": 0.95, + "Logical Fallacies": 0.92, + "Machine Learning": 0.795, + "Management": 0.893, + "Marketing": 0.962, + "Medical Genetics": 0.93, + "Miscellaneous": 0.939, + "Moral Scenarios": 0.876, + "Nutrition": 0.928, + "Prehistory": 0.929, + "Public Relations": 0.818, + "Security Studies": 0.857, + "Sociology": 0.94, + "Virology": 0.572, + "World Religions": 0.906, + "Mean win rate": 0.33 + } + }, + { + "model_id": "meta/llama-3.1-70b-instruct-turbo", + "name": "Llama 3.1 Instruct Turbo 70B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.801, + "Abstract Algebra": 0.55, + "Anatomy": 0.8, + "College Physics": 0.559, + "Computer Security": 0.8, + "Econometrics": 0.675, + "Global Facts": 0.61, + "Jurisprudence": 0.889, + "Philosophy": 0.833, + "Professional Psychology": 0.846, + "Us Foreign Policy": 0.93, + "Astronomy": 0.908, + "Business Ethics": 0.72, + "Clinical Knowledge": 0.845, + "Conceptual Physics": 0.834, + "Electrical Engineering": 0.745, + "Elementary Mathematics": 0.701, + "Formal Logic": 0.675, + "High School World History": 0.937, + "Human Sexuality": 0.855, + "International Law": 0.926, + "Logical Fallacies": 0.84, + "Machine Learning": 0.696, + "Management": 0.913, + "Marketing": 0.936, + "Medical Genetics": 0.93, + "Miscellaneous": 0.913, + "Moral Scenarios": 0.834, + "Nutrition": 0.889, + "Prehistory": 0.88, + "Public Relations": 0.709, + "Security Studies": 0.849, + "Sociology": 0.92, + "Virology": 0.578, + "World Religions": 0.895, + "Mean win rate": 0.021 + } + }, + { + "model_id": "meta/llama-3.1-8b-instruct-turbo", + "name": "Llama 3.1 Instruct Turbo 8B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.561, + "Abstract Algebra": 0.26, + "Anatomy": 0.459, + "College Physics": 0.363, + "Computer Security": 0.71, + "Econometrics": 0.351, + "Global Facts": 0.26, + "Jurisprudence": 0.731, + "Philosophy": 0.64, + "Professional Psychology": 0.649, + "Us Foreign Policy": 0.79, + "Astronomy": 0.645, + "Business Ethics": 0.65, + "Clinical Knowledge": 0.615, + "Conceptual Physics": 0.528, + "Electrical Engineering": 0.441, + "Elementary Mathematics": 0.429, + "Formal Logic": 0.444, + "High School World History": 0.515, + "Human Sexuality": 0.733, + "International Law": 0.694, + "Logical Fallacies": 0.742, + "Machine Learning": 0.384, + "Management": 0.709, + "Marketing": 0.833, + "Medical Genetics": 0.66, + "Miscellaneous": 0.653, + "Moral Scenarios": 0.368, + "Nutrition": 0.712, + "Prehistory": 0.728, + "Public Relations": 0.664, + "Security Studies": 0.576, + "Sociology": 0.701, + "Virology": 0.446, + "World Religions": 0.789, + "Mean win rate": 0.475 + } + }, + { + "model_id": "meta/llama-3.2-11b-vision-instruct-turbo", + "name": "Llama 3.2 Vision Instruct Turbo 11B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.565, + "Abstract Algebra": 0.28, + "Anatomy": 0.533, + "College Physics": 0.333, + "Computer Security": 0.71, + "Econometrics": 0.395, + "Global Facts": 0.25, + "Jurisprudence": 0.722, + "Philosophy": 0.646, + "Professional Psychology": 0.649, + "Us Foreign Policy": 0.78, + "Astronomy": 0.671, + "Business Ethics": 0.64, + "Clinical Knowledge": 0.638, + "Conceptual Physics": 0.536, + "Electrical Engineering": 0.51, + "Elementary Mathematics": 0.458, + "Formal Logic": 0.46, + "High School World History": 0.502, + "Human Sexuality": 0.763, + "International Law": 0.711, + "Logical Fallacies": 0.742, + "Machine Learning": 0.375, + "Management": 0.728, + "Marketing": 0.838, + "Medical Genetics": 0.7, + "Miscellaneous": 0.644, + "Moral Scenarios": 0.328, + "Nutrition": 0.752, + "Prehistory": 0.744, + "Public Relations": 0.645, + "Security Studies": 0.567, + "Sociology": 0.627, + "Virology": 0.446, + "World Religions": 0.696, + "Mean win rate": 0.897 + } + }, + { + "model_id": "meta/llama-3.2-90b-vision-instruct-turbo", + "name": "Llama 3.2 Vision Instruct Turbo 90B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.803, + "Abstract Algebra": 0.52, + "Anatomy": 0.8, + "College Physics": 0.539, + "Computer Security": 0.81, + "Econometrics": 0.684, + "Global Facts": 0.6, + "Jurisprudence": 0.88, + "Philosophy": 0.839, + "Professional Psychology": 0.843, + "Us Foreign Policy": 0.93, + "Astronomy": 0.921, + "Business Ethics": 0.76, + "Clinical Knowledge": 0.845, + "Conceptual Physics": 0.826, + "Electrical Engineering": 0.759, + "Elementary Mathematics": 0.688, + "Formal Logic": 0.683, + "High School World History": 0.941, + "Human Sexuality": 0.87, + "International Law": 0.934, + "Logical Fallacies": 0.834, + "Machine Learning": 0.688, + "Management": 0.913, + "Marketing": 0.944, + "Medical Genetics": 0.92, + "Miscellaneous": 0.913, + "Moral Scenarios": 0.841, + "Nutrition": 0.889, + "Prehistory": 0.886, + "Public Relations": 0.718, + "Security Studies": 0.853, + "Sociology": 0.92, + "Virology": 0.584, + "World Religions": 0.901, + "Mean win rate": 0.773 + } + }, + { + "model_id": "meta/llama-3.3-70b-instruct-turbo", + "name": "Llama 3.3 Instruct Turbo 70B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.791, + "Abstract Algebra": 0.5, + "Anatomy": 0.778, + "College Physics": 0.52, + "Computer Security": 0.8, + "Econometrics": 0.719, + "Global Facts": 0.58, + "Jurisprudence": 0.87, + "Philosophy": 0.83, + "Professional Psychology": 0.845, + "Us Foreign Policy": 0.93, + "Astronomy": 0.888, + "Business Ethics": 0.8, + "Clinical Knowledge": 0.83, + "Conceptual Physics": 0.821, + "Electrical Engineering": 0.745, + "Elementary Mathematics": 0.672, + "Formal Logic": 0.675, + "High School World History": 0.907, + "Human Sexuality": 0.855, + "International Law": 0.884, + "Logical Fallacies": 0.816, + "Machine Learning": 0.714, + "Management": 0.903, + "Marketing": 0.927, + "Medical Genetics": 0.9, + "Miscellaneous": 0.914, + "Moral Scenarios": 0.698, + "Nutrition": 0.882, + "Prehistory": 0.895, + "Public Relations": 0.727, + "Security Studies": 0.845, + "Sociology": 0.92, + "Virology": 0.566, + "World Religions": 0.883, + "Mean win rate": 0.722 + } + }, + { + "model_id": "microsoft/phi-2", + "name": "Phi-2", + "developer": "microsoft", + "scores": { + "MMLU All Subjects": 0.584, + "Abstract Algebra": 0.31, + "Anatomy": 0.437, + "College Physics": 0.382, + "Computer Security": 0.73, + "Econometrics": 0.342, + "Global Facts": 0.35, + "Jurisprudence": 0.694, + "Philosophy": 0.598, + "Professional Psychology": 0.572, + "Us Foreign Policy": 0.78, + "Astronomy": 0.605, + "Business Ethics": 0.59, + "Clinical Knowledge": 0.619, + "Conceptual Physics": 0.519, + "Electrical Engineering": 0.545, + "Elementary Mathematics": 0.463, + "Formal Logic": 0.389, + "High School World History": 0.73, + "Human Sexuality": 0.733, + "International Law": 0.752, + "Logical Fallacies": 0.767, + "Machine Learning": 0.5, + "Management": 0.748, + "Marketing": 0.833, + "Medical Genetics": 0.62, + "Miscellaneous": 0.688, + "Moral Scenarios": 0.231, + "Nutrition": 0.627, + "Prehistory": 0.605, + "Public Relations": 0.673, + "Security Studies": 0.702, + "Sociology": 0.816, + "Virology": 0.47, + "World Religions": 0.702, + "Mean win rate": 0.824 + } + }, + { + "model_id": "microsoft/phi-3-medium-4k-instruct", + "name": "Phi-3 14B", + "developer": "microsoft", + "scores": { + "MMLU All Subjects": 0.775, + "Abstract Algebra": 0.5, + "Anatomy": 0.719, + "College Physics": 0.529, + "Computer Security": 0.79, + "Econometrics": 0.614, + "Global Facts": 0.5, + "Jurisprudence": 0.88, + "Philosophy": 0.804, + "Professional Psychology": 0.835, + "Us Foreign Policy": 0.95, + "Astronomy": 0.849, + "Business Ethics": 0.8, + "Clinical Knowledge": 0.826, + "Conceptual Physics": 0.809, + "Electrical Engineering": 0.683, + "Elementary Mathematics": 0.709, + "Formal Logic": 0.587, + "High School World History": 0.903, + "Human Sexuality": 0.863, + "International Law": 0.934, + "Logical Fallacies": 0.828, + "Machine Learning": 0.696, + "Management": 0.864, + "Marketing": 0.919, + "Medical Genetics": 0.91, + "Miscellaneous": 0.894, + "Moral Scenarios": 0.639, + "Nutrition": 0.837, + "Prehistory": 0.867, + "Public Relations": 0.755, + "Security Studies": 0.829, + "Sociology": 0.891, + "Virology": 0.554, + "World Religions": 0.865, + "Mean win rate": 0.015 + } + }, + { + "model_id": "microsoft/phi-3-small-8k-instruct", + "name": "Phi-3 7B", + "developer": "microsoft", + "scores": { + "MMLU All Subjects": 0.757, + "Abstract Algebra": 0.44, + "Anatomy": 0.726, + "College Physics": 0.559, + "Computer Security": 0.77, + "Econometrics": 0.596, + "Global Facts": 0.52, + "Jurisprudence": 0.843, + "Philosophy": 0.82, + "Professional Psychology": 0.835, + "Us Foreign Policy": 0.95, + "Astronomy": 0.849, + "Business Ethics": 0.77, + "Clinical Knowledge": 0.83, + "Conceptual Physics": 0.779, + "Electrical Engineering": 0.69, + "Elementary Mathematics": 0.619, + "Formal Logic": 0.595, + "High School World History": 0.848, + "Human Sexuality": 0.817, + "International Law": 0.851, + "Logical Fallacies": 0.81, + "Machine Learning": 0.652, + "Management": 0.903, + "Marketing": 0.897, + "Medical Genetics": 0.84, + "Miscellaneous": 0.871, + "Moral Scenarios": 0.711, + "Nutrition": 0.833, + "Prehistory": 0.858, + "Public Relations": 0.727, + "Security Studies": 0.804, + "Sociology": 0.886, + "Virology": 0.548, + "World Religions": 0.825, + "Mean win rate": 0.708 + } + }, + { + "model_id": "mistralai/mistral-7b-instruct-v0.3", + "name": "Mistral Instruct v0.3 7B", + "developer": "mistralai", + "scores": { + "MMLU All Subjects": 0.599, + "Abstract Algebra": 0.27, + "Anatomy": 0.585, + "College Physics": 0.343, + "Computer Security": 0.7, + "Econometrics": 0.421, + "Global Facts": 0.33, + "Jurisprudence": 0.713, + "Philosophy": 0.659, + "Professional Psychology": 0.641, + "Us Foreign Policy": 0.79, + "Astronomy": 0.638, + "Business Ethics": 0.57, + "Clinical Knowledge": 0.687, + "Conceptual Physics": 0.549, + "Electrical Engineering": 0.572, + "Elementary Mathematics": 0.402, + "Formal Logic": 0.397, + "High School World History": 0.759, + "Human Sexuality": 0.702, + "International Law": 0.76, + "Logical Fallacies": 0.712, + "Machine Learning": 0.455, + "Management": 0.767, + "Marketing": 0.842, + "Medical Genetics": 0.75, + "Miscellaneous": 0.785, + "Moral Scenarios": 0.393, + "Nutrition": 0.676, + "Prehistory": 0.673, + "Public Relations": 0.636, + "Security Studies": 0.682, + "Sociology": 0.806, + "Virology": 0.47, + "World Religions": 0.825, + "Mean win rate": 0.509 + } + }, + { + "model_id": "mistralai/mistral-7b-v0.1", + "name": "Mistral v0.1 7B", + "developer": "mistralai", + "scores": { + "MMLU All Subjects": 0.566, + "Abstract Algebra": 0.25, + "Anatomy": 0.467, + "College Physics": 0.314, + "Computer Security": 0.69, + "Econometrics": 0.351, + "Global Facts": 0.29, + "Jurisprudence": 0.667, + "Philosophy": 0.63, + "Professional Psychology": 0.578, + "Us Foreign Policy": 0.79, + "Astronomy": 0.599, + "Business Ethics": 0.56, + "Clinical Knowledge": 0.653, + "Conceptual Physics": 0.451, + "Electrical Engineering": 0.538, + "Elementary Mathematics": 0.32, + "Formal Logic": 0.365, + "High School World History": 0.726, + "Human Sexuality": 0.702, + "International Law": 0.76, + "Logical Fallacies": 0.693, + "Machine Learning": 0.438, + "Management": 0.709, + "Marketing": 0.833, + "Medical Genetics": 0.68, + "Miscellaneous": 0.72, + "Moral Scenarios": 0.33, + "Nutrition": 0.657, + "Prehistory": 0.642, + "Public Relations": 0.6, + "Security Studies": 0.731, + "Sociology": 0.831, + "Virology": 0.44, + "World Religions": 0.789, + "Mean win rate": 0.213 + } + }, + { + "model_id": "mistralai/mistral-large-2402", + "name": "Mistral Large 2402", + "developer": "mistralai", + "scores": { + "MMLU All Subjects": 0.688, + "Abstract Algebra": 0.45, + "Anatomy": 0.674, + "College Physics": 0.373, + "Computer Security": 0.8, + "Econometrics": 0.64, + "Global Facts": 0.34, + "Jurisprudence": 0.815, + "Philosophy": 0.794, + "Professional Psychology": 0.809, + "Us Foreign Policy": 0.92, + "Astronomy": 0.842, + "Business Ethics": 0.67, + "Clinical Knowledge": 0.751, + "Conceptual Physics": 0.574, + "Electrical Engineering": 0.545, + "Elementary Mathematics": 0.508, + "Formal Logic": 0.532, + "High School World History": 0.886, + "Human Sexuality": 0.847, + "International Law": 0.868, + "Logical Fallacies": 0.81, + "Machine Learning": 0.562, + "Management": 0.854, + "Marketing": 0.897, + "Medical Genetics": 0.74, + "Miscellaneous": 0.9, + "Moral Scenarios": 0.579, + "Nutrition": 0.791, + "Prehistory": 0.904, + "Public Relations": 0.709, + "Security Studies": 0.824, + "Sociology": 0.93, + "Virology": 0.554, + "World Religions": 0.883, + "Mean win rate": 0.464 + } + }, + { + "model_id": "mistralai/mistral-large-2407", + "name": "Mistral Large 2 2407", + "developer": "mistralai", + "scores": { + "MMLU All Subjects": 0.8, + "Abstract Algebra": 0.7, + "Anatomy": 0.785, + "College Physics": 0.559, + "Computer Security": 0.81, + "Econometrics": 0.693, + "Global Facts": 0.56, + "Jurisprudence": 0.861, + "Philosophy": 0.826, + "Professional Psychology": 0.861, + "Us Foreign Policy": 0.9, + "Astronomy": 0.921, + "Business Ethics": 0.79, + "Clinical Knowledge": 0.864, + "Conceptual Physics": 0.864, + "Electrical Engineering": 0.793, + "Elementary Mathematics": 0.799, + "Formal Logic": 0.579, + "High School World History": 0.92, + "Human Sexuality": 0.924, + "International Law": 0.926, + "Logical Fallacies": 0.847, + "Machine Learning": 0.661, + "Management": 0.883, + "Marketing": 0.94, + "Medical Genetics": 0.9, + "Miscellaneous": 0.936, + "Moral Scenarios": 0.839, + "Nutrition": 0.827, + "Prehistory": 0.92, + "Public Relations": 0.764, + "Security Studies": 0.865, + "Sociology": 0.91, + "Virology": 0.59, + "World Religions": 0.865, + "Mean win rate": 0.24 + } + }, + { + "model_id": "mistralai/mistral-small-2402", + "name": "Mistral Small 2402", + "developer": "mistralai", + "scores": { + "MMLU All Subjects": 0.687, + "Abstract Algebra": 0.26, + "Anatomy": 0.674, + "College Physics": 0.402, + "Computer Security": 0.77, + "Econometrics": 0.614, + "Global Facts": 0.45, + "Jurisprudence": 0.833, + "Philosophy": 0.765, + "Professional Psychology": 0.768, + "Us Foreign Policy": 0.89, + "Astronomy": 0.77, + "Business Ethics": 0.71, + "Clinical Knowledge": 0.766, + "Conceptual Physics": 0.685, + "Electrical Engineering": 0.628, + "Elementary Mathematics": 0.415, + "Formal Logic": 0.516, + "High School World History": 0.857, + "Human Sexuality": 0.824, + "International Law": 0.826, + "Logical Fallacies": 0.804, + "Machine Learning": 0.562, + "Management": 0.786, + "Marketing": 0.906, + "Medical Genetics": 0.75, + "Miscellaneous": 0.844, + "Moral Scenarios": 0.575, + "Nutrition": 0.761, + "Prehistory": 0.802, + "Public Relations": 0.773, + "Security Studies": 0.788, + "Sociology": 0.871, + "Virology": 0.542, + "World Religions": 0.848, + "Mean win rate": 0.54 + } + }, + { + "model_id": "mistralai/mixtral-8x22b", + "name": "Mixtral 8x22B", + "developer": "mistralai", + "scores": { + "MMLU All Subjects": 0.778, + "Abstract Algebra": 0.48, + "Anatomy": 0.741, + "College Physics": 0.569, + "Computer Security": 0.84, + "Econometrics": 0.667, + "Global Facts": 0.56, + "Jurisprudence": 0.852, + "Philosophy": 0.842, + "Professional Psychology": 0.845, + "Us Foreign Policy": 0.95, + "Astronomy": 0.882, + "Business Ethics": 0.74, + "Clinical Knowledge": 0.819, + "Conceptual Physics": 0.796, + "Electrical Engineering": 0.766, + "Elementary Mathematics": 0.622, + "Formal Logic": 0.627, + "High School World History": 0.895, + "Human Sexuality": 0.885, + "International Law": 0.917, + "Logical Fallacies": 0.877, + "Machine Learning": 0.661, + "Management": 0.883, + "Marketing": 0.915, + "Medical Genetics": 0.85, + "Miscellaneous": 0.899, + "Moral Scenarios": 0.646, + "Nutrition": 0.866, + "Prehistory": 0.87, + "Public Relations": 0.755, + "Security Studies": 0.865, + "Sociology": 0.92, + "Virology": 0.596, + "World Religions": 0.901, + "Mean win rate": 0.598 + } + }, + { + "model_id": "mistralai/mixtral-8x7b-32kseqlen", + "name": "Mixtral 8x7B 32K seqlen", + "developer": "mistralai", + "scores": { + "MMLU All Subjects": 0.717, + "Abstract Algebra": 0.38, + "Anatomy": 0.696, + "College Physics": 0.51, + "Computer Security": 0.81, + "Econometrics": 0.605, + "Global Facts": 0.46, + "Jurisprudence": 0.833, + "Philosophy": 0.797, + "Professional Psychology": 0.779, + "Us Foreign Policy": 0.93, + "Astronomy": 0.829, + "Business Ethics": 0.72, + "Clinical Knowledge": 0.785, + "Conceptual Physics": 0.681, + "Electrical Engineering": 0.676, + "Elementary Mathematics": 0.476, + "Formal Logic": 0.532, + "High School World History": 0.886, + "Human Sexuality": 0.87, + "International Law": 0.86, + "Logical Fallacies": 0.767, + "Machine Learning": 0.509, + "Management": 0.845, + "Marketing": 0.923, + "Medical Genetics": 0.76, + "Miscellaneous": 0.881, + "Moral Scenarios": 0.444, + "Nutrition": 0.83, + "Prehistory": 0.849, + "Public Relations": 0.682, + "Security Studies": 0.792, + "Sociology": 0.871, + "Virology": 0.506, + "World Religions": 0.871, + "Mean win rate": 0.689 + } + }, + { + "model_id": "mistralai/open-mistral-nemo-2407", + "name": "Mistral NeMo 2402", + "developer": "mistralai", + "scores": { + "MMLU All Subjects": 0.653, + "Abstract Algebra": 0.29, + "Anatomy": 0.607, + "College Physics": 0.373, + "Computer Security": 0.81, + "Econometrics": 0.561, + "Global Facts": 0.4, + "Jurisprudence": 0.796, + "Philosophy": 0.733, + "Professional Psychology": 0.588, + "Us Foreign Policy": 0.89, + "Astronomy": 0.691, + "Business Ethics": 0.49, + "Clinical Knowledge": 0.736, + "Conceptual Physics": 0.647, + "Electrical Engineering": 0.531, + "Elementary Mathematics": 0.439, + "Formal Logic": 0.405, + "High School World History": 0.848, + "Human Sexuality": 0.702, + "International Law": 0.769, + "Logical Fallacies": 0.791, + "Machine Learning": 0.402, + "Management": 0.796, + "Marketing": 0.889, + "Medical Genetics": 0.78, + "Miscellaneous": 0.861, + "Moral Scenarios": 0.381, + "Nutrition": 0.709, + "Prehistory": 0.765, + "Public Relations": 0.718, + "Security Studies": 0.771, + "Sociology": 0.726, + "Virology": 0.56, + "World Religions": 0.789, + "Mean win rate": 0.215 + } + }, + { + "model_id": "openai/gpt-3.5-turbo-0125", + "name": "GPT-3.5 Turbo 0125", + "developer": "openai", + "scores": { + "MMLU All Subjects": 0.673, + "Abstract Algebra": 0.31, + "Anatomy": 0.696, + "College Physics": 0.471, + "Computer Security": 0.78, + "Econometrics": 0.474, + "Global Facts": 0.39, + "Jurisprudence": 0.806, + "Philosophy": 0.746, + "Professional Psychology": 0.722, + "Us Foreign Policy": 0.89, + "Astronomy": 0.75, + "Business Ethics": 0.75, + "Clinical Knowledge": 0.755, + "Conceptual Physics": 0.634, + "Electrical Engineering": 0.669, + "Elementary Mathematics": 0.534, + "Formal Logic": 0.444, + "High School World History": 0.819, + "Human Sexuality": 0.779, + "International Law": 0.81, + "Logical Fallacies": 0.779, + "Machine Learning": 0.455, + "Management": 0.835, + "Marketing": 0.91, + "Medical Genetics": 0.73, + "Miscellaneous": 0.89, + "Moral Scenarios": 0.355, + "Nutrition": 0.748, + "Prehistory": 0.735, + "Public Relations": 0.727, + "Security Studies": 0.751, + "Sociology": 0.861, + "Virology": 0.536, + "World Religions": 0.842, + "Mean win rate": 0.493 + } + }, + { + "model_id": "openai/gpt-3.5-turbo-0613", + "name": "gpt-3.5-turbo-0613", + "developer": "openai", + "scores": { + "MMLU All Subjects": 0.689, + "Abstract Algebra": 0.38, + "Anatomy": 0.659, + "College Physics": 0.461, + "Computer Security": 0.81, + "Econometrics": 0.5, + "Global Facts": 0.37, + "Jurisprudence": 0.806, + "Philosophy": 0.759, + "Professional Psychology": 0.732, + "Us Foreign Policy": 0.88, + "Astronomy": 0.763, + "Business Ethics": 0.75, + "Clinical Knowledge": 0.777, + "Conceptual Physics": 0.613, + "Electrical Engineering": 0.648, + "Elementary Mathematics": 0.5, + "Formal Logic": 0.397, + "High School World History": 0.857, + "Human Sexuality": 0.786, + "International Law": 0.843, + "Logical Fallacies": 0.791, + "Machine Learning": 0.455, + "Management": 0.845, + "Marketing": 0.91, + "Medical Genetics": 0.8, + "Miscellaneous": 0.893, + "Moral Scenarios": 0.404, + "Nutrition": 0.758, + "Prehistory": 0.787, + "Public Relations": 0.745, + "Security Studies": 0.8, + "Sociology": 0.871, + "Virology": 0.542, + "World Religions": 0.836, + "Mean win rate": 0.589 + } + }, + { + "model_id": "openai/gpt-4-0613", + "name": "GPT-4 0613", + "developer": "openai", + "scores": { + "MMLU All Subjects": 0.824, + "Abstract Algebra": 0.63, + "Anatomy": 0.8, + "College Physics": 0.627, + "Computer Security": 0.86, + "Econometrics": 0.684, + "Global Facts": 0.62, + "Jurisprudence": 0.889, + "Philosophy": 0.859, + "Professional Psychology": 0.891, + "Us Foreign Policy": 0.95, + "Astronomy": 0.934, + "Business Ethics": 0.79, + "Clinical Knowledge": 0.845, + "Conceptual Physics": 0.868, + "Electrical Engineering": 0.786, + "Elementary Mathematics": 0.807, + "Formal Logic": 0.643, + "High School World History": 0.945, + "Human Sexuality": 0.908, + "International Law": 0.917, + "Logical Fallacies": 0.871, + "Machine Learning": 0.759, + "Management": 0.932, + "Marketing": 0.962, + "Medical Genetics": 0.94, + "Miscellaneous": 0.949, + "Moral Scenarios": 0.902, + "Nutrition": 0.892, + "Prehistory": 0.926, + "Public Relations": 0.745, + "Security Studies": 0.861, + "Sociology": 0.93, + "Virology": 0.596, + "World Religions": 0.877, + "Mean win rate": 0.517 + } + }, + { + "model_id": "openai/gpt-4-1106-preview", + "name": "GPT-4 Turbo 1106 preview", + "developer": "openai", + "scores": { + "MMLU All Subjects": 0.796, + "Abstract Algebra": 0.53, + "Anatomy": 0.807, + "College Physics": 0.402, + "Computer Security": 0.86, + "Econometrics": 0.675, + "Global Facts": 0.58, + "Jurisprudence": 0.889, + "Philosophy": 0.852, + "Professional Psychology": 0.887, + "Us Foreign Policy": 0.96, + "Astronomy": 0.941, + "Business Ethics": 0.78, + "Clinical Knowledge": 0.864, + "Conceptual Physics": 0.894, + "Electrical Engineering": 0.772, + "Elementary Mathematics": 0.638, + "Formal Logic": 0.651, + "High School World History": 0.958, + "Human Sexuality": 0.908, + "International Law": 0.926, + "Logical Fallacies": 0.865, + "Machine Learning": 0.723, + "Management": 0.913, + "Marketing": 0.932, + "Medical Genetics": 0.93, + "Miscellaneous": 0.946, + "Moral Scenarios": 0.816, + "Nutrition": 0.879, + "Prehistory": 0.917, + "Public Relations": 0.782, + "Security Studies": 0.841, + "Sociology": 0.925, + "Virology": 0.59, + "World Religions": 0.854, + "Mean win rate": 0.416 + } + }, + { + "model_id": "openai/gpt-4-turbo-2024-04-09", + "name": "GPT-4 Turbo 2024-04-09", + "developer": "openai", + "scores": { + "MMLU All Subjects": 0.813, + "Abstract Algebra": 0.56, + "Anatomy": 0.822, + "College Physics": 0.539, + "Computer Security": 0.83, + "Econometrics": 0.675, + "Global Facts": 0.58, + "Jurisprudence": 0.88, + "Philosophy": 0.868, + "Professional Psychology": 0.873, + "Us Foreign Policy": 0.96, + "Astronomy": 0.941, + "Business Ethics": 0.82, + "Clinical Knowledge": 0.83, + "Conceptual Physics": 0.894, + "Electrical Engineering": 0.752, + "Elementary Mathematics": 0.72, + "Formal Logic": 0.706, + "High School World History": 0.941, + "Human Sexuality": 0.901, + "International Law": 0.942, + "Logical Fallacies": 0.871, + "Machine Learning": 0.741, + "Management": 0.883, + "Marketing": 0.949, + "Medical Genetics": 0.92, + "Miscellaneous": 0.945, + "Moral Scenarios": 0.803, + "Nutrition": 0.892, + "Prehistory": 0.92, + "Public Relations": 0.755, + "Security Studies": 0.8, + "Sociology": 0.915, + "Virology": 0.602, + "World Religions": 0.848, + "Mean win rate": 0.351 + } + }, + { + "model_id": "openai/gpt-4o-2024-05-13", + "name": "GPT-4o 2024-05-13", + "developer": "openai", + "scores": { + "MMLU All Subjects": 0.842, + "Abstract Algebra": 0.66, + "Anatomy": 0.911, + "College Physics": 0.686, + "Computer Security": 0.85, + "Econometrics": 0.693, + "Global Facts": 0.64, + "Jurisprudence": 0.898, + "Philosophy": 0.9, + "Professional Psychology": 0.905, + "Us Foreign Policy": 0.96, + "Astronomy": 0.941, + "Business Ethics": 0.85, + "Clinical Knowledge": 0.894, + "Conceptual Physics": 0.911, + "Electrical Engineering": 0.807, + "Elementary Mathematics": 0.741, + "Formal Logic": 0.683, + "High School World History": 0.945, + "Human Sexuality": 0.908, + "International Law": 0.934, + "Logical Fallacies": 0.883, + "Machine Learning": 0.768, + "Management": 0.942, + "Marketing": 0.936, + "Medical Genetics": 0.96, + "Miscellaneous": 0.954, + "Moral Scenarios": 0.841, + "Nutrition": 0.899, + "Prehistory": 0.938, + "Public Relations": 0.809, + "Security Studies": 0.837, + "Sociology": 0.94, + "Virology": 0.596, + "World Religions": 0.889, + "Mean win rate": 0.671 + } + }, + { + "model_id": "openai/gpt-4o-2024-08-06", + "name": "GPT-4o 2024-08-06", + "developer": "openai", + "scores": { + "MMLU All Subjects": 0.843, + "Abstract Algebra": 0.58, + "Anatomy": 0.911, + "College Physics": 0.686, + "Computer Security": 0.85, + "Econometrics": 0.711, + "Global Facts": 0.69, + "Jurisprudence": 0.907, + "Philosophy": 0.894, + "Professional Psychology": 0.899, + "Us Foreign Policy": 0.95, + "Astronomy": 0.947, + "Business Ethics": 0.89, + "Clinical Knowledge": 0.894, + "Conceptual Physics": 0.923, + "Electrical Engineering": 0.793, + "Elementary Mathematics": 0.775, + "Formal Logic": 0.675, + "High School World History": 0.941, + "Human Sexuality": 0.901, + "International Law": 0.942, + "Logical Fallacies": 0.902, + "Machine Learning": 0.777, + "Management": 0.913, + "Marketing": 0.94, + "Medical Genetics": 0.98, + "Miscellaneous": 0.958, + "Moral Scenarios": 0.802, + "Nutrition": 0.905, + "Prehistory": 0.935, + "Public Relations": 0.782, + "Security Studies": 0.833, + "Sociology": 0.945, + "Virology": 0.578, + "World Religions": 0.883, + "Mean win rate": 0.52 + } + }, + { + "model_id": "openai/gpt-4o-mini-2024-07-18", + "name": "GPT-4o mini 2024-07-18", + "developer": "openai", + "scores": { + "MMLU All Subjects": 0.767, + "Abstract Algebra": 0.42, + "Anatomy": 0.77, + "College Physics": 0.559, + "Computer Security": 0.85, + "Econometrics": 0.649, + "Global Facts": 0.45, + "Jurisprudence": 0.87, + "Philosophy": 0.772, + "Professional Psychology": 0.833, + "Us Foreign Policy": 0.91, + "Astronomy": 0.849, + "Business Ethics": 0.79, + "Clinical Knowledge": 0.845, + "Conceptual Physics": 0.791, + "Electrical Engineering": 0.731, + "Elementary Mathematics": 0.651, + "Formal Logic": 0.556, + "High School World History": 0.903, + "Human Sexuality": 0.863, + "International Law": 0.926, + "Logical Fallacies": 0.871, + "Machine Learning": 0.616, + "Management": 0.845, + "Marketing": 0.927, + "Medical Genetics": 0.89, + "Miscellaneous": 0.913, + "Moral Scenarios": 0.485, + "Nutrition": 0.827, + "Prehistory": 0.833, + "Public Relations": 0.791, + "Security Studies": 0.788, + "Sociology": 0.9, + "Virology": 0.536, + "World Religions": 0.86, + "Mean win rate": 0.774 + } + }, + { + "model_id": "qwen/qwen1.5-110b-chat", + "name": "Qwen1.5 Chat 110B", + "developer": "qwen", + "scores": { + "MMLU All Subjects": 0.768, + "Abstract Algebra": 0.57, + "Anatomy": 0.696, + "College Physics": 0.51, + "Computer Security": 0.82, + "Econometrics": 0.64, + "Global Facts": 0.51, + "Jurisprudence": 0.833, + "Philosophy": 0.823, + "Professional Psychology": 0.82, + "Us Foreign Policy": 0.87, + "Astronomy": 0.901, + "Business Ethics": 0.8, + "Clinical Knowledge": 0.766, + "Conceptual Physics": 0.838, + "Electrical Engineering": 0.752, + "Elementary Mathematics": 0.669, + "Formal Logic": 0.643, + "High School World History": 0.903, + "Human Sexuality": 0.855, + "International Law": 0.876, + "Logical Fallacies": 0.828, + "Machine Learning": 0.634, + "Management": 0.835, + "Marketing": 0.919, + "Medical Genetics": 0.85, + "Miscellaneous": 0.934, + "Moral Scenarios": 0.783, + "Nutrition": 0.804, + "Prehistory": 0.867, + "Public Relations": 0.773, + "Security Studies": 0.735, + "Sociology": 0.866, + "Virology": 0.542, + "World Religions": 0.871, + "Mean win rate": 0.875 + } + }, + { + "model_id": "qwen/qwen1.5-14b", + "name": "Qwen1.5 14B", + "developer": "qwen", + "scores": { + "MMLU All Subjects": 0.686, + "Abstract Algebra": 0.4, + "Anatomy": 0.637, + "College Physics": 0.48, + "Computer Security": 0.84, + "Econometrics": 0.561, + "Global Facts": 0.49, + "Jurisprudence": 0.769, + "Philosophy": 0.717, + "Professional Psychology": 0.699, + "Us Foreign Policy": 0.87, + "Astronomy": 0.724, + "Business Ethics": 0.75, + "Clinical Knowledge": 0.736, + "Conceptual Physics": 0.694, + "Electrical Engineering": 0.683, + "Elementary Mathematics": 0.603, + "Formal Logic": 0.492, + "High School World History": 0.84, + "Human Sexuality": 0.756, + "International Law": 0.826, + "Logical Fallacies": 0.736, + "Machine Learning": 0.509, + "Management": 0.816, + "Marketing": 0.893, + "Medical Genetics": 0.76, + "Miscellaneous": 0.835, + "Moral Scenarios": 0.368, + "Nutrition": 0.742, + "Prehistory": 0.71, + "Public Relations": 0.655, + "Security Studies": 0.8, + "Sociology": 0.841, + "Virology": 0.458, + "World Religions": 0.842, + "Mean win rate": 0.796 + } + }, + { + "model_id": "qwen/qwen1.5-32b", + "name": "Qwen1.5 32B", + "developer": "qwen", + "scores": { + "MMLU All Subjects": 0.744, + "Abstract Algebra": 0.4, + "Anatomy": 0.644, + "College Physics": 0.51, + "Computer Security": 0.77, + "Econometrics": 0.561, + "Global Facts": 0.47, + "Jurisprudence": 0.843, + "Philosophy": 0.826, + "Professional Psychology": 0.75, + "Us Foreign Policy": 0.91, + "Astronomy": 0.855, + "Business Ethics": 0.77, + "Clinical Knowledge": 0.781, + "Conceptual Physics": 0.766, + "Electrical Engineering": 0.731, + "Elementary Mathematics": 0.685, + "Formal Logic": 0.524, + "High School World History": 0.869, + "Human Sexuality": 0.847, + "International Law": 0.884, + "Logical Fallacies": 0.822, + "Machine Learning": 0.616, + "Management": 0.874, + "Marketing": 0.936, + "Medical Genetics": 0.85, + "Miscellaneous": 0.884, + "Moral Scenarios": 0.545, + "Nutrition": 0.81, + "Prehistory": 0.83, + "Public Relations": 0.664, + "Security Studies": 0.829, + "Sociology": 0.881, + "Virology": 0.578, + "World Religions": 0.854, + "Mean win rate": 0.624 + } + }, + { + "model_id": "qwen/qwen1.5-72b", + "name": "Qwen1.5 72B", + "developer": "qwen", + "scores": { + "MMLU All Subjects": 0.774, + "Abstract Algebra": 0.44, + "Anatomy": 0.733, + "College Physics": 0.559, + "Computer Security": 0.81, + "Econometrics": 0.544, + "Global Facts": 0.56, + "Jurisprudence": 0.824, + "Philosophy": 0.83, + "Professional Psychology": 0.809, + "Us Foreign Policy": 0.94, + "Astronomy": 0.868, + "Business Ethics": 0.79, + "Clinical Knowledge": 0.834, + "Conceptual Physics": 0.821, + "Electrical Engineering": 0.779, + "Elementary Mathematics": 0.696, + "Formal Logic": 0.556, + "High School World History": 0.899, + "Human Sexuality": 0.878, + "International Law": 0.909, + "Logical Fallacies": 0.853, + "Machine Learning": 0.67, + "Management": 0.854, + "Marketing": 0.949, + "Medical Genetics": 0.87, + "Miscellaneous": 0.921, + "Moral Scenarios": 0.669, + "Nutrition": 0.859, + "Prehistory": 0.88, + "Public Relations": 0.755, + "Security Studies": 0.824, + "Sociology": 0.9, + "Virology": 0.584, + "World Religions": 0.883, + "Mean win rate": 0.65 + } + }, + { + "model_id": "qwen/qwen1.5-7b", + "name": "Qwen1.5 7B", + "developer": "qwen", + "scores": { + "MMLU All Subjects": 0.626, + "Abstract Algebra": 0.39, + "Anatomy": 0.526, + "College Physics": 0.471, + "Computer Security": 0.76, + "Econometrics": 0.447, + "Global Facts": 0.4, + "Jurisprudence": 0.778, + "Philosophy": 0.691, + "Professional Psychology": 0.603, + "Us Foreign Policy": 0.84, + "Astronomy": 0.671, + "Business Ethics": 0.69, + "Clinical Knowledge": 0.691, + "Conceptual Physics": 0.579, + "Electrical Engineering": 0.572, + "Elementary Mathematics": 0.5, + "Formal Logic": 0.397, + "High School World History": 0.789, + "Human Sexuality": 0.695, + "International Law": 0.76, + "Logical Fallacies": 0.706, + "Machine Learning": 0.411, + "Management": 0.816, + "Marketing": 0.863, + "Medical Genetics": 0.69, + "Miscellaneous": 0.765, + "Moral Scenarios": 0.372, + "Nutrition": 0.696, + "Prehistory": 0.688, + "Public Relations": 0.627, + "Security Studies": 0.727, + "Sociology": 0.836, + "Virology": 0.488, + "World Religions": 0.778, + "Mean win rate": 0.843 + } + }, + { + "model_id": "qwen/qwen2-72b-instruct", + "name": "Qwen2 Instruct 72B", + "developer": "qwen", + "scores": { + "MMLU All Subjects": 0.824, + "Abstract Algebra": 0.67, + "Anatomy": 0.793, + "College Physics": 0.598, + "Computer Security": 0.85, + "Econometrics": 0.737, + "Global Facts": 0.58, + "Jurisprudence": 0.87, + "Philosophy": 0.859, + "Professional Psychology": 0.886, + "Us Foreign Policy": 0.94, + "Astronomy": 0.934, + "Business Ethics": 0.82, + "Clinical Knowledge": 0.868, + "Conceptual Physics": 0.872, + "Electrical Engineering": 0.793, + "Elementary Mathematics": 0.825, + "Formal Logic": 0.667, + "High School World History": 0.932, + "Human Sexuality": 0.893, + "International Law": 0.893, + "Logical Fallacies": 0.914, + "Machine Learning": 0.768, + "Management": 0.903, + "Marketing": 0.953, + "Medical Genetics": 0.9, + "Miscellaneous": 0.943, + "Moral Scenarios": 0.815, + "Nutrition": 0.902, + "Prehistory": 0.914, + "Public Relations": 0.745, + "Security Studies": 0.837, + "Sociology": 0.935, + "Virology": 0.56, + "World Religions": 0.848, + "Mean win rate": 0.826 + } + }, + { + "model_id": "qwen/qwen2.5-72b-instruct-turbo", + "name": "Qwen2.5 Instruct Turbo 72B", + "developer": "qwen", + "scores": { + "MMLU All Subjects": 0.834, + "Abstract Algebra": 0.68, + "Anatomy": 0.822, + "College Physics": 0.588, + "Computer Security": 0.86, + "Econometrics": 0.728, + "Global Facts": 0.61, + "Jurisprudence": 0.87, + "Philosophy": 0.839, + "Professional Psychology": 0.864, + "Us Foreign Policy": 0.96, + "Astronomy": 0.934, + "Business Ethics": 0.85, + "Clinical Knowledge": 0.872, + "Conceptual Physics": 0.885, + "Electrical Engineering": 0.8, + "Elementary Mathematics": 0.87, + "Formal Logic": 0.73, + "High School World History": 0.92, + "Human Sexuality": 0.878, + "International Law": 0.893, + "Logical Fallacies": 0.89, + "Machine Learning": 0.777, + "Management": 0.913, + "Marketing": 0.953, + "Medical Genetics": 0.92, + "Miscellaneous": 0.932, + "Moral Scenarios": 0.787, + "Nutrition": 0.886, + "Prehistory": 0.91, + "Public Relations": 0.782, + "Security Studies": 0.849, + "Sociology": 0.925, + "Virology": 0.584, + "World Religions": 0.901, + "Mean win rate": 0.548 + } + }, + { + "model_id": "qwen/qwen2.5-7b-instruct-turbo", + "name": "Qwen2.5 Instruct Turbo 7B", + "developer": "qwen", + "scores": { + "MMLU All Subjects": 0.729, + "Abstract Algebra": 0.49, + "Anatomy": 0.689, + "College Physics": 0.51, + "Computer Security": 0.79, + "Econometrics": 0.64, + "Global Facts": 0.42, + "Jurisprudence": 0.796, + "Philosophy": 0.746, + "Professional Psychology": 0.757, + "Us Foreign Policy": 0.86, + "Astronomy": 0.836, + "Business Ethics": 0.82, + "Clinical Knowledge": 0.785, + "Conceptual Physics": 0.736, + "Electrical Engineering": 0.717, + "Elementary Mathematics": 0.643, + "Formal Logic": 0.587, + "High School World History": 0.878, + "Human Sexuality": 0.794, + "International Law": 0.86, + "Logical Fallacies": 0.773, + "Machine Learning": 0.554, + "Management": 0.845, + "Marketing": 0.919, + "Medical Genetics": 0.85, + "Miscellaneous": 0.852, + "Moral Scenarios": 0.511, + "Nutrition": 0.778, + "Prehistory": 0.836, + "Public Relations": 0.709, + "Security Studies": 0.682, + "Sociology": 0.861, + "Virology": 0.578, + "World Religions": 0.83, + "Mean win rate": 0.887 + } + }, + { + "model_id": "snowflake/snowflake-arctic-instruct", + "name": "Arctic Instruct", + "developer": "snowflake", + "scores": { + "MMLU All Subjects": 0.677, + "Abstract Algebra": 0.35, + "Anatomy": 0.652, + "College Physics": 0.461, + "Computer Security": 0.84, + "Econometrics": 0.5, + "Global Facts": 0.39, + "Jurisprudence": 0.741, + "Philosophy": 0.752, + "Professional Psychology": 0.724, + "Us Foreign Policy": 0.88, + "Astronomy": 0.763, + "Business Ethics": 0.69, + "Clinical Knowledge": 0.781, + "Conceptual Physics": 0.634, + "Electrical Engineering": 0.662, + "Elementary Mathematics": 0.481, + "Formal Logic": 0.444, + "High School World History": 0.827, + "Human Sexuality": 0.847, + "International Law": 0.826, + "Logical Fallacies": 0.779, + "Machine Learning": 0.473, + "Management": 0.796, + "Marketing": 0.902, + "Medical Genetics": 0.76, + "Miscellaneous": 0.875, + "Moral Scenarios": 0.28, + "Nutrition": 0.725, + "Prehistory": 0.79, + "Public Relations": 0.664, + "Security Studies": 0.78, + "Sociology": 0.891, + "Virology": 0.536, + "World Religions": 0.854, + "Mean win rate": 0.565 + } + }, + { + "model_id": "upstage/solar-pro-241126", + "name": "Solar Pro", + "developer": "upstage", + "scores": { + "MMLU All Subjects": 0.776, + "Abstract Algebra": 0.46, + "Anatomy": 0.719, + "College Physics": 0.559, + "Computer Security": 0.82, + "Econometrics": 0.605, + "Global Facts": 0.5, + "Jurisprudence": 0.898, + "Philosophy": 0.817, + "Professional Psychology": 0.85, + "Us Foreign Policy": 0.97, + "Astronomy": 0.868, + "Business Ethics": 0.8, + "Clinical Knowledge": 0.808, + "Conceptual Physics": 0.826, + "Electrical Engineering": 0.697, + "Elementary Mathematics": 0.611, + "Formal Logic": 0.579, + "High School World History": 0.907, + "Human Sexuality": 0.847, + "International Law": 0.901, + "Logical Fallacies": 0.865, + "Machine Learning": 0.616, + "Management": 0.864, + "Marketing": 0.953, + "Medical Genetics": 0.91, + "Miscellaneous": 0.888, + "Moral Scenarios": 0.811, + "Nutrition": 0.859, + "Prehistory": 0.867, + "Public Relations": 0.764, + "Security Studies": 0.82, + "Sociology": 0.886, + "Virology": 0.572, + "World Religions": 0.883, + "Mean win rate": 0.462 + } + }, + { + "model_id": "writer/palmyra-x-004", + "name": "Palmyra-X-004", + "developer": "writer", + "scores": { + "MMLU All Subjects": 0.813, + "Abstract Algebra": 0.75, + "Anatomy": 0.822, + "College Physics": 0.647, + "Computer Security": 0.82, + "Econometrics": 0.684, + "Global Facts": 0.62, + "Jurisprudence": 0.843, + "Philosophy": 0.83, + "Professional Psychology": 0.845, + "Us Foreign Policy": 0.92, + "Astronomy": 0.928, + "Business Ethics": 0.76, + "Clinical Knowledge": 0.879, + "Conceptual Physics": 0.885, + "Electrical Engineering": 0.793, + "Elementary Mathematics": 0.841, + "Formal Logic": 0.579, + "High School World History": 0.911, + "Human Sexuality": 0.924, + "International Law": 0.901, + "Logical Fallacies": 0.877, + "Machine Learning": 0.679, + "Management": 0.903, + "Marketing": 0.932, + "Medical Genetics": 0.87, + "Miscellaneous": 0.934, + "Moral Scenarios": 0.825, + "Nutrition": 0.869, + "Prehistory": 0.917, + "Public Relations": 0.791, + "Security Studies": 0.849, + "Sociology": 0.915, + "Virology": 0.584, + "World Religions": 0.842, + "Mean win rate": 0.629 + } + }, + { + "model_id": "writer/palmyra-x-v3", + "name": "Palmyra X V3 72B", + "developer": "writer", + "scores": { + "MMLU All Subjects": 0.786, + "Abstract Algebra": 0.53, + "Anatomy": 0.733, + "College Physics": 0.549, + "Computer Security": 0.78, + "Econometrics": 0.649, + "Global Facts": 0.53, + "Jurisprudence": 0.88, + "Philosophy": 0.836, + "Professional Psychology": 0.858, + "Us Foreign Policy": 0.96, + "Astronomy": 0.862, + "Business Ethics": 0.83, + "Clinical Knowledge": 0.804, + "Conceptual Physics": 0.809, + "Electrical Engineering": 0.772, + "Elementary Mathematics": 0.661, + "Formal Logic": 0.659, + "High School World History": 0.911, + "Human Sexuality": 0.924, + "International Law": 0.909, + "Logical Fallacies": 0.877, + "Machine Learning": 0.625, + "Management": 0.903, + "Marketing": 0.94, + "Medical Genetics": 0.83, + "Miscellaneous": 0.894, + "Moral Scenarios": 0.562, + "Nutrition": 0.856, + "Prehistory": 0.87, + "Public Relations": 0.773, + "Security Studies": 0.833, + "Sociology": 0.91, + "Virology": 0.572, + "World Religions": 0.877, + "Mean win rate": 0.325 + } + } + ] +} \ No newline at end of file diff --git a/data/benchmarks/hfopenllm_v2.json b/data/benchmarks/hfopenllm_v2.json new file mode 100644 index 0000000000000000000000000000000000000000..14cadd1c78c83808c8c6786afa0b5f2a1daac5c9 --- /dev/null +++ b/data/benchmarks/hfopenllm_v2.json @@ -0,0 +1,58452 @@ +{ + "models": [ + { + "model_id": "0-hero/Matter-0.2-7B-DPO", + "name": "Matter-0.2-7B-DPO", + "developer": "0-hero", + "scores": { + "IFEval": 0.3303, + "BBH": 0.3596, + "MATH Level 5": 0.0144, + "GPQA": 0.2592, + "MUSR": 0.3814, + "MMLU-PRO": 0.1164 + } + }, + { + "model_id": "01-ai/Yi-1.5-34B", + "name": "Yi-1.5-34B", + "developer": "01-ai", + "scores": { + "IFEval": 0.2841, + "BBH": 0.5976, + "MATH Level 5": 0.1533, + "GPQA": 0.3658, + "MUSR": 0.4236, + "MMLU-PRO": 0.4666 + } + }, + { + "model_id": "01-ai/Yi-1.5-34B-32K", + "name": "Yi-1.5-34B-32K", + "developer": "01-ai", + "scores": { + "IFEval": 0.3119, + "BBH": 0.6016, + "MATH Level 5": 0.1541, + "GPQA": 0.3633, + "MUSR": 0.4398, + "MMLU-PRO": 0.4709 + } + }, + { + "model_id": "01-ai/Yi-1.5-34B-Chat", + "name": "Yi-1.5-34B-Chat", + "developer": "01-ai", + "scores": { + "IFEval": 0.6067, + "BBH": 0.6084, + "MATH Level 5": 0.2772, + "GPQA": 0.3649, + "MUSR": 0.4282, + "MMLU-PRO": 0.452 + } + }, + { + "model_id": "01-ai/Yi-1.5-34B-Chat-16K", + "name": "Yi-1.5-34B-Chat-16K", + "developer": "01-ai", + "scores": { + "IFEval": 0.4564, + "BBH": 0.61, + "MATH Level 5": 0.2137, + "GPQA": 0.3381, + "MUSR": 0.4398, + "MMLU-PRO": 0.4545 + } + }, + { + "model_id": "01-ai/Yi-1.5-6B", + "name": "Yi-1.5-6B", + "developer": "01-ai", + "scores": { + "IFEval": 0.2617, + "BBH": 0.4493, + "MATH Level 5": 0.0665, + "GPQA": 0.3138, + "MUSR": 0.4374, + "MMLU-PRO": 0.3144 + } + }, + { + "model_id": "01-ai/Yi-1.5-6B-Chat", + "name": "Yi-1.5-6B-Chat", + "developer": "01-ai", + "scores": { + "IFEval": 0.5145, + "BBH": 0.4571, + "MATH Level 5": 0.1624, + "GPQA": 0.302, + "MUSR": 0.4392, + "MMLU-PRO": 0.3193 + } + }, + { + "model_id": "01-ai/Yi-1.5-9B", + "name": "Yi-1.5-9B", + "developer": "01-ai", + "scores": { + "IFEval": 0.2936, + "BBH": 0.5143, + "MATH Level 5": 0.114, + "GPQA": 0.3792, + "MUSR": 0.4328, + "MMLU-PRO": 0.3916 + } + }, + { + "model_id": "01-ai/Yi-1.5-9B-32K", + "name": "Yi-1.5-9B-32K", + "developer": "01-ai", + "scores": { + "IFEval": 0.2303, + "BBH": 0.4963, + "MATH Level 5": 0.108, + "GPQA": 0.3591, + "MUSR": 0.4186, + "MMLU-PRO": 0.3765 + } + }, + { + "model_id": "01-ai/Yi-1.5-9B-Chat", + "name": "Yi-1.5-9B-Chat", + "developer": "01-ai", + "scores": { + "IFEval": 0.6046, + "BBH": 0.5559, + "MATH Level 5": 0.2258, + "GPQA": 0.3347, + "MUSR": 0.4259, + "MMLU-PRO": 0.3975 + } + }, + { + "model_id": "01-ai/Yi-1.5-9B-Chat-16K", + "name": "Yi-1.5-9B-Chat-16K", + "developer": "01-ai", + "scores": { + "IFEval": 0.4214, + "BBH": 0.5153, + "MATH Level 5": 0.1782, + "GPQA": 0.3087, + "MUSR": 0.4099, + "MMLU-PRO": 0.3994 + } + }, + { + "model_id": "01-ai/Yi-34B", + "name": "Yi-34B", + "developer": "01-ai", + "scores": { + "IFEval": 0.3046, + "BBH": 0.5457, + "MATH Level 5": 0.0514, + "GPQA": 0.3666, + "MUSR": 0.4119, + "MMLU-PRO": 0.4412 + } + }, + { + "model_id": "01-ai/Yi-34B-200K", + "name": "Yi-34B-200K", + "developer": "01-ai", + "scores": { + "IFEval": 0.1542, + "BBH": 0.5442, + "MATH Level 5": 0.0574, + "GPQA": 0.3565, + "MUSR": 0.3817, + "MMLU-PRO": 0.4535 + } + }, + { + "model_id": "01-ai/Yi-34B-Chat", + "name": "Yi-34B-Chat", + "developer": "01-ai", + "scores": { + "IFEval": 0.4699, + "BBH": 0.5561, + "MATH Level 5": 0.0627, + "GPQA": 0.3381, + "MUSR": 0.3978, + "MMLU-PRO": 0.4093 + } + }, + { + "model_id": "01-ai/Yi-6B", + "name": "Yi-6B", + "developer": "01-ai", + "scores": { + "IFEval": 0.2893, + "BBH": 0.4309, + "MATH Level 5": 0.0159, + "GPQA": 0.2693, + "MUSR": 0.3937, + "MMLU-PRO": 0.2991 + } + }, + { + "model_id": "01-ai/Yi-6B-200K", + "name": "Yi-6B-200K", + "developer": "01-ai", + "scores": { + "IFEval": 0.0843, + "BBH": 0.4289, + "MATH Level 5": 0.0181, + "GPQA": 0.2819, + "MUSR": 0.4587, + "MMLU-PRO": 0.2844 + } + }, + { + "model_id": "01-ai/Yi-6B-Chat", + "name": "Yi-6B-Chat", + "developer": "01-ai", + "scores": { + "IFEval": 0.3395, + "BBH": 0.4133, + "MATH Level 5": 0.0136, + "GPQA": 0.2945, + "MUSR": 0.3688, + "MMLU-PRO": 0.3061 + } + }, + { + "model_id": "01-ai/Yi-9B", + "name": "Yi-9B", + "developer": "01-ai", + "scores": { + "IFEval": 0.2709, + "BBH": 0.494, + "MATH Level 5": 0.0559, + "GPQA": 0.318, + "MUSR": 0.4054, + "MMLU-PRO": 0.3574 + } + }, + { + "model_id": "01-ai/Yi-9B-200K", + "name": "Yi-9B-200K", + "developer": "01-ai", + "scores": { + "IFEval": 0.2327, + "BBH": 0.4793, + "MATH Level 5": 0.0665, + "GPQA": 0.3154, + "MUSR": 0.4294, + "MMLU-PRO": 0.3622 + } + }, + { + "model_id": "01-ai/Yi-Coder-9B-Chat", + "name": "Yi-Coder-9B-Chat", + "developer": "01-ai", + "scores": { + "IFEval": 0.4817, + "BBH": 0.4814, + "MATH Level 5": 0.04, + "GPQA": 0.2475, + "MUSR": 0.3992, + "MMLU-PRO": 0.2425 + } + }, + { + "model_id": "1-800-LLMs/Qwen-2.5-14B-Hindi", + "name": "Qwen-2.5-14B-Hindi", + "developer": "1-800-LLMs", + "scores": { + "IFEval": 0.5826, + "BBH": 0.6524, + "MATH Level 5": 0.3331, + "GPQA": 0.3624, + "MUSR": 0.4489, + "MMLU-PRO": 0.5263 + } + }, + { + "model_id": "1-800-LLMs/Qwen-2.5-14B-Hindi-Custom-Instruct", + "name": "Qwen-2.5-14B-Hindi-Custom-Instruct", + "developer": "1-800-LLMs", + "scores": { + "IFEval": 0.3077, + "BBH": 0.6284, + "MATH Level 5": 0.3112, + "GPQA": 0.37, + "MUSR": 0.4491, + "MMLU-PRO": 0.5164 + } + }, + { + "model_id": "1024m/PHI-4-Hindi", + "name": "PHI-4-Hindi", + "developer": "1024m", + "scores": { + "IFEval": 0.0082, + "BBH": 0.671, + "MATH Level 5": 0.2334, + "GPQA": 0.3977, + "MUSR": 0.4914, + "MMLU-PRO": 0.5239 + } + }, + { + "model_id": "1024m/QWEN-14B-B100", + "name": "QWEN-14B-B100", + "developer": "1024m", + "scores": { + "IFEval": 0.7762, + "BBH": 0.6533, + "MATH Level 5": 0.5438, + "GPQA": 0.3507, + "MUSR": 0.41, + "MMLU-PRO": 0.5179 + } + }, + { + "model_id": "152334H/miqu-1-70b-sf", + "name": "miqu-1-70b-sf", + "developer": "152334H", + "scores": { + "IFEval": 0.5182, + "BBH": 0.6102, + "MATH Level 5": 0.1246, + "GPQA": 0.3507, + "MUSR": 0.4582, + "MMLU-PRO": 0.4228 + } + }, + { + "model_id": "1TuanPham/T-VisStar-7B-v0.1", + "name": "T-VisStar-7B-v0.1", + "developer": "1TuanPham", + "scores": { + "IFEval": 0.3607, + "BBH": 0.5052, + "MATH Level 5": 0.0574, + "GPQA": 0.2852, + "MUSR": 0.4375, + "MMLU-PRO": 0.3211 + } + }, + { + "model_id": "1TuanPham/T-VisStar-v0.1", + "name": "T-VisStar-v0.1", + "developer": "1TuanPham", + "scores": { + "IFEval": 0.3607, + "BBH": 0.5052, + "MATH Level 5": 0.0574, + "GPQA": 0.2852, + "MUSR": 0.4375, + "MMLU-PRO": 0.3211 + } + }, + { + "model_id": "3rd-Degree-Burn/L-3.1-Science-Writer-8B", + "name": "L-3.1-Science-Writer-8B", + "developer": "3rd-Degree-Burn", + "scores": { + "IFEval": 0.4263, + "BBH": 0.5041, + "MATH Level 5": 0.1035, + "GPQA": 0.2743, + "MUSR": 0.3959, + "MMLU-PRO": 0.3649 + } + }, + { + "model_id": "3rd-Degree-Burn/Llama-3.1-8B-Squareroot", + "name": "Llama-3.1-8B-Squareroot", + "developer": "3rd-Degree-Burn", + "scores": { + "IFEval": 0.2213, + "BBH": 0.3461, + "MATH Level 5": 0.2659, + "GPQA": 0.2567, + "MUSR": 0.3089, + "MMLU-PRO": 0.175 + } + }, + { + "model_id": "3rd-Degree-Burn/Llama-3.1-8B-Squareroot-v1", + "name": "Llama-3.1-8B-Squareroot-v1", + "developer": "3rd-Degree-Burn", + "scores": { + "IFEval": 0.2892, + "BBH": 0.3343, + "MATH Level 5": 0.0884, + "GPQA": 0.2559, + "MUSR": 0.3341, + "MMLU-PRO": 0.1127 + } + }, + { + "model_id": "3rd-Degree-Burn/Llama-Squared-8B", + "name": "Llama-Squared-8B", + "developer": "3rd-Degree-Burn", + "scores": { + "IFEval": 0.2755, + "BBH": 0.4431, + "MATH Level 5": 0.0574, + "GPQA": 0.2718, + "MUSR": 0.3089, + "MMLU-PRO": 0.2366 + } + }, + { + "model_id": "4season/final_model_test_v2", + "name": "final_model_test_v2", + "developer": "4season", + "scores": { + "IFEval": 0.3191, + "BBH": 0.6342, + "MATH Level 5": 0.0838, + "GPQA": 0.3272, + "MUSR": 0.4314, + "MMLU-PRO": 0.3528 + } + }, + { + "model_id": "AALF/FuseChat-Llama-3.1-8B-Instruct-preview", + "name": "FuseChat-Llama-3.1-8B-Instruct-preview", + "developer": "AALF", + "scores": { + "IFEval": 0.719, + "BBH": 0.512, + "MATH Level 5": 0.2477, + "GPQA": 0.3054, + "MUSR": 0.382, + "MMLU-PRO": 0.3733 + } + }, + { + "model_id": "AALF/FuseChat-Llama-3.1-8B-SFT-preview", + "name": "FuseChat-Llama-3.1-8B-SFT-preview", + "developer": "AALF", + "scores": { + "IFEval": 0.7281, + "BBH": 0.524, + "MATH Level 5": 0.2251, + "GPQA": 0.3045, + "MUSR": 0.402, + "MMLU-PRO": 0.3743 + } + }, + { + "model_id": "AALF/gemma-2-27b-it-SimPO-37K", + "name": "gemma-2-27b-it-SimPO-37K", + "developer": "AALF", + "scores": { + "IFEval": 0.2407, + "BBH": 0.3911, + "MATH Level 5": 0.0128, + "GPQA": 0.2802, + "MUSR": 0.3488, + "MMLU-PRO": 0.1971 + } + }, + { + "model_id": "AALF/gemma-2-27b-it-SimPO-37K-100steps", + "name": "gemma-2-27b-it-SimPO-37K-100steps", + "developer": "AALF", + "scores": { + "IFEval": 0.2568, + "BBH": 0.3931, + "MATH Level 5": 0.0211, + "GPQA": 0.2886, + "MUSR": 0.3329, + "MMLU-PRO": 0.2125 + } + }, + { + "model_id": "AELLM/gemma-2-aeria-infinity-9b", + "name": "gemma-2-aeria-infinity-9b", + "developer": "AELLM", + "scores": { + "IFEval": 0.7594, + "BBH": 0.5983, + "MATH Level 5": 0.2145, + "GPQA": 0.3339, + "MUSR": 0.402, + "MMLU-PRO": 0.3862 + } + }, + { + "model_id": "AELLM/gemma-2-lyco-infinity-9b", + "name": "gemma-2-lyco-infinity-9b", + "developer": "AELLM", + "scores": { + "IFEval": 0.7316, + "BBH": 0.584, + "MATH Level 5": 0.1707, + "GPQA": 0.328, + "MUSR": 0.4006, + "MMLU-PRO": 0.3787 + } + }, + { + "model_id": "AGI-0/Art-v0-3B", + "name": "Art-v0-3B", + "developer": "AGI-0", + "scores": { + "IFEval": 0.3192, + "BBH": 0.3401, + "MATH Level 5": 0.2462, + "GPQA": 0.2592, + "MUSR": 0.3768, + "MMLU-PRO": 0.1179 + } + }, + { + "model_id": "AGI-0/Artificium-llama3.1-8B-001", + "name": "Artificium-llama3.1-8B-001", + "developer": "AGI-0", + "scores": { + "IFEval": 0.5248, + "BBH": 0.4256, + "MATH Level 5": 0.136, + "GPQA": 0.2659, + "MUSR": 0.3795, + "MMLU-PRO": 0.3182 + } + }, + { + "model_id": "AGI-0/smartllama3.1-8B-001", + "name": "smartllama3.1-8B-001", + "developer": "AGI-0", + "scores": { + "IFEval": 0.3518, + "BBH": 0.467, + "MATH Level 5": 0.1299, + "GPQA": 0.3062, + "MUSR": 0.4386, + "MMLU-PRO": 0.3487 + } + }, + { + "model_id": "AI-MO/NuminaMath-7B-CoT", + "name": "NuminaMath-7B-CoT", + "developer": "AI-MO", + "scores": { + "IFEval": 0.2689, + "BBH": 0.4314, + "MATH Level 5": 0.2696, + "GPQA": 0.2659, + "MUSR": 0.3303, + "MMLU-PRO": 0.2868 + } + }, + { + "model_id": "AI-MO/NuminaMath-7B-TIR", + "name": "NuminaMath-7B-TIR", + "developer": "AI-MO", + "scores": { + "IFEval": 0.2756, + "BBH": 0.4144, + "MATH Level 5": 0.1609, + "GPQA": 0.2584, + "MUSR": 0.3509, + "MMLU-PRO": 0.2733 + } + }, + { + "model_id": "AI-Sweden-Models/Llama-3-8B-instruct", + "name": "Llama-3-8B-instruct", + "developer": "AI-Sweden-Models", + "scores": { + "IFEval": 0.2401, + "BBH": 0.4173, + "MATH Level 5": 0.0385, + "GPQA": 0.2659, + "MUSR": 0.4771, + "MMLU-PRO": 0.2597 + } + }, + { + "model_id": "AI-Sweden-Models/gpt-sw3-40b", + "name": "gpt-sw3-40b", + "developer": "AI-Sweden-Models", + "scores": { + "IFEval": 0.147, + "BBH": 0.3268, + "MATH Level 5": 0.0174, + "GPQA": 0.2349, + "MUSR": 0.3632, + "MMLU-PRO": 0.1276 + } + }, + { + "model_id": "AI4free/Dhanishtha", + "name": "Dhanishtha", + "developer": "AI4free", + "scores": { + "IFEval": 0.2451, + "BBH": 0.3404, + "MATH Level 5": 0.256, + "GPQA": 0.2525, + "MUSR": 0.3569, + "MMLU-PRO": 0.1643 + } + }, + { + "model_id": "AI4free/t2", + "name": "t2", + "developer": "AI4free", + "scores": { + "IFEval": 0.3867, + "BBH": 0.291, + "MATH Level 5": 0.1896, + "GPQA": 0.2576, + "MUSR": 0.3846, + "MMLU-PRO": 0.1144 + } + }, + { + "model_id": "AIDC-AI/Marco-o1", + "name": "Marco-o1", + "developer": "AIDC-AI", + "scores": { + "IFEval": 0.4771, + "BBH": 0.5364, + "MATH Level 5": 0.3746, + "GPQA": 0.2592, + "MUSR": 0.4138, + "MMLU-PRO": 0.4117 + } + }, + { + "model_id": "Aashraf995/Creative-7B-nerd", + "name": "Creative-7B-nerd", + "developer": "Aashraf995", + "scores": { + "IFEval": 0.4722, + "BBH": 0.5607, + "MATH Level 5": 0.3165, + "GPQA": 0.3263, + "MUSR": 0.4515, + "MMLU-PRO": 0.4492 + } + }, + { + "model_id": "Aashraf995/Gemma-Evo-10B", + "name": "Gemma-Evo-10B", + "developer": "Aashraf995", + "scores": { + "IFEval": 0.7332, + "BBH": 0.6044, + "MATH Level 5": 0.2228, + "GPQA": 0.354, + "MUSR": 0.4595, + "MMLU-PRO": 0.4275 + } + }, + { + "model_id": "Aashraf995/Qwen-Evo-7B", + "name": "Qwen-Evo-7B", + "developer": "Aashraf995", + "scores": { + "IFEval": 0.4757, + "BBH": 0.5709, + "MATH Level 5": 0.3142, + "GPQA": 0.3255, + "MUSR": 0.4541, + "MMLU-PRO": 0.4462 + } + }, + { + "model_id": "Aashraf995/QwenStock-14B", + "name": "QwenStock-14B", + "developer": "Aashraf995", + "scores": { + "IFEval": 0.5009, + "BBH": 0.655, + "MATH Level 5": 0.3573, + "GPQA": 0.3893, + "MUSR": 0.4793, + "MMLU-PRO": 0.5382 + } + }, + { + "model_id": "AbacusResearch/Jallabi-34B", + "name": "Jallabi-34B", + "developer": "AbacusResearch", + "scores": { + "IFEval": 0.3529, + "BBH": 0.6023, + "MATH Level 5": 0.0521, + "GPQA": 0.3389, + "MUSR": 0.4822, + "MMLU-PRO": 0.4682 + } + }, + { + "model_id": "Ahdoot/StructuredThinker-v0.3-MoreStructure", + "name": "StructuredThinker-v0.3-MoreStructure", + "developer": "Ahdoot", + "scores": { + "IFEval": 0.4193, + "BBH": 0.4838, + "MATH Level 5": 0.2908, + "GPQA": 0.297, + "MUSR": 0.4158, + "MMLU-PRO": 0.361 + } + }, + { + "model_id": "Ahdoot/Test_StealthThinker", + "name": "Test_StealthThinker", + "developer": "Ahdoot", + "scores": { + "IFEval": 0.422, + "BBH": 0.4647, + "MATH Level 5": 0.179, + "GPQA": 0.2961, + "MUSR": 0.428, + "MMLU-PRO": 0.3597 + } + }, + { + "model_id": "AicoresSecurity/Cybernet-Sec-3B-R1-V0", + "name": "Cybernet-Sec-3B-R1-V0", + "developer": "AicoresSecurity", + "scores": { + "IFEval": 0.6358, + "BBH": 0.4497, + "MATH Level 5": 0.1156, + "GPQA": 0.2634, + "MUSR": 0.3314, + "MMLU-PRO": 0.301 + } + }, + { + "model_id": "AicoresSecurity/Cybernet-Sec-3B-R1-V0-Coder", + "name": "Cybernet-Sec-3B-R1-V0-Coder", + "developer": "AicoresSecurity", + "scores": { + "IFEval": 0.7098, + "BBH": 0.4478, + "MATH Level 5": 0.1488, + "GPQA": 0.2718, + "MUSR": 0.3408, + "MMLU-PRO": 0.3178 + } + }, + { + "model_id": "AicoresSecurity/Cybernet-Sec-3B-R1-V1", + "name": "Cybernet-Sec-3B-R1-V1", + "developer": "AicoresSecurity", + "scores": { + "IFEval": 0.6146, + "BBH": 0.4282, + "MATH Level 5": 0.1518, + "GPQA": 0.2609, + "MUSR": 0.3287, + "MMLU-PRO": 0.2876 + } + }, + { + "model_id": "AicoresSecurity/Cybernet-Sec-3B-R1-V1.1", + "name": "Cybernet-Sec-3B-R1-V1.1", + "developer": "AicoresSecurity", + "scores": { + "IFEval": 0.673, + "BBH": 0.4392, + "MATH Level 5": 0.176, + "GPQA": 0.271, + "MUSR": 0.3541, + "MMLU-PRO": 0.3088 + } + }, + { + "model_id": "Alepach/notHumpback-M0", + "name": "notHumpback-M0", + "developer": "Alepach", + "scores": { + "IFEval": 0.235, + "BBH": 0.2785, + "MATH Level 5": 0.0189, + "GPQA": 0.2492, + "MUSR": 0.3552, + "MMLU-PRO": 0.1119 + } + }, + { + "model_id": "Alepach/notHumpback-M1", + "name": "notHumpback-M1", + "developer": "Alepach", + "scores": { + "IFEval": 0.2207, + "BBH": 0.2882, + "MATH Level 5": 0.0159, + "GPQA": 0.2374, + "MUSR": 0.342, + "MMLU-PRO": 0.1091 + } + }, + { + "model_id": "Alepach/notHumpback-M1-v2", + "name": "notHumpback-M1-v2", + "developer": "Alepach", + "scores": { + "IFEval": 0.2277, + "BBH": 0.2776, + "MATH Level 5": 0.0219, + "GPQA": 0.2601, + "MUSR": 0.3473, + "MMLU-PRO": 0.1119 + } + }, + { + "model_id": "Alibaba-NLP/gte-Qwen2-7B-instruct", + "name": "gte-Qwen2-7B-instruct", + "developer": "Alibaba-NLP", + "scores": { + "IFEval": 0.2255, + "BBH": 0.4495, + "MATH Level 5": 0.0642, + "GPQA": 0.245, + "MUSR": 0.3559, + "MMLU-PRO": 0.3321 + } + }, + { + "model_id": "Alsebay/Qwen2.5-7B-test-novelist", + "name": "Qwen2.5-7B-test-novelist", + "developer": "Alsebay", + "scores": { + "IFEval": 0.5352, + "BBH": 0.5151, + "MATH Level 5": 0.2349, + "GPQA": 0.2911, + "MUSR": 0.4749, + "MMLU-PRO": 0.3866 + } + }, + { + "model_id": "Amaorynho/BBAI2006", + "name": "BBAI2006", + "developer": "Amaorynho", + "scores": { + "IFEval": 0.1467, + "BBH": 0.2704, + "MATH Level 5": 0.0, + "GPQA": 0.2525, + "MUSR": 0.3605, + "MMLU-PRO": 0.1123 + } + }, + { + "model_id": "Amaorynho/BBAI270V4", + "name": "BBAI270V4", + "developer": "Amaorynho", + "scores": { + "IFEval": 0.199, + "BBH": 0.3071, + "MATH Level 5": 0.0083, + "GPQA": 0.2458, + "MUSR": 0.3314, + "MMLU-PRO": 0.1114 + } + }, + { + "model_id": "Amaorynho/BBAIIFEV1", + "name": "BBAIIFEV1", + "developer": "Amaorynho", + "scores": { + "IFEval": 0.8047, + "BBH": 0.5292, + "MATH Level 5": 0.1934, + "GPQA": 0.3104, + "MUSR": 0.4185, + "MMLU-PRO": 0.3857 + } + }, + { + "model_id": "Amaorynho/BBAI_375", + "name": "BBAI_375", + "developer": "Amaorynho", + "scores": { + "IFEval": 0.1467, + "BBH": 0.2704, + "MATH Level 5": 0.0, + "GPQA": 0.2525, + "MUSR": 0.3605, + "MMLU-PRO": 0.1123 + } + }, + { + "model_id": "Amu/t1-1.5B", + "name": "t1-1.5B", + "developer": "Amu", + "scores": { + "IFEval": 0.3394, + "BBH": 0.4008, + "MATH Level 5": 0.0514, + "GPQA": 0.2433, + "MUSR": 0.3517, + "MMLU-PRO": 0.2566 + } + }, + { + "model_id": "Amu/t1-3B", + "name": "t1-3B", + "developer": "Amu", + "scores": { + "IFEval": 0.3328, + "BBH": 0.3999, + "MATH Level 5": 0.1375, + "GPQA": 0.2408, + "MUSR": 0.3435, + "MMLU-PRO": 0.1284 + } + }, + { + "model_id": "ArliAI/ArliAI-RPMax-12B-v1.1", + "name": "ArliAI-RPMax-12B-v1.1", + "developer": "ArliAI", + "scores": { + "IFEval": 0.5349, + "BBH": 0.4752, + "MATH Level 5": 0.1125, + "GPQA": 0.2819, + "MUSR": 0.3618, + "MMLU-PRO": 0.3384 + } + }, + { + "model_id": "ArliAI/Llama-3.1-8B-ArliAI-RPMax-v1.1", + "name": "Llama-3.1-8B-ArliAI-RPMax-v1.1", + "developer": "ArliAI", + "scores": { + "IFEval": 0.6359, + "BBH": 0.5016, + "MATH Level 5": 0.1314, + "GPQA": 0.2836, + "MUSR": 0.3577, + "MMLU-PRO": 0.3551 + } + }, + { + "model_id": "Arthur-LAGACHERIE/Precis-1B-Instruct", + "name": "Precis-1B-Instruct", + "developer": "Arthur-LAGACHERIE", + "scores": { + "IFEval": 0.3671, + "BBH": 0.3224, + "MATH Level 5": 0.0038, + "GPQA": 0.2659, + "MUSR": 0.3436, + "MMLU-PRO": 0.1426 + } + }, + { + "model_id": "Artples/L-MChat-7b", + "name": "L-MChat-7b", + "developer": "Artples", + "scores": { + "IFEval": 0.5297, + "BBH": 0.46, + "MATH Level 5": 0.0921, + "GPQA": 0.3054, + "MUSR": 0.4029, + "MMLU-PRO": 0.3299 + } + }, + { + "model_id": "Artples/L-MChat-Small", + "name": "L-MChat-Small", + "developer": "Artples", + "scores": { + "IFEval": 0.3287, + "BBH": 0.4823, + "MATH Level 5": 0.0378, + "GPQA": 0.2676, + "MUSR": 0.3696, + "MMLU-PRO": 0.2464 + } + }, + { + "model_id": "Aryanne/QwentileSwap", + "name": "QwentileSwap", + "developer": "Aryanne", + "scores": { + "IFEval": 0.7378, + "BBH": 0.7008, + "MATH Level 5": 0.4222, + "GPQA": 0.3674, + "MUSR": 0.464, + "MMLU-PRO": 0.5946 + } + }, + { + "model_id": "Aryanne/SHBA", + "name": "SHBA", + "developer": "Aryanne", + "scores": { + "IFEval": 0.7817, + "BBH": 0.5233, + "MATH Level 5": 0.1798, + "GPQA": 0.3054, + "MUSR": 0.4161, + "MMLU-PRO": 0.3892 + } + }, + { + "model_id": "Aryanne/SuperHeart", + "name": "SuperHeart", + "developer": "Aryanne", + "scores": { + "IFEval": 0.5192, + "BBH": 0.5215, + "MATH Level 5": 0.1563, + "GPQA": 0.3012, + "MUSR": 0.4436, + "MMLU-PRO": 0.3912 + } + }, + { + "model_id": "AtAndDev/Qwen2.5-1.5B-continuous-learnt", + "name": "Qwen2.5-1.5B-continuous-learnt", + "developer": "AtAndDev", + "scores": { + "IFEval": 0.4605, + "BBH": 0.4258, + "MATH Level 5": 0.0748, + "GPQA": 0.2659, + "MUSR": 0.3636, + "MMLU-PRO": 0.2812 + } + }, + { + "model_id": "Ateron/Glowing-Forest-12B", + "name": "Glowing-Forest-12B", + "developer": "Ateron", + "scores": { + "IFEval": 0.3592, + "BBH": 0.5492, + "MATH Level 5": 0.0778, + "GPQA": 0.3331, + "MUSR": 0.4449, + "MMLU-PRO": 0.3718 + } + }, + { + "model_id": "Ateron/Lotus-Magpic", + "name": "Lotus-Magpic", + "developer": "Ateron", + "scores": { + "IFEval": 0.6286, + "BBH": 0.5254, + "MATH Level 5": 0.0997, + "GPQA": 0.3029, + "MUSR": 0.4332, + "MMLU-PRO": 0.3491 + } + }, + { + "model_id": "Ateron/Way_of_MagPicaro", + "name": "Way_of_MagPicaro", + "developer": "Ateron", + "scores": { + "IFEval": 0.2637, + "BBH": 0.5427, + "MATH Level 5": 0.0589, + "GPQA": 0.3339, + "MUSR": 0.4649, + "MMLU-PRO": 0.3536 + } + }, + { + "model_id": "AuraIndustries/Aura-4B", + "name": "Aura-4B", + "developer": "AuraIndustries", + "scores": { + "IFEval": 0.3816, + "BBH": 0.449, + "MATH Level 5": 0.0423, + "GPQA": 0.2878, + "MUSR": 0.3938, + "MMLU-PRO": 0.2706 + } + }, + { + "model_id": "AuraIndustries/Aura-8B", + "name": "Aura-8B", + "developer": "AuraIndustries", + "scores": { + "IFEval": 0.7205, + "BBH": 0.5131, + "MATH Level 5": 0.1518, + "GPQA": 0.2861, + "MUSR": 0.4004, + "MMLU-PRO": 0.3874 + } + }, + { + "model_id": "AuraIndustries/Aura-MoE-2x4B", + "name": "Aura-MoE-2x4B", + "developer": "AuraIndustries", + "scores": { + "IFEval": 0.4601, + "BBH": 0.4339, + "MATH Level 5": 0.031, + "GPQA": 0.2718, + "MUSR": 0.4085, + "MMLU-PRO": 0.265 + } + }, + { + "model_id": "AuraIndustries/Aura-MoE-2x4B-v2", + "name": "Aura-MoE-2x4B-v2", + "developer": "AuraIndustries", + "scores": { + "IFEval": 0.4778, + "BBH": 0.4315, + "MATH Level 5": 0.0317, + "GPQA": 0.2878, + "MUSR": 0.4101, + "MMLU-PRO": 0.261 + } + }, + { + "model_id": "Aurel9/testmerge-7b", + "name": "testmerge-7b", + "developer": "Aurel9", + "scores": { + "IFEval": 0.398, + "BBH": 0.519, + "MATH Level 5": 0.0657, + "GPQA": 0.3003, + "MUSR": 0.4659, + "MMLU-PRO": 0.3053 + } + }, + { + "model_id": "Ayush-Singh/Llama1B-sft-2", + "name": "Llama1B-sft-2", + "developer": "Ayush-Singh", + "scores": { + "IFEval": 0.1374, + "BBH": 0.2834, + "MATH Level 5": 0.0, + "GPQA": 0.2458, + "MUSR": 0.3552, + "MMLU-PRO": 0.1117 + } + }, + { + "model_id": "Azure99/Blossom-V6-14B", + "name": "Blossom-V6-14B", + "developer": "Azure99", + "scores": { + "IFEval": 0.6395, + "BBH": 0.5069, + "MATH Level 5": 0.5257, + "GPQA": 0.2626, + "MUSR": 0.4035, + "MMLU-PRO": 0.4544 + } + }, + { + "model_id": "Azure99/Blossom-V6-7B", + "name": "Blossom-V6-7B", + "developer": "Azure99", + "scores": { + "IFEval": 0.5538, + "BBH": 0.4974, + "MATH Level 5": 0.4585, + "GPQA": 0.3045, + "MUSR": 0.4301, + "MMLU-PRO": 0.4144 + } + }, + { + "model_id": "Azure99/blossom-v5-32b", + "name": "blossom-v5-32b", + "developer": "Azure99", + "scores": { + "IFEval": 0.5235, + "BBH": 0.5955, + "MATH Level 5": 0.1866, + "GPQA": 0.3112, + "MUSR": 0.402, + "MMLU-PRO": 0.4235 + } + }, + { + "model_id": "Azure99/blossom-v5-llama3-8b", + "name": "blossom-v5-llama3-8b", + "developer": "Azure99", + "scores": { + "IFEval": 0.4343, + "BBH": 0.4185, + "MATH Level 5": 0.0514, + "GPQA": 0.2651, + "MUSR": 0.367, + "MMLU-PRO": 0.2206 + } + }, + { + "model_id": "Azure99/blossom-v5.1-34b", + "name": "blossom-v5.1-34b", + "developer": "Azure99", + "scores": { + "IFEval": 0.5697, + "BBH": 0.6109, + "MATH Level 5": 0.2591, + "GPQA": 0.3096, + "MUSR": 0.3928, + "MMLU-PRO": 0.4558 + } + }, + { + "model_id": "Azure99/blossom-v5.1-9b", + "name": "blossom-v5.1-9b", + "developer": "Azure99", + "scores": { + "IFEval": 0.5086, + "BBH": 0.5343, + "MATH Level 5": 0.2122, + "GPQA": 0.3356, + "MUSR": 0.3994, + "MMLU-PRO": 0.3979 + } + }, + { + "model_id": "BAAI/Gemma2-9B-IT-Simpo-Infinity-Preference", + "name": "Gemma2-9B-IT-Simpo-Infinity-Preference", + "developer": "BAAI", + "scores": { + "IFEval": 0.3176, + "BBH": 0.5979, + "MATH Level 5": 0.0974, + "GPQA": 0.3398, + "MUSR": 0.3966, + "MMLU-PRO": 0.3869 + } + }, + { + "model_id": "BAAI/Infinity-Instruct-3M-0613-Llama3-70B", + "name": "Infinity-Instruct-3M-0613-Llama3-70B", + "developer": "BAAI", + "scores": { + "IFEval": 0.6821, + "BBH": 0.6642, + "MATH Level 5": 0.2153, + "GPQA": 0.3582, + "MUSR": 0.4523, + "MMLU-PRO": 0.473 + } + }, + { + "model_id": "BAAI/Infinity-Instruct-3M-0613-Mistral-7B", + "name": "Infinity-Instruct-3M-0613-Mistral-7B", + "developer": "BAAI", + "scores": { + "IFEval": 0.532, + "BBH": 0.4958, + "MATH Level 5": 0.0816, + "GPQA": 0.2961, + "MUSR": 0.4351, + "MMLU-PRO": 0.3161 + } + }, + { + "model_id": "BAAI/Infinity-Instruct-3M-0625-Llama3-70B", + "name": "Infinity-Instruct-3M-0625-Llama3-70B", + "developer": "BAAI", + "scores": { + "IFEval": 0.7442, + "BBH": 0.667, + "MATH Level 5": 0.2251, + "GPQA": 0.3574, + "MUSR": 0.4617, + "MMLU-PRO": 0.4586 + } + }, + { + "model_id": "BAAI/Infinity-Instruct-3M-0625-Llama3-8B", + "name": "Infinity-Instruct-3M-0625-Llama3-8B", + "developer": "BAAI", + "scores": { + "IFEval": 0.605, + "BBH": 0.4955, + "MATH Level 5": 0.0884, + "GPQA": 0.2752, + "MUSR": 0.3712, + "MMLU-PRO": 0.3252 + } + }, + { + "model_id": "BAAI/Infinity-Instruct-3M-0625-Mistral-7B", + "name": "Infinity-Instruct-3M-0625-Mistral-7B", + "developer": "BAAI", + "scores": { + "IFEval": 0.5867, + "BBH": 0.494, + "MATH Level 5": 0.0763, + "GPQA": 0.2869, + "MUSR": 0.4272, + "MMLU-PRO": 0.323 + } + }, + { + "model_id": "BAAI/Infinity-Instruct-3M-0625-Qwen2-7B", + "name": "Infinity-Instruct-3M-0625-Qwen2-7B", + "developer": "BAAI", + "scores": { + "IFEval": 0.5554, + "BBH": 0.5346, + "MATH Level 5": 0.1926, + "GPQA": 0.3129, + "MUSR": 0.3888, + "MMLU-PRO": 0.396 + } + }, + { + "model_id": "BAAI/Infinity-Instruct-3M-0625-Yi-1.5-9B", + "name": "Infinity-Instruct-3M-0625-Yi-1.5-9B", + "developer": "BAAI", + "scores": { + "IFEval": 0.5186, + "BBH": 0.5509, + "MATH Level 5": 0.1639, + "GPQA": 0.354, + "MUSR": 0.4575, + "MMLU-PRO": 0.4118 + } + }, + { + "model_id": "BAAI/Infinity-Instruct-7M-0729-Llama3_1-8B", + "name": "Infinity-Instruct-7M-0729-Llama3_1-8B", + "developer": "BAAI", + "scores": { + "IFEval": 0.6132, + "BBH": 0.5077, + "MATH Level 5": 0.1276, + "GPQA": 0.2928, + "MUSR": 0.3578, + "MMLU-PRO": 0.3224 + } + }, + { + "model_id": "BAAI/Infinity-Instruct-7M-0729-mistral-7B", + "name": "Infinity-Instruct-7M-0729-mistral-7B", + "developer": "BAAI", + "scores": { + "IFEval": 0.6162, + "BBH": 0.4964, + "MATH Level 5": 0.0831, + "GPQA": 0.2903, + "MUSR": 0.4062, + "MMLU-PRO": 0.3274 + } + }, + { + "model_id": "BAAI/Infinity-Instruct-7M-Gen-Llama3_1-70B", + "name": "Infinity-Instruct-7M-Gen-Llama3_1-70B", + "developer": "BAAI", + "scores": { + "IFEval": 0.7335, + "BBH": 0.6695, + "MATH Level 5": 0.2523, + "GPQA": 0.3758, + "MUSR": 0.4539, + "MMLU-PRO": 0.4607 + } + }, + { + "model_id": "BAAI/Infinity-Instruct-7M-Gen-Llama3_1-8B", + "name": "Infinity-Instruct-7M-Gen-Llama3_1-8B", + "developer": "BAAI", + "scores": { + "IFEval": 0.6132, + "BBH": 0.5077, + "MATH Level 5": 0.1276, + "GPQA": 0.2928, + "MUSR": 0.3578, + "MMLU-PRO": 0.3224 + } + }, + { + "model_id": "BAAI/Infinity-Instruct-7M-Gen-mistral-7B", + "name": "Infinity-Instruct-7M-Gen-mistral-7B", + "developer": "BAAI", + "scores": { + "IFEval": 0.6147, + "BBH": 0.4964, + "MATH Level 5": 0.0831, + "GPQA": 0.2903, + "MUSR": 0.4062, + "MMLU-PRO": 0.3274 + } + }, + { + "model_id": "BAAI/OPI-Llama-3.1-8B-Instruct", + "name": "OPI-Llama-3.1-8B-Instruct", + "developer": "BAAI", + "scores": { + "IFEval": 0.2075, + "BBH": 0.3551, + "MATH Level 5": 0.0136, + "GPQA": 0.2743, + "MUSR": 0.3233, + "MMLU-PRO": 0.2124 + } + }, + { + "model_id": "BEE-spoke-data/Meta-Llama-3-8Bee", + "name": "Meta-Llama-3-8Bee", + "developer": "BEE-spoke-data", + "scores": { + "IFEval": 0.1951, + "BBH": 0.4626, + "MATH Level 5": 0.0483, + "GPQA": 0.3138, + "MUSR": 0.3654, + "MMLU-PRO": 0.322 + } + }, + { + "model_id": "BEE-spoke-data/smol_llama-101M-GQA", + "name": "smol_llama-101M-GQA", + "developer": "BEE-spoke-data", + "scores": { + "IFEval": 0.1384, + "BBH": 0.3018, + "MATH Level 5": 0.006, + "GPQA": 0.2576, + "MUSR": 0.3713, + "MMLU-PRO": 0.1107 + } + }, + { + "model_id": "BEE-spoke-data/smol_llama-220M-GQA", + "name": "smol_llama-220M-GQA", + "developer": "BEE-spoke-data", + "scores": { + "IFEval": 0.2386, + "BBH": 0.3032, + "MATH Level 5": 0.0106, + "GPQA": 0.2559, + "MUSR": 0.4059, + "MMLU-PRO": 0.1149 + } + }, + { + "model_id": "BEE-spoke-data/smol_llama-220M-GQA-fineweb_edu", + "name": "smol_llama-220M-GQA-fineweb_edu", + "developer": "BEE-spoke-data", + "scores": { + "IFEval": 0.1988, + "BBH": 0.2929, + "MATH Level 5": 0.0068, + "GPQA": 0.2592, + "MUSR": 0.4368, + "MMLU-PRO": 0.1127 + } + }, + { + "model_id": "BEE-spoke-data/smol_llama-220M-openhermes", + "name": "smol_llama-220M-openhermes", + "developer": "BEE-spoke-data", + "scores": { + "IFEval": 0.1555, + "BBH": 0.3028, + "MATH Level 5": 0.0106, + "GPQA": 0.2676, + "MUSR": 0.3847, + "MMLU-PRO": 0.112 + } + }, + { + "model_id": "BEE-spoke-data/tFINE-900m-e16-d32-flan", + "name": "tFINE-900m-e16-d32-flan", + "developer": "BEE-spoke-data", + "scores": { + "IFEval": 0.1506, + "BBH": 0.3028, + "MATH Level 5": 0.0098, + "GPQA": 0.2332, + "MUSR": 0.3724, + "MMLU-PRO": 0.1307 + } + }, + { + "model_id": "BEE-spoke-data/tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024", + "name": "tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024", + "developer": "BEE-spoke-data", + "scores": { + "IFEval": 0.1321, + "BBH": 0.3138, + "MATH Level 5": 0.0106, + "GPQA": 0.2542, + "MUSR": 0.4393, + "MMLU-PRO": 0.1237 + } + }, + { + "model_id": "BEE-spoke-data/tFINE-900m-e16-d32-instruct_2e", + "name": "tFINE-900m-e16-d32-instruct_2e", + "developer": "BEE-spoke-data", + "scores": { + "IFEval": 0.1403, + "BBH": 0.3135, + "MATH Level 5": 0.0136, + "GPQA": 0.2592, + "MUSR": 0.4207, + "MMLU-PRO": 0.1237 + } + }, + { + "model_id": "BEE-spoke-data/tFINE-900m-instruct-orpo", + "name": "tFINE-900m-instruct-orpo", + "developer": "BEE-spoke-data", + "scores": { + "IFEval": 0.133, + "BBH": 0.3022, + "MATH Level 5": 0.0159, + "GPQA": 0.2592, + "MUSR": 0.3409, + "MMLU-PRO": 0.1152 + } + }, + { + "model_id": "BSC-LT/salamandra-7b", + "name": "salamandra-7b", + "developer": "BSC-LT", + "scores": { + "IFEval": 0.1367, + "BBH": 0.3517, + "MATH Level 5": 0.0038, + "GPQA": 0.2701, + "MUSR": 0.3501, + "MMLU-PRO": 0.1493 + } + }, + { + "model_id": "BSC-LT/salamandra-7b-instruct", + "name": "salamandra-7b-instruct", + "developer": "BSC-LT", + "scores": { + "IFEval": 0.2451, + "BBH": 0.3851, + "MATH Level 5": 0.0083, + "GPQA": 0.2643, + "MUSR": 0.4134, + "MMLU-PRO": 0.1805 + } + }, + { + "model_id": "Ba2han/Llama-Phi-3_DoRA", + "name": "Llama-Phi-3_DoRA", + "developer": "Ba2han", + "scores": { + "IFEval": 0.5131, + "BBH": 0.5515, + "MATH Level 5": 0.1216, + "GPQA": 0.3263, + "MUSR": 0.4069, + "MMLU-PRO": 0.3915 + } + }, + { + "model_id": "Baptiste-HUVELLE-10/LeTriomphant2.2_ECE_iLAB", + "name": "LeTriomphant2.2_ECE_iLAB", + "developer": "Baptiste-HUVELLE-10", + "scores": { + "IFEval": 0.5076, + "BBH": 0.7256, + "MATH Level 5": 0.4449, + "GPQA": 0.3993, + "MUSR": 0.4626, + "MMLU-PRO": 0.5851 + } + }, + { + "model_id": "BenevolenceMessiah/Qwen2.5-72B-2x-Instruct-TIES-v1.0", + "name": "Qwen2.5-72B-2x-Instruct-TIES-v1.0", + "developer": "BenevolenceMessiah", + "scores": { + "IFEval": 0.5473, + "BBH": 0.7273, + "MATH Level 5": 0.5785, + "GPQA": 0.3674, + "MUSR": 0.4207, + "MMLU-PRO": 0.5628 + } + }, + { + "model_id": "BenevolenceMessiah/Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0", + "name": "Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0", + "developer": "BenevolenceMessiah", + "scores": { + "IFEval": 0.3012, + "BBH": 0.4909, + "MATH Level 5": 0.0415, + "GPQA": 0.2626, + "MUSR": 0.408, + "MMLU-PRO": 0.268 + } + }, + { + "model_id": "BlackBeenie/Bloslain-8B-v0.2", + "name": "Bloslain-8B-v0.2", + "developer": "BlackBeenie", + "scores": { + "IFEval": 0.5023, + "BBH": 0.5111, + "MATH Level 5": 0.145, + "GPQA": 0.3062, + "MUSR": 0.4076, + "MMLU-PRO": 0.3654 + } + }, + { + "model_id": "BlackBeenie/Llama-3.1-8B-OpenO1-SFT-v0.1", + "name": "Llama-3.1-8B-OpenO1-SFT-v0.1", + "developer": "BlackBeenie", + "scores": { + "IFEval": 0.5124, + "BBH": 0.4787, + "MATH Level 5": 0.1526, + "GPQA": 0.2685, + "MUSR": 0.3618, + "MMLU-PRO": 0.3492 + } + }, + { + "model_id": "BlackBeenie/Llama-3.1-8B-pythonic-passthrough-merge", + "name": "Llama-3.1-8B-pythonic-passthrough-merge", + "developer": "BlackBeenie", + "scores": { + "IFEval": 0.2316, + "BBH": 0.3454, + "MATH Level 5": 0.0113, + "GPQA": 0.2685, + "MUSR": 0.3778, + "MMLU-PRO": 0.1332 + } + }, + { + "model_id": "BlackBeenie/Neos-Gemma-2-9b", + "name": "Neos-Gemma-2-9b", + "developer": "BlackBeenie", + "scores": { + "IFEval": 0.5876, + "BBH": 0.5503, + "MATH Level 5": 0.0982, + "GPQA": 0.323, + "MUSR": 0.3618, + "MMLU-PRO": 0.3981 + } + }, + { + "model_id": "BlackBeenie/Neos-Llama-3.1-8B", + "name": "Neos-Llama-3.1-8B", + "developer": "BlackBeenie", + "scores": { + "IFEval": 0.4944, + "BBH": 0.4425, + "MATH Level 5": 0.1322, + "GPQA": 0.2685, + "MUSR": 0.375, + "MMLU-PRO": 0.3262 + } + }, + { + "model_id": "BlackBeenie/Neos-Llama-3.1-base", + "name": "Neos-Llama-3.1-base", + "developer": "BlackBeenie", + "scores": { + "IFEval": 0.1751, + "BBH": 0.293, + "MATH Level 5": 0.0, + "GPQA": 0.2374, + "MUSR": 0.3499, + "MMLU-PRO": 0.1112 + } + }, + { + "model_id": "BlackBeenie/Neos-Phi-3-14B-v0.1", + "name": "Neos-Phi-3-14B-v0.1", + "developer": "BlackBeenie", + "scores": { + "IFEval": 0.4022, + "BBH": 0.6212, + "MATH Level 5": 0.1782, + "GPQA": 0.3054, + "MUSR": 0.4125, + "MMLU-PRO": 0.4564 + } + }, + { + "model_id": "BlackBeenie/llama-3-luminous-merged", + "name": "llama-3-luminous-merged", + "developer": "BlackBeenie", + "scores": { + "IFEval": 0.4323, + "BBH": 0.5154, + "MATH Level 5": 0.0869, + "GPQA": 0.2928, + "MUSR": 0.4149, + "MMLU-PRO": 0.3773 + } + }, + { + "model_id": "BlackBeenie/llama-3.1-8B-Galore-openassistant-guanaco", + "name": "llama-3.1-8B-Galore-openassistant-guanaco", + "developer": "BlackBeenie", + "scores": { + "IFEval": 0.2635, + "BBH": 0.5213, + "MATH Level 5": 0.0665, + "GPQA": 0.3003, + "MUSR": 0.4406, + "MMLU-PRO": 0.3206 + } + }, + { + "model_id": "Bllossom/llama-3.2-Korean-Bllossom-AICA-5B", + "name": "llama-3.2-Korean-Bllossom-AICA-5B", + "developer": "Bllossom", + "scores": { + "IFEval": 0.5172, + "BBH": 0.4293, + "MATH Level 5": 0.1239, + "GPQA": 0.2987, + "MUSR": 0.3834, + "MMLU-PRO": 0.271 + } + }, + { + "model_id": "BoltMonkey/DreadMix", + "name": "DreadMix", + "developer": "BoltMonkey", + "scores": { + "IFEval": 0.7095, + "BBH": 0.5435, + "MATH Level 5": 0.1556, + "GPQA": 0.2995, + "MUSR": 0.4212, + "MMLU-PRO": 0.379 + } + }, + { + "model_id": "BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated", + "name": "NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated", + "developer": "BoltMonkey", + "scores": { + "IFEval": 0.459, + "BBH": 0.5185, + "MATH Level 5": 0.0937, + "GPQA": 0.2743, + "MUSR": 0.4083, + "MMLU-PRO": 0.3631 + } + }, + { + "model_id": "BoltMonkey/SuperNeuralDreadDevil-8b", + "name": "SuperNeuralDreadDevil-8b", + "developer": "BoltMonkey", + "scores": { + "IFEval": 0.771, + "BBH": 0.5286, + "MATH Level 5": 0.0929, + "GPQA": 0.2919, + "MUSR": 0.3977, + "MMLU-PRO": 0.3679 + } + }, + { + "model_id": "BrainWave-ML/llama3.2-3B-maths-orpo", + "name": "llama3.2-3B-maths-orpo", + "developer": "BrainWave-ML", + "scores": { + "IFEval": 0.2049, + "BBH": 0.2912, + "MATH Level 5": 0.0, + "GPQA": 0.2592, + "MUSR": 0.3575, + "MMLU-PRO": 0.1168 + } + }, + { + "model_id": "BramVanroy/GEITje-7B-ultra", + "name": "GEITje-7B-ultra", + "developer": "BramVanroy", + "scores": { + "IFEval": 0.3723, + "BBH": 0.3776, + "MATH Level 5": 0.0159, + "GPQA": 0.2626, + "MUSR": 0.329, + "MMLU-PRO": 0.2011 + } + }, + { + "model_id": "BramVanroy/fietje-2", + "name": "fietje-2", + "developer": "BramVanroy", + "scores": { + "IFEval": 0.2098, + "BBH": 0.4036, + "MATH Level 5": 0.0159, + "GPQA": 0.2542, + "MUSR": 0.3696, + "MMLU-PRO": 0.1986 + } + }, + { + "model_id": "BramVanroy/fietje-2-chat", + "name": "fietje-2-chat", + "developer": "BramVanroy", + "scores": { + "IFEval": 0.2917, + "BBH": 0.415, + "MATH Level 5": 0.0189, + "GPQA": 0.2399, + "MUSR": 0.3528, + "MMLU-PRO": 0.2055 + } + }, + { + "model_id": "BramVanroy/fietje-2-instruct", + "name": "fietje-2-instruct", + "developer": "BramVanroy", + "scores": { + "IFEval": 0.279, + "BBH": 0.4136, + "MATH Level 5": 0.0227, + "GPQA": 0.2332, + "MUSR": 0.3369, + "MMLU-PRO": 0.2104 + } + }, + { + "model_id": "CYFRAGOVPL/Llama-PLLuM-8B-base", + "name": "Llama-PLLuM-8B-base", + "developer": "CYFRAGOVPL", + "scores": { + "IFEval": 0.2899, + "BBH": 0.432, + "MATH Level 5": 0.0363, + "GPQA": 0.2852, + "MUSR": 0.397, + "MMLU-PRO": 0.2757 + } + }, + { + "model_id": "CYFRAGOVPL/Llama-PLLuM-8B-chat", + "name": "Llama-PLLuM-8B-chat", + "developer": "CYFRAGOVPL", + "scores": { + "IFEval": 0.3515, + "BBH": 0.4077, + "MATH Level 5": 0.034, + "GPQA": 0.2643, + "MUSR": 0.4199, + "MMLU-PRO": 0.2719 + } + }, + { + "model_id": "CYFRAGOVPL/PLLuM-12B-base", + "name": "PLLuM-12B-base", + "developer": "CYFRAGOVPL", + "scores": { + "IFEval": 0.2821, + "BBH": 0.4391, + "MATH Level 5": 0.0287, + "GPQA": 0.2903, + "MUSR": 0.4142, + "MMLU-PRO": 0.274 + } + }, + { + "model_id": "CYFRAGOVPL/PLLuM-12B-chat", + "name": "PLLuM-12B-chat", + "developer": "CYFRAGOVPL", + "scores": { + "IFEval": 0.3214, + "BBH": 0.4446, + "MATH Level 5": 0.0181, + "GPQA": 0.2601, + "MUSR": 0.4115, + "MMLU-PRO": 0.2872 + } + }, + { + "model_id": "CYFRAGOVPL/PLLuM-12B-nc-base", + "name": "PLLuM-12B-nc-base", + "developer": "CYFRAGOVPL", + "scores": { + "IFEval": 0.2405, + "BBH": 0.4277, + "MATH Level 5": 0.0219, + "GPQA": 0.2701, + "MUSR": 0.3645, + "MMLU-PRO": 0.2559 + } + }, + { + "model_id": "CYFRAGOVPL/PLLuM-12B-nc-chat", + "name": "PLLuM-12B-nc-chat", + "developer": "CYFRAGOVPL", + "scores": { + "IFEval": 0.2834, + "BBH": 0.4576, + "MATH Level 5": 0.0121, + "GPQA": 0.2827, + "MUSR": 0.4354, + "MMLU-PRO": 0.2597 + } + }, + { + "model_id": "CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct", + "name": "Llama-3.2-Rabbit-Ko-3B-Instruct", + "developer": "CarrotAI", + "scores": { + "IFEval": 0.7199, + "BBH": 0.4427, + "MATH Level 5": 0.2054, + "GPQA": 0.271, + "MUSR": 0.3649, + "MMLU-PRO": 0.2822 + } + }, + { + "model_id": "CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct-2412", + "name": "Llama-3.2-Rabbit-Ko-3B-Instruct-2412", + "developer": "CarrotAI", + "scores": { + "IFEval": 0.4782, + "BBH": 0.4358, + "MATH Level 5": 0.176, + "GPQA": 0.2928, + "MUSR": 0.3872, + "MMLU-PRO": 0.3134 + } + }, + { + "model_id": "Casual-Autopsy/L3-Umbral-Mind-RP-v2.0-8B", + "name": "L3-Umbral-Mind-RP-v2.0-8B", + "developer": "Casual-Autopsy", + "scores": { + "IFEval": 0.7123, + "BBH": 0.5262, + "MATH Level 5": 0.1095, + "GPQA": 0.2869, + "MUSR": 0.3687, + "MMLU-PRO": 0.3723 + } + }, + { + "model_id": "CausalLM/14B", + "name": "14B", + "developer": "CausalLM", + "scores": { + "IFEval": 0.2788, + "BBH": 0.47, + "MATH Level 5": 0.0755, + "GPQA": 0.3029, + "MUSR": 0.4155, + "MMLU-PRO": 0.3221 + } + }, + { + "model_id": "CausalLM/34b-beta", + "name": "34b-beta", + "developer": "CausalLM", + "scores": { + "IFEval": 0.3043, + "BBH": 0.5591, + "MATH Level 5": 0.0483, + "GPQA": 0.3465, + "MUSR": 0.3749, + "MMLU-PRO": 0.5325 + } + }, + { + "model_id": "CausalLM/preview-1-hf", + "name": "preview-1-hf", + "developer": "CausalLM", + "scores": { + "IFEval": 0.5559, + "BBH": 0.3615, + "MATH Level 5": 0.0302, + "GPQA": 0.2617, + "MUSR": 0.3422, + "MMLU-PRO": 0.3597 + } + }, + { + "model_id": "Changgil/K2S3-14b-v0.2", + "name": "K2S3-14b-v0.2", + "developer": "Changgil", + "scores": { + "IFEval": 0.3243, + "BBH": 0.4613, + "MATH Level 5": 0.0574, + "GPQA": 0.281, + "MUSR": 0.3923, + "MMLU-PRO": 0.2644 + } + }, + { + "model_id": "Changgil/K2S3-v0.1", + "name": "K2S3-v0.1", + "developer": "Changgil", + "scores": { + "IFEval": 0.3277, + "BBH": 0.4655, + "MATH Level 5": 0.0461, + "GPQA": 0.2643, + "MUSR": 0.4014, + "MMLU-PRO": 0.2562 + } + }, + { + "model_id": "ClaudioItaly/Albacus", + "name": "Albacus", + "developer": "ClaudioItaly", + "scores": { + "IFEval": 0.4667, + "BBH": 0.5113, + "MATH Level 5": 0.071, + "GPQA": 0.2718, + "MUSR": 0.4135, + "MMLU-PRO": 0.3165 + } + }, + { + "model_id": "ClaudioItaly/Book-Gut12B", + "name": "Book-Gut12B", + "developer": "ClaudioItaly", + "scores": { + "IFEval": 0.3998, + "BBH": 0.5417, + "MATH Level 5": 0.102, + "GPQA": 0.307, + "MUSR": 0.4635, + "MMLU-PRO": 0.367 + } + }, + { + "model_id": "ClaudioItaly/Evolutionstory-7B-v2.2", + "name": "Evolutionstory-7B-v2.2", + "developer": "ClaudioItaly", + "scores": { + "IFEval": 0.4814, + "BBH": 0.5108, + "MATH Level 5": 0.071, + "GPQA": 0.2752, + "MUSR": 0.4135, + "MMLU-PRO": 0.3159 + } + }, + { + "model_id": "ClaudioItaly/intelligence-cod-rag-7b-v3", + "name": "intelligence-cod-rag-7b-v3", + "developer": "ClaudioItaly", + "scores": { + "IFEval": 0.6898, + "BBH": 0.5366, + "MATH Level 5": 0.3807, + "GPQA": 0.2727, + "MUSR": 0.4153, + "MMLU-PRO": 0.4195 + } + }, + { + "model_id": "CohereForAI/aya-23-35B", + "name": "aya-23-35B", + "developer": "CohereForAI", + "scores": { + "IFEval": 0.6462, + "BBH": 0.54, + "MATH Level 5": 0.0347, + "GPQA": 0.2945, + "MUSR": 0.431, + "MMLU-PRO": 0.3356 + } + }, + { + "model_id": "CohereForAI/aya-23-8B", + "name": "aya-23-8B", + "developer": "CohereForAI", + "scores": { + "IFEval": 0.4699, + "BBH": 0.4296, + "MATH Level 5": 0.0166, + "GPQA": 0.2844, + "MUSR": 0.3941, + "MMLU-PRO": 0.2278 + } + }, + { + "model_id": "CohereForAI/aya-expanse-32b", + "name": "aya-expanse-32b", + "developer": "CohereForAI", + "scores": { + "IFEval": 0.7302, + "BBH": 0.5649, + "MATH Level 5": 0.1533, + "GPQA": 0.3255, + "MUSR": 0.3873, + "MMLU-PRO": 0.413 + } + }, + { + "model_id": "CohereForAI/aya-expanse-8b", + "name": "aya-expanse-8b", + "developer": "CohereForAI", + "scores": { + "IFEval": 0.6359, + "BBH": 0.4977, + "MATH Level 5": 0.0861, + "GPQA": 0.3029, + "MUSR": 0.3729, + "MMLU-PRO": 0.3004 + } + }, + { + "model_id": "CohereForAI/c4ai-command-r-plus", + "name": "c4ai-command-r-plus", + "developer": "CohereForAI", + "scores": { + "IFEval": 0.7664, + "BBH": 0.5815, + "MATH Level 5": 0.0801, + "GPQA": 0.3054, + "MUSR": 0.4807, + "MMLU-PRO": 0.3992 + } + }, + { + "model_id": "CohereForAI/c4ai-command-r-plus-08-2024", + "name": "c4ai-command-r-plus-08-2024", + "developer": "CohereForAI", + "scores": { + "IFEval": 0.754, + "BBH": 0.5996, + "MATH Level 5": 0.1239, + "GPQA": 0.3507, + "MUSR": 0.4829, + "MMLU-PRO": 0.4421 + } + }, + { + "model_id": "CohereForAI/c4ai-command-r-v01", + "name": "c4ai-command-r-v01", + "developer": "CohereForAI", + "scores": { + "IFEval": 0.6748, + "BBH": 0.5406, + "MATH Level 5": 0.0347, + "GPQA": 0.307, + "MUSR": 0.4517, + "MMLU-PRO": 0.3369 + } + }, + { + "model_id": "CohereForAI/c4ai-command-r7b-12-2024", + "name": "c4ai-command-r7b-12-2024", + "developer": "CohereForAI", + "scores": { + "IFEval": 0.7713, + "BBH": 0.5503, + "MATH Level 5": 0.2991, + "GPQA": 0.3087, + "MUSR": 0.4125, + "MMLU-PRO": 0.3572 + } + }, + { + "model_id": "Columbia-NLP/LION-Gemma-2b-dpo-v1.0", + "name": "LION-Gemma-2b-dpo-v1.0", + "developer": "Columbia-NLP", + "scores": { + "IFEval": 0.3278, + "BBH": 0.392, + "MATH Level 5": 0.0431, + "GPQA": 0.2492, + "MUSR": 0.412, + "MMLU-PRO": 0.1666 + } + }, + { + "model_id": "Columbia-NLP/LION-Gemma-2b-odpo-v1.0", + "name": "LION-Gemma-2b-odpo-v1.0", + "developer": "Columbia-NLP", + "scores": { + "IFEval": 0.3066, + "BBH": 0.3896, + "MATH Level 5": 0.0695, + "GPQA": 0.2424, + "MUSR": 0.4279, + "MMLU-PRO": 0.1692 + } + }, + { + "model_id": "Columbia-NLP/LION-Gemma-2b-sft-v1.0", + "name": "LION-Gemma-2b-sft-v1.0", + "developer": "Columbia-NLP", + "scores": { + "IFEval": 0.3692, + "BBH": 0.3879, + "MATH Level 5": 0.068, + "GPQA": 0.2559, + "MUSR": 0.4027, + "MMLU-PRO": 0.1782 + } + }, + { + "model_id": "Columbia-NLP/LION-LLaMA-3-8b-dpo-v1.0", + "name": "LION-LLaMA-3-8b-dpo-v1.0", + "developer": "Columbia-NLP", + "scores": { + "IFEval": 0.4957, + "BBH": 0.5028, + "MATH Level 5": 0.1171, + "GPQA": 0.281, + "MUSR": 0.4097, + "MMLU-PRO": 0.3219 + } + }, + { + "model_id": "Columbia-NLP/LION-LLaMA-3-8b-odpo-v1.0", + "name": "LION-LLaMA-3-8b-odpo-v1.0", + "developer": "Columbia-NLP", + "scores": { + "IFEval": 0.3968, + "BBH": 0.5024, + "MATH Level 5": 0.1065, + "GPQA": 0.2852, + "MUSR": 0.4057, + "MMLU-PRO": 0.3152 + } + }, + { + "model_id": "Columbia-NLP/LION-LLaMA-3-8b-sft-v1.0", + "name": "LION-LLaMA-3-8b-sft-v1.0", + "developer": "Columbia-NLP", + "scores": { + "IFEval": 0.3817, + "BBH": 0.5088, + "MATH Level 5": 0.114, + "GPQA": 0.2777, + "MUSR": 0.4503, + "MMLU-PRO": 0.3237 + } + }, + { + "model_id": "CombinHorizon/Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES", + "name": "Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES", + "developer": "CombinHorizon", + "scores": { + "IFEval": 0.824, + "BBH": 0.637, + "MATH Level 5": 0.5317, + "GPQA": 0.3247, + "MUSR": 0.426, + "MMLU-PRO": 0.4979 + } + }, + { + "model_id": "CombinHorizon/Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES", + "name": "Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES", + "developer": "CombinHorizon", + "scores": { + "IFEval": 0.7564, + "BBH": 0.5402, + "MATH Level 5": 0.4932, + "GPQA": 0.2978, + "MUSR": 0.4033, + "MMLU-PRO": 0.4342 + } + }, + { + "model_id": "CombinHorizon/YiSM-blossom5.1-34B-SLERP", + "name": "YiSM-blossom5.1-34B-SLERP", + "developer": "CombinHorizon", + "scores": { + "IFEval": 0.5033, + "BBH": 0.6208, + "MATH Level 5": 0.2153, + "GPQA": 0.3557, + "MUSR": 0.4413, + "MMLU-PRO": 0.4741 + } + }, + { + "model_id": "CombinHorizon/huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES", + "name": "huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES", + "developer": "CombinHorizon", + "scores": { + "IFEval": 0.8206, + "BBH": 0.6929, + "MATH Level 5": 0.5944, + "GPQA": 0.3389, + "MUSR": 0.4207, + "MMLU-PRO": 0.5721 + } + }, + { + "model_id": "CombinHorizon/huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES", + "name": "huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES", + "developer": "CombinHorizon", + "scores": { + "IFEval": 0.8176, + "BBH": 0.6336, + "MATH Level 5": 0.5476, + "GPQA": 0.3146, + "MUSR": 0.426, + "MMLU-PRO": 0.491 + } + }, + { + "model_id": "CombinHorizon/zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES", + "name": "zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES", + "developer": "CombinHorizon", + "scores": { + "IFEval": 0.8328, + "BBH": 0.6955, + "MATH Level 5": 0.5853, + "GPQA": 0.3674, + "MUSR": 0.4314, + "MMLU-PRO": 0.5685 + } + }, + { + "model_id": "ContactDoctor/Bio-Medical-3B-CoT-012025", + "name": "Bio-Medical-3B-CoT-012025", + "developer": "ContactDoctor", + "scores": { + "IFEval": 0.3604, + "BBH": 0.4383, + "MATH Level 5": 0.2213, + "GPQA": 0.3045, + "MUSR": 0.3368, + "MMLU-PRO": 0.2934 + } + }, + { + "model_id": "ContactDoctor/Bio-Medical-Llama-3-8B", + "name": "Bio-Medical-Llama-3-8B", + "developer": "ContactDoctor", + "scores": { + "IFEval": 0.4422, + "BBH": 0.4863, + "MATH Level 5": 0.0672, + "GPQA": 0.3339, + "MUSR": 0.3514, + "MMLU-PRO": 0.3648 + } + }, + { + "model_id": "CoolSpring/Qwen2-0.5B-Abyme", + "name": "Qwen2-0.5B-Abyme", + "developer": "CoolSpring", + "scores": { + "IFEval": 0.1915, + "BBH": 0.2862, + "MATH Level 5": 0.0295, + "GPQA": 0.2534, + "MUSR": 0.3542, + "MMLU-PRO": 0.1333 + } + }, + { + "model_id": "CoolSpring/Qwen2-0.5B-Abyme-merge2", + "name": "Qwen2-0.5B-Abyme-merge2", + "developer": "CoolSpring", + "scores": { + "IFEval": 0.2022, + "BBH": 0.2994, + "MATH Level 5": 0.0332, + "GPQA": 0.2601, + "MUSR": 0.3687, + "MMLU-PRO": 0.1489 + } + }, + { + "model_id": "CoolSpring/Qwen2-0.5B-Abyme-merge3", + "name": "Qwen2-0.5B-Abyme-merge3", + "developer": "CoolSpring", + "scores": { + "IFEval": 0.2386, + "BBH": 0.3003, + "MATH Level 5": 0.0317, + "GPQA": 0.2643, + "MUSR": 0.3501, + "MMLU-PRO": 0.15 + } + }, + { + "model_id": "Corianas/Neural-Mistral-7B", + "name": "Neural-Mistral-7B", + "developer": "Corianas", + "scores": { + "IFEval": 0.5489, + "BBH": 0.4428, + "MATH Level 5": 0.0189, + "GPQA": 0.2836, + "MUSR": 0.3873, + "MMLU-PRO": 0.2738 + } + }, + { + "model_id": "Corianas/Quokka_2.7b", + "name": "Quokka_2.7b", + "developer": "Corianas", + "scores": { + "IFEval": 0.1749, + "BBH": 0.3055, + "MATH Level 5": 0.0083, + "GPQA": 0.2559, + "MUSR": 0.3908, + "MMLU-PRO": 0.1145 + } + }, + { + "model_id": "Corianas/llama-3-reactor", + "name": "llama-3-reactor", + "developer": "Corianas", + "scores": { + "IFEval": 0.23, + "BBH": 0.4457, + "MATH Level 5": 0.0468, + "GPQA": 0.2978, + "MUSR": 0.3977, + "MMLU-PRO": 0.2801 + } + }, + { + "model_id": "CortexLM/btlm-7b-base-v0.2", + "name": "btlm-7b-base-v0.2", + "developer": "CortexLM", + "scores": { + "IFEval": 0.1483, + "BBH": 0.4006, + "MATH Level 5": 0.0151, + "GPQA": 0.2534, + "MUSR": 0.3846, + "MMLU-PRO": 0.235 + } + }, + { + "model_id": "Cran-May/SCE-2-24B", + "name": "SCE-2-24B", + "developer": "Cran-May", + "scores": { + "IFEval": 0.5866, + "BBH": 0.6265, + "MATH Level 5": 0.1896, + "GPQA": 0.3372, + "MUSR": 0.4528, + "MMLU-PRO": 0.4612 + } + }, + { + "model_id": "Cran-May/SCE-3-24B", + "name": "SCE-3-24B", + "developer": "Cran-May", + "scores": { + "IFEval": 0.5465, + "BBH": 0.5973, + "MATH Level 5": 0.1881, + "GPQA": 0.3465, + "MUSR": 0.4435, + "MMLU-PRO": 0.4647 + } + }, + { + "model_id": "Cran-May/T.E-8.1", + "name": "T.E-8.1", + "developer": "Cran-May", + "scores": { + "IFEval": 0.7077, + "BBH": 0.5582, + "MATH Level 5": 0.4456, + "GPQA": 0.3129, + "MUSR": 0.4505, + "MMLU-PRO": 0.4432 + } + }, + { + "model_id": "Cran-May/merge_model_20250308_2", + "name": "merge_model_20250308_2", + "developer": "Cran-May", + "scores": { + "IFEval": 0.5932, + "BBH": 0.6585, + "MATH Level 5": 0.4381, + "GPQA": 0.3909, + "MUSR": 0.4794, + "MMLU-PRO": 0.542 + } + }, + { + "model_id": "Cran-May/merge_model_20250308_3", + "name": "merge_model_20250308_3", + "developer": "Cran-May", + "scores": { + "IFEval": 0.6018, + "BBH": 0.6271, + "MATH Level 5": 0.2545, + "GPQA": 0.3221, + "MUSR": 0.432, + "MMLU-PRO": 0.4962 + } + }, + { + "model_id": "Cran-May/merge_model_20250308_4", + "name": "merge_model_20250308_4", + "developer": "Cran-May", + "scores": { + "IFEval": 0.454, + "BBH": 0.6664, + "MATH Level 5": 0.4199, + "GPQA": 0.3977, + "MUSR": 0.4688, + "MMLU-PRO": 0.5367 + } + }, + { + "model_id": "Cran-May/tempmotacilla-cinerea-0308", + "name": "tempmotacilla-cinerea-0308", + "developer": "Cran-May", + "scores": { + "IFEval": 0.8085, + "BBH": 0.6551, + "MATH Level 5": 0.5551, + "GPQA": 0.3624, + "MUSR": 0.4208, + "MMLU-PRO": 0.525 + } + }, + { + "model_id": "CreitinGameplays/Llama-3.1-8B-R1-v0.1", + "name": "Llama-3.1-8B-R1-v0.1", + "developer": "CreitinGameplays", + "scores": { + "IFEval": 0.3235, + "BBH": 0.3057, + "MATH Level 5": 0.1813, + "GPQA": 0.2584, + "MUSR": 0.3622, + "MMLU-PRO": 0.1252 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-Broca", + "name": "Qwen2.5-14B-Broca", + "developer": "CultriX", + "scores": { + "IFEval": 0.5604, + "BBH": 0.6527, + "MATH Level 5": 0.358, + "GPQA": 0.3867, + "MUSR": 0.4767, + "MMLU-PRO": 0.5364 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-BrocaV9", + "name": "Qwen2.5-14B-BrocaV9", + "developer": "CultriX", + "scores": { + "IFEval": 0.6763, + "BBH": 0.6391, + "MATH Level 5": 0.3814, + "GPQA": 0.3641, + "MUSR": 0.469, + "MMLU-PRO": 0.5331 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-Brocav3", + "name": "Qwen2.5-14B-Brocav3", + "developer": "CultriX", + "scores": { + "IFEval": 0.6952, + "BBH": 0.6452, + "MATH Level 5": 0.3875, + "GPQA": 0.3591, + "MUSR": 0.4756, + "MMLU-PRO": 0.5317 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-Brocav6", + "name": "Qwen2.5-14B-Brocav6", + "developer": "CultriX", + "scores": { + "IFEval": 0.6995, + "BBH": 0.6389, + "MATH Level 5": 0.3875, + "GPQA": 0.3674, + "MUSR": 0.4742, + "MMLU-PRO": 0.5319 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-Brocav7", + "name": "Qwen2.5-14B-Brocav7", + "developer": "CultriX", + "scores": { + "IFEval": 0.6724, + "BBH": 0.6444, + "MATH Level 5": 0.3844, + "GPQA": 0.3674, + "MUSR": 0.4796, + "MMLU-PRO": 0.5258 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-Emerged", + "name": "Qwen2.5-14B-Emerged", + "developer": "CultriX", + "scores": { + "IFEval": 0.7, + "BBH": 0.626, + "MATH Level 5": 0.3248, + "GPQA": 0.3574, + "MUSR": 0.4691, + "MMLU-PRO": 0.5186 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-Emergedv3", + "name": "Qwen2.5-14B-Emergedv3", + "developer": "CultriX", + "scores": { + "IFEval": 0.6388, + "BBH": 0.6191, + "MATH Level 5": 0.4358, + "GPQA": 0.3607, + "MUSR": 0.4728, + "MMLU-PRO": 0.5174 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-FinalMerge", + "name": "Qwen2.5-14B-FinalMerge", + "developer": "CultriX", + "scores": { + "IFEval": 0.4891, + "BBH": 0.5715, + "MATH Level 5": 0.3814, + "GPQA": 0.3549, + "MUSR": 0.4379, + "MMLU-PRO": 0.4574 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-Hyper", + "name": "Qwen2.5-14B-Hyper", + "developer": "CultriX", + "scores": { + "IFEval": 0.5391, + "BBH": 0.6507, + "MATH Level 5": 0.3437, + "GPQA": 0.3918, + "MUSR": 0.4898, + "MMLU-PRO": 0.5374 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-HyperMarck-dl", + "name": "Qwen2.5-14B-HyperMarck-dl", + "developer": "CultriX", + "scores": { + "IFEval": 0.665, + "BBH": 0.6096, + "MATH Level 5": 0.5287, + "GPQA": 0.3674, + "MUSR": 0.4416, + "MMLU-PRO": 0.5091 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-Hyperionv3", + "name": "Qwen2.5-14B-Hyperionv3", + "developer": "CultriX", + "scores": { + "IFEval": 0.6836, + "BBH": 0.6522, + "MATH Level 5": 0.3701, + "GPQA": 0.3708, + "MUSR": 0.473, + "MMLU-PRO": 0.534 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-Hyperionv4", + "name": "Qwen2.5-14B-Hyperionv4", + "developer": "CultriX", + "scores": { + "IFEval": 0.5416, + "BBH": 0.6472, + "MATH Level 5": 0.3474, + "GPQA": 0.3977, + "MUSR": 0.4832, + "MMLU-PRO": 0.5364 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-Hyperionv5", + "name": "Qwen2.5-14B-Hyperionv5", + "developer": "CultriX", + "scores": { + "IFEval": 0.6729, + "BBH": 0.6443, + "MATH Level 5": 0.3822, + "GPQA": 0.3716, + "MUSR": 0.4795, + "MMLU-PRO": 0.5302 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-MegaMerge-pt2", + "name": "Qwen2.5-14B-MegaMerge-pt2", + "developer": "CultriX", + "scores": { + "IFEval": 0.5683, + "BBH": 0.6578, + "MATH Level 5": 0.3995, + "GPQA": 0.3792, + "MUSR": 0.4729, + "MMLU-PRO": 0.5421 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-MergeStock", + "name": "Qwen2.5-14B-MergeStock", + "developer": "CultriX", + "scores": { + "IFEval": 0.5685, + "BBH": 0.6579, + "MATH Level 5": 0.4147, + "GPQA": 0.3733, + "MUSR": 0.4676, + "MMLU-PRO": 0.5396 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-ReasoningMerge", + "name": "Qwen2.5-14B-ReasoningMerge", + "developer": "CultriX", + "scores": { + "IFEval": 0.4605, + "BBH": 0.6578, + "MATH Level 5": 0.5204, + "GPQA": 0.4077, + "MUSR": 0.5166, + "MMLU-PRO": 0.5345 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-Ultimav2", + "name": "Qwen2.5-14B-Ultimav2", + "developer": "CultriX", + "scores": { + "IFEval": 0.55, + "BBH": 0.6555, + "MATH Level 5": 0.3844, + "GPQA": 0.3851, + "MUSR": 0.4966, + "MMLU-PRO": 0.5417 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-Unity", + "name": "Qwen2.5-14B-Unity", + "developer": "CultriX", + "scores": { + "IFEval": 0.6739, + "BBH": 0.602, + "MATH Level 5": 0.4313, + "GPQA": 0.3473, + "MUSR": 0.4679, + "MMLU-PRO": 0.5076 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-Wernicke", + "name": "Qwen2.5-14B-Wernicke", + "developer": "CultriX", + "scores": { + "IFEval": 0.5235, + "BBH": 0.6568, + "MATH Level 5": 0.3814, + "GPQA": 0.3935, + "MUSR": 0.4689, + "MMLU-PRO": 0.5424 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-Wernicke-SFT", + "name": "Qwen2.5-14B-Wernicke-SFT", + "developer": "CultriX", + "scores": { + "IFEval": 0.4937, + "BBH": 0.6461, + "MATH Level 5": 0.3595, + "GPQA": 0.354, + "MUSR": 0.39, + "MMLU-PRO": 0.507 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-Wernicke-SLERP", + "name": "Qwen2.5-14B-Wernicke-SLERP", + "developer": "CultriX", + "scores": { + "IFEval": 0.5589, + "BBH": 0.6441, + "MATH Level 5": 0.4486, + "GPQA": 0.344, + "MUSR": 0.414, + "MMLU-PRO": 0.5094 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-Wernickev3", + "name": "Qwen2.5-14B-Wernickev3", + "developer": "CultriX", + "scores": { + "IFEval": 0.7048, + "BBH": 0.6184, + "MATH Level 5": 0.3542, + "GPQA": 0.3624, + "MUSR": 0.4717, + "MMLU-PRO": 0.5151 + } + }, + { + "model_id": "CultriX/Qwen2.5-14B-partialmergept1", + "name": "Qwen2.5-14B-partialmergept1", + "developer": "CultriX", + "scores": { + "IFEval": 0.6337, + "BBH": 0.6151, + "MATH Level 5": 0.4539, + "GPQA": 0.3616, + "MUSR": 0.4757, + "MMLU-PRO": 0.5208 + } + }, + { + "model_id": "CultriX/Qwenfinity-2.5-14B", + "name": "Qwenfinity-2.5-14B", + "developer": "CultriX", + "scores": { + "IFEval": 0.4814, + "BBH": 0.5655, + "MATH Level 5": 0.4101, + "GPQA": 0.349, + "MUSR": 0.4506, + "MMLU-PRO": 0.4498 + } + }, + { + "model_id": "CultriX/Qwestion-14B", + "name": "Qwestion-14B", + "developer": "CultriX", + "scores": { + "IFEval": 0.6318, + "BBH": 0.645, + "MATH Level 5": 0.3724, + "GPQA": 0.3683, + "MUSR": 0.4636, + "MMLU-PRO": 0.5422 + } + }, + { + "model_id": "CultriX/SeQwence-14B", + "name": "SeQwence-14B", + "developer": "CultriX", + "scores": { + "IFEval": 0.5352, + "BBH": 0.6506, + "MATH Level 5": 0.3535, + "GPQA": 0.3607, + "MUSR": 0.4666, + "MMLU-PRO": 0.5419 + } + }, + { + "model_id": "CultriX/SeQwence-14B-EvolMerge", + "name": "SeQwence-14B-EvolMerge", + "developer": "CultriX", + "scores": { + "IFEval": 0.5382, + "BBH": 0.6572, + "MATH Level 5": 0.3671, + "GPQA": 0.3809, + "MUSR": 0.4821, + "MMLU-PRO": 0.5419 + } + }, + { + "model_id": "CultriX/SeQwence-14B-EvolMergev1", + "name": "SeQwence-14B-EvolMergev1", + "developer": "CultriX", + "scores": { + "IFEval": 0.5555, + "BBH": 0.6546, + "MATH Level 5": 0.4215, + "GPQA": 0.3767, + "MUSR": 0.4623, + "MMLU-PRO": 0.5393 + } + }, + { + "model_id": "CultriX/SeQwence-14B-v5", + "name": "SeQwence-14B-v5", + "developer": "CultriX", + "scores": { + "IFEval": 0.592, + "BBH": 0.6517, + "MATH Level 5": 0.3308, + "GPQA": 0.37, + "MUSR": 0.4714, + "MMLU-PRO": 0.5415 + } + }, + { + "model_id": "CultriX/SeQwence-14Bv1", + "name": "SeQwence-14Bv1", + "developer": "CultriX", + "scores": { + "IFEval": 0.6678, + "BBH": 0.6345, + "MATH Level 5": 0.361, + "GPQA": 0.3616, + "MUSR": 0.4704, + "MMLU-PRO": 0.532 + } + }, + { + "model_id": "CultriX/SeQwence-14Bv2", + "name": "SeQwence-14Bv2", + "developer": "CultriX", + "scores": { + "IFEval": 0.5786, + "BBH": 0.6305, + "MATH Level 5": 0.4758, + "GPQA": 0.3607, + "MUSR": 0.4601, + "MMLU-PRO": 0.5334 + } + }, + { + "model_id": "CultriX/SeQwence-14Bv3", + "name": "SeQwence-14Bv3", + "developer": "CultriX", + "scores": { + "IFEval": 0.5719, + "BBH": 0.6302, + "MATH Level 5": 0.4766, + "GPQA": 0.3649, + "MUSR": 0.4624, + "MMLU-PRO": 0.5335 + } + }, + { + "model_id": "DRXD1000/Atlas-7B", + "name": "Atlas-7B", + "developer": "DRXD1000", + "scores": { + "IFEval": 0.3704, + "BBH": 0.3302, + "MATH Level 5": 0.0189, + "GPQA": 0.2576, + "MUSR": 0.3342, + "MMLU-PRO": 0.1401 + } + }, + { + "model_id": "DRXD1000/Phoenix-7B", + "name": "Phoenix-7B", + "developer": "DRXD1000", + "scores": { + "IFEval": 0.321, + "BBH": 0.3932, + "MATH Level 5": 0.0166, + "GPQA": 0.2785, + "MUSR": 0.3849, + "MMLU-PRO": 0.2343 + } + }, + { + "model_id": "DUAL-GPO/zephyr-7b-ipo-0k-15k-i1", + "name": "zephyr-7b-ipo-0k-15k-i1", + "developer": "DUAL-GPO", + "scores": { + "IFEval": 0.2756, + "BBH": 0.4473, + "MATH Level 5": 0.0302, + "GPQA": 0.2911, + "MUSR": 0.4173, + "MMLU-PRO": 0.313 + } + }, + { + "model_id": "DZgas/GIGABATEMAN-7B", + "name": "GIGABATEMAN-7B", + "developer": "DZgas", + "scores": { + "IFEval": 0.4607, + "BBH": 0.5032, + "MATH Level 5": 0.0551, + "GPQA": 0.2894, + "MUSR": 0.4328, + "MMLU-PRO": 0.3177 + } + }, + { + "model_id": "Daemontatox/AetherDrake-SFT", + "name": "AetherDrake-SFT", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.4813, + "BBH": 0.4872, + "MATH Level 5": 0.1511, + "GPQA": 0.3205, + "MUSR": 0.4088, + "MMLU-PRO": 0.3499 + } + }, + { + "model_id": "Daemontatox/AetherSett", + "name": "AetherSett", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.537, + "BBH": 0.5452, + "MATH Level 5": 0.3973, + "GPQA": 0.3079, + "MUSR": 0.4603, + "MMLU-PRO": 0.4279 + } + }, + { + "model_id": "Daemontatox/AetherTOT", + "name": "AetherTOT", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.4398, + "BBH": 0.5066, + "MATH Level 5": 0.1488, + "GPQA": 0.3238, + "MUSR": 0.4079, + "MMLU-PRO": 0.3804 + } + }, + { + "model_id": "Daemontatox/AetherUncensored", + "name": "AetherUncensored", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.4042, + "BBH": 0.4463, + "MATH Level 5": 0.145, + "GPQA": 0.2886, + "MUSR": 0.3747, + "MMLU-PRO": 0.271 + } + }, + { + "model_id": "Daemontatox/Cogito-MIS", + "name": "Cogito-MIS", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.1815, + "BBH": 0.506, + "MATH Level 5": 0.0861, + "GPQA": 0.2567, + "MUSR": 0.3768, + "MMLU-PRO": 0.1435 + } + }, + { + "model_id": "Daemontatox/CogitoDistil", + "name": "CogitoDistil", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.2776, + "BBH": 0.3677, + "MATH Level 5": 0.3927, + "GPQA": 0.2592, + "MUSR": 0.3755, + "MMLU-PRO": 0.2625 + } + }, + { + "model_id": "Daemontatox/CogitoZ", + "name": "CogitoZ", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.3967, + "BBH": 0.6734, + "MATH Level 5": 0.5242, + "GPQA": 0.3951, + "MUSR": 0.4793, + "MMLU-PRO": 0.5593 + } + }, + { + "model_id": "Daemontatox/CogitoZ14", + "name": "CogitoZ14", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.6637, + "BBH": 0.6298, + "MATH Level 5": 0.4222, + "GPQA": 0.3163, + "MUSR": 0.4059, + "MMLU-PRO": 0.3999 + } + }, + { + "model_id": "Daemontatox/DocumentCogito", + "name": "DocumentCogito", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.5064, + "BBH": 0.5112, + "MATH Level 5": 0.1631, + "GPQA": 0.3163, + "MUSR": 0.3973, + "MMLU-PRO": 0.3802 + } + }, + { + "model_id": "Daemontatox/Llama3.3-70B-CogniLink", + "name": "Llama3.3-70B-CogniLink", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.6931, + "BBH": 0.6668, + "MATH Level 5": 0.4139, + "GPQA": 0.4455, + "MUSR": 0.4877, + "MMLU-PRO": 0.5173 + } + }, + { + "model_id": "Daemontatox/Llama_cot", + "name": "Llama_cot", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.7549, + "BBH": 0.4838, + "MATH Level 5": 0.2024, + "GPQA": 0.2911, + "MUSR": 0.3872, + "MMLU-PRO": 0.3518 + } + }, + { + "model_id": "Daemontatox/MawaredT1", + "name": "MawaredT1", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.4199, + "BBH": 0.5215, + "MATH Level 5": 0.3021, + "GPQA": 0.3347, + "MUSR": 0.4702, + "MMLU-PRO": 0.4718 + } + }, + { + "model_id": "Daemontatox/Mini_QwQ", + "name": "Mini_QwQ", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.4497, + "BBH": 0.5549, + "MATH Level 5": 0.4192, + "GPQA": 0.3037, + "MUSR": 0.4682, + "MMLU-PRO": 0.4373 + } + }, + { + "model_id": "Daemontatox/NemoR", + "name": "NemoR", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.2287, + "BBH": 0.5194, + "MATH Level 5": 0.0831, + "GPQA": 0.3272, + "MUSR": 0.3908, + "MMLU-PRO": 0.329 + } + }, + { + "model_id": "Daemontatox/PathFinderAI2.0", + "name": "PathFinderAI2.0", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.4541, + "BBH": 0.6658, + "MATH Level 5": 0.5076, + "GPQA": 0.302, + "MUSR": 0.4216, + "MMLU-PRO": 0.5547 + } + }, + { + "model_id": "Daemontatox/PathFinderAi3.0", + "name": "PathFinderAi3.0", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.4271, + "BBH": 0.6884, + "MATH Level 5": 0.5045, + "GPQA": 0.4086, + "MUSR": 0.4807, + "MMLU-PRO": 0.5757 + } + }, + { + "model_id": "Daemontatox/PathfinderAI", + "name": "PathfinderAI", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.4855, + "BBH": 0.6627, + "MATH Level 5": 0.4841, + "GPQA": 0.3096, + "MUSR": 0.4256, + "MMLU-PRO": 0.5542 + } + }, + { + "model_id": "Daemontatox/Phi-4-COT", + "name": "Phi-4-COT", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.1793, + "BBH": 0.6173, + "MATH Level 5": 0.2243, + "GPQA": 0.3356, + "MUSR": 0.453, + "MMLU-PRO": 0.5005 + } + }, + { + "model_id": "Daemontatox/PixelParse_AI", + "name": "PixelParse_AI", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.4383, + "BBH": 0.5034, + "MATH Level 5": 0.1473, + "GPQA": 0.3238, + "MUSR": 0.4052, + "MMLU-PRO": 0.3778 + } + }, + { + "model_id": "Daemontatox/RA2.0", + "name": "RA2.0", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.3784, + "BBH": 0.4889, + "MATH Level 5": 0.3837, + "GPQA": 0.3054, + "MUSR": 0.4091, + "MMLU-PRO": 0.2616 + } + }, + { + "model_id": "Daemontatox/RA_Reasoner", + "name": "RA_Reasoner", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.5592, + "BBH": 0.6054, + "MATH Level 5": 0.2122, + "GPQA": 0.3314, + "MUSR": 0.3964, + "MMLU-PRO": 0.43 + } + }, + { + "model_id": "Daemontatox/RA_Reasoner2.0", + "name": "RA_Reasoner2.0", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.5366, + "BBH": 0.6062, + "MATH Level 5": 0.2311, + "GPQA": 0.3247, + "MUSR": 0.3884, + "MMLU-PRO": 0.4353 + } + }, + { + "model_id": "Daemontatox/ReasonTest", + "name": "ReasonTest", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.408, + "BBH": 0.5435, + "MATH Level 5": 0.2137, + "GPQA": 0.3188, + "MUSR": 0.4315, + "MMLU-PRO": 0.4272 + } + }, + { + "model_id": "Daemontatox/Research_PathfinderAI", + "name": "Research_PathfinderAI", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.3457, + "BBH": 0.2872, + "MATH Level 5": 0.1699, + "GPQA": 0.2408, + "MUSR": 0.3394, + "MMLU-PRO": 0.113 + } + }, + { + "model_id": "Daemontatox/SphinX", + "name": "SphinX", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.5725, + "BBH": 0.5441, + "MATH Level 5": 0.3082, + "GPQA": 0.2978, + "MUSR": 0.4405, + "MMLU-PRO": 0.4366 + } + }, + { + "model_id": "Daemontatox/Sphinx2.0", + "name": "Sphinx2.0", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.7123, + "BBH": 0.6473, + "MATH Level 5": 0.4018, + "GPQA": 0.2936, + "MUSR": 0.426, + "MMLU-PRO": 0.5184 + } + }, + { + "model_id": "Daemontatox/TinySphinx", + "name": "TinySphinx", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.2567, + "BBH": 0.331, + "MATH Level 5": 0.0431, + "GPQA": 0.2735, + "MUSR": 0.3328, + "MMLU-PRO": 0.1698 + } + }, + { + "model_id": "Daemontatox/TinySphinx2.0", + "name": "TinySphinx2.0", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.2535, + "BBH": 0.3168, + "MATH Level 5": 0.0325, + "GPQA": 0.2685, + "MUSR": 0.3382, + "MMLU-PRO": 0.1731 + } + }, + { + "model_id": "Daemontatox/Zirel-7B-Math", + "name": "Zirel-7B-Math", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.6639, + "BBH": 0.5448, + "MATH Level 5": 0.1979, + "GPQA": 0.3263, + "MUSR": 0.4789, + "MMLU-PRO": 0.4237 + } + }, + { + "model_id": "Daemontatox/Zirel_1.5", + "name": "Zirel_1.5", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.4168, + "BBH": 0.3985, + "MATH Level 5": 0.1133, + "GPQA": 0.2601, + "MUSR": 0.3658, + "MMLU-PRO": 0.2143 + } + }, + { + "model_id": "Daemontatox/mini-Cogito-R1", + "name": "mini-Cogito-R1", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.2298, + "BBH": 0.328, + "MATH Level 5": 0.2749, + "GPQA": 0.2869, + "MUSR": 0.3447, + "MMLU-PRO": 0.1482 + } + }, + { + "model_id": "Daemontatox/mini_Pathfinder", + "name": "mini_Pathfinder", + "developer": "Daemontatox", + "scores": { + "IFEval": 0.2962, + "BBH": 0.3956, + "MATH Level 5": 0.4751, + "GPQA": 0.2584, + "MUSR": 0.3781, + "MMLU-PRO": 0.2809 + } + }, + { + "model_id": "Dampfinchen/Llama-3.1-8B-Ultra-Instruct", + "name": "Llama-3.1-8B-Ultra-Instruct", + "developer": "Dampfinchen", + "scores": { + "IFEval": 0.8081, + "BBH": 0.5258, + "MATH Level 5": 0.2205, + "GPQA": 0.2919, + "MUSR": 0.4003, + "MMLU-PRO": 0.3826 + } + }, + { + "model_id": "Danielbrdz/Barcenas-10b", + "name": "Barcenas-10b", + "developer": "Danielbrdz", + "scores": { + "IFEval": 0.6608, + "BBH": 0.6121, + "MATH Level 5": 0.2153, + "GPQA": 0.3414, + "MUSR": 0.4135, + "MMLU-PRO": 0.4361 + } + }, + { + "model_id": "Danielbrdz/Barcenas-14b-Phi-3-medium-ORPO", + "name": "Barcenas-14b-Phi-3-medium-ORPO", + "developer": "Danielbrdz", + "scores": { + "IFEval": 0.4799, + "BBH": 0.6536, + "MATH Level 5": 0.2024, + "GPQA": 0.3263, + "MUSR": 0.4808, + "MMLU-PRO": 0.4723 + } + }, + { + "model_id": "Danielbrdz/Barcenas-14b-phi-4", + "name": "Barcenas-14b-phi-4", + "developer": "Danielbrdz", + "scores": { + "IFEval": 0.0498, + "BBH": 0.6769, + "MATH Level 5": 0.2583, + "GPQA": 0.3834, + "MUSR": 0.5097, + "MMLU-PRO": 0.5175 + } + }, + { + "model_id": "Danielbrdz/Barcenas-14b-phi-4-v2", + "name": "Barcenas-14b-phi-4-v2", + "developer": "Danielbrdz", + "scores": { + "IFEval": 0.2775, + "BBH": 0.6573, + "MATH Level 5": 0.3218, + "GPQA": 0.3784, + "MUSR": 0.4399, + "MMLU-PRO": 0.5244 + } + }, + { + "model_id": "Danielbrdz/Barcenas-3b-GRPO", + "name": "Barcenas-3b-GRPO", + "developer": "Danielbrdz", + "scores": { + "IFEval": 0.5444, + "BBH": 0.4414, + "MATH Level 5": 0.1375, + "GPQA": 0.2903, + "MUSR": 0.3576, + "MMLU-PRO": 0.3037 + } + }, + { + "model_id": "Danielbrdz/Barcenas-Llama3-8b-ORPO", + "name": "Barcenas-Llama3-8b-ORPO", + "developer": "Danielbrdz", + "scores": { + "IFEval": 0.7372, + "BBH": 0.4987, + "MATH Level 5": 0.0657, + "GPQA": 0.307, + "MUSR": 0.419, + "MMLU-PRO": 0.383 + } + }, + { + "model_id": "Danielbrdz/Barcenas-R1-Qwen-1.5b", + "name": "Barcenas-R1-Qwen-1.5b", + "developer": "Danielbrdz", + "scores": { + "IFEval": 0.2428, + "BBH": 0.3587, + "MATH Level 5": 0.3497, + "GPQA": 0.3037, + "MUSR": 0.3541, + "MMLU-PRO": 0.1909 + } + }, + { + "model_id": "Dans-DiscountModels/12b-mn-dans-reasoning-test-2", + "name": "12b-mn-dans-reasoning-test-2", + "developer": "Dans-DiscountModels", + "scores": { + "IFEval": 0.3711, + "BBH": 0.4807, + "MATH Level 5": 0.0634, + "GPQA": 0.2735, + "MUSR": 0.3702, + "MMLU-PRO": 0.2507 + } + }, + { + "model_id": "Dans-DiscountModels/12b-mn-dans-reasoning-test-3", + "name": "12b-mn-dans-reasoning-test-3", + "developer": "Dans-DiscountModels", + "scores": { + "IFEval": 0.5053, + "BBH": 0.4839, + "MATH Level 5": 0.0778, + "GPQA": 0.271, + "MUSR": 0.4168, + "MMLU-PRO": 0.2516 + } + }, + { + "model_id": "Dans-DiscountModels/Dans-Instruct-CoreCurriculum-12b-ChatML", + "name": "Dans-Instruct-CoreCurriculum-12b-ChatML", + "developer": "Dans-DiscountModels", + "scores": { + "IFEval": 0.2111, + "BBH": 0.4792, + "MATH Level 5": 0.0431, + "GPQA": 0.2802, + "MUSR": 0.3606, + "MMLU-PRO": 0.2805 + } + }, + { + "model_id": "Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML", + "name": "Dans-Instruct-Mix-8b-ChatML", + "developer": "Dans-DiscountModels", + "scores": { + "IFEval": 0.0825, + "BBH": 0.4738, + "MATH Level 5": 0.0551, + "GPQA": 0.2945, + "MUSR": 0.3918, + "MMLU-PRO": 0.3288 + } + }, + { + "model_id": "Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.0", + "name": "Dans-Instruct-Mix-8b-ChatML-V0.1.0", + "developer": "Dans-DiscountModels", + "scores": { + "IFEval": 0.0668, + "BBH": 0.4775, + "MATH Level 5": 0.0672, + "GPQA": 0.2861, + "MUSR": 0.3786, + "MMLU-PRO": 0.3284 + } + }, + { + "model_id": "Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.1", + "name": "Dans-Instruct-Mix-8b-ChatML-V0.1.1", + "developer": "Dans-DiscountModels", + "scores": { + "IFEval": 0.0911, + "BBH": 0.4749, + "MATH Level 5": 0.0597, + "GPQA": 0.2911, + "MUSR": 0.3825, + "MMLU-PRO": 0.3279 + } + }, + { + "model_id": "Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.2.0", + "name": "Dans-Instruct-Mix-8b-ChatML-V0.2.0", + "developer": "Dans-DiscountModels", + "scores": { + "IFEval": 0.5064, + "BBH": 0.4624, + "MATH Level 5": 0.0733, + "GPQA": 0.2936, + "MUSR": 0.3644, + "MMLU-PRO": 0.3 + } + }, + { + "model_id": "Dans-DiscountModels/Mistral-7b-v0.3-Test-E0.7", + "name": "Mistral-7b-v0.3-Test-E0.7", + "developer": "Dans-DiscountModels", + "scores": { + "IFEval": 0.5124, + "BBH": 0.475, + "MATH Level 5": 0.034, + "GPQA": 0.2961, + "MUSR": 0.4005, + "MMLU-PRO": 0.2744 + } + }, + { + "model_id": "Dans-DiscountModels/mistral-7b-test-merged", + "name": "mistral-7b-test-merged", + "developer": "Dans-DiscountModels", + "scores": { + "IFEval": 0.6678, + "BBH": 0.4898, + "MATH Level 5": 0.0446, + "GPQA": 0.2945, + "MUSR": 0.3754, + "MMLU-PRO": 0.2978 + } + }, + { + "model_id": "Darkknight535/OpenCrystal-12B-L3", + "name": "OpenCrystal-12B-L3", + "developer": "Darkknight535", + "scores": { + "IFEval": 0.4071, + "BBH": 0.5223, + "MATH Level 5": 0.0899, + "GPQA": 0.3062, + "MUSR": 0.3657, + "MMLU-PRO": 0.364 + } + }, + { + "model_id": "DavidAU/DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm", + "name": "DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm", + "developer": "DavidAU", + "scores": { + "IFEval": 0.3136, + "BBH": 0.4762, + "MATH Level 5": 0.1057, + "GPQA": 0.3138, + "MUSR": 0.3928, + "MMLU-PRO": 0.3209 + } + }, + { + "model_id": "DavidAU/DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B", + "name": "DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B", + "developer": "DavidAU", + "scores": { + "IFEval": 0.3685, + "BBH": 0.4887, + "MATH Level 5": 0.0657, + "GPQA": 0.318, + "MUSR": 0.432, + "MMLU-PRO": 0.2976 + } + }, + { + "model_id": "DavidAU/DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B", + "name": "DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B", + "developer": "DavidAU", + "scores": { + "IFEval": 0.2507, + "BBH": 0.4488, + "MATH Level 5": 0.0295, + "GPQA": 0.3138, + "MUSR": 0.4164, + "MMLU-PRO": 0.2709 + } + }, + { + "model_id": "DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B", + "name": "DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B", + "developer": "DavidAU", + "scores": { + "IFEval": 0.3883, + "BBH": 0.4886, + "MATH Level 5": 0.0816, + "GPQA": 0.323, + "MUSR": 0.4375, + "MMLU-PRO": 0.3024 + } + }, + { + "model_id": "DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B", + "name": "DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B", + "developer": "DavidAU", + "scores": { + "IFEval": 0.3436, + "BBH": 0.4769, + "MATH Level 5": 0.0755, + "GPQA": 0.3372, + "MUSR": 0.4231, + "MMLU-PRO": 0.297 + } + }, + { + "model_id": "DavidAU/DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm", + "name": "DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm", + "developer": "DavidAU", + "scores": { + "IFEval": 0.3416, + "BBH": 0.5807, + "MATH Level 5": 0.5536, + "GPQA": 0.3859, + "MUSR": 0.5155, + "MMLU-PRO": 0.4624 + } + }, + { + "model_id": "DavidAU/DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B", + "name": "DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B", + "developer": "DavidAU", + "scores": { + "IFEval": 0.2853, + "BBH": 0.4462, + "MATH Level 5": 0.0174, + "GPQA": 0.3054, + "MUSR": 0.4179, + "MMLU-PRO": 0.2778 + } + }, + { + "model_id": "DavidAU/DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B", + "name": "DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B", + "developer": "DavidAU", + "scores": { + "IFEval": 0.3793, + "BBH": 0.4232, + "MATH Level 5": 0.108, + "GPQA": 0.2794, + "MUSR": 0.356, + "MMLU-PRO": 0.272 + } + }, + { + "model_id": "DavidAU/Gemma-The-Writer-9B", + "name": "Gemma-The-Writer-9B", + "developer": "DavidAU", + "scores": { + "IFEval": 0.174, + "BBH": 0.5905, + "MATH Level 5": 0.0876, + "GPQA": 0.3456, + "MUSR": 0.4099, + "MMLU-PRO": 0.3979 + } + }, + { + "model_id": "DavidAU/Gemma-The-Writer-DEADLINE-10B", + "name": "Gemma-The-Writer-DEADLINE-10B", + "developer": "DavidAU", + "scores": { + "IFEval": 0.2332, + "BBH": 0.5896, + "MATH Level 5": 0.0989, + "GPQA": 0.3423, + "MUSR": 0.4189, + "MMLU-PRO": 0.3946 + } + }, + { + "model_id": "DavidAU/Gemma-The-Writer-J.GutenBerg-10B", + "name": "Gemma-The-Writer-J.GutenBerg-10B", + "developer": "DavidAU", + "scores": { + "IFEval": 0.2858, + "BBH": 0.5909, + "MATH Level 5": 0.0921, + "GPQA": 0.3381, + "MUSR": 0.4176, + "MMLU-PRO": 0.3947 + } + }, + { + "model_id": "DavidAU/Gemma-The-Writer-Mighty-Sword-9B", + "name": "Gemma-The-Writer-Mighty-Sword-9B", + "developer": "DavidAU", + "scores": { + "IFEval": 0.7528, + "BBH": 0.5912, + "MATH Level 5": 0.1911, + "GPQA": 0.3482, + "MUSR": 0.4112, + "MMLU-PRO": 0.3968 + } + }, + { + "model_id": "DavidAU/Gemma-The-Writer-N-Restless-Quill-10B-Uncensored", + "name": "Gemma-The-Writer-N-Restless-Quill-10B-Uncensored", + "developer": "DavidAU", + "scores": { + "IFEval": 0.7071, + "BBH": 0.5922, + "MATH Level 5": 0.2296, + "GPQA": 0.3414, + "MUSR": 0.4163, + "MMLU-PRO": 0.3966 + } + }, + { + "model_id": "DavidAU/L3-DARKEST-PLANET-16.5B", + "name": "L3-DARKEST-PLANET-16.5B", + "developer": "DavidAU", + "scores": { + "IFEval": 0.6231, + "BBH": 0.523, + "MATH Level 5": 0.0899, + "GPQA": 0.2953, + "MUSR": 0.3754, + "MMLU-PRO": 0.363 + } + }, + { + "model_id": "DavidAU/L3-Dark-Planet-8B", + "name": "L3-Dark-Planet-8B", + "developer": "DavidAU", + "scores": { + "IFEval": 0.4134, + "BBH": 0.5084, + "MATH Level 5": 0.0823, + "GPQA": 0.3003, + "MUSR": 0.3616, + "MMLU-PRO": 0.3737 + } + }, + { + "model_id": "DavidAU/L3-Jamet-12.2B-MK.V-Blackroot-Instruct", + "name": "L3-Jamet-12.2B-MK.V-Blackroot-Instruct", + "developer": "DavidAU", + "scores": { + "IFEval": 0.3962, + "BBH": 0.4766, + "MATH Level 5": 0.0408, + "GPQA": 0.2785, + "MUSR": 0.402, + "MMLU-PRO": 0.3291 + } + }, + { + "model_id": "DavidAU/L3-Lumimaid-12.2B-v0.1-OAS-Instruct", + "name": "L3-Lumimaid-12.2B-v0.1-OAS-Instruct", + "developer": "DavidAU", + "scores": { + "IFEval": 0.3924, + "BBH": 0.4693, + "MATH Level 5": 0.0461, + "GPQA": 0.2768, + "MUSR": 0.4194, + "MMLU-PRO": 0.3142 + } + }, + { + "model_id": "DavidAU/L3-SMB-Instruct-12.2B-F32", + "name": "L3-SMB-Instruct-12.2B-F32", + "developer": "DavidAU", + "scores": { + "IFEval": 0.4303, + "BBH": 0.4786, + "MATH Level 5": 0.0468, + "GPQA": 0.2819, + "MUSR": 0.4087, + "MMLU-PRO": 0.3312 + } + }, + { + "model_id": "DavidAU/L3-Stheno-Maid-Blackroot-Grand-HORROR-16B", + "name": "L3-Stheno-Maid-Blackroot-Grand-HORROR-16B", + "developer": "DavidAU", + "scores": { + "IFEval": 0.3439, + "BBH": 0.4736, + "MATH Level 5": 0.0219, + "GPQA": 0.271, + "MUSR": 0.4031, + "MMLU-PRO": 0.357 + } + }, + { + "model_id": "DavidAU/L3-Stheno-v3.2-12.2B-Instruct", + "name": "L3-Stheno-v3.2-12.2B-Instruct", + "developer": "DavidAU", + "scores": { + "IFEval": 0.4028, + "BBH": 0.4846, + "MATH Level 5": 0.0506, + "GPQA": 0.2752, + "MUSR": 0.4103, + "MMLU-PRO": 0.3345 + } + }, + { + "model_id": "DavidAU/L3.1-Dark-Planet-SpinFire-Uncensored-8B", + "name": "L3.1-Dark-Planet-SpinFire-Uncensored-8B", + "developer": "DavidAU", + "scores": { + "IFEval": 0.7043, + "BBH": 0.5261, + "MATH Level 5": 0.0929, + "GPQA": 0.2794, + "MUSR": 0.3541, + "MMLU-PRO": 0.367 + } + }, + { + "model_id": "DavidAU/L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B", + "name": "L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B", + "developer": "DavidAU", + "scores": { + "IFEval": 0.3345, + "BBH": 0.4421, + "MATH Level 5": 0.2606, + "GPQA": 0.3138, + "MUSR": 0.3749, + "MMLU-PRO": 0.2892 + } + }, + { + "model_id": "DavidAU/Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B", + "name": "Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B", + "developer": "DavidAU", + "scores": { + "IFEval": 0.1783, + "BBH": 0.3033, + "MATH Level 5": 0.0249, + "GPQA": 0.2592, + "MUSR": 0.3715, + "MMLU-PRO": 0.1142 + } + }, + { + "model_id": "DavidAU/Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B", + "name": "Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B", + "developer": "DavidAU", + "scores": { + "IFEval": 0.2835, + "BBH": 0.3592, + "MATH Level 5": 0.2417, + "GPQA": 0.2651, + "MUSR": 0.3847, + "MMLU-PRO": 0.1636 + } + }, + { + "model_id": "DavidAU/Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32", + "name": "Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32", + "developer": "DavidAU", + "scores": { + "IFEval": 0.2107, + "BBH": 0.3286, + "MATH Level 5": 0.0665, + "GPQA": 0.2475, + "MUSR": 0.3404, + "MMLU-PRO": 0.1122 + } + }, + { + "model_id": "Davidsv/SUONG-1", + "name": "SUONG-1", + "developer": "Davidsv", + "scores": { + "IFEval": 0.2497, + "BBH": 0.2817, + "MATH Level 5": 0.0, + "GPQA": 0.2441, + "MUSR": 0.3578, + "MMLU-PRO": 0.1085 + } + }, + { + "model_id": "DavieLion/Llama-3.2-1B-SPIN-iter0", + "name": "Llama-3.2-1B-SPIN-iter0", + "developer": "DavieLion", + "scores": { + "IFEval": 0.1549, + "BBH": 0.2937, + "MATH Level 5": 0.006, + "GPQA": 0.2576, + "MUSR": 0.3565, + "MMLU-PRO": 0.1128 + } + }, + { + "model_id": "DavieLion/Llama-3.2-1B-SPIN-iter1", + "name": "Llama-3.2-1B-SPIN-iter1", + "developer": "DavieLion", + "scores": { + "IFEval": 0.1575, + "BBH": 0.294, + "MATH Level 5": 0.0023, + "GPQA": 0.2508, + "MUSR": 0.3646, + "MMLU-PRO": 0.1118 + } + }, + { + "model_id": "DavieLion/Llama-3.2-1B-SPIN-iter2", + "name": "Llama-3.2-1B-SPIN-iter2", + "developer": "DavieLion", + "scores": { + "IFEval": 0.1376, + "BBH": 0.298, + "MATH Level 5": 0.0053, + "GPQA": 0.2542, + "MUSR": 0.3553, + "MMLU-PRO": 0.1129 + } + }, + { + "model_id": "DavieLion/Llama-3.2-1B-SPIN-iter3", + "name": "Llama-3.2-1B-SPIN-iter3", + "developer": "DavieLion", + "scores": { + "IFEval": 0.1324, + "BBH": 0.2972, + "MATH Level 5": 0.0, + "GPQA": 0.2643, + "MUSR": 0.3527, + "MMLU-PRO": 0.1129 + } + }, + { + "model_id": "DavieLion/Lllma-3.2-1B", + "name": "Lllma-3.2-1B", + "developer": "DavieLion", + "scores": { + "IFEval": 0.1601, + "BBH": 0.2965, + "MATH Level 5": 0.0068, + "GPQA": 0.2441, + "MUSR": 0.3578, + "MMLU-PRO": 0.1126 + } + }, + { + "model_id": "DebateLabKIT/Llama-3.1-Argunaut-1-8B-SFT", + "name": "Llama-3.1-Argunaut-1-8B-SFT", + "developer": "DebateLabKIT", + "scores": { + "IFEval": 0.5519, + "BBH": 0.4824, + "MATH Level 5": 0.145, + "GPQA": 0.2836, + "MUSR": 0.4503, + "MMLU-PRO": 0.3472 + } + }, + { + "model_id": "Deci/DeciLM-7B", + "name": "DeciLM-7B", + "developer": "Deci", + "scores": { + "IFEval": 0.2813, + "BBH": 0.4423, + "MATH Level 5": 0.0287, + "GPQA": 0.2953, + "MUSR": 0.4359, + "MMLU-PRO": 0.2692 + } + }, + { + "model_id": "Deci/DeciLM-7B-instruct", + "name": "DeciLM-7B-instruct", + "developer": "Deci", + "scores": { + "IFEval": 0.488, + "BBH": 0.459, + "MATH Level 5": 0.0302, + "GPQA": 0.2894, + "MUSR": 0.3884, + "MMLU-PRO": 0.2608 + } + }, + { + "model_id": "DeepAutoAI/Explore_Llama-3.1-8B-Inst", + "name": "Explore_Llama-3.1-8B-Inst", + "developer": "DeepAutoAI", + "scores": { + "IFEval": 0.7795, + "BBH": 0.5117, + "MATH Level 5": 0.2009, + "GPQA": 0.2836, + "MUSR": 0.391, + "MMLU-PRO": 0.3792 + } + }, + { + "model_id": "DeepAutoAI/Explore_Llama-3.2-1B-Inst", + "name": "Explore_Llama-3.2-1B-Inst", + "developer": "DeepAutoAI", + "scores": { + "IFEval": 0.5649, + "BBH": 0.3505, + "MATH Level 5": 0.0748, + "GPQA": 0.2559, + "MUSR": 0.3183, + "MMLU-PRO": 0.1809 + } + }, + { + "model_id": "DeepAutoAI/Explore_Llama-3.2-1B-Inst_v0", + "name": "Explore_Llama-3.2-1B-Inst_v0", + "developer": "DeepAutoAI", + "scores": { + "IFEval": 0.5597, + "BBH": 0.3365, + "MATH Level 5": 0.0597, + "GPQA": 0.2634, + "MUSR": 0.3103, + "MMLU-PRO": 0.1804 + } + }, + { + "model_id": "DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1", + "name": "Explore_Llama-3.2-1B-Inst_v1", + "developer": "DeepAutoAI", + "scores": { + "IFEval": 0.4999, + "BBH": 0.3141, + "MATH Level 5": 0.031, + "GPQA": 0.245, + "MUSR": 0.3781, + "MMLU-PRO": 0.1269 + } + }, + { + "model_id": "DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1.1", + "name": "Explore_Llama-3.2-1B-Inst_v1.1", + "developer": "DeepAutoAI", + "scores": { + "IFEval": 0.5844, + "BBH": 0.3513, + "MATH Level 5": 0.0718, + "GPQA": 0.2626, + "MUSR": 0.3117, + "MMLU-PRO": 0.1818 + } + }, + { + "model_id": "DeepAutoAI/causal_gpt2", + "name": "causal_gpt2", + "developer": "DeepAutoAI", + "scores": { + "IFEval": 0.1813, + "BBH": 0.3026, + "MATH Level 5": 0.0053, + "GPQA": 0.2601, + "MUSR": 0.427, + "MMLU-PRO": 0.1131 + } + }, + { + "model_id": "DeepAutoAI/d2nwg_Llama-3.1-8B-Instruct-v0.0", + "name": "d2nwg_Llama-3.1-8B-Instruct-v0.0", + "developer": "DeepAutoAI", + "scores": { + "IFEval": 0.7893, + "BBH": 0.508, + "MATH Level 5": 0.1805, + "GPQA": 0.2919, + "MUSR": 0.4135, + "MMLU-PRO": 0.3877 + } + }, + { + "model_id": "DeepAutoAI/d2nwg_causal_gpt2", + "name": "d2nwg_causal_gpt2", + "developer": "DeepAutoAI", + "scores": { + "IFEval": 0.1916, + "BBH": 0.3027, + "MATH Level 5": 0.0045, + "GPQA": 0.2576, + "MUSR": 0.4297, + "MMLU-PRO": 0.1151 + } + }, + { + "model_id": "DeepAutoAI/d2nwg_causal_gpt2_v1", + "name": "d2nwg_causal_gpt2_v1", + "developer": "DeepAutoAI", + "scores": { + "IFEval": 0.1989, + "BBH": 0.2992, + "MATH Level 5": 0.0038, + "GPQA": 0.2584, + "MUSR": 0.4337, + "MMLU-PRO": 0.1135 + } + }, + { + "model_id": "DeepAutoAI/ldm_soup_Llama-3.1-8B-Inst", + "name": "ldm_soup_Llama-3.1-8B-Inst", + "developer": "DeepAutoAI", + "scores": { + "IFEval": 0.8033, + "BBH": 0.5121, + "MATH Level 5": 0.1888, + "GPQA": 0.2894, + "MUSR": 0.4161, + "MMLU-PRO": 0.3886 + } + }, + { + "model_id": "DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.0", + "name": "ldm_soup_Llama-3.1-8B-Instruct-v0.0", + "developer": "DeepAutoAI", + "scores": { + "IFEval": 0.7889, + "BBH": 0.5125, + "MATH Level 5": 0.1918, + "GPQA": 0.2911, + "MUSR": 0.4121, + "MMLU-PRO": 0.3895 + } + }, + { + "model_id": "DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.1", + "name": "ldm_soup_Llama-3.1-8B-Instruct-v0.1", + "developer": "DeepAutoAI", + "scores": { + "IFEval": 0.7889, + "BBH": 0.5125, + "MATH Level 5": 0.1918, + "GPQA": 0.2911, + "MUSR": 0.4121, + "MMLU-PRO": 0.3895 + } + }, + { + "model_id": "DeepMount00/Lexora-Lite-3B", + "name": "Lexora-Lite-3B", + "developer": "DeepMount00", + "scores": { + "IFEval": 0.5776, + "BBH": 0.4873, + "MATH Level 5": 0.2304, + "GPQA": 0.2743, + "MUSR": 0.3966, + "MMLU-PRO": 0.3602 + } + }, + { + "model_id": "DeepMount00/Lexora-Lite-3B_v2", + "name": "Lexora-Lite-3B_v2", + "developer": "DeepMount00", + "scores": { + "IFEval": 0.4943, + "BBH": 0.4812, + "MATH Level 5": 0.2281, + "GPQA": 0.271, + "MUSR": 0.3822, + "MMLU-PRO": 0.3544 + } + }, + { + "model_id": "DeepMount00/Lexora-Medium-7B", + "name": "Lexora-Medium-7B", + "developer": "DeepMount00", + "scores": { + "IFEval": 0.4103, + "BBH": 0.5145, + "MATH Level 5": 0.2221, + "GPQA": 0.3054, + "MUSR": 0.4439, + "MMLU-PRO": 0.4325 + } + }, + { + "model_id": "DeepMount00/Llama-3-8b-Ita", + "name": "Llama-3-8b-Ita", + "developer": "DeepMount00", + "scores": { + "IFEval": 0.753, + "BBH": 0.4936, + "MATH Level 5": 0.0665, + "GPQA": 0.3054, + "MUSR": 0.4268, + "MMLU-PRO": 0.3852 + } + }, + { + "model_id": "DeepMount00/Llama-3.1-8b-ITA", + "name": "Llama-3.1-8b-ITA", + "developer": "DeepMount00", + "scores": { + "IFEval": 0.7917, + "BBH": 0.5109, + "MATH Level 5": 0.1088, + "GPQA": 0.2878, + "MUSR": 0.4136, + "MMLU-PRO": 0.3876 + } + }, + { + "model_id": "DeepMount00/Llama-3.1-8b-Ita", + "name": "Llama-3.1-8b-Ita", + "developer": "DeepMount00", + "scores": { + "IFEval": 0.5365, + "BBH": 0.517, + "MATH Level 5": 0.1707, + "GPQA": 0.3062, + "MUSR": 0.4487, + "MMLU-PRO": 0.396 + } + }, + { + "model_id": "DeepMount00/Llama-3.1-Distilled", + "name": "Llama-3.1-Distilled", + "developer": "DeepMount00", + "scores": { + "IFEval": 0.7844, + "BBH": 0.5101, + "MATH Level 5": 0.2032, + "GPQA": 0.3037, + "MUSR": 0.4058, + "MMLU-PRO": 0.3782 + } + }, + { + "model_id": "DeepMount00/Qwen2-1.5B-Ita", + "name": "Qwen2-1.5B-Ita", + "developer": "DeepMount00", + "scores": { + "IFEval": 0.5173, + "BBH": 0.3981, + "MATH Level 5": 0.114, + "GPQA": 0.2626, + "MUSR": 0.3504, + "MMLU-PRO": 0.2772 + } + }, + { + "model_id": "DeepMount00/Qwen2-1.5B-Ita_v2", + "name": "Qwen2-1.5B-Ita_v2", + "developer": "DeepMount00", + "scores": { + "IFEval": 0.5, + "BBH": 0.3954, + "MATH Level 5": 0.0967, + "GPQA": 0.2592, + "MUSR": 0.3702, + "MMLU-PRO": 0.3032 + } + }, + { + "model_id": "DeepMount00/Qwen2-1.5B-Ita_v3", + "name": "Qwen2-1.5B-Ita_v3", + "developer": "DeepMount00", + "scores": { + "IFEval": 0.489, + "BBH": 0.3948, + "MATH Level 5": 0.1042, + "GPQA": 0.2534, + "MUSR": 0.3742, + "MMLU-PRO": 0.3018 + } + }, + { + "model_id": "DeepMount00/Qwen2-1.5B-Ita_v5", + "name": "Qwen2-1.5B-Ita_v5", + "developer": "DeepMount00", + "scores": { + "IFEval": 0.4987, + "BBH": 0.4032, + "MATH Level 5": 0.1178, + "GPQA": 0.2542, + "MUSR": 0.3422, + "MMLU-PRO": 0.2943 + } + }, + { + "model_id": "DeepMount00/Qwen2-1.5B-Ita_v6", + "name": "Qwen2-1.5B-Ita_v6", + "developer": "DeepMount00", + "scores": { + "IFEval": 0.2999, + "BBH": 0.4249, + "MATH Level 5": 0.0846, + "GPQA": 0.2827, + "MUSR": 0.3755, + "MMLU-PRO": 0.2872 + } + }, + { + "model_id": "DeepMount00/Qwen2.5-7B-Instruct-MathCoder", + "name": "Qwen2.5-7B-Instruct-MathCoder", + "developer": "DeepMount00", + "scores": { + "IFEval": 0.153, + "BBH": 0.2998, + "MATH Level 5": 0.0008, + "GPQA": 0.2626, + "MUSR": 0.3806, + "MMLU-PRO": 0.1118 + } + }, + { + "model_id": "DeepMount00/mergekit-ties-okvgjfz", + "name": "mergekit-ties-okvgjfz", + "developer": "DeepMount00", + "scores": { + "IFEval": 0.153, + "BBH": 0.2998, + "MATH Level 5": 0.0008, + "GPQA": 0.2626, + "MUSR": 0.3806, + "MMLU-PRO": 0.1118 + } + }, + { + "model_id": "Delta-Vector/Baldur-8B", + "name": "Baldur-8B", + "developer": "Delta-Vector", + "scores": { + "IFEval": 0.4782, + "BBH": 0.5306, + "MATH Level 5": 0.1435, + "GPQA": 0.302, + "MUSR": 0.4372, + "MMLU-PRO": 0.3654 + } + }, + { + "model_id": "Delta-Vector/Control-8B", + "name": "Control-8B", + "developer": "Delta-Vector", + "scores": { + "IFEval": 0.549, + "BBH": 0.5041, + "MATH Level 5": 0.139, + "GPQA": 0.3163, + "MUSR": 0.4355, + "MMLU-PRO": 0.3732 + } + }, + { + "model_id": "Delta-Vector/Control-8B-V1.1", + "name": "Control-8B-V1.1", + "developer": "Delta-Vector", + "scores": { + "IFEval": 0.5697, + "BBH": 0.4993, + "MATH Level 5": 0.1276, + "GPQA": 0.307, + "MUSR": 0.4237, + "MMLU-PRO": 0.3745 + } + }, + { + "model_id": "Delta-Vector/Darkens-8B", + "name": "Darkens-8B", + "developer": "Delta-Vector", + "scores": { + "IFEval": 0.2548, + "BBH": 0.5251, + "MATH Level 5": 0.0589, + "GPQA": 0.3247, + "MUSR": 0.4106, + "MMLU-PRO": 0.3736 + } + }, + { + "model_id": "Delta-Vector/Henbane-7b-attempt2", + "name": "Henbane-7b-attempt2", + "developer": "Delta-Vector", + "scores": { + "IFEval": 0.4157, + "BBH": 0.5061, + "MATH Level 5": 0.2273, + "GPQA": 0.2903, + "MUSR": 0.3973, + "MMLU-PRO": 0.4028 + } + }, + { + "model_id": "Delta-Vector/Odin-9B", + "name": "Odin-9B", + "developer": "Delta-Vector", + "scores": { + "IFEval": 0.3692, + "BBH": 0.544, + "MATH Level 5": 0.145, + "GPQA": 0.3414, + "MUSR": 0.4648, + "MMLU-PRO": 0.4047 + } + }, + { + "model_id": "Delta-Vector/Tor-8B", + "name": "Tor-8B", + "developer": "Delta-Vector", + "scores": { + "IFEval": 0.2382, + "BBH": 0.5209, + "MATH Level 5": 0.0589, + "GPQA": 0.3238, + "MUSR": 0.4092, + "MMLU-PRO": 0.373 + } + }, + { + "model_id": "DevQuasar/DevQuasar-R1-Uncensored-Llama-8B", + "name": "DevQuasar-R1-Uncensored-Llama-8B", + "developer": "DevQuasar", + "scores": { + "IFEval": 0.3849, + "BBH": 0.5118, + "MATH Level 5": 0.3308, + "GPQA": 0.3473, + "MUSR": 0.4436, + "MMLU-PRO": 0.3615 + } + }, + { + "model_id": "Dongwei/DeepSeek-R1-Distill-Qwen-7B-GRPO", + "name": "DeepSeek-R1-Distill-Qwen-7B-GRPO", + "developer": "Dongwei", + "scores": { + "IFEval": 0.4038, + "BBH": 0.3443, + "MATH Level 5": 0.1956, + "GPQA": 0.2794, + "MUSR": 0.3663, + "MMLU-PRO": 0.2322 + } + }, + { + "model_id": "DoppelReflEx/L3-8B-R1-WolfCore", + "name": "L3-8B-R1-WolfCore", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.3775, + "BBH": 0.5318, + "MATH Level 5": 0.1631, + "GPQA": 0.3289, + "MUSR": 0.4277, + "MMLU-PRO": 0.3717 + } + }, + { + "model_id": "DoppelReflEx/L3-8B-R1-WolfCore-V1.5-test", + "name": "L3-8B-R1-WolfCore-V1.5-test", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.3955, + "BBH": 0.5315, + "MATH Level 5": 0.1231, + "GPQA": 0.3263, + "MUSR": 0.3841, + "MMLU-PRO": 0.3728 + } + }, + { + "model_id": "DoppelReflEx/L3-8B-WolfCore", + "name": "L3-8B-WolfCore", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.4022, + "BBH": 0.5182, + "MATH Level 5": 0.0982, + "GPQA": 0.3096, + "MUSR": 0.3973, + "MMLU-PRO": 0.3705 + } + }, + { + "model_id": "DoppelReflEx/MN-12B-FoxFrame-test", + "name": "MN-12B-FoxFrame-test", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.4222, + "BBH": 0.5456, + "MATH Level 5": 0.1397, + "GPQA": 0.3079, + "MUSR": 0.4254, + "MMLU-PRO": 0.3503 + } + }, + { + "model_id": "DoppelReflEx/MN-12B-FoxFrame2-test", + "name": "MN-12B-FoxFrame2-test", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.4319, + "BBH": 0.5485, + "MATH Level 5": 0.1405, + "GPQA": 0.3146, + "MUSR": 0.4252, + "MMLU-PRO": 0.3569 + } + }, + { + "model_id": "DoppelReflEx/MN-12B-FoxFrame3-test", + "name": "MN-12B-FoxFrame3-test", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.4323, + "BBH": 0.5395, + "MATH Level 5": 0.1322, + "GPQA": 0.3012, + "MUSR": 0.4598, + "MMLU-PRO": 0.3529 + } + }, + { + "model_id": "DoppelReflEx/MN-12B-Kakigori", + "name": "MN-12B-Kakigori", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.3593, + "BBH": 0.5416, + "MATH Level 5": 0.1193, + "GPQA": 0.3247, + "MUSR": 0.4052, + "MMLU-PRO": 0.3581 + } + }, + { + "model_id": "DoppelReflEx/MN-12B-LilithFrame", + "name": "MN-12B-LilithFrame", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.436, + "BBH": 0.4956, + "MATH Level 5": 0.0589, + "GPQA": 0.3205, + "MUSR": 0.3843, + "MMLU-PRO": 0.3237 + } + }, + { + "model_id": "DoppelReflEx/MN-12B-LilithFrame-Experiment-2", + "name": "MN-12B-LilithFrame-Experiment-2", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.4299, + "BBH": 0.4983, + "MATH Level 5": 0.1073, + "GPQA": 0.3255, + "MUSR": 0.3804, + "MMLU-PRO": 0.3276 + } + }, + { + "model_id": "DoppelReflEx/MN-12B-LilithFrame-Experiment-3", + "name": "MN-12B-LilithFrame-Experiment-3", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.4128, + "BBH": 0.5468, + "MATH Level 5": 0.1344, + "GPQA": 0.328, + "MUSR": 0.4039, + "MMLU-PRO": 0.3604 + } + }, + { + "model_id": "DoppelReflEx/MN-12B-LilithFrame-Experiment-4", + "name": "MN-12B-LilithFrame-Experiment-4", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.3981, + "BBH": 0.5534, + "MATH Level 5": 0.1224, + "GPQA": 0.3171, + "MUSR": 0.4371, + "MMLU-PRO": 0.3649 + } + }, + { + "model_id": "DoppelReflEx/MN-12B-Mimicore-GreenSnake", + "name": "MN-12B-Mimicore-GreenSnake", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.478, + "BBH": 0.5481, + "MATH Level 5": 0.139, + "GPQA": 0.3247, + "MUSR": 0.4306, + "MMLU-PRO": 0.3651 + } + }, + { + "model_id": "DoppelReflEx/MN-12B-Mimicore-Nocturne", + "name": "MN-12B-Mimicore-Nocturne", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.3957, + "BBH": 0.5703, + "MATH Level 5": 0.1057, + "GPQA": 0.3196, + "MUSR": 0.4569, + "MMLU-PRO": 0.3634 + } + }, + { + "model_id": "DoppelReflEx/MN-12B-Mimicore-Orochi", + "name": "MN-12B-Mimicore-Orochi", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.462, + "BBH": 0.5498, + "MATH Level 5": 0.136, + "GPQA": 0.3129, + "MUSR": 0.4546, + "MMLU-PRO": 0.3447 + } + }, + { + "model_id": "DoppelReflEx/MN-12B-Mimicore-Orochi-v2-Experiment", + "name": "MN-12B-Mimicore-Orochi-v2-Experiment", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.2842, + "BBH": 0.5323, + "MATH Level 5": 0.0612, + "GPQA": 0.2978, + "MUSR": 0.4574, + "MMLU-PRO": 0.3423 + } + }, + { + "model_id": "DoppelReflEx/MN-12B-Mimicore-Orochi-v3-Experiment", + "name": "MN-12B-Mimicore-Orochi-v3-Experiment", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.4102, + "BBH": 0.5438, + "MATH Level 5": 0.1216, + "GPQA": 0.2928, + "MUSR": 0.4438, + "MMLU-PRO": 0.3396 + } + }, + { + "model_id": "DoppelReflEx/MN-12B-Mimicore-Orochi-v4-Experiment", + "name": "MN-12B-Mimicore-Orochi-v4-Experiment", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.4321, + "BBH": 0.5463, + "MATH Level 5": 0.1208, + "GPQA": 0.3054, + "MUSR": 0.4449, + "MMLU-PRO": 0.352 + } + }, + { + "model_id": "DoppelReflEx/MN-12B-Mimicore-WhiteSnake", + "name": "MN-12B-Mimicore-WhiteSnake", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.4438, + "BBH": 0.5605, + "MATH Level 5": 0.1314, + "GPQA": 0.318, + "MUSR": 0.4569, + "MMLU-PRO": 0.3658 + } + }, + { + "model_id": "DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-1", + "name": "MN-12B-Mimicore-WhiteSnake-v2-Experiment-1", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.3909, + "BBH": 0.4866, + "MATH Level 5": 0.0785, + "GPQA": 0.3054, + "MUSR": 0.379, + "MMLU-PRO": 0.3114 + } + }, + { + "model_id": "DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-2", + "name": "MN-12B-Mimicore-WhiteSnake-v2-Experiment-2", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.3124, + "BBH": 0.5126, + "MATH Level 5": 0.1125, + "GPQA": 0.2961, + "MUSR": 0.3975, + "MMLU-PRO": 0.3314 + } + }, + { + "model_id": "DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-3", + "name": "MN-12B-Mimicore-WhiteSnake-v2-Experiment-3", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.4302, + "BBH": 0.4812, + "MATH Level 5": 0.0899, + "GPQA": 0.302, + "MUSR": 0.3684, + "MMLU-PRO": 0.3198 + } + }, + { + "model_id": "DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-4", + "name": "MN-12B-Mimicore-WhiteSnake-v2-Experiment-4", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.4241, + "BBH": 0.5185, + "MATH Level 5": 0.114, + "GPQA": 0.3104, + "MUSR": 0.4002, + "MMLU-PRO": 0.3342 + } + }, + { + "model_id": "DoppelReflEx/MN-12B-Unleashed-Twilight", + "name": "MN-12B-Unleashed-Twilight", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.3505, + "BBH": 0.5521, + "MATH Level 5": 0.0959, + "GPQA": 0.3289, + "MUSR": 0.4384, + "MMLU-PRO": 0.3678 + } + }, + { + "model_id": "DoppelReflEx/MN-12B-WolFrame", + "name": "MN-12B-WolFrame", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.4397, + "BBH": 0.5117, + "MATH Level 5": 0.1314, + "GPQA": 0.3104, + "MUSR": 0.4015, + "MMLU-PRO": 0.3393 + } + }, + { + "model_id": "DoppelReflEx/MiniusLight-24B", + "name": "MiniusLight-24B", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.2577, + "BBH": 0.6256, + "MATH Level 5": 0.1261, + "GPQA": 0.3582, + "MUSR": 0.4319, + "MMLU-PRO": 0.5091 + } + }, + { + "model_id": "DoppelReflEx/MiniusLight-24B-test", + "name": "MiniusLight-24B-test", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.0394, + "BBH": 0.6334, + "MATH Level 5": 0.0257, + "GPQA": 0.3683, + "MUSR": 0.4093, + "MMLU-PRO": 0.5182 + } + }, + { + "model_id": "DoppelReflEx/MiniusLight-24B-v1b-test", + "name": "MiniusLight-24B-v1b-test", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.3791, + "BBH": 0.6617, + "MATH Level 5": 0.2394, + "GPQA": 0.3792, + "MUSR": 0.4557, + "MMLU-PRO": 0.5365 + } + }, + { + "model_id": "DoppelReflEx/MiniusLight-24B-v1c-test", + "name": "MiniusLight-24B-v1c-test", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.3786, + "BBH": 0.6753, + "MATH Level 5": 0.2968, + "GPQA": 0.3951, + "MUSR": 0.4634, + "MMLU-PRO": 0.5487 + } + }, + { + "model_id": "DoppelReflEx/MiniusLight-24B-v1d-test", + "name": "MiniusLight-24B-v1d-test", + "developer": "DoppelReflEx", + "scores": { + "IFEval": 0.4032, + "BBH": 0.6712, + "MATH Level 5": 0.2946, + "GPQA": 0.3951, + "MUSR": 0.4621, + "MMLU-PRO": 0.5489 + } + }, + { + "model_id": "DreadPoor/Again-8B-Model_Stock", + "name": "Again-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.6724, + "BBH": 0.531, + "MATH Level 5": 0.1201, + "GPQA": 0.3012, + "MUSR": 0.3987, + "MMLU-PRO": 0.3518 + } + }, + { + "model_id": "DreadPoor/Alita99-8B-LINEAR", + "name": "Alita99-8B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.719, + "BBH": 0.5442, + "MATH Level 5": 0.1647, + "GPQA": 0.3163, + "MUSR": 0.4266, + "MMLU-PRO": 0.3809 + } + }, + { + "model_id": "DreadPoor/AnotherTest", + "name": "AnotherTest", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.4701, + "BBH": 0.4683, + "MATH Level 5": 0.0619, + "GPQA": 0.2978, + "MUSR": 0.4213, + "MMLU-PRO": 0.2875 + } + }, + { + "model_id": "DreadPoor/Aspire-8B-model_stock", + "name": "Aspire-8B-model_stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7141, + "BBH": 0.5278, + "MATH Level 5": 0.1495, + "GPQA": 0.3146, + "MUSR": 0.4212, + "MMLU-PRO": 0.3763 + } + }, + { + "model_id": "DreadPoor/Aspire_1.3-8B_model-stock", + "name": "Aspire_1.3-8B_model-stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7062, + "BBH": 0.5302, + "MATH Level 5": 0.1692, + "GPQA": 0.3079, + "MUSR": 0.4105, + "MMLU-PRO": 0.3716 + } + }, + { + "model_id": "DreadPoor/Aspire_V2-8B-Model_Stock", + "name": "Aspire_V2-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7371, + "BBH": 0.533, + "MATH Level 5": 0.176, + "GPQA": 0.3205, + "MUSR": 0.3894, + "MMLU-PRO": 0.3697 + } + }, + { + "model_id": "DreadPoor/Aspire_V2.1-8B-Model_Stock", + "name": "Aspire_V2.1-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7238, + "BBH": 0.5236, + "MATH Level 5": 0.1767, + "GPQA": 0.3096, + "MUSR": 0.4136, + "MMLU-PRO": 0.3801 + } + }, + { + "model_id": "DreadPoor/Aspire_V2_ALT-8B-Model_Stock", + "name": "Aspire_V2_ALT-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7381, + "BBH": 0.5266, + "MATH Level 5": 0.173, + "GPQA": 0.3247, + "MUSR": 0.3975, + "MMLU-PRO": 0.3727 + } + }, + { + "model_id": "DreadPoor/Aspire_V2_ALT_ROW-8B-Model_Stock", + "name": "Aspire_V2_ALT_ROW-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7381, + "BBH": 0.5266, + "MATH Level 5": 0.173, + "GPQA": 0.3247, + "MUSR": 0.3975, + "MMLU-PRO": 0.3727 + } + }, + { + "model_id": "DreadPoor/Aspire_V3-8B-Model_Stock", + "name": "Aspire_V3-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.5119, + "BBH": 0.5268, + "MATH Level 5": 0.1858, + "GPQA": 0.3054, + "MUSR": 0.4015, + "MMLU-PRO": 0.3642 + } + }, + { + "model_id": "DreadPoor/Aspire_V4-8B-Model_Stock", + "name": "Aspire_V4-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7694, + "BBH": 0.5314, + "MATH Level 5": 0.1926, + "GPQA": 0.3045, + "MUSR": 0.3867, + "MMLU-PRO": 0.3708 + } + }, + { + "model_id": "DreadPoor/Aspire_V4_ALT-8B-Model_Stock", + "name": "Aspire_V4_ALT-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7366, + "BBH": 0.5268, + "MATH Level 5": 0.1813, + "GPQA": 0.3205, + "MUSR": 0.392, + "MMLU-PRO": 0.3682 + } + }, + { + "model_id": "DreadPoor/Asymmetric_Linearity-8B-Model_Stock", + "name": "Asymmetric_Linearity-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7174, + "BBH": 0.5465, + "MATH Level 5": 0.1647, + "GPQA": 0.3146, + "MUSR": 0.4199, + "MMLU-PRO": 0.3844 + } + }, + { + "model_id": "DreadPoor/Aurora_faustus-8B-LINEAR", + "name": "Aurora_faustus-8B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7281, + "BBH": 0.5516, + "MATH Level 5": 0.1707, + "GPQA": 0.307, + "MUSR": 0.4146, + "MMLU-PRO": 0.3842 + } + }, + { + "model_id": "DreadPoor/Aurora_faustus-8B-LORABLATED", + "name": "Aurora_faustus-8B-LORABLATED", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7527, + "BBH": 0.5392, + "MATH Level 5": 0.1488, + "GPQA": 0.302, + "MUSR": 0.4239, + "MMLU-PRO": 0.3673 + } + }, + { + "model_id": "DreadPoor/Aurora_faustus-8B-LORABLATED_ALT", + "name": "Aurora_faustus-8B-LORABLATED_ALT", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7378, + "BBH": 0.5388, + "MATH Level 5": 0.1586, + "GPQA": 0.2987, + "MUSR": 0.4225, + "MMLU-PRO": 0.3694 + } + }, + { + "model_id": "DreadPoor/Autumn_Dawn-8B-LINEAR", + "name": "Autumn_Dawn-8B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7293, + "BBH": 0.5459, + "MATH Level 5": 0.1858, + "GPQA": 0.2936, + "MUSR": 0.4186, + "MMLU-PRO": 0.3968 + } + }, + { + "model_id": "DreadPoor/BaeZel-8B-LINEAR", + "name": "BaeZel-8B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7378, + "BBH": 0.5464, + "MATH Level 5": 0.1813, + "GPQA": 0.3213, + "MUSR": 0.4227, + "MMLU-PRO": 0.3861 + } + }, + { + "model_id": "DreadPoor/BaeZel-8B-Model_Stock", + "name": "BaeZel-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7713, + "BBH": 0.5408, + "MATH Level 5": 0.1639, + "GPQA": 0.3138, + "MUSR": 0.4199, + "MMLU-PRO": 0.388 + } + }, + { + "model_id": "DreadPoor/BaeZel_V2-8B-Model_Stock", + "name": "BaeZel_V2-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7677, + "BBH": 0.5374, + "MATH Level 5": 0.1798, + "GPQA": 0.2995, + "MUSR": 0.4186, + "MMLU-PRO": 0.3947 + } + }, + { + "model_id": "DreadPoor/BaeZel_V2_ALT-8B-Model_Stock", + "name": "BaeZel_V2_ALT-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7677, + "BBH": 0.5374, + "MATH Level 5": 0.1798, + "GPQA": 0.2995, + "MUSR": 0.4186, + "MMLU-PRO": 0.3947 + } + }, + { + "model_id": "DreadPoor/BaeZel_V3-8B-Model_Stock", + "name": "BaeZel_V3-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7832, + "BBH": 0.5392, + "MATH Level 5": 0.1896, + "GPQA": 0.3205, + "MUSR": 0.4174, + "MMLU-PRO": 0.3888 + } + }, + { + "model_id": "DreadPoor/Blunt_Edge-8B-SLERP", + "name": "Blunt_Edge-8B-SLERP", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7497, + "BBH": 0.5389, + "MATH Level 5": 0.1858, + "GPQA": 0.3112, + "MUSR": 0.4174, + "MMLU-PRO": 0.3767 + } + }, + { + "model_id": "DreadPoor/BulkUp", + "name": "BulkUp", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.1778, + "BBH": 0.287, + "MATH Level 5": 0.0, + "GPQA": 0.2475, + "MUSR": 0.3447, + "MMLU-PRO": 0.111 + } + }, + { + "model_id": "DreadPoor/Cadence-8B-LINEAR", + "name": "Cadence-8B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7682, + "BBH": 0.5433, + "MATH Level 5": 0.1677, + "GPQA": 0.3029, + "MUSR": 0.4173, + "MMLU-PRO": 0.3803 + } + }, + { + "model_id": "DreadPoor/Caelid-8B-Model_Stock", + "name": "Caelid-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7247, + "BBH": 0.546, + "MATH Level 5": 0.1511, + "GPQA": 0.3104, + "MUSR": 0.4001, + "MMLU-PRO": 0.3816 + } + }, + { + "model_id": "DreadPoor/Casuar-9B-Model_Stock", + "name": "Casuar-9B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7765, + "BBH": 0.6107, + "MATH Level 5": 0.213, + "GPQA": 0.3448, + "MUSR": 0.4165, + "MMLU-PRO": 0.4156 + } + }, + { + "model_id": "DreadPoor/Condensed_Milk-8B-Model_Stock", + "name": "Condensed_Milk-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7536, + "BBH": 0.5435, + "MATH Level 5": 0.1745, + "GPQA": 0.3213, + "MUSR": 0.416, + "MMLU-PRO": 0.3876 + } + }, + { + "model_id": "DreadPoor/CoolerCoder-8B-LINEAR", + "name": "CoolerCoder-8B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.4519, + "BBH": 0.4762, + "MATH Level 5": 0.0793, + "GPQA": 0.2903, + "MUSR": 0.3964, + "MMLU-PRO": 0.3159 + } + }, + { + "model_id": "DreadPoor/Damasteel-8B-LINEAR", + "name": "Damasteel-8B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7384, + "BBH": 0.5388, + "MATH Level 5": 0.1669, + "GPQA": 0.2987, + "MUSR": 0.4212, + "MMLU-PRO": 0.3779 + } + }, + { + "model_id": "DreadPoor/Dearly_Beloved-8B-TIES", + "name": "Dearly_Beloved-8B-TIES", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.8267, + "BBH": 0.405, + "MATH Level 5": 0.2115, + "GPQA": 0.2987, + "MUSR": 0.4175, + "MMLU-PRO": 0.2827 + } + }, + { + "model_id": "DreadPoor/Decayed-8B-LINEAR", + "name": "Decayed-8B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7676, + "BBH": 0.5417, + "MATH Level 5": 0.1715, + "GPQA": 0.3096, + "MUSR": 0.4186, + "MMLU-PRO": 0.3763 + } + }, + { + "model_id": "DreadPoor/Derivative-8B-Model_Stock", + "name": "Derivative-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7667, + "BBH": 0.5395, + "MATH Level 5": 0.179, + "GPQA": 0.3171, + "MUSR": 0.42, + "MMLU-PRO": 0.3811 + } + }, + { + "model_id": "DreadPoor/Derivative_V2-8B-Model_Stock", + "name": "Derivative_V2-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7537, + "BBH": 0.5393, + "MATH Level 5": 0.1798, + "GPQA": 0.307, + "MUSR": 0.4123, + "MMLU-PRO": 0.3856 + } + }, + { + "model_id": "DreadPoor/Derivative_V2_ALT-8B-Model_Stock", + "name": "Derivative_V2_ALT-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.772, + "BBH": 0.5365, + "MATH Level 5": 0.1881, + "GPQA": 0.3112, + "MUSR": 0.4135, + "MMLU-PRO": 0.3882 + } + }, + { + "model_id": "DreadPoor/Derivative_V3-8B-Model_Stock", + "name": "Derivative_V3-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.6964, + "BBH": 0.5243, + "MATH Level 5": 0.1465, + "GPQA": 0.2945, + "MUSR": 0.415, + "MMLU-PRO": 0.3502 + } + }, + { + "model_id": "DreadPoor/Elusive_Dragon_Heart-8B-LINEAR", + "name": "Elusive_Dragon_Heart-8B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7131, + "BBH": 0.5456, + "MATH Level 5": 0.148, + "GPQA": 0.3062, + "MUSR": 0.4146, + "MMLU-PRO": 0.3814 + } + }, + { + "model_id": "DreadPoor/Emu_Eggs-9B-Model_Stock", + "name": "Emu_Eggs-9B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7607, + "BBH": 0.6052, + "MATH Level 5": 0.21, + "GPQA": 0.3331, + "MUSR": 0.4071, + "MMLU-PRO": 0.4227 + } + }, + { + "model_id": "DreadPoor/Eunoia_Vespera-8B-LINEAR", + "name": "Eunoia_Vespera-8B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7235, + "BBH": 0.5399, + "MATH Level 5": 0.1541, + "GPQA": 0.307, + "MUSR": 0.4185, + "MMLU-PRO": 0.3839 + } + }, + { + "model_id": "DreadPoor/Fu_sion_HA-8B-SLERP", + "name": "Fu_sion_HA-8B-SLERP", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7609, + "BBH": 0.5373, + "MATH Level 5": 0.1752, + "GPQA": 0.323, + "MUSR": 0.416, + "MMLU-PRO": 0.3825 + } + }, + { + "model_id": "DreadPoor/HOT_STINKING_GARBAGE", + "name": "HOT_STINKING_GARBAGE", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.5754, + "BBH": 0.4884, + "MATH Level 5": 0.0672, + "GPQA": 0.2752, + "MUSR": 0.425, + "MMLU-PRO": 0.3017 + } + }, + { + "model_id": "DreadPoor/H_the_eighth-8B-LINEAR", + "name": "H_the_eighth-8B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7469, + "BBH": 0.5384, + "MATH Level 5": 0.1775, + "GPQA": 0.328, + "MUSR": 0.4173, + "MMLU-PRO": 0.3824 + } + }, + { + "model_id": "DreadPoor/Happy_New_Year-8B-Model_Stock", + "name": "Happy_New_Year-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7616, + "BBH": 0.5368, + "MATH Level 5": 0.1594, + "GPQA": 0.3138, + "MUSR": 0.4186, + "MMLU-PRO": 0.3879 + } + }, + { + "model_id": "DreadPoor/Heart_Stolen-8B-Model_Stock", + "name": "Heart_Stolen-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7245, + "BBH": 0.5395, + "MATH Level 5": 0.1722, + "GPQA": 0.3171, + "MUSR": 0.4162, + "MMLU-PRO": 0.3794 + } + }, + { + "model_id": "DreadPoor/Heart_Stolen-ALT-8B-Model_Stock", + "name": "Heart_Stolen-ALT-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7184, + "BBH": 0.5263, + "MATH Level 5": 0.1563, + "GPQA": 0.3012, + "MUSR": 0.4055, + "MMLU-PRO": 0.3772 + } + }, + { + "model_id": "DreadPoor/Here_We_Go_Again-8B-SLERP", + "name": "Here_We_Go_Again-8B-SLERP", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7442, + "BBH": 0.546, + "MATH Level 5": 0.173, + "GPQA": 0.3188, + "MUSR": 0.4187, + "MMLU-PRO": 0.3873 + } + }, + { + "model_id": "DreadPoor/Howdy-8B-LINEAR", + "name": "Howdy-8B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7378, + "BBH": 0.5384, + "MATH Level 5": 0.1775, + "GPQA": 0.3146, + "MUSR": 0.4121, + "MMLU-PRO": 0.3807 + } + }, + { + "model_id": "DreadPoor/Incidental-8B-Model_Stock", + "name": "Incidental-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7482, + "BBH": 0.5452, + "MATH Level 5": 0.1616, + "GPQA": 0.3029, + "MUSR": 0.424, + "MMLU-PRO": 0.3873 + } + }, + { + "model_id": "DreadPoor/Irina-8B-model_stock", + "name": "Irina-8B-model_stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.6799, + "BBH": 0.5237, + "MATH Level 5": 0.102, + "GPQA": 0.2844, + "MUSR": 0.4003, + "MMLU-PRO": 0.3574 + } + }, + { + "model_id": "DreadPoor/Kindling-8B-Model_Stock", + "name": "Kindling-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7308, + "BBH": 0.5492, + "MATH Level 5": 0.1752, + "GPQA": 0.318, + "MUSR": 0.4068, + "MMLU-PRO": 0.383 + } + }, + { + "model_id": "DreadPoor/L3.1-BaeZel-8B-Della", + "name": "L3.1-BaeZel-8B-Della", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.518, + "BBH": 0.5448, + "MATH Level 5": 0.1745, + "GPQA": 0.3196, + "MUSR": 0.42, + "MMLU-PRO": 0.3902 + } + }, + { + "model_id": "DreadPoor/Laughing_Stock-8B-Model_Stock", + "name": "Laughing_Stock-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.719, + "BBH": 0.5449, + "MATH Level 5": 0.1579, + "GPQA": 0.2894, + "MUSR": 0.4146, + "MMLU-PRO": 0.3764 + } + }, + { + "model_id": "DreadPoor/Lava_Lamp-8B-SLERP", + "name": "Lava_Lamp-8B-SLERP", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7381, + "BBH": 0.5368, + "MATH Level 5": 0.1737, + "GPQA": 0.3054, + "MUSR": 0.4187, + "MMLU-PRO": 0.375 + } + }, + { + "model_id": "DreadPoor/LemonP-8B-Model_Stock", + "name": "LemonP-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7676, + "BBH": 0.5439, + "MATH Level 5": 0.1767, + "GPQA": 0.3029, + "MUSR": 0.4081, + "MMLU-PRO": 0.4004 + } + }, + { + "model_id": "DreadPoor/Lydia_of_Whiterun-8B-LINEAR", + "name": "Lydia_of_Whiterun-8B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7603, + "BBH": 0.538, + "MATH Level 5": 0.1767, + "GPQA": 0.3163, + "MUSR": 0.4251, + "MMLU-PRO": 0.3801 + } + }, + { + "model_id": "DreadPoor/Matryoshka-8B-LINEAR", + "name": "Matryoshka-8B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7263, + "BBH": 0.5444, + "MATH Level 5": 0.1752, + "GPQA": 0.3205, + "MUSR": 0.4252, + "MMLU-PRO": 0.3866 + } + }, + { + "model_id": "DreadPoor/Mercury_In_Retrograde-8b-Model-Stock", + "name": "Mercury_In_Retrograde-8b-Model-Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7296, + "BBH": 0.5391, + "MATH Level 5": 0.1647, + "GPQA": 0.3163, + "MUSR": 0.4199, + "MMLU-PRO": 0.3829 + } + }, + { + "model_id": "DreadPoor/Minthy-8B-Model_Stock", + "name": "Minthy-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7658, + "BBH": 0.5353, + "MATH Level 5": 0.1918, + "GPQA": 0.3037, + "MUSR": 0.4094, + "MMLU-PRO": 0.3993 + } + }, + { + "model_id": "DreadPoor/Minthy_ALT-8B-Model_Stock", + "name": "Minthy_ALT-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.6992, + "BBH": 0.5375, + "MATH Level 5": 0.176, + "GPQA": 0.3062, + "MUSR": 0.4225, + "MMLU-PRO": 0.3674 + } + }, + { + "model_id": "DreadPoor/Minthy_V2-8B-Model_Stock", + "name": "Minthy_V2-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7126, + "BBH": 0.5491, + "MATH Level 5": 0.1594, + "GPQA": 0.2945, + "MUSR": 0.4199, + "MMLU-PRO": 0.3737 + } + }, + { + "model_id": "DreadPoor/Minus_Penus-8B-Model_Stock", + "name": "Minus_Penus-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7311, + "BBH": 0.5344, + "MATH Level 5": 0.2002, + "GPQA": 0.3096, + "MUSR": 0.4019, + "MMLU-PRO": 0.3752 + } + }, + { + "model_id": "DreadPoor/Morphing-8B-Model_Stock", + "name": "Morphing-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7445, + "BBH": 0.5397, + "MATH Level 5": 0.1888, + "GPQA": 0.2936, + "MUSR": 0.4069, + "MMLU-PRO": 0.3852 + } + }, + { + "model_id": "DreadPoor/Not_Even_My_Final_Form-8B-Model_Stock", + "name": "Not_Even_My_Final_Form-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7722, + "BBH": 0.5351, + "MATH Level 5": 0.176, + "GPQA": 0.2953, + "MUSR": 0.4147, + "MMLU-PRO": 0.384 + } + }, + { + "model_id": "DreadPoor/Nother_One-8B-Model_Stock", + "name": "Nother_One-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.6863, + "BBH": 0.5205, + "MATH Level 5": 0.1518, + "GPQA": 0.2894, + "MUSR": 0.387, + "MMLU-PRO": 0.3595 + } + }, + { + "model_id": "DreadPoor/Noxis-8B-LINEAR", + "name": "Noxis-8B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.6913, + "BBH": 0.5421, + "MATH Level 5": 0.1979, + "GPQA": 0.3188, + "MUSR": 0.4231, + "MMLU-PRO": 0.366 + } + }, + { + "model_id": "DreadPoor/Nullsworn-12B-LINEAR", + "name": "Nullsworn-12B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.4436, + "BBH": 0.5483, + "MATH Level 5": 0.1125, + "GPQA": 0.3079, + "MUSR": 0.435, + "MMLU-PRO": 0.3645 + } + }, + { + "model_id": "DreadPoor/Nwah-8B-Model_Stock", + "name": "Nwah-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7716, + "BBH": 0.5384, + "MATH Level 5": 0.1798, + "GPQA": 0.3104, + "MUSR": 0.4039, + "MMLU-PRO": 0.3807 + } + }, + { + "model_id": "DreadPoor/ONeil-model_stock-8B", + "name": "ONeil-model_stock-8B", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.6786, + "BBH": 0.5548, + "MATH Level 5": 0.1012, + "GPQA": 0.3054, + "MUSR": 0.4173, + "MMLU-PRO": 0.3599 + } + }, + { + "model_id": "DreadPoor/Oh_Boy-8B-LINEAR", + "name": "Oh_Boy-8B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7503, + "BBH": 0.5375, + "MATH Level 5": 0.1782, + "GPQA": 0.3079, + "MUSR": 0.4108, + "MMLU-PRO": 0.3849 + } + }, + { + "model_id": "DreadPoor/OrangeJ-8B-Model_Stock", + "name": "OrangeJ-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7841, + "BBH": 0.5413, + "MATH Level 5": 0.176, + "GPQA": 0.3012, + "MUSR": 0.4028, + "MMLU-PRO": 0.3969 + } + }, + { + "model_id": "DreadPoor/Promissum_Mane-8B-LINEAR", + "name": "Promissum_Mane-8B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.715, + "BBH": 0.5458, + "MATH Level 5": 0.1556, + "GPQA": 0.3045, + "MUSR": 0.42, + "MMLU-PRO": 0.3851 + } + }, + { + "model_id": "DreadPoor/Promissum_Mane-8B-LINEAR-lorablated", + "name": "Promissum_Mane-8B-LINEAR-lorablated", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7156, + "BBH": 0.5435, + "MATH Level 5": 0.1533, + "GPQA": 0.3037, + "MUSR": 0.4198, + "MMLU-PRO": 0.3739 + } + }, + { + "model_id": "DreadPoor/RPMash-8B-Model_Stock", + "name": "RPMash-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.4564, + "BBH": 0.5169, + "MATH Level 5": 0.108, + "GPQA": 0.2869, + "MUSR": 0.4054, + "MMLU-PRO": 0.3604 + } + }, + { + "model_id": "DreadPoor/RPMash_V3-8B-Model_Stock", + "name": "RPMash_V3-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7049, + "BBH": 0.5217, + "MATH Level 5": 0.1042, + "GPQA": 0.3003, + "MUSR": 0.3778, + "MMLU-PRO": 0.3614 + } + }, + { + "model_id": "DreadPoor/Rusted_Gold-8B-LINEAR", + "name": "Rusted_Gold-8B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7296, + "BBH": 0.5387, + "MATH Level 5": 0.1934, + "GPQA": 0.2987, + "MUSR": 0.4178, + "MMLU-PRO": 0.378 + } + }, + { + "model_id": "DreadPoor/Rusted_Platinum-8B-LINEAR", + "name": "Rusted_Platinum-8B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.718, + "BBH": 0.5428, + "MATH Level 5": 0.1722, + "GPQA": 0.2802, + "MUSR": 0.3967, + "MMLU-PRO": 0.373 + } + }, + { + "model_id": "DreadPoor/Rusted_Platinum-8B-Model_Stock", + "name": "Rusted_Platinum-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.4408, + "BBH": 0.5243, + "MATH Level 5": 0.102, + "GPQA": 0.2685, + "MUSR": 0.3741, + "MMLU-PRO": 0.3546 + } + }, + { + "model_id": "DreadPoor/Sellen-8B-model_stock", + "name": "Sellen-8B-model_stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7113, + "BBH": 0.5232, + "MATH Level 5": 0.1337, + "GPQA": 0.2743, + "MUSR": 0.396, + "MMLU-PRO": 0.357 + } + }, + { + "model_id": "DreadPoor/Something-8B-Model_Stock", + "name": "Something-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.5043, + "BBH": 0.5395, + "MATH Level 5": 0.1798, + "GPQA": 0.3171, + "MUSR": 0.4187, + "MMLU-PRO": 0.3885 + } + }, + { + "model_id": "DreadPoor/Spring_Dusk-8B-SCE", + "name": "Spring_Dusk-8B-SCE", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.6515, + "BBH": 0.5635, + "MATH Level 5": 0.0763, + "GPQA": 0.2878, + "MUSR": 0.46, + "MMLU-PRO": 0.3436 + } + }, + { + "model_id": "DreadPoor/Summer_Dawn-8B-SCE", + "name": "Summer_Dawn-8B-SCE", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.6642, + "BBH": 0.5391, + "MATH Level 5": 0.1722, + "GPQA": 0.3003, + "MUSR": 0.412, + "MMLU-PRO": 0.3753 + } + }, + { + "model_id": "DreadPoor/Summer_Dusk-8B-TIES", + "name": "Summer_Dusk-8B-TIES", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.4922, + "BBH": 0.536, + "MATH Level 5": 0.1805, + "GPQA": 0.307, + "MUSR": 0.4267, + "MMLU-PRO": 0.3856 + } + }, + { + "model_id": "DreadPoor/Summer_Rain-8B-SCE", + "name": "Summer_Rain-8B-SCE", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.5459, + "BBH": 0.5846, + "MATH Level 5": 0.0702, + "GPQA": 0.3037, + "MUSR": 0.4477, + "MMLU-PRO": 0.3551 + } + }, + { + "model_id": "DreadPoor/Summer_Rain-8B-TIES", + "name": "Summer_Rain-8B-TIES", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.5444, + "BBH": 0.5846, + "MATH Level 5": 0.0702, + "GPQA": 0.3037, + "MUSR": 0.4477, + "MMLU-PRO": 0.3551 + } + }, + { + "model_id": "DreadPoor/Sun-8B-Model_Stock", + "name": "Sun-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7758, + "BBH": 0.5264, + "MATH Level 5": 0.21, + "GPQA": 0.2995, + "MUSR": 0.4098, + "MMLU-PRO": 0.3835 + } + }, + { + "model_id": "DreadPoor/Sweetened_Condensed_Milk-8B-Model_Stock", + "name": "Sweetened_Condensed_Milk-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7417, + "BBH": 0.5406, + "MATH Level 5": 0.1873, + "GPQA": 0.3029, + "MUSR": 0.4107, + "MMLU-PRO": 0.3848 + } + }, + { + "model_id": "DreadPoor/TEST02-Ignore", + "name": "TEST02-Ignore", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.6119, + "BBH": 0.5602, + "MATH Level 5": 0.0869, + "GPQA": 0.2844, + "MUSR": 0.4199, + "MMLU-PRO": 0.3468 + } + }, + { + "model_id": "DreadPoor/TEST03-ignore", + "name": "TEST03-ignore", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.6967, + "BBH": 0.5383, + "MATH Level 5": 0.1654, + "GPQA": 0.3087, + "MUSR": 0.4186, + "MMLU-PRO": 0.3789 + } + }, + { + "model_id": "DreadPoor/TEST06-ignore", + "name": "TEST06-ignore", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7323, + "BBH": 0.5509, + "MATH Level 5": 0.1178, + "GPQA": 0.2869, + "MUSR": 0.4225, + "MMLU-PRO": 0.3615 + } + }, + { + "model_id": "DreadPoor/TEST07-ignore", + "name": "TEST07-ignore", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.74, + "BBH": 0.5561, + "MATH Level 5": 0.1662, + "GPQA": 0.3087, + "MUSR": 0.4094, + "MMLU-PRO": 0.388 + } + }, + { + "model_id": "DreadPoor/TEST08-ignore", + "name": "TEST08-ignore", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7467, + "BBH": 0.5454, + "MATH Level 5": 0.182, + "GPQA": 0.3129, + "MUSR": 0.4081, + "MMLU-PRO": 0.3853 + } + }, + { + "model_id": "DreadPoor/Trinas_Nectar-8B-model_stock", + "name": "Trinas_Nectar-8B-model_stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7259, + "BBH": 0.5256, + "MATH Level 5": 0.1526, + "GPQA": 0.2861, + "MUSR": 0.4068, + "MMLU-PRO": 0.3618 + } + }, + { + "model_id": "DreadPoor/UNTESTED-VENN_1.2-8B-Model_Stock", + "name": "UNTESTED-VENN_1.2-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.4718, + "BBH": 0.5475, + "MATH Level 5": 0.1541, + "GPQA": 0.3154, + "MUSR": 0.4449, + "MMLU-PRO": 0.3787 + } + }, + { + "model_id": "DreadPoor/VENN_1.2-8B-Model_Stock", + "name": "VENN_1.2-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7226, + "BBH": 0.5459, + "MATH Level 5": 0.1707, + "GPQA": 0.297, + "MUSR": 0.42, + "MMLU-PRO": 0.3721 + } + }, + { + "model_id": "DreadPoor/WIP-Acacia-8B-Model_Stock", + "name": "WIP-Acacia-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.6246, + "BBH": 0.5195, + "MATH Level 5": 0.1669, + "GPQA": 0.3062, + "MUSR": 0.4226, + "MMLU-PRO": 0.3737 + } + }, + { + "model_id": "DreadPoor/WIP_Damascus-8B-TIES", + "name": "WIP_Damascus-8B-TIES", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.4776, + "BBH": 0.5411, + "MATH Level 5": 0.1654, + "GPQA": 0.307, + "MUSR": 0.4119, + "MMLU-PRO": 0.3761 + } + }, + { + "model_id": "DreadPoor/Wannabe-8B-Model_Stock", + "name": "Wannabe-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7205, + "BBH": 0.539, + "MATH Level 5": 0.1775, + "GPQA": 0.3012, + "MUSR": 0.4135, + "MMLU-PRO": 0.3831 + } + }, + { + "model_id": "DreadPoor/What_A_Thrill-8B-Model_Stock", + "name": "What_A_Thrill-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7064, + "BBH": 0.5311, + "MATH Level 5": 0.182, + "GPQA": 0.297, + "MUSR": 0.408, + "MMLU-PRO": 0.3615 + } + }, + { + "model_id": "DreadPoor/Winter-8B-SCE", + "name": "Winter-8B-SCE", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7536, + "BBH": 0.5262, + "MATH Level 5": 0.1918, + "GPQA": 0.2995, + "MUSR": 0.4071, + "MMLU-PRO": 0.3839 + } + }, + { + "model_id": "DreadPoor/Winter_Dawn-8B-TIES", + "name": "Winter_Dawn-8B-TIES", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.5496, + "BBH": 0.5309, + "MATH Level 5": 0.1858, + "GPQA": 0.3096, + "MUSR": 0.4279, + "MMLU-PRO": 0.391 + } + }, + { + "model_id": "DreadPoor/Winter_Dusk-8B-TIES", + "name": "Winter_Dusk-8B-TIES", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7153, + "BBH": 0.4952, + "MATH Level 5": 0.0718, + "GPQA": 0.2995, + "MUSR": 0.3688, + "MMLU-PRO": 0.3478 + } + }, + { + "model_id": "DreadPoor/Winter_Night-8B-Model_Stock", + "name": "Winter_Night-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.704, + "BBH": 0.5185, + "MATH Level 5": 0.1458, + "GPQA": 0.3062, + "MUSR": 0.3914, + "MMLU-PRO": 0.3666 + } + }, + { + "model_id": "DreadPoor/Yafune-8B-Model_Stock", + "name": "Yafune-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7533, + "BBH": 0.5467, + "MATH Level 5": 0.1662, + "GPQA": 0.3272, + "MUSR": 0.4173, + "MMLU-PRO": 0.3851 + } + }, + { + "model_id": "DreadPoor/Yearn_V3-8B-Model_Stock", + "name": "Yearn_V3-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.729, + "BBH": 0.5322, + "MATH Level 5": 0.1896, + "GPQA": 0.3054, + "MUSR": 0.3909, + "MMLU-PRO": 0.3802 + } + }, + { + "model_id": "DreadPoor/ZEUS-8B-V17-Abliterated_ALT", + "name": "ZEUS-8B-V17-Abliterated_ALT", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.5511, + "BBH": 0.5231, + "MATH Level 5": 0.1903, + "GPQA": 0.3079, + "MUSR": 0.4149, + "MMLU-PRO": 0.389 + } + }, + { + "model_id": "DreadPoor/Zelus-8B-Model_Stock", + "name": "Zelus-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7788, + "BBH": 0.5307, + "MATH Level 5": 0.1647, + "GPQA": 0.3062, + "MUSR": 0.4214, + "MMLU-PRO": 0.3841 + } + }, + { + "model_id": "DreadPoor/Zelus_V2-8B-Model_Stock", + "name": "Zelus_V2-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7898, + "BBH": 0.5345, + "MATH Level 5": 0.2054, + "GPQA": 0.3096, + "MUSR": 0.3961, + "MMLU-PRO": 0.3833 + } + }, + { + "model_id": "DreadPoor/felix_dies-mistral-7B-model_stock", + "name": "felix_dies-mistral-7B-model_stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.3008, + "BBH": 0.4901, + "MATH Level 5": 0.0536, + "GPQA": 0.2919, + "MUSR": 0.4518, + "MMLU-PRO": 0.3109 + } + }, + { + "model_id": "DreadPoor/hakuchido-8B-MODEL_STOCK", + "name": "hakuchido-8B-MODEL_STOCK", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7375, + "BBH": 0.5398, + "MATH Level 5": 0.1949, + "GPQA": 0.2953, + "MUSR": 0.4175, + "MMLU-PRO": 0.3782 + } + }, + { + "model_id": "DreadPoor/ichor-8B-Model_Stock", + "name": "ichor-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.5386, + "BBH": 0.5084, + "MATH Level 5": 0.1088, + "GPQA": 0.3238, + "MUSR": 0.4212, + "MMLU-PRO": 0.3151 + } + }, + { + "model_id": "DreadPoor/ichor_1.1-8B-Model_Stock", + "name": "ichor_1.1-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.8096, + "BBH": 0.5281, + "MATH Level 5": 0.1775, + "GPQA": 0.3062, + "MUSR": 0.4068, + "MMLU-PRO": 0.3856 + } + }, + { + "model_id": "DreadPoor/inexpertus-8B-Model_Stock", + "name": "inexpertus-8B-Model_Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7795, + "BBH": 0.528, + "MATH Level 5": 0.1707, + "GPQA": 0.3096, + "MUSR": 0.4118, + "MMLU-PRO": 0.3791 + } + }, + { + "model_id": "DreadPoor/inexpertus_1.1-8B-LINEAR", + "name": "inexpertus_1.1-8B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7527, + "BBH": 0.5525, + "MATH Level 5": 0.173, + "GPQA": 0.2978, + "MUSR": 0.4173, + "MMLU-PRO": 0.3827 + } + }, + { + "model_id": "DreadPoor/inexpertus_1.2-8B-LINEAR", + "name": "inexpertus_1.2-8B-LINEAR", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7348, + "BBH": 0.5523, + "MATH Level 5": 0.1586, + "GPQA": 0.2953, + "MUSR": 0.4133, + "MMLU-PRO": 0.3788 + } + }, + { + "model_id": "DreadPoor/mergekit-nuslerp-nqzkedi", + "name": "mergekit-nuslerp-nqzkedi", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7765, + "BBH": 0.5362, + "MATH Level 5": 0.1881, + "GPQA": 0.3012, + "MUSR": 0.4225, + "MMLU-PRO": 0.3919 + } + }, + { + "model_id": "DreadPoor/remember_to_breathe-8b-Model-Stock", + "name": "remember_to_breathe-8b-Model-Stock", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7104, + "BBH": 0.5412, + "MATH Level 5": 0.1488, + "GPQA": 0.3012, + "MUSR": 0.4145, + "MMLU-PRO": 0.3761 + } + }, + { + "model_id": "DreadPoor/test", + "name": "test", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.4937, + "BBH": 0.5372, + "MATH Level 5": 0.1934, + "GPQA": 0.271, + "MUSR": 0.4351, + "MMLU-PRO": 0.3647 + } + }, + { + "model_id": "DreadPoor/test_ALT", + "name": "test_ALT", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.4997, + "BBH": 0.537, + "MATH Level 5": 0.1707, + "GPQA": 0.2693, + "MUSR": 0.4363, + "MMLU-PRO": 0.3492 + } + }, + { + "model_id": "DreadPoor/tests_pending-do_not_use_yet", + "name": "tests_pending-do_not_use_yet", + "developer": "DreadPoor", + "scores": { + "IFEval": 0.7691, + "BBH": 0.5408, + "MATH Level 5": 0.1979, + "GPQA": 0.297, + "MUSR": 0.4005, + "MMLU-PRO": 0.3827 + } + }, + { + "model_id": "ECE-ILAB-PRYMMAL/ILAB-Merging-3B-V2", + "name": "ILAB-Merging-3B-V2", + "developer": "ECE-ILAB-PRYMMAL", + "scores": { + "IFEval": 0.4029, + "BBH": 0.5402, + "MATH Level 5": 0.1518, + "GPQA": 0.3054, + "MUSR": 0.4332, + "MMLU-PRO": 0.3861 + } + }, + { + "model_id": "EVA-UNIT-01/EVA-Qwen2.5-14B-v0.2", + "name": "EVA-Qwen2.5-14B-v0.2", + "developer": "EVA-UNIT-01", + "scores": { + "IFEval": 0.4038, + "BBH": 0.609, + "MATH Level 5": 0.3406, + "GPQA": 0.3943, + "MUSR": 0.4794, + "MMLU-PRO": 0.5135 + } + }, + { + "model_id": "EVA-UNIT-01/EVA-Qwen2.5-72B-v0.2", + "name": "EVA-Qwen2.5-72B-v0.2", + "developer": "EVA-UNIT-01", + "scores": { + "IFEval": 0.6879, + "BBH": 0.7088, + "MATH Level 5": 0.4313, + "GPQA": 0.4086, + "MUSR": 0.472, + "MMLU-PRO": 0.5813 + } + }, + { + "model_id": "Edgerunners/meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16", + "name": "meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16", + "developer": "Edgerunners", + "scores": { + "IFEval": 0.7147, + "BBH": 0.498, + "MATH Level 5": 0.0906, + "GPQA": 0.2601, + "MUSR": 0.3342, + "MMLU-PRO": 0.3636 + } + }, + { + "model_id": "EleutherAI/gpt-j-6b", + "name": "gpt-j-6b", + "developer": "EleutherAI", + "scores": { + "IFEval": 0.2522, + "BBH": 0.3191, + "MATH Level 5": 0.0136, + "GPQA": 0.2458, + "MUSR": 0.3658, + "MMLU-PRO": 0.1241 + } + }, + { + "model_id": "EleutherAI/gpt-neo-1.3B", + "name": "gpt-neo-1.3B", + "developer": "EleutherAI", + "scores": { + "IFEval": 0.2079, + "BBH": 0.3039, + "MATH Level 5": 0.0106, + "GPQA": 0.2559, + "MUSR": 0.3817, + "MMLU-PRO": 0.1164 + } + }, + { + "model_id": "EleutherAI/gpt-neo-125m", + "name": "gpt-neo-125m", + "developer": "EleutherAI", + "scores": { + "IFEval": 0.1905, + "BBH": 0.3115, + "MATH Level 5": 0.006, + "GPQA": 0.2534, + "MUSR": 0.3593, + "MMLU-PRO": 0.1026 + } + }, + { + "model_id": "EleutherAI/gpt-neo-2.7B", + "name": "gpt-neo-2.7B", + "developer": "EleutherAI", + "scores": { + "IFEval": 0.259, + "BBH": 0.314, + "MATH Level 5": 0.0106, + "GPQA": 0.2659, + "MUSR": 0.3554, + "MMLU-PRO": 0.1163 + } + }, + { + "model_id": "EleutherAI/gpt-neox-20b", + "name": "gpt-neox-20b", + "developer": "EleutherAI", + "scores": { + "IFEval": 0.2587, + "BBH": 0.3165, + "MATH Level 5": 0.0136, + "GPQA": 0.2433, + "MUSR": 0.3647, + "MMLU-PRO": 0.1155 + } + }, + { + "model_id": "EleutherAI/pythia-1.4b", + "name": "pythia-1.4b", + "developer": "EleutherAI", + "scores": { + "IFEval": 0.2371, + "BBH": 0.315, + "MATH Level 5": 0.0151, + "GPQA": 0.2617, + "MUSR": 0.3538, + "MMLU-PRO": 0.1123 + } + }, + { + "model_id": "EleutherAI/pythia-12b", + "name": "pythia-12b", + "developer": "EleutherAI", + "scores": { + "IFEval": 0.2471, + "BBH": 0.318, + "MATH Level 5": 0.0166, + "GPQA": 0.2466, + "MUSR": 0.3647, + "MMLU-PRO": 0.1109 + } + }, + { + "model_id": "EleutherAI/pythia-160m", + "name": "pythia-160m", + "developer": "EleutherAI", + "scores": { + "IFEval": 0.1816, + "BBH": 0.297, + "MATH Level 5": 0.0091, + "GPQA": 0.2584, + "MUSR": 0.4179, + "MMLU-PRO": 0.112 + } + }, + { + "model_id": "EleutherAI/pythia-1b", + "name": "pythia-1b", + "developer": "EleutherAI", + "scores": { + "IFEval": 0.2208, + "BBH": 0.3004, + "MATH Level 5": 0.0091, + "GPQA": 0.2567, + "MUSR": 0.3552, + "MMLU-PRO": 0.1136 + } + }, + { + "model_id": "EleutherAI/pythia-2.8b", + "name": "pythia-2.8b", + "developer": "EleutherAI", + "scores": { + "IFEval": 0.2173, + "BBH": 0.3224, + "MATH Level 5": 0.0136, + "GPQA": 0.25, + "MUSR": 0.3486, + "MMLU-PRO": 0.1137 + } + }, + { + "model_id": "EleutherAI/pythia-410m", + "name": "pythia-410m", + "developer": "EleutherAI", + "scores": { + "IFEval": 0.2195, + "BBH": 0.3028, + "MATH Level 5": 0.0098, + "GPQA": 0.2592, + "MUSR": 0.3578, + "MMLU-PRO": 0.1128 + } + }, + { + "model_id": "EleutherAI/pythia-6.9b", + "name": "pythia-6.9b", + "developer": "EleutherAI", + "scores": { + "IFEval": 0.2281, + "BBH": 0.3232, + "MATH Level 5": 0.0144, + "GPQA": 0.2517, + "MUSR": 0.3591, + "MMLU-PRO": 0.1147 + } + }, + { + "model_id": "Enno-Ai/EnnoAi-Pro-French-Llama-3-8B-v0.4", + "name": "EnnoAi-Pro-French-Llama-3-8B-v0.4", + "developer": "Enno-Ai", + "scores": { + "IFEval": 0.4189, + "BBH": 0.4075, + "MATH Level 5": 0.0363, + "GPQA": 0.271, + "MUSR": 0.417, + "MMLU-PRO": 0.2635 + } + }, + { + "model_id": "Enno-Ai/EnnoAi-Pro-Llama-3-8B", + "name": "EnnoAi-Pro-Llama-3-8B", + "developer": "Enno-Ai", + "scores": { + "IFEval": 0.3195, + "BBH": 0.4152, + "MATH Level 5": 0.0219, + "GPQA": 0.2617, + "MUSR": 0.4071, + "MMLU-PRO": 0.2151 + } + }, + { + "model_id": "Enno-Ai/EnnoAi-Pro-Llama-3-8B-v0.3", + "name": "EnnoAi-Pro-Llama-3-8B-v0.3", + "developer": "Enno-Ai", + "scores": { + "IFEval": 0.5083, + "BBH": 0.4101, + "MATH Level 5": 0.0483, + "GPQA": 0.2651, + "MUSR": 0.4236, + "MMLU-PRO": 0.299 + } + }, + { + "model_id": "Enno-Ai/EnnoAi-Pro-Llama-3.1-8B-v0.9", + "name": "EnnoAi-Pro-Llama-3.1-8B-v0.9", + "developer": "Enno-Ai", + "scores": { + "IFEval": 0.4689, + "BBH": 0.416, + "MATH Level 5": 0.0378, + "GPQA": 0.2659, + "MUSR": 0.3832, + "MMLU-PRO": 0.2596 + } + }, + { + "model_id": "EnnoAi/EnnoAi-7B-French-Instruct-202502", + "name": "EnnoAi-7B-French-Instruct-202502", + "developer": "EnnoAi", + "scores": { + "IFEval": 0.5564, + "BBH": 0.5575, + "MATH Level 5": 0.3724, + "GPQA": 0.2953, + "MUSR": 0.46, + "MMLU-PRO": 0.4013 + } + }, + { + "model_id": "EnnoAi/EnnoAi-Pro-Llama-3.1-8B-v1.0", + "name": "EnnoAi-Pro-Llama-3.1-8B-v1.0", + "developer": "EnnoAi", + "scores": { + "IFEval": 0.4704, + "BBH": 0.416, + "MATH Level 5": 0.0378, + "GPQA": 0.2659, + "MUSR": 0.3832, + "MMLU-PRO": 0.2596 + } + }, + { + "model_id": "Epiculous/Azure_Dusk-v0.2", + "name": "Azure_Dusk-v0.2", + "developer": "Epiculous", + "scores": { + "IFEval": 0.3467, + "BBH": 0.412, + "MATH Level 5": 0.0295, + "GPQA": 0.2609, + "MUSR": 0.3835, + "MMLU-PRO": 0.3034 + } + }, + { + "model_id": "Epiculous/Crimson_Dawn-v0.2", + "name": "Crimson_Dawn-v0.2", + "developer": "Epiculous", + "scores": { + "IFEval": 0.3103, + "BBH": 0.4482, + "MATH Level 5": 0.0431, + "GPQA": 0.276, + "MUSR": 0.4152, + "MMLU-PRO": 0.2721 + } + }, + { + "model_id": "Epiculous/NovaSpark", + "name": "NovaSpark", + "developer": "Epiculous", + "scores": { + "IFEval": 0.6408, + "BBH": 0.5064, + "MATH Level 5": 0.1518, + "GPQA": 0.2978, + "MUSR": 0.3882, + "MMLU-PRO": 0.3649 + } + }, + { + "model_id": "Epiculous/Violet_Twilight-v0.2", + "name": "Violet_Twilight-v0.2", + "developer": "Epiculous", + "scores": { + "IFEval": 0.4532, + "BBH": 0.4615, + "MATH Level 5": 0.0287, + "GPQA": 0.2659, + "MUSR": 0.4299, + "MMLU-PRO": 0.3111 + } + }, + { + "model_id": "EpistemeAI/Alpaca-Llama3.1-8B", + "name": "Alpaca-Llama3.1-8B", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.1599, + "BBH": 0.4755, + "MATH Level 5": 0.0506, + "GPQA": 0.2903, + "MUSR": 0.3403, + "MMLU-PRO": 0.3246 + } + }, + { + "model_id": "EpistemeAI/Athena-gemma-2-2b-it", + "name": "Athena-gemma-2-2b-it", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.3134, + "BBH": 0.4264, + "MATH Level 5": 0.0491, + "GPQA": 0.2685, + "MUSR": 0.4351, + "MMLU-PRO": 0.2422 + } + }, + { + "model_id": "EpistemeAI/Athena-gemma-2-2b-it-Philos", + "name": "Athena-gemma-2-2b-it-Philos", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.4621, + "BBH": 0.3795, + "MATH Level 5": 0.037, + "GPQA": 0.281, + "MUSR": 0.4314, + "MMLU-PRO": 0.2248 + } + }, + { + "model_id": "EpistemeAI/Athene-codegemma-2-7b-it-alpaca-v1.3", + "name": "Athene-codegemma-2-7b-it-alpaca-v1.3", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.403, + "BBH": 0.4332, + "MATH Level 5": 0.0619, + "GPQA": 0.2802, + "MUSR": 0.4503, + "MMLU-PRO": 0.2587 + } + }, + { + "model_id": "EpistemeAI/DeepPhi-3.5-mini-instruct", + "name": "DeepPhi-3.5-mini-instruct", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.1326, + "BBH": 0.2882, + "MATH Level 5": 0.0068, + "GPQA": 0.2332, + "MUSR": 0.3656, + "MMLU-PRO": 0.1103 + } + }, + { + "model_id": "EpistemeAI/DeepThinkers-Phi4", + "name": "DeepThinkers-Phi4", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.694, + "BBH": 0.679, + "MATH Level 5": 0.4585, + "GPQA": 0.3406, + "MUSR": 0.3981, + "MMLU-PRO": 0.5258 + } + }, + { + "model_id": "EpistemeAI/FineLlama3.1-8B-Instruct", + "name": "FineLlama3.1-8B-Instruct", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.08, + "BBH": 0.4557, + "MATH Level 5": 0.0347, + "GPQA": 0.2802, + "MUSR": 0.3482, + "MMLU-PRO": 0.3113 + } + }, + { + "model_id": "EpistemeAI/Fireball-12B", + "name": "Fireball-12B", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.1834, + "BBH": 0.5111, + "MATH Level 5": 0.0408, + "GPQA": 0.2617, + "MUSR": 0.4236, + "MMLU-PRO": 0.3344 + } + }, + { + "model_id": "EpistemeAI/Fireball-12B-v1.13a-philosophers", + "name": "Fireball-12B-v1.13a-philosophers", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.0876, + "BBH": 0.5103, + "MATH Level 5": 0.0461, + "GPQA": 0.3012, + "MUSR": 0.4081, + "MMLU-PRO": 0.3367 + } + }, + { + "model_id": "EpistemeAI/Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200", + "name": "Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.4577, + "BBH": 0.4838, + "MATH Level 5": 0.1231, + "GPQA": 0.3003, + "MUSR": 0.3945, + "MMLU-PRO": 0.3583 + } + }, + { + "model_id": "EpistemeAI/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta", + "name": "Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.7274, + "BBH": 0.4865, + "MATH Level 5": 0.1526, + "GPQA": 0.2802, + "MUSR": 0.3619, + "MMLU-PRO": 0.3543 + } + }, + { + "model_id": "EpistemeAI/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2", + "name": "Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.4673, + "BBH": 0.4932, + "MATH Level 5": 0.1239, + "GPQA": 0.2861, + "MUSR": 0.4624, + "MMLU-PRO": 0.3352 + } + }, + { + "model_id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto", + "name": "Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.4432, + "BBH": 0.4824, + "MATH Level 5": 0.1329, + "GPQA": 0.3121, + "MUSR": 0.4066, + "MMLU-PRO": 0.3516 + } + }, + { + "model_id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K", + "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.4457, + "BBH": 0.4897, + "MATH Level 5": 0.1208, + "GPQA": 0.2945, + "MUSR": 0.3762, + "MMLU-PRO": 0.3543 + } + }, + { + "model_id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code", + "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.5975, + "BBH": 0.4904, + "MATH Level 5": 0.1337, + "GPQA": 0.302, + "MUSR": 0.401, + "MMLU-PRO": 0.3423 + } + }, + { + "model_id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds", + "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.6691, + "BBH": 0.4668, + "MATH Level 5": 0.1337, + "GPQA": 0.2727, + "MUSR": 0.3418, + "MMLU-PRO": 0.3389 + } + }, + { + "model_id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto", + "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.7207, + "BBH": 0.461, + "MATH Level 5": 0.1314, + "GPQA": 0.2701, + "MUSR": 0.3432, + "MMLU-PRO": 0.3354 + } + }, + { + "model_id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT", + "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.4578, + "BBH": 0.4761, + "MATH Level 5": 0.1382, + "GPQA": 0.2936, + "MUSR": 0.3881, + "MMLU-PRO": 0.3471 + } + }, + { + "model_id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto", + "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.7205, + "BBH": 0.4818, + "MATH Level 5": 0.1435, + "GPQA": 0.2483, + "MUSR": 0.33, + "MMLU-PRO": 0.3548 + } + }, + { + "model_id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Math", + "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Math", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.4623, + "BBH": 0.4983, + "MATH Level 5": 0.108, + "GPQA": 0.2911, + "MUSR": 0.3641, + "MMLU-PRO": 0.3331 + } + }, + { + "model_id": "EpistemeAI/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO", + "name": "Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.4611, + "BBH": 0.4801, + "MATH Level 5": 0.1254, + "GPQA": 0.3003, + "MUSR": 0.3998, + "MMLU-PRO": 0.3521 + } + }, + { + "model_id": "EpistemeAI/Fireball-Mistral-Nemo-Base-2407-v1-DPO2", + "name": "Fireball-Mistral-Nemo-Base-2407-v1-DPO2", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.1861, + "BBH": 0.4968, + "MATH Level 5": 0.0363, + "GPQA": 0.2919, + "MUSR": 0.404, + "MMLU-PRO": 0.3353 + } + }, + { + "model_id": "EpistemeAI/Fireball-R1-Llama-3.1-8B", + "name": "Fireball-R1-Llama-3.1-8B", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.4427, + "BBH": 0.3643, + "MATH Level 5": 0.3112, + "GPQA": 0.2483, + "MUSR": 0.3288, + "MMLU-PRO": 0.1115 + } + }, + { + "model_id": "EpistemeAI/Fireball-R1-Llama-3.1-8B-Medical-COT", + "name": "Fireball-R1-Llama-3.1-8B-Medical-COT", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.3216, + "BBH": 0.3716, + "MATH Level 5": 0.327, + "GPQA": 0.2743, + "MUSR": 0.3114, + "MMLU-PRO": 0.1402 + } + }, + { + "model_id": "EpistemeAI/Fireball-R1.1-Llama-3.1-8B", + "name": "Fireball-R1.1-Llama-3.1-8B", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.3676, + "BBH": 0.3326, + "MATH Level 5": 0.1382, + "GPQA": 0.2517, + "MUSR": 0.3419, + "MMLU-PRO": 0.1115 + } + }, + { + "model_id": "EpistemeAI/Llama-3.2-3B-Agent007-Coder", + "name": "Llama-3.2-3B-Agent007-Coder", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.54, + "BBH": 0.4304, + "MATH Level 5": 0.111, + "GPQA": 0.2576, + "MUSR": 0.3668, + "MMLU-PRO": 0.2852 + } + }, + { + "model_id": "EpistemeAI/Mistral-Nemo-Instruct-12B-Philosophy-Math", + "name": "Mistral-Nemo-Instruct-12B-Philosophy-Math", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.0695, + "BBH": 0.5365, + "MATH Level 5": 0.0959, + "GPQA": 0.3314, + "MUSR": 0.4292, + "MMLU-PRO": 0.3296 + } + }, + { + "model_id": "EpistemeAI/OpenReasoner-Llama-3.2-3B-rs1.0", + "name": "OpenReasoner-Llama-3.2-3B-rs1.0", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.7274, + "BBH": 0.4519, + "MATH Level 5": 0.1344, + "GPQA": 0.2718, + "MUSR": 0.3461, + "MMLU-PRO": 0.3134 + } + }, + { + "model_id": "EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy", + "name": "Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.7101, + "BBH": 0.4628, + "MATH Level 5": 0.1397, + "GPQA": 0.2768, + "MUSR": 0.3195, + "MMLU-PRO": 0.3311 + } + }, + { + "model_id": "EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic", + "name": "Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.7122, + "BBH": 0.4566, + "MATH Level 5": 0.1246, + "GPQA": 0.2844, + "MUSR": 0.3235, + "MMLU-PRO": 0.335 + } + }, + { + "model_id": "EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent", + "name": "Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.6915, + "BBH": 0.4525, + "MATH Level 5": 0.1292, + "GPQA": 0.2668, + "MUSR": 0.3578, + "MMLU-PRO": 0.329 + } + }, + { + "model_id": "EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT", + "name": "Reasoning-Llama-3.1-CoT-RE1-NMT", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.4829, + "BBH": 0.4736, + "MATH Level 5": 0.1299, + "GPQA": 0.2609, + "MUSR": 0.3182, + "MMLU-PRO": 0.3343 + } + }, + { + "model_id": "EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO", + "name": "Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.4553, + "BBH": 0.4804, + "MATH Level 5": 0.1292, + "GPQA": 0.307, + "MUSR": 0.3931, + "MMLU-PRO": 0.3598 + } + }, + { + "model_id": "EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.2", + "name": "Reasoning-Llama-3.2-1B-Instruct-v1.2", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.4087, + "BBH": 0.3324, + "MATH Level 5": 0.0506, + "GPQA": 0.2609, + "MUSR": 0.3222, + "MMLU-PRO": 0.1179 + } + }, + { + "model_id": "EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.3", + "name": "Reasoning-Llama-3.2-1B-Instruct-v1.3", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.3273, + "BBH": 0.3263, + "MATH Level 5": 0.0506, + "GPQA": 0.2584, + "MUSR": 0.326, + "MMLU-PRO": 0.1173 + } + }, + { + "model_id": "EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1", + "name": "Reasoning-Llama-3.2-3B-Math-Instruct-RE1", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.512, + "BBH": 0.4381, + "MATH Level 5": 0.108, + "GPQA": 0.2643, + "MUSR": 0.3435, + "MMLU-PRO": 0.2789 + } + }, + { + "model_id": "EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO", + "name": "Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.729, + "BBH": 0.4518, + "MATH Level 5": 0.1533, + "GPQA": 0.2735, + "MUSR": 0.3487, + "MMLU-PRO": 0.31 + } + }, + { + "model_id": "EpistemeAI/ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math", + "name": "ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.5903, + "BBH": 0.4364, + "MATH Level 5": 0.148, + "GPQA": 0.2601, + "MUSR": 0.3314, + "MMLU-PRO": 0.2823 + } + }, + { + "model_id": "EpistemeAI/ReasoningCore-3B-0", + "name": "ReasoningCore-3B-0", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.7341, + "BBH": 0.4446, + "MATH Level 5": 0.1586, + "GPQA": 0.2727, + "MUSR": 0.3554, + "MMLU-PRO": 0.3172 + } + }, + { + "model_id": "EpistemeAI/ReasoningCore-3B-Instruct-r01-Reflect", + "name": "ReasoningCore-3B-Instruct-r01-Reflect", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.7335, + "BBH": 0.445, + "MATH Level 5": 0.1541, + "GPQA": 0.2735, + "MUSR": 0.3527, + "MMLU-PRO": 0.3144 + } + }, + { + "model_id": "EpistemeAI/ReasoningCore-3B-R01", + "name": "ReasoningCore-3B-R01", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.2976, + "BBH": 0.4373, + "MATH Level 5": 0.1299, + "GPQA": 0.2609, + "MUSR": 0.3195, + "MMLU-PRO": 0.2591 + } + }, + { + "model_id": "EpistemeAI/ReasoningCore-3B-RE1-V2", + "name": "ReasoningCore-3B-RE1-V2", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.7393, + "BBH": 0.4462, + "MATH Level 5": 0.1563, + "GPQA": 0.2735, + "MUSR": 0.3541, + "MMLU-PRO": 0.3181 + } + }, + { + "model_id": "EpistemeAI/ReasoningCore-3B-RE1-V2A", + "name": "ReasoningCore-3B-RE1-V2A", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.5733, + "BBH": 0.419, + "MATH Level 5": 0.0929, + "GPQA": 0.2777, + "MUSR": 0.3352, + "MMLU-PRO": 0.2736 + } + }, + { + "model_id": "EpistemeAI/ReasoningCore-3B-RE1-V2B", + "name": "ReasoningCore-3B-RE1-V2B", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.5051, + "BBH": 0.4168, + "MATH Level 5": 0.1073, + "GPQA": 0.2617, + "MUSR": 0.3448, + "MMLU-PRO": 0.2673 + } + }, + { + "model_id": "EpistemeAI/ReasoningCore-3B-RE1-V2C", + "name": "ReasoningCore-3B-RE1-V2C", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.5057, + "BBH": 0.4177, + "MATH Level 5": 0.0974, + "GPQA": 0.2609, + "MUSR": 0.3422, + "MMLU-PRO": 0.2691 + } + }, + { + "model_id": "EpistemeAI/ReasoningCore-3B-T1-V1", + "name": "ReasoningCore-3B-T1-V1", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.7208, + "BBH": 0.4517, + "MATH Level 5": 0.1458, + "GPQA": 0.276, + "MUSR": 0.354, + "MMLU-PRO": 0.312 + } + }, + { + "model_id": "EpistemeAI/ReasoningCore-3B-T1_1", + "name": "ReasoningCore-3B-T1_1", + "developer": "EpistemeAI", + "scores": { + "IFEval": 0.7275, + "BBH": 0.4524, + "MATH Level 5": 0.1541, + "GPQA": 0.276, + "MUSR": 0.3554, + "MMLU-PRO": 0.3117 + } + }, + { + "model_id": "EpistemeAI2/Athene-codegemma-2-7b-it-alpaca-v1.2", + "name": "Athene-codegemma-2-7b-it-alpaca-v1.2", + "developer": "EpistemeAI2", + "scores": { + "IFEval": 0.4351, + "BBH": 0.4175, + "MATH Level 5": 0.0423, + "GPQA": 0.271, + "MUSR": 0.417, + "MMLU-PRO": 0.2297 + } + }, + { + "model_id": "EpistemeAI2/Fireball-12B-v1.2", + "name": "Fireball-12B-v1.2", + "developer": "EpistemeAI2", + "scores": { + "IFEval": 0.1355, + "BBH": 0.5019, + "MATH Level 5": 0.0415, + "GPQA": 0.2987, + "MUSR": 0.4173, + "MMLU-PRO": 0.3337 + } + }, + { + "model_id": "EpistemeAI2/Fireball-Alpaca-Llama3.1-8B-Philos", + "name": "Fireball-Alpaca-Llama3.1-8B-Philos", + "developer": "EpistemeAI2", + "scores": { + "IFEval": 0.4986, + "BBH": 0.4978, + "MATH Level 5": 0.1186, + "GPQA": 0.2928, + "MUSR": 0.4277, + "MMLU-PRO": 0.3406 + } + }, + { + "model_id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.01-8B-Philos", + "name": "Fireball-Alpaca-Llama3.1.01-8B-Philos", + "developer": "EpistemeAI2", + "scores": { + "IFEval": 0.4212, + "BBH": 0.4956, + "MATH Level 5": 0.136, + "GPQA": 0.2886, + "MUSR": 0.4371, + "MMLU-PRO": 0.3383 + } + }, + { + "model_id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.03-8B-Philos", + "name": "Fireball-Alpaca-Llama3.1.03-8B-Philos", + "developer": "EpistemeAI2", + "scores": { + "IFEval": 0.3881, + "BBH": 0.4951, + "MATH Level 5": 0.1284, + "GPQA": 0.2785, + "MUSR": 0.428, + "MMLU-PRO": 0.3355 + } + }, + { + "model_id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.04-8B-Philos", + "name": "Fireball-Alpaca-Llama3.1.04-8B-Philos", + "developer": "EpistemeAI2", + "scores": { + "IFEval": 0.4084, + "BBH": 0.493, + "MATH Level 5": 0.1201, + "GPQA": 0.2903, + "MUSR": 0.4372, + "MMLU-PRO": 0.3403 + } + }, + { + "model_id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo", + "name": "Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo", + "developer": "EpistemeAI2", + "scores": { + "IFEval": 0.4866, + "BBH": 0.4881, + "MATH Level 5": 0.1307, + "GPQA": 0.2978, + "MUSR": 0.3932, + "MMLU-PRO": 0.3615 + } + }, + { + "model_id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math", + "name": "Fireball-Alpaca-Llama3.1.07-8B-Philos-Math", + "developer": "EpistemeAI2", + "scores": { + "IFEval": 0.5079, + "BBH": 0.4847, + "MATH Level 5": 0.1201, + "GPQA": 0.2961, + "MUSR": 0.4063, + "MMLU-PRO": 0.3531 + } + }, + { + "model_id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection", + "name": "Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection", + "developer": "EpistemeAI2", + "scores": { + "IFEval": 0.3952, + "BBH": 0.4955, + "MATH Level 5": 0.1246, + "GPQA": 0.2995, + "MUSR": 0.4048, + "MMLU-PRO": 0.3593 + } + }, + { + "model_id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1", + "name": "Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1", + "developer": "EpistemeAI2", + "scores": { + "IFEval": 0.5316, + "BBH": 0.4828, + "MATH Level 5": 0.1239, + "GPQA": 0.297, + "MUSR": 0.4103, + "MMLU-PRO": 0.3523 + } + }, + { + "model_id": "EpistemeAI2/Fireball-Llama-3.1-8B-Philos-Reflection", + "name": "Fireball-Llama-3.1-8B-Philos-Reflection", + "developer": "EpistemeAI2", + "scores": { + "IFEval": 0.3596, + "BBH": 0.4898, + "MATH Level 5": 0.1284, + "GPQA": 0.3079, + "MUSR": 0.3957, + "MMLU-PRO": 0.3551 + } + }, + { + "model_id": "EpistemeAI2/Fireball-MathMistral-Nemo-Base-2407-v2dpo", + "name": "Fireball-MathMistral-Nemo-Base-2407-v2dpo", + "developer": "EpistemeAI2", + "scores": { + "IFEval": 0.3097, + "BBH": 0.4328, + "MATH Level 5": 0.037, + "GPQA": 0.2634, + "MUSR": 0.403, + "MMLU-PRO": 0.1148 + } + }, + { + "model_id": "EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math", + "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math", + "developer": "EpistemeAI2", + "scores": { + "IFEval": 0.5515, + "BBH": 0.4808, + "MATH Level 5": 0.1352, + "GPQA": 0.3045, + "MUSR": 0.3693, + "MMLU-PRO": 0.342 + } + }, + { + "model_id": "EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT", + "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT", + "developer": "EpistemeAI2", + "scores": { + "IFEval": 0.4633, + "BBH": 0.4791, + "MATH Level 5": 0.1171, + "GPQA": 0.3121, + "MUSR": 0.3774, + "MMLU-PRO": 0.3565 + } + }, + { + "model_id": "EpistemeAI2/Fireball-Phi-3-medium-4k-inst-Philos", + "name": "Fireball-Phi-3-medium-4k-inst-Philos", + "developer": "EpistemeAI2", + "scores": { + "IFEval": 0.5313, + "BBH": 0.6178, + "MATH Level 5": 0.1707, + "GPQA": 0.3322, + "MUSR": 0.4139, + "MMLU-PRO": 0.4599 + } + }, + { + "model_id": "Eric111/CatunaMayo", + "name": "CatunaMayo", + "developer": "Eric111", + "scores": { + "IFEval": 0.4074, + "BBH": 0.5244, + "MATH Level 5": 0.0846, + "GPQA": 0.2919, + "MUSR": 0.454, + "MMLU-PRO": 0.3178 + } + }, + { + "model_id": "Eric111/CatunaMayo-DPO", + "name": "CatunaMayo-DPO", + "developer": "Eric111", + "scores": { + "IFEval": 0.4215, + "BBH": 0.5224, + "MATH Level 5": 0.0816, + "GPQA": 0.2919, + "MUSR": 0.445, + "MMLU-PRO": 0.317 + } + }, + { + "model_id": "Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties", + "name": "Chocolatine-3B-Instruct-DPO-Revised-Ties", + "developer": "Etherll", + "scores": { + "IFEval": 0.3725, + "BBH": 0.5411, + "MATH Level 5": 0.1631, + "GPQA": 0.3238, + "MUSR": 0.4649, + "MMLU-PRO": 0.3978 + } + }, + { + "model_id": "Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties-v2", + "name": "Chocolatine-3B-Instruct-DPO-Revised-Ties-v2", + "developer": "Etherll", + "scores": { + "IFEval": 0.374, + "BBH": 0.5411, + "MATH Level 5": 0.1631, + "GPQA": 0.3238, + "MUSR": 0.4649, + "MMLU-PRO": 0.3978 + } + }, + { + "model_id": "Etherll/Herplete-LLM-Llama-3.1-8b", + "name": "Herplete-LLM-Llama-3.1-8b", + "developer": "Etherll", + "scores": { + "IFEval": 0.6106, + "BBH": 0.5347, + "MATH Level 5": 0.1548, + "GPQA": 0.3146, + "MUSR": 0.3991, + "MMLU-PRO": 0.3752 + } + }, + { + "model_id": "Etherll/Herplete-LLM-Llama-3.1-8b-Ties", + "name": "Herplete-LLM-Llama-3.1-8b-Ties", + "developer": "Etherll", + "scores": { + "IFEval": 0.6164, + "BBH": 0.5338, + "MATH Level 5": 0.1601, + "GPQA": 0.3171, + "MUSR": 0.4017, + "MMLU-PRO": 0.3752 + } + }, + { + "model_id": "Etherll/Qwen2.5-7B-della-test", + "name": "Qwen2.5-7B-della-test", + "developer": "Etherll", + "scores": { + "IFEval": 0.7625, + "BBH": 0.5447, + "MATH Level 5": 0.4894, + "GPQA": 0.3087, + "MUSR": 0.4047, + "MMLU-PRO": 0.4361 + } + }, + { + "model_id": "Etherll/Qwen2.5-Coder-7B-Instruct-Ties", + "name": "Qwen2.5-Coder-7B-Instruct-Ties", + "developer": "Etherll", + "scores": { + "IFEval": 0.5005, + "BBH": 0.4895, + "MATH Level 5": 0.2915, + "GPQA": 0.3297, + "MUSR": 0.4373, + "MMLU-PRO": 0.3503 + } + }, + { + "model_id": "Etherll/Replete-LLM-V3-Llama-3.1-8b", + "name": "Replete-LLM-V3-Llama-3.1-8b", + "developer": "Etherll", + "scores": { + "IFEval": 0.5263, + "BBH": 0.4543, + "MATH Level 5": 0.2273, + "GPQA": 0.2685, + "MUSR": 0.3516, + "MMLU-PRO": 0.347 + } + }, + { + "model_id": "Etherll/SuperHermes", + "name": "SuperHermes", + "developer": "Etherll", + "scores": { + "IFEval": 0.5459, + "BBH": 0.529, + "MATH Level 5": 0.1654, + "GPQA": 0.3238, + "MUSR": 0.44, + "MMLU-PRO": 0.3949 + } + }, + { + "model_id": "Eurdem/Defne-llama3.1-8B", + "name": "Defne-llama3.1-8B", + "developer": "Eurdem", + "scores": { + "IFEval": 0.5036, + "BBH": 0.5321, + "MATH Level 5": 0.1601, + "GPQA": 0.2961, + "MUSR": 0.4331, + "MMLU-PRO": 0.3866 + } + }, + { + "model_id": "FINGU-AI/Chocolatine-Fusion-14B", + "name": "Chocolatine-Fusion-14B", + "developer": "FINGU-AI", + "scores": { + "IFEval": 0.6949, + "BBH": 0.6413, + "MATH Level 5": 0.3852, + "GPQA": 0.3716, + "MUSR": 0.494, + "MMLU-PRO": 0.5262 + } + }, + { + "model_id": "FINGU-AI/L3-8B", + "name": "L3-8B", + "developer": "FINGU-AI", + "scores": { + "IFEval": 0.7517, + "BBH": 0.4986, + "MATH Level 5": 0.2545, + "GPQA": 0.2953, + "MUSR": 0.3828, + "MMLU-PRO": 0.3639 + } + }, + { + "model_id": "FINGU-AI/Phi-4-RRStock", + "name": "Phi-4-RRStock", + "developer": "FINGU-AI", + "scores": { + "IFEval": 0.2855, + "BBH": 0.6443, + "MATH Level 5": 0.0582, + "GPQA": 0.38, + "MUSR": 0.4479, + "MMLU-PRO": 0.4883 + } + }, + { + "model_id": "FINGU-AI/Q-Small-3B", + "name": "Q-Small-3B", + "developer": "FINGU-AI", + "scores": { + "IFEval": 0.4145, + "BBH": 0.4319, + "MATH Level 5": 0.0831, + "GPQA": 0.2668, + "MUSR": 0.4005, + "MMLU-PRO": 0.279 + } + }, + { + "model_id": "FINGU-AI/QwQ-Buddy-32B-Alpha", + "name": "QwQ-Buddy-32B-Alpha", + "developer": "FINGU-AI", + "scores": { + "IFEval": 0.3446, + "BBH": 0.6424, + "MATH Level 5": 0.3852, + "GPQA": 0.3792, + "MUSR": 0.506, + "MMLU-PRO": 0.5294 + } + }, + { + "model_id": "FINGU-AI/RomboUltima-32B", + "name": "RomboUltima-32B", + "developer": "FINGU-AI", + "scores": { + "IFEval": 0.6672, + "BBH": 0.6938, + "MATH Level 5": 0.5385, + "GPQA": 0.3716, + "MUSR": 0.4836, + "MMLU-PRO": 0.5789 + } + }, + { + "model_id": "FINGU-AI/Ultimos-32B", + "name": "Ultimos-32B", + "developer": "FINGU-AI", + "scores": { + "IFEval": 0.1592, + "BBH": 0.2906, + "MATH Level 5": 0.0, + "GPQA": 0.2492, + "MUSR": 0.3286, + "MMLU-PRO": 0.1111 + } + }, + { + "model_id": "FallenMerick/Chewy-Lemon-Cookie-11B", + "name": "Chewy-Lemon-Cookie-11B", + "developer": "FallenMerick", + "scores": { + "IFEval": 0.4875, + "BBH": 0.5251, + "MATH Level 5": 0.0544, + "GPQA": 0.2794, + "MUSR": 0.4546, + "MMLU-PRO": 0.3267 + } + }, + { + "model_id": "Felladrin/Llama-160M-Chat-v1", + "name": "Llama-160M-Chat-v1", + "developer": "Felladrin", + "scores": { + "IFEval": 0.1575, + "BBH": 0.3036, + "MATH Level 5": 0.006, + "GPQA": 0.2576, + "MUSR": 0.3661, + "MMLU-PRO": 0.1136 + } + }, + { + "model_id": "Felladrin/Minueza-32M-UltraChat", + "name": "Minueza-32M-UltraChat", + "developer": "Felladrin", + "scores": { + "IFEval": 0.1376, + "BBH": 0.2941, + "MATH Level 5": 0.0045, + "GPQA": 0.2559, + "MUSR": 0.3742, + "MMLU-PRO": 0.1133 + } + }, + { + "model_id": "FlofloB/100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", + "name": "100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", + "developer": "FlofloB", + "scores": { + "IFEval": 0.3083, + "BBH": 0.3323, + "MATH Level 5": 0.0408, + "GPQA": 0.2693, + "MUSR": 0.3302, + "MMLU-PRO": 0.1498 + } + }, + { + "model_id": "FlofloB/10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit", + "name": "10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit", + "developer": "FlofloB", + "scores": { + "IFEval": 0.5097, + "BBH": 0.5215, + "MATH Level 5": 0.0974, + "GPQA": 0.2995, + "MUSR": 0.431, + "MMLU-PRO": 0.3769 + } + }, + { + "model_id": "FlofloB/10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", + "name": "10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", + "developer": "FlofloB", + "scores": { + "IFEval": 0.2815, + "BBH": 0.3306, + "MATH Level 5": 0.031, + "GPQA": 0.2794, + "MUSR": 0.3302, + "MMLU-PRO": 0.1541 + } + }, + { + "model_id": "FlofloB/40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", + "name": "40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", + "developer": "FlofloB", + "scores": { + "IFEval": 0.3016, + "BBH": 0.3325, + "MATH Level 5": 0.0332, + "GPQA": 0.2676, + "MUSR": 0.3408, + "MMLU-PRO": 0.1485 + } + }, + { + "model_id": "FlofloB/83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", + "name": "83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", + "developer": "FlofloB", + "scores": { + "IFEval": 0.2869, + "BBH": 0.3347, + "MATH Level 5": 0.0302, + "GPQA": 0.2735, + "MUSR": 0.3289, + "MMLU-PRO": 0.1555 + } + }, + { + "model_id": "FlofloB/smollm2-135M_pretrained_1000k_fineweb", + "name": "smollm2-135M_pretrained_1000k_fineweb", + "developer": "FlofloB", + "scores": { + "IFEval": 0.1485, + "BBH": 0.2918, + "MATH Level 5": 0.0091, + "GPQA": 0.2626, + "MUSR": 0.3581, + "MMLU-PRO": 0.1164 + } + }, + { + "model_id": "FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed", + "name": "smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed", + "developer": "FlofloB", + "scores": { + "IFEval": 0.1554, + "BBH": 0.3066, + "MATH Level 5": 0.006, + "GPQA": 0.2508, + "MUSR": 0.358, + "MMLU-PRO": 0.1143 + } + }, + { + "model_id": "FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_selected", + "name": "smollm2-135M_pretrained_1000k_fineweb_uncovai_selected", + "developer": "FlofloB", + "scores": { + "IFEval": 0.1468, + "BBH": 0.2932, + "MATH Level 5": 0.0068, + "GPQA": 0.2659, + "MUSR": 0.4048, + "MMLU-PRO": 0.1157 + } + }, + { + "model_id": "FlofloB/smollm2-135M_pretrained_1200k_fineweb", + "name": "smollm2-135M_pretrained_1200k_fineweb", + "developer": "FlofloB", + "scores": { + "IFEval": 0.1581, + "BBH": 0.2941, + "MATH Level 5": 0.0068, + "GPQA": 0.2643, + "MUSR": 0.3714, + "MMLU-PRO": 0.1076 + } + }, + { + "model_id": "FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed", + "name": "smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed", + "developer": "FlofloB", + "scores": { + "IFEval": 0.1578, + "BBH": 0.295, + "MATH Level 5": 0.0008, + "GPQA": 0.2651, + "MUSR": 0.37, + "MMLU-PRO": 0.1139 + } + }, + { + "model_id": "FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_selected", + "name": "smollm2-135M_pretrained_1200k_fineweb_uncovai_selected", + "developer": "FlofloB", + "scores": { + "IFEval": 0.1585, + "BBH": 0.296, + "MATH Level 5": 0.0076, + "GPQA": 0.2634, + "MUSR": 0.3567, + "MMLU-PRO": 0.1164 + } + }, + { + "model_id": "FlofloB/smollm2-135M_pretrained_1400k_fineweb", + "name": "smollm2-135M_pretrained_1400k_fineweb", + "developer": "FlofloB", + "scores": { + "IFEval": 0.1764, + "BBH": 0.2922, + "MATH Level 5": 0.0113, + "GPQA": 0.2659, + "MUSR": 0.3873, + "MMLU-PRO": 0.108 + } + }, + { + "model_id": "FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed", + "name": "smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed", + "developer": "FlofloB", + "scores": { + "IFEval": 0.1707, + "BBH": 0.2992, + "MATH Level 5": 0.0106, + "GPQA": 0.2609, + "MUSR": 0.3939, + "MMLU-PRO": 0.1105 + } + }, + { + "model_id": "FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_selected", + "name": "smollm2-135M_pretrained_1400k_fineweb_uncovai_selected", + "developer": "FlofloB", + "scores": { + "IFEval": 0.1538, + "BBH": 0.2917, + "MATH Level 5": 0.0106, + "GPQA": 0.2685, + "MUSR": 0.3741, + "MMLU-PRO": 0.1137 + } + }, + { + "model_id": "FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed", + "name": "smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed", + "developer": "FlofloB", + "scores": { + "IFEval": 0.1475, + "BBH": 0.3029, + "MATH Level 5": 0.0038, + "GPQA": 0.2584, + "MUSR": 0.3578, + "MMLU-PRO": 0.112 + } + }, + { + "model_id": "FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_selected", + "name": "smollm2-135M_pretrained_200k_fineweb_uncovai_selected", + "developer": "FlofloB", + "scores": { + "IFEval": 0.1345, + "BBH": 0.2927, + "MATH Level 5": 0.0076, + "GPQA": 0.2508, + "MUSR": 0.366, + "MMLU-PRO": 0.1131 + } + }, + { + "model_id": "FlofloB/smollm2-135M_pretrained_400k_fineweb", + "name": "smollm2-135M_pretrained_400k_fineweb", + "developer": "FlofloB", + "scores": { + "IFEval": 0.1511, + "BBH": 0.2972, + "MATH Level 5": 0.0121, + "GPQA": 0.2525, + "MUSR": 0.3794, + "MMLU-PRO": 0.1163 + } + }, + { + "model_id": "FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed", + "name": "smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed", + "developer": "FlofloB", + "scores": { + "IFEval": 0.1556, + "BBH": 0.3049, + "MATH Level 5": 0.0091, + "GPQA": 0.255, + "MUSR": 0.386, + "MMLU-PRO": 0.1138 + } + }, + { + "model_id": "FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_selected", + "name": "smollm2-135M_pretrained_400k_fineweb_uncovai_selected", + "developer": "FlofloB", + "scores": { + "IFEval": 0.1584, + "BBH": 0.2925, + "MATH Level 5": 0.0068, + "GPQA": 0.2542, + "MUSR": 0.382, + "MMLU-PRO": 0.1158 + } + }, + { + "model_id": "FlofloB/smollm2-135M_pretrained_600k_fineweb", + "name": "smollm2-135M_pretrained_600k_fineweb", + "developer": "FlofloB", + "scores": { + "IFEval": 0.1639, + "BBH": 0.3014, + "MATH Level 5": 0.006, + "GPQA": 0.2659, + "MUSR": 0.3809, + "MMLU-PRO": 0.1126 + } + }, + { + "model_id": "FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed", + "name": "smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed", + "developer": "FlofloB", + "scores": { + "IFEval": 0.1641, + "BBH": 0.3, + "MATH Level 5": 0.0091, + "GPQA": 0.2626, + "MUSR": 0.3793, + "MMLU-PRO": 0.1147 + } + }, + { + "model_id": "FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_selected", + "name": "smollm2-135M_pretrained_600k_fineweb_uncovai_selected", + "developer": "FlofloB", + "scores": { + "IFEval": 0.1606, + "BBH": 0.2983, + "MATH Level 5": 0.0076, + "GPQA": 0.2609, + "MUSR": 0.3846, + "MMLU-PRO": 0.1162 + } + }, + { + "model_id": "FlofloB/smollm2-135M_pretrained_800k_fineweb", + "name": "smollm2-135M_pretrained_800k_fineweb", + "developer": "FlofloB", + "scores": { + "IFEval": 0.1641, + "BBH": 0.2959, + "MATH Level 5": 0.0083, + "GPQA": 0.2492, + "MUSR": 0.3701, + "MMLU-PRO": 0.1152 + } + }, + { + "model_id": "FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed", + "name": "smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed", + "developer": "FlofloB", + "scores": { + "IFEval": 0.1623, + "BBH": 0.3038, + "MATH Level 5": 0.0068, + "GPQA": 0.2525, + "MUSR": 0.3993, + "MMLU-PRO": 0.1138 + } + }, + { + "model_id": "FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_selected", + "name": "smollm2-135M_pretrained_800k_fineweb_uncovai_selected", + "developer": "FlofloB", + "scores": { + "IFEval": 0.1474, + "BBH": 0.2943, + "MATH Level 5": 0.0045, + "GPQA": 0.2617, + "MUSR": 0.3766, + "MMLU-PRO": 0.113 + } + }, + { + "model_id": "FlofloB/smollm2_pretrained_200k_fineweb", + "name": "smollm2_pretrained_200k_fineweb", + "developer": "FlofloB", + "scores": { + "IFEval": 0.1527, + "BBH": 0.2995, + "MATH Level 5": 0.0038, + "GPQA": 0.2475, + "MUSR": 0.3699, + "MMLU-PRO": 0.1159 + } + }, + { + "model_id": "FlofloB/test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit", + "name": "test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit", + "developer": "FlofloB", + "scores": { + "IFEval": 0.5215, + "BBH": 0.5241, + "MATH Level 5": 0.1103, + "GPQA": 0.3112, + "MUSR": 0.4244, + "MMLU-PRO": 0.3721 + } + }, + { + "model_id": "FuJhen/ft-openhermes-25-mistral-7b-irca-dpo-pairs", + "name": "ft-openhermes-25-mistral-7b-irca-dpo-pairs", + "developer": "FuJhen", + "scores": { + "IFEval": 0.542, + "BBH": 0.4773, + "MATH Level 5": 0.0483, + "GPQA": 0.2785, + "MUSR": 0.4174, + "MMLU-PRO": 0.2956 + } + }, + { + "model_id": "FuJhen/mistral-instruct-7B-DPO", + "name": "mistral-instruct-7B-DPO", + "developer": "FuJhen", + "scores": { + "IFEval": 0.4968, + "BBH": 0.4624, + "MATH Level 5": 0.0385, + "GPQA": 0.2777, + "MUSR": 0.4016, + "MMLU-PRO": 0.3034 + } + }, + { + "model_id": "FuJhen/mistral_7b_v0.1_structedData_e2e", + "name": "mistral_7b_v0.1_structedData_e2e", + "developer": "FuJhen", + "scores": { + "IFEval": 0.1727, + "BBH": 0.4114, + "MATH Level 5": 0.0045, + "GPQA": 0.2794, + "MUSR": 0.3723, + "MMLU-PRO": 0.2811 + } + }, + { + "model_id": "FuJhen/mistral_7b_v0.1_structedData_viggo", + "name": "mistral_7b_v0.1_structedData_viggo", + "developer": "FuJhen", + "scores": { + "IFEval": 0.1783, + "BBH": 0.4524, + "MATH Level 5": 0.0287, + "GPQA": 0.2836, + "MUSR": 0.3738, + "MMLU-PRO": 0.2942 + } + }, + { + "model_id": "FuseAI/FuseChat-7B-v2.0", + "name": "FuseChat-7B-v2.0", + "developer": "FuseAI", + "scores": { + "IFEval": 0.3423, + "BBH": 0.4954, + "MATH Level 5": 0.0612, + "GPQA": 0.302, + "MUSR": 0.4797, + "MMLU-PRO": 0.3162 + } + }, + { + "model_id": "FuseAI/FuseChat-Llama-3.1-8B-Instruct", + "name": "FuseChat-Llama-3.1-8B-Instruct", + "developer": "FuseAI", + "scores": { + "IFEval": 0.7205, + "BBH": 0.512, + "MATH Level 5": 0.2477, + "GPQA": 0.3054, + "MUSR": 0.382, + "MMLU-PRO": 0.3733 + } + }, + { + "model_id": "FuseAI/FuseChat-Llama-3.2-3B-Instruct", + "name": "FuseChat-Llama-3.2-3B-Instruct", + "developer": "FuseAI", + "scores": { + "IFEval": 0.6849, + "BBH": 0.4658, + "MATH Level 5": 0.2424, + "GPQA": 0.2961, + "MUSR": 0.3914, + "MMLU-PRO": 0.3132 + } + }, + { + "model_id": "FuseAI/FuseChat-Qwen-2.5-7B-Instruct", + "name": "FuseChat-Qwen-2.5-7B-Instruct", + "developer": "FuseAI", + "scores": { + "IFEval": 0.5906, + "BBH": 0.5526, + "MATH Level 5": 0.4562, + "GPQA": 0.2961, + "MUSR": 0.3874, + "MMLU-PRO": 0.4118 + } + }, + { + "model_id": "GalrionSoftworks/MN-LooseCannon-12B-v1", + "name": "MN-LooseCannon-12B-v1", + "developer": "GalrionSoftworks", + "scores": { + "IFEval": 0.5418, + "BBH": 0.5128, + "MATH Level 5": 0.0853, + "GPQA": 0.2852, + "MUSR": 0.4138, + "MMLU-PRO": 0.3196 + } + }, + { + "model_id": "GalrionSoftworks/MagnusIntellectus-12B-v1", + "name": "MagnusIntellectus-12B-v1", + "developer": "GalrionSoftworks", + "scores": { + "IFEval": 0.4421, + "BBH": 0.5323, + "MATH Level 5": 0.065, + "GPQA": 0.2844, + "MUSR": 0.4428, + "MMLU-PRO": 0.3421 + } + }, + { + "model_id": "GenVRadmin/AryaBhatta-GemmaOrca-2-Merged", + "name": "AryaBhatta-GemmaOrca-2-Merged", + "developer": "GenVRadmin", + "scores": { + "IFEval": 0.3064, + "BBH": 0.3887, + "MATH Level 5": 0.0498, + "GPQA": 0.2685, + "MUSR": 0.455, + "MMLU-PRO": 0.2384 + } + }, + { + "model_id": "GenVRadmin/AryaBhatta-GemmaOrca-Merged", + "name": "AryaBhatta-GemmaOrca-Merged", + "developer": "GenVRadmin", + "scores": { + "IFEval": 0.3064, + "BBH": 0.4131, + "MATH Level 5": 0.0514, + "GPQA": 0.2559, + "MUSR": 0.3524, + "MMLU-PRO": 0.2228 + } + }, + { + "model_id": "GenVRadmin/AryaBhatta-GemmaUltra-Merged", + "name": "AryaBhatta-GemmaUltra-Merged", + "developer": "GenVRadmin", + "scores": { + "IFEval": 0.3021, + "BBH": 0.4141, + "MATH Level 5": 0.0536, + "GPQA": 0.2534, + "MUSR": 0.4279, + "MMLU-PRO": 0.2266 + } + }, + { + "model_id": "GenVRadmin/llama38bGenZ_Vikas-Merged", + "name": "llama38bGenZ_Vikas-Merged", + "developer": "GenVRadmin", + "scores": { + "IFEval": 0.3, + "BBH": 0.4536, + "MATH Level 5": 0.0574, + "GPQA": 0.2953, + "MUSR": 0.4402, + "MMLU-PRO": 0.2622 + } + }, + { + "model_id": "GoToCompany/gemma2-9b-cpt-sahabatai-v1-instruct", + "name": "gemma2-9b-cpt-sahabatai-v1-instruct", + "developer": "GoToCompany", + "scores": { + "IFEval": 0.6551, + "BBH": 0.5955, + "MATH Level 5": 0.2054, + "GPQA": 0.3347, + "MUSR": 0.4779, + "MMLU-PRO": 0.4264 + } + }, + { + "model_id": "GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct", + "name": "llama3-8b-cpt-sahabatai-v1-instruct", + "developer": "GoToCompany", + "scores": { + "IFEval": 0.5238, + "BBH": 0.4951, + "MATH Level 5": 0.1276, + "GPQA": 0.2668, + "MUSR": 0.4488, + "MMLU-PRO": 0.3453 + } + }, + { + "model_id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1", + "name": "Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1", + "developer": "Goekdeniz-Guelmez", + "scores": { + "IFEval": 0.3417, + "BBH": 0.3292, + "MATH Level 5": 0.0023, + "GPQA": 0.2576, + "MUSR": 0.3249, + "MMLU-PRO": 0.1638 + } + }, + { + "model_id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1", + "name": "Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1", + "developer": "Goekdeniz-Guelmez", + "scores": { + "IFEval": 0.4769, + "BBH": 0.4186, + "MATH Level 5": 0.2085, + "GPQA": 0.2433, + "MUSR": 0.3675, + "MMLU-PRO": 0.2783 + } + }, + { + "model_id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2", + "name": "Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2", + "developer": "Goekdeniz-Guelmez", + "scores": { + "IFEval": 0.4216, + "BBH": 0.4042, + "MATH Level 5": 0.1269, + "GPQA": 0.2399, + "MUSR": 0.3769, + "MMLU-PRO": 0.2562 + } + }, + { + "model_id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3", + "name": "Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3", + "developer": "Goekdeniz-Guelmez", + "scores": { + "IFEval": 0.4253, + "BBH": 0.4053, + "MATH Level 5": 0.1307, + "GPQA": 0.2433, + "MUSR": 0.3702, + "MMLU-PRO": 0.2556 + } + }, + { + "model_id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-14B-Instruct-abliterated-v4", + "name": "Josiefied-Qwen2.5-14B-Instruct-abliterated-v4", + "developer": "Goekdeniz-Guelmez", + "scores": { + "IFEval": 0.8292, + "BBH": 0.6356, + "MATH Level 5": 0.5423, + "GPQA": 0.3423, + "MUSR": 0.4287, + "MMLU-PRO": 0.5018 + } + }, + { + "model_id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2", + "name": "Josiefied-Qwen2.5-7B-Instruct-abliterated-v2", + "developer": "Goekdeniz-Guelmez", + "scores": { + "IFEval": 0.7814, + "BBH": 0.531, + "MATH Level 5": 0.4532, + "GPQA": 0.2987, + "MUSR": 0.4354, + "MMLU-PRO": 0.412 + } + }, + { + "model_id": "Goekdeniz-Guelmez/j.o.s.i.e.v4o-1.5b-dpo-stage1-v1", + "name": "j.o.s.i.e.v4o-1.5b-dpo-stage1-v1", + "developer": "Goekdeniz-Guelmez", + "scores": { + "IFEval": 0.4188, + "BBH": 0.4124, + "MATH Level 5": 0.1201, + "GPQA": 0.2508, + "MUSR": 0.3529, + "MMLU-PRO": 0.2555 + } + }, + { + "model_id": "Goekdeniz-Guelmez/josie-3b-v6.0", + "name": "josie-3b-v6.0", + "developer": "Goekdeniz-Guelmez", + "scores": { + "IFEval": 0.601, + "BBH": 0.4496, + "MATH Level 5": 0.2938, + "GPQA": 0.2903, + "MUSR": 0.3861, + "MMLU-PRO": 0.322 + } + }, + { + "model_id": "Goekdeniz-Guelmez/josie-7b-v6.0", + "name": "josie-7b-v6.0", + "developer": "Goekdeniz-Guelmez", + "scores": { + "IFEval": 0.7412, + "BBH": 0.5105, + "MATH Level 5": 0.4358, + "GPQA": 0.2827, + "MUSR": 0.4154, + "MMLU-PRO": 0.3807 + } + }, + { + "model_id": "Goekdeniz-Guelmez/josie-7b-v6.0-step2000", + "name": "josie-7b-v6.0-step2000", + "developer": "Goekdeniz-Guelmez", + "scores": { + "IFEval": 0.7628, + "BBH": 0.5098, + "MATH Level 5": 0.0, + "GPQA": 0.2802, + "MUSR": 0.4579, + "MMLU-PRO": 0.4033 + } + }, + { + "model_id": "GreenNode/GreenNode-small-9B-it", + "name": "GreenNode-small-9B-it", + "developer": "GreenNode", + "scores": { + "IFEval": 0.7436, + "BBH": 0.5994, + "MATH Level 5": 0.1745, + "GPQA": 0.3196, + "MUSR": 0.4204, + "MMLU-PRO": 0.3927 + } + }, + { + "model_id": "GritLM/GritLM-7B-KTO", + "name": "GritLM-7B-KTO", + "developer": "GritLM", + "scores": { + "IFEval": 0.531, + "BBH": 0.4853, + "MATH Level 5": 0.0272, + "GPQA": 0.2978, + "MUSR": 0.371, + "MMLU-PRO": 0.268 + } + }, + { + "model_id": "GritLM/GritLM-8x7B-KTO", + "name": "GritLM-8x7B-KTO", + "developer": "GritLM", + "scores": { + "IFEval": 0.5714, + "BBH": 0.582, + "MATH Level 5": 0.1224, + "GPQA": 0.2961, + "MUSR": 0.4217, + "MMLU-PRO": 0.3648 + } + }, + { + "model_id": "Groq/Llama-3-Groq-8B-Tool-Use", + "name": "Llama-3-Groq-8B-Tool-Use", + "developer": "Groq", + "scores": { + "IFEval": 0.6098, + "BBH": 0.4863, + "MATH Level 5": 0.0604, + "GPQA": 0.2676, + "MUSR": 0.366, + "MMLU-PRO": 0.3399 + } + }, + { + "model_id": "Gryphe/Pantheon-RP-1.0-8b-Llama-3", + "name": "Pantheon-RP-1.0-8b-Llama-3", + "developer": "Gryphe", + "scores": { + "IFEval": 0.3933, + "BBH": 0.4539, + "MATH Level 5": 0.0634, + "GPQA": 0.276, + "MUSR": 0.3832, + "MMLU-PRO": 0.3067 + } + }, + { + "model_id": "Gryphe/Pantheon-RP-1.5-12b-Nemo", + "name": "Pantheon-RP-1.5-12b-Nemo", + "developer": "Gryphe", + "scores": { + "IFEval": 0.4763, + "BBH": 0.5196, + "MATH Level 5": 0.0491, + "GPQA": 0.2727, + "MUSR": 0.442, + "MMLU-PRO": 0.3302 + } + }, + { + "model_id": "Gryphe/Pantheon-RP-1.6-12b-Nemo", + "name": "Pantheon-RP-1.6-12b-Nemo", + "developer": "Gryphe", + "scores": { + "IFEval": 0.4481, + "BBH": 0.5204, + "MATH Level 5": 0.0461, + "GPQA": 0.2777, + "MUSR": 0.4288, + "MMLU-PRO": 0.3311 + } + }, + { + "model_id": "Gryphe/Pantheon-RP-1.6-12b-Nemo-KTO", + "name": "Pantheon-RP-1.6-12b-Nemo-KTO", + "developer": "Gryphe", + "scores": { + "IFEval": 0.4636, + "BBH": 0.5277, + "MATH Level 5": 0.0529, + "GPQA": 0.2953, + "MUSR": 0.4248, + "MMLU-PRO": 0.3382 + } + }, + { + "model_id": "Gryphe/Pantheon-RP-Pure-1.6.2-22b-Small", + "name": "Pantheon-RP-Pure-1.6.2-22b-Small", + "developer": "Gryphe", + "scores": { + "IFEval": 0.6931, + "BBH": 0.5305, + "MATH Level 5": 0.2024, + "GPQA": 0.3289, + "MUSR": 0.3765, + "MMLU-PRO": 0.3942 + } + }, + { + "model_id": "GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall", + "name": "Nature-Reason-1.2-reallysmall", + "developer": "GuilhermeNaturaUmana", + "scores": { + "IFEval": 0.4985, + "BBH": 0.5645, + "MATH Level 5": 0.2576, + "GPQA": 0.3003, + "MUSR": 0.4373, + "MMLU-PRO": 0.4429 + } + }, + { + "model_id": "Gunulhona/Gemma-Ko-Merge", + "name": "Gemma-Ko-Merge", + "developer": "Gunulhona", + "scores": { + "IFEval": 0.6416, + "BBH": 0.5813, + "MATH Level 5": 0.1881, + "GPQA": 0.3356, + "MUSR": 0.4047, + "MMLU-PRO": 0.3879 + } + }, + { + "model_id": "Gunulhona/Gemma-Ko-Merge-PEFT", + "name": "Gemma-Ko-Merge-PEFT", + "developer": "Gunulhona", + "scores": { + "IFEval": 0.4441, + "BBH": 0.4863, + "MATH Level 5": 0.0, + "GPQA": 0.307, + "MUSR": 0.3986, + "MMLU-PRO": 0.3098 + } + }, + { + "model_id": "HPAI-BSC/Llama3-Aloe-8B-Alpha", + "name": "Llama3-Aloe-8B-Alpha", + "developer": "HPAI-BSC", + "scores": { + "IFEval": 0.5081, + "BBH": 0.4831, + "MATH Level 5": 0.0612, + "GPQA": 0.2945, + "MUSR": 0.3673, + "MMLU-PRO": 0.3295 + } + }, + { + "model_id": "HPAI-BSC/Llama3.1-Aloe-Beta-8B", + "name": "Llama3.1-Aloe-Beta-8B", + "developer": "HPAI-BSC", + "scores": { + "IFEval": 0.7253, + "BBH": 0.5093, + "MATH Level 5": 0.1828, + "GPQA": 0.2685, + "MUSR": 0.3835, + "MMLU-PRO": 0.358 + } + }, + { + "model_id": "HPAI-BSC/Qwen2.5-Aloe-Beta-7B", + "name": "Qwen2.5-Aloe-Beta-7B", + "developer": "HPAI-BSC", + "scores": { + "IFEval": 0.4554, + "BBH": 0.5049, + "MATH Level 5": 0.3542, + "GPQA": 0.2911, + "MUSR": 0.426, + "MMLU-PRO": 0.4354 + } + }, + { + "model_id": "HarbingerX/Zeitgeist-3b-V1", + "name": "Zeitgeist-3b-V1", + "developer": "HarbingerX", + "scores": { + "IFEval": 0.6712, + "BBH": 0.4441, + "MATH Level 5": 0.1035, + "GPQA": 0.2819, + "MUSR": 0.3579, + "MMLU-PRO": 0.3009 + } + }, + { + "model_id": "HarbingerX/Zeitgeist-3b-V1.2", + "name": "Zeitgeist-3b-V1.2", + "developer": "HarbingerX", + "scores": { + "IFEval": 0.6754, + "BBH": 0.4441, + "MATH Level 5": 0.1012, + "GPQA": 0.2777, + "MUSR": 0.3579, + "MMLU-PRO": 0.3056 + } + }, + { + "model_id": "Hastagaras/L3.2-JametMini-3B-MK.III", + "name": "L3.2-JametMini-3B-MK.III", + "developer": "Hastagaras", + "scores": { + "IFEval": 0.6183, + "BBH": 0.4539, + "MATH Level 5": 0.1458, + "GPQA": 0.2827, + "MUSR": 0.3686, + "MMLU-PRO": 0.2983 + } + }, + { + "model_id": "Hastagaras/Llama-3.1-Jamet-8B-MK.I", + "name": "Llama-3.1-Jamet-8B-MK.I", + "developer": "Hastagaras", + "scores": { + "IFEval": 0.7338, + "BBH": 0.5049, + "MATH Level 5": 0.1269, + "GPQA": 0.2743, + "MUSR": 0.3726, + "MMLU-PRO": 0.3482 + } + }, + { + "model_id": "Hastagaras/Zabuza-8B-Llama-3.1", + "name": "Zabuza-8B-Llama-3.1", + "developer": "Hastagaras", + "scores": { + "IFEval": 0.6265, + "BBH": 0.4539, + "MATH Level 5": 0.0551, + "GPQA": 0.2643, + "MUSR": 0.3568, + "MMLU-PRO": 0.2923 + } + }, + { + "model_id": "HelpingAI/Cipher-20B", + "name": "Cipher-20B", + "developer": "HelpingAI", + "scores": { + "IFEval": 0.5378, + "BBH": 0.6032, + "MATH Level 5": 0.1994, + "GPQA": 0.2953, + "MUSR": 0.4003, + "MMLU-PRO": 0.3744 + } + }, + { + "model_id": "HelpingAI/Dhanishtha-Large", + "name": "Dhanishtha-Large", + "developer": "HelpingAI", + "scores": { + "IFEval": 0.2457, + "BBH": 0.4604, + "MATH Level 5": 0.3852, + "GPQA": 0.3029, + "MUSR": 0.3845, + "MMLU-PRO": 0.2755 + } + }, + { + "model_id": "HelpingAI/Priya-10B", + "name": "Priya-10B", + "developer": "HelpingAI", + "scores": { + "IFEval": 0.4043, + "BBH": 0.4441, + "MATH Level 5": 0.0189, + "GPQA": 0.2559, + "MUSR": 0.3793, + "MMLU-PRO": 0.2493 + } + }, + { + "model_id": "HelpingAI/Priya-3B", + "name": "Priya-3B", + "developer": "HelpingAI", + "scores": { + "IFEval": 0.4526, + "BBH": 0.3961, + "MATH Level 5": 0.0144, + "GPQA": 0.2567, + "MUSR": 0.3713, + "MMLU-PRO": 0.2339 + } + }, + { + "model_id": "HeraiHench/DeepSeek-R1-Qwen-Coder-8B", + "name": "DeepSeek-R1-Qwen-Coder-8B", + "developer": "HeraiHench", + "scores": { + "IFEval": 0.1869, + "BBH": 0.2913, + "MATH Level 5": 0.0, + "GPQA": 0.2601, + "MUSR": 0.3738, + "MMLU-PRO": 0.1123 + } + }, + { + "model_id": "HeraiHench/Double-Down-Qwen-Math-7B", + "name": "Double-Down-Qwen-Math-7B", + "developer": "HeraiHench", + "scores": { + "IFEval": 0.167, + "BBH": 0.2845, + "MATH Level 5": 0.0008, + "GPQA": 0.2651, + "MUSR": 0.3737, + "MMLU-PRO": 0.1112 + } + }, + { + "model_id": "HeraiHench/Marge-Qwen-Math-7B", + "name": "Marge-Qwen-Math-7B", + "developer": "HeraiHench", + "scores": { + "IFEval": 0.1262, + "BBH": 0.3069, + "MATH Level 5": 0.0053, + "GPQA": 0.2391, + "MUSR": 0.3939, + "MMLU-PRO": 0.1056 + } + }, + { + "model_id": "HeraiHench/Phi-4-slerp-ReasoningRP-14B", + "name": "Phi-4-slerp-ReasoningRP-14B", + "developer": "HeraiHench", + "scores": { + "IFEval": 0.1575, + "BBH": 0.4196, + "MATH Level 5": 0.0, + "GPQA": 0.2936, + "MUSR": 0.3116, + "MMLU-PRO": 0.19 + } + }, + { + "model_id": "HiroseKoichi/Llama-Salad-4x8B-V3", + "name": "Llama-Salad-4x8B-V3", + "developer": "HiroseKoichi", + "scores": { + "IFEval": 0.6654, + "BBH": 0.5245, + "MATH Level 5": 0.0959, + "GPQA": 0.3029, + "MUSR": 0.374, + "MMLU-PRO": 0.3518 + } + }, + { + "model_id": "HoangHa/Pensez-Llama3.1-8B", + "name": "Pensez-Llama3.1-8B", + "developer": "HoangHa", + "scores": { + "IFEval": 0.3887, + "BBH": 0.4669, + "MATH Level 5": 0.1148, + "GPQA": 0.2886, + "MUSR": 0.3597, + "MMLU-PRO": 0.3126 + } + }, + { + "model_id": "HuggingFaceH4/zephyr-7b-alpha", + "name": "zephyr-7b-alpha", + "developer": "HuggingFaceH4", + "scores": { + "IFEval": 0.5191, + "BBH": 0.4583, + "MATH Level 5": 0.0196, + "GPQA": 0.2978, + "MUSR": 0.395, + "MMLU-PRO": 0.2795 + } + }, + { + "model_id": "HuggingFaceH4/zephyr-7b-beta", + "name": "zephyr-7b-beta", + "developer": "HuggingFaceH4", + "scores": { + "IFEval": 0.495, + "BBH": 0.4316, + "MATH Level 5": 0.0287, + "GPQA": 0.2903, + "MUSR": 0.3925, + "MMLU-PRO": 0.2781 + } + }, + { + "model_id": "HuggingFaceH4/zephyr-7b-gemma-v0.1", + "name": "zephyr-7b-gemma-v0.1", + "developer": "HuggingFaceH4", + "scores": { + "IFEval": 0.3364, + "BBH": 0.4624, + "MATH Level 5": 0.0816, + "GPQA": 0.2945, + "MUSR": 0.374, + "MMLU-PRO": 0.2847 + } + }, + { + "model_id": "HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1", + "name": "zephyr-orpo-141b-A35b-v0.1", + "developer": "HuggingFaceH4", + "scores": { + "IFEval": 0.6511, + "BBH": 0.629, + "MATH Level 5": 0.2047, + "GPQA": 0.3784, + "MUSR": 0.4465, + "MMLU-PRO": 0.4586 + } + }, + { + "model_id": "HuggingFaceTB/SmolLM-1.7B", + "name": "SmolLM-1.7B", + "developer": "HuggingFaceTB", + "scores": { + "IFEval": 0.2362, + "BBH": 0.3181, + "MATH Level 5": 0.0166, + "GPQA": 0.2416, + "MUSR": 0.3421, + "MMLU-PRO": 0.1148 + } + }, + { + "model_id": "HuggingFaceTB/SmolLM-1.7B-Instruct", + "name": "SmolLM-1.7B-Instruct", + "developer": "HuggingFaceTB", + "scores": { + "IFEval": 0.2348, + "BBH": 0.2885, + "MATH Level 5": 0.0211, + "GPQA": 0.2601, + "MUSR": 0.3487, + "MMLU-PRO": 0.1166 + } + }, + { + "model_id": "HuggingFaceTB/SmolLM-135M", + "name": "SmolLM-135M", + "developer": "HuggingFaceTB", + "scores": { + "IFEval": 0.2125, + "BBH": 0.3046, + "MATH Level 5": 0.0136, + "GPQA": 0.2584, + "MUSR": 0.4366, + "MMLU-PRO": 0.1122 + } + }, + { + "model_id": "HuggingFaceTB/SmolLM-135M-Instruct", + "name": "SmolLM-135M-Instruct", + "developer": "HuggingFaceTB", + "scores": { + "IFEval": 0.1214, + "BBH": 0.3015, + "MATH Level 5": 0.0053, + "GPQA": 0.2592, + "MUSR": 0.3635, + "MMLU-PRO": 0.1176 + } + }, + { + "model_id": "HuggingFaceTB/SmolLM-360M", + "name": "SmolLM-360M", + "developer": "HuggingFaceTB", + "scores": { + "IFEval": 0.2134, + "BBH": 0.3065, + "MATH Level 5": 0.0113, + "GPQA": 0.2676, + "MUSR": 0.4018, + "MMLU-PRO": 0.1124 + } + }, + { + "model_id": "HuggingFaceTB/SmolLM-360M-Instruct", + "name": "SmolLM-360M-Instruct", + "developer": "HuggingFaceTB", + "scores": { + "IFEval": 0.1952, + "BBH": 0.2885, + "MATH Level 5": 0.0181, + "GPQA": 0.2643, + "MUSR": 0.3472, + "MMLU-PRO": 0.1166 + } + }, + { + "model_id": "HuggingFaceTB/SmolLM2-1.7B", + "name": "SmolLM2-1.7B", + "developer": "HuggingFaceTB", + "scores": { + "IFEval": 0.244, + "BBH": 0.3453, + "MATH Level 5": 0.0264, + "GPQA": 0.2794, + "MUSR": 0.3485, + "MMLU-PRO": 0.2138 + } + }, + { + "model_id": "HuggingFaceTB/SmolLM2-1.7B-Instruct", + "name": "SmolLM2-1.7B-Instruct", + "developer": "HuggingFaceTB", + "scores": { + "IFEval": 0.5368, + "BBH": 0.3599, + "MATH Level 5": 0.0582, + "GPQA": 0.2794, + "MUSR": 0.3421, + "MMLU-PRO": 0.2054 + } + }, + { + "model_id": "HuggingFaceTB/SmolLM2-135M", + "name": "SmolLM2-135M", + "developer": "HuggingFaceTB", + "scores": { + "IFEval": 0.1818, + "BBH": 0.3044, + "MATH Level 5": 0.0121, + "GPQA": 0.2483, + "MUSR": 0.4112, + "MMLU-PRO": 0.1095 + } + }, + { + "model_id": "HuggingFaceTB/SmolLM2-135M-Instruct", + "name": "SmolLM2-135M-Instruct", + "developer": "HuggingFaceTB", + "scores": { + "IFEval": 0.2883, + "BBH": 0.3124, + "MATH Level 5": 0.003, + "GPQA": 0.2357, + "MUSR": 0.3662, + "MMLU-PRO": 0.1115 + } + }, + { + "model_id": "HuggingFaceTB/SmolLM2-360M", + "name": "SmolLM2-360M", + "developer": "HuggingFaceTB", + "scores": { + "IFEval": 0.2115, + "BBH": 0.3233, + "MATH Level 5": 0.0121, + "GPQA": 0.2458, + "MUSR": 0.3954, + "MMLU-PRO": 0.1169 + } + }, + { + "model_id": "HuggingFaceTB/SmolLM2-360M-Instruct", + "name": "SmolLM2-360M-Instruct", + "developer": "HuggingFaceTB", + "scores": { + "IFEval": 0.3842, + "BBH": 0.3144, + "MATH Level 5": 0.0151, + "GPQA": 0.255, + "MUSR": 0.3461, + "MMLU-PRO": 0.1117 + } + }, + { + "model_id": "HumanLLMs/Humanish-LLama3-8B-Instruct", + "name": "Humanish-LLama3-8B-Instruct", + "developer": "HumanLLMs", + "scores": { + "IFEval": 0.6498, + "BBH": 0.4968, + "MATH Level 5": 0.1027, + "GPQA": 0.2559, + "MUSR": 0.3582, + "MMLU-PRO": 0.3702 + } + }, + { + "model_id": "HumanLLMs/Humanish-Mistral-Nemo-Instruct-2407", + "name": "Humanish-Mistral-Nemo-Instruct-2407", + "developer": "HumanLLMs", + "scores": { + "IFEval": 0.5451, + "BBH": 0.5262, + "MATH Level 5": 0.1367, + "GPQA": 0.2878, + "MUSR": 0.3968, + "MMLU-PRO": 0.3521 + } + }, + { + "model_id": "HumanLLMs/Humanish-Qwen2.5-7B-Instruct", + "name": "Humanish-Qwen2.5-7B-Instruct", + "developer": "HumanLLMs", + "scores": { + "IFEval": 0.7284, + "BBH": 0.5364, + "MATH Level 5": 0.5, + "GPQA": 0.2987, + "MUSR": 0.3981, + "MMLU-PRO": 0.4398 + } + }, + { + "model_id": "IDEA-CCNL/Ziya-LLaMA-13B-v1", + "name": "Ziya-LLaMA-13B-v1", + "developer": "IDEA-CCNL", + "scores": { + "IFEval": 0.1697, + "BBH": 0.2877, + "MATH Level 5": 0.0, + "GPQA": 0.2492, + "MUSR": 0.3751, + "MMLU-PRO": 0.1101 + } + }, + { + "model_id": "INSAIT-Institute/BgGPT-Gemma-2-27B-IT-v1.0", + "name": "BgGPT-Gemma-2-27B-IT-v1.0", + "developer": "INSAIT-Institute", + "scores": { + "IFEval": 0.0, + "BBH": 0.2912, + "MATH Level 5": 0.0, + "GPQA": 0.2601, + "MUSR": 0.3575, + "MMLU-PRO": 0.1167 + } + }, + { + "model_id": "IlyaGusev/gemma-2-2b-it-abliterated", + "name": "gemma-2-2b-it-abliterated", + "developer": "IlyaGusev", + "scores": { + "IFEval": 0.5331, + "BBH": 0.4119, + "MATH Level 5": 0.0612, + "GPQA": 0.2651, + "MUSR": 0.3782, + "MMLU-PRO": 0.2538 + } + }, + { + "model_id": "IlyaGusev/gemma-2-9b-it-abliterated", + "name": "gemma-2-9b-it-abliterated", + "developer": "IlyaGusev", + "scores": { + "IFEval": 0.7473, + "BBH": 0.5906, + "MATH Level 5": 0.1775, + "GPQA": 0.3456, + "MUSR": 0.4034, + "MMLU-PRO": 0.3915 + } + }, + { + "model_id": "Infinirc/Infinirc-Llama3-8B-2G-Release-v1.0", + "name": "Infinirc-Llama3-8B-2G-Release-v1.0", + "developer": "Infinirc", + "scores": { + "IFEval": 0.2024, + "BBH": 0.4351, + "MATH Level 5": 0.0166, + "GPQA": 0.2995, + "MUSR": 0.4609, + "MMLU-PRO": 0.216 + } + }, + { + "model_id": "Intel/neural-chat-7b-v3", + "name": "neural-chat-7b-v3", + "developer": "Intel", + "scores": { + "IFEval": 0.2778, + "BBH": 0.5048, + "MATH Level 5": 0.0295, + "GPQA": 0.2919, + "MUSR": 0.5055, + "MMLU-PRO": 0.2699 + } + }, + { + "model_id": "Intel/neural-chat-7b-v3-1", + "name": "neural-chat-7b-v3-1", + "developer": "Intel", + "scores": { + "IFEval": 0.4687, + "BBH": 0.5052, + "MATH Level 5": 0.0355, + "GPQA": 0.2903, + "MUSR": 0.4979, + "MMLU-PRO": 0.2678 + } + }, + { + "model_id": "Intel/neural-chat-7b-v3-2", + "name": "neural-chat-7b-v3-2", + "developer": "Intel", + "scores": { + "IFEval": 0.4988, + "BBH": 0.5032, + "MATH Level 5": 0.0476, + "GPQA": 0.2903, + "MUSR": 0.4895, + "MMLU-PRO": 0.2667 + } + }, + { + "model_id": "Intel/neural-chat-7b-v3-3", + "name": "neural-chat-7b-v3-3", + "developer": "Intel", + "scores": { + "IFEval": 0.4763, + "BBH": 0.4877, + "MATH Level 5": 0.0408, + "GPQA": 0.2894, + "MUSR": 0.486, + "MMLU-PRO": 0.2625 + } + }, + { + "model_id": "IntervitensInc/internlm2_5-20b-llamafied", + "name": "internlm2_5-20b-llamafied", + "developer": "IntervitensInc", + "scores": { + "IFEval": 0.341, + "BBH": 0.7478, + "MATH Level 5": 0.1715, + "GPQA": 0.3381, + "MUSR": 0.4475, + "MMLU-PRO": 0.4051 + } + }, + { + "model_id": "Invalid-Null/PeiYangMe-0.5", + "name": "PeiYangMe-0.5", + "developer": "Invalid-Null", + "scores": { + "IFEval": 0.1409, + "BBH": 0.2791, + "MATH Level 5": 0.0, + "GPQA": 0.2441, + "MUSR": 0.3738, + "MMLU-PRO": 0.1109 + } + }, + { + "model_id": "Invalid-Null/PeiYangMe-0.7", + "name": "PeiYangMe-0.7", + "developer": "Invalid-Null", + "scores": { + "IFEval": 0.1491, + "BBH": 0.3028, + "MATH Level 5": 0.0113, + "GPQA": 0.2332, + "MUSR": 0.3857, + "MMLU-PRO": 0.1101 + } + }, + { + "model_id": "Isaak-Carter/JOSIEv4o-8b-stage1-v4", + "name": "JOSIEv4o-8b-stage1-v4", + "developer": "Isaak-Carter", + "scores": { + "IFEval": 0.2477, + "BBH": 0.4758, + "MATH Level 5": 0.0453, + "GPQA": 0.2911, + "MUSR": 0.3641, + "MMLU-PRO": 0.3292 + } + }, + { + "model_id": "Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated", + "name": "Josiefied-Qwen2.5-7B-Instruct-abliterated", + "developer": "Isaak-Carter", + "scores": { + "IFEval": 0.7317, + "BBH": 0.5396, + "MATH Level 5": 0.4924, + "GPQA": 0.3029, + "MUSR": 0.4087, + "MMLU-PRO": 0.4276 + } + }, + { + "model_id": "Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2", + "name": "Josiefied-Qwen2.5-7B-Instruct-abliterated-v2", + "developer": "Isaak-Carter", + "scores": { + "IFEval": 0.7841, + "BBH": 0.5311, + "MATH Level 5": 0.4721, + "GPQA": 0.2987, + "MUSR": 0.4354, + "MMLU-PRO": 0.4128 + } + }, + { + "model_id": "J-LAB/Thynk_orpo", + "name": "Thynk_orpo", + "developer": "J-LAB", + "scores": { + "IFEval": 0.2102, + "BBH": 0.4463, + "MATH Level 5": 0.148, + "GPQA": 0.2928, + "MUSR": 0.4515, + "MMLU-PRO": 0.3231 + } + }, + { + "model_id": "JackFram/llama-160m", + "name": "llama-160m", + "developer": "JackFram", + "scores": { + "IFEval": 0.1791, + "BBH": 0.2888, + "MATH Level 5": 0.0083, + "GPQA": 0.2617, + "MUSR": 0.3792, + "MMLU-PRO": 0.1128 + } + }, + { + "model_id": "JackFram/llama-68m", + "name": "llama-68m", + "developer": "JackFram", + "scores": { + "IFEval": 0.1726, + "BBH": 0.2936, + "MATH Level 5": 0.006, + "GPQA": 0.2584, + "MUSR": 0.391, + "MMLU-PRO": 0.1144 + } + }, + { + "model_id": "Jacoby746/Casual-Magnum-34B", + "name": "Casual-Magnum-34B", + "developer": "Jacoby746", + "scores": { + "IFEval": 0.193, + "BBH": 0.6032, + "MATH Level 5": 0.0921, + "GPQA": 0.3725, + "MUSR": 0.4078, + "MMLU-PRO": 0.5184 + } + }, + { + "model_id": "Jacoby746/Inf-Silent-Kunoichi-v0.1-2x7B", + "name": "Inf-Silent-Kunoichi-v0.1-2x7B", + "developer": "Jacoby746", + "scores": { + "IFEval": 0.388, + "BBH": 0.5185, + "MATH Level 5": 0.071, + "GPQA": 0.2894, + "MUSR": 0.428, + "MMLU-PRO": 0.3271 + } + }, + { + "model_id": "Jacoby746/Inf-Silent-Kunoichi-v0.2-2x7B", + "name": "Inf-Silent-Kunoichi-v0.2-2x7B", + "developer": "Jacoby746", + "scores": { + "IFEval": 0.3636, + "BBH": 0.5209, + "MATH Level 5": 0.0627, + "GPQA": 0.3003, + "MUSR": 0.432, + "MMLU-PRO": 0.3272 + } + }, + { + "model_id": "Jacoby746/Proto-Athena-4x7B", + "name": "Proto-Athena-4x7B", + "developer": "Jacoby746", + "scores": { + "IFEval": 0.3703, + "BBH": 0.5107, + "MATH Level 5": 0.065, + "GPQA": 0.2945, + "MUSR": 0.4348, + "MMLU-PRO": 0.3206 + } + }, + { + "model_id": "Jacoby746/Proto-Athena-v0.2-4x7B", + "name": "Proto-Athena-v0.2-4x7B", + "developer": "Jacoby746", + "scores": { + "IFEval": 0.3752, + "BBH": 0.5068, + "MATH Level 5": 0.0634, + "GPQA": 0.2987, + "MUSR": 0.4213, + "MMLU-PRO": 0.3197 + } + }, + { + "model_id": "Jacoby746/Proto-Harpy-Blazing-Light-v0.1-2x7B", + "name": "Proto-Harpy-Blazing-Light-v0.1-2x7B", + "developer": "Jacoby746", + "scores": { + "IFEval": 0.4905, + "BBH": 0.5187, + "MATH Level 5": 0.0748, + "GPQA": 0.2953, + "MUSR": 0.445, + "MMLU-PRO": 0.3301 + } + }, + { + "model_id": "Jacoby746/Proto-Harpy-Spark-v0.1-7B", + "name": "Proto-Harpy-Spark-v0.1-7B", + "developer": "Jacoby746", + "scores": { + "IFEval": 0.4333, + "BBH": 0.4736, + "MATH Level 5": 0.0619, + "GPQA": 0.3054, + "MUSR": 0.4317, + "MMLU-PRO": 0.3069 + } + }, + { + "model_id": "JayHyeon/Qwen-0.5B-DPO-1epoch", + "name": "Qwen-0.5B-DPO-1epoch", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2647, + "BBH": 0.3191, + "MATH Level 5": 0.0287, + "GPQA": 0.2525, + "MUSR": 0.3352, + "MMLU-PRO": 0.1558 + } + }, + { + "model_id": "JayHyeon/Qwen-0.5B-DPO-5epoch", + "name": "Qwen-0.5B-DPO-5epoch", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.257, + "BBH": 0.3112, + "MATH Level 5": 0.04, + "GPQA": 0.2433, + "MUSR": 0.338, + "MMLU-PRO": 0.1533 + } + }, + { + "model_id": "JayHyeon/Qwen-0.5B-IRPO-1epoch", + "name": "Qwen-0.5B-IRPO-1epoch", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2589, + "BBH": 0.3164, + "MATH Level 5": 0.0317, + "GPQA": 0.2466, + "MUSR": 0.3286, + "MMLU-PRO": 0.15 + } + }, + { + "model_id": "JayHyeon/Qwen-0.5B-IRPO-5epoch", + "name": "Qwen-0.5B-IRPO-5epoch", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2487, + "BBH": 0.3189, + "MATH Level 5": 0.0325, + "GPQA": 0.2399, + "MUSR": 0.3287, + "MMLU-PRO": 0.1507 + } + }, + { + "model_id": "JayHyeon/Qwen-0.5B-eDPO-1epoch", + "name": "Qwen-0.5B-eDPO-1epoch", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2623, + "BBH": 0.3181, + "MATH Level 5": 0.0347, + "GPQA": 0.2424, + "MUSR": 0.3327, + "MMLU-PRO": 0.1553 + } + }, + { + "model_id": "JayHyeon/Qwen-0.5B-eDPO-5epoch", + "name": "Qwen-0.5B-eDPO-5epoch", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2477, + "BBH": 0.3096, + "MATH Level 5": 0.0234, + "GPQA": 0.2492, + "MUSR": 0.3326, + "MMLU-PRO": 0.1523 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-Instruct-SFT", + "name": "Qwen2.5-0.5B-Instruct-SFT", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2768, + "BBH": 0.3254, + "MATH Level 5": 0.0393, + "GPQA": 0.2827, + "MUSR": 0.3342, + "MMLU-PRO": 0.152 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1", + "name": "Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2469, + "BBH": 0.326, + "MATH Level 5": 0.065, + "GPQA": 0.2727, + "MUSR": 0.3434, + "MMLU-PRO": 0.1575 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1", + "name": "Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2606, + "BBH": 0.3308, + "MATH Level 5": 0.0498, + "GPQA": 0.2802, + "MUSR": 0.3288, + "MMLU-PRO": 0.1626 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1", + "name": "Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2529, + "BBH": 0.3262, + "MATH Level 5": 0.0566, + "GPQA": 0.2685, + "MUSR": 0.3301, + "MMLU-PRO": 0.1576 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT", + "name": "Qwen2.5-0.5B-SFT", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.1964, + "BBH": 0.3121, + "MATH Level 5": 0.0272, + "GPQA": 0.2785, + "MUSR": 0.3394, + "MMLU-PRO": 0.1673 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-4", + "name": "Qwen2.5-0.5B-SFT-1e-4", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.202, + "BBH": 0.3017, + "MATH Level 5": 0.0189, + "GPQA": 0.2508, + "MUSR": 0.3446, + "MMLU-PRO": 0.1619 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-4-2ep", + "name": "Qwen2.5-0.5B-SFT-1e-4-2ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.214, + "BBH": 0.3172, + "MATH Level 5": 0.0264, + "GPQA": 0.2466, + "MUSR": 0.3473, + "MMLU-PRO": 0.1537 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-4-3ep", + "name": "Qwen2.5-0.5B-SFT-1e-4-3ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2257, + "BBH": 0.3064, + "MATH Level 5": 0.0264, + "GPQA": 0.2483, + "MUSR": 0.3661, + "MMLU-PRO": 0.1532 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-4-5ep", + "name": "Qwen2.5-0.5B-SFT-1e-4-5ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.1987, + "BBH": 0.3104, + "MATH Level 5": 0.0196, + "GPQA": 0.2534, + "MUSR": 0.3407, + "MMLU-PRO": 0.1558 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-5", + "name": "Qwen2.5-0.5B-SFT-1e-5", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.1986, + "BBH": 0.314, + "MATH Level 5": 0.0378, + "GPQA": 0.2685, + "MUSR": 0.346, + "MMLU-PRO": 0.1698 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-5-2ep", + "name": "Qwen2.5-0.5B-SFT-1e-5-2ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.1971, + "BBH": 0.3225, + "MATH Level 5": 0.0529, + "GPQA": 0.2693, + "MUSR": 0.3368, + "MMLU-PRO": 0.1651 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-5-3ep", + "name": "Qwen2.5-0.5B-SFT-1e-5-3ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2241, + "BBH": 0.3247, + "MATH Level 5": 0.0536, + "GPQA": 0.2701, + "MUSR": 0.3353, + "MMLU-PRO": 0.1689 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-5-5ep", + "name": "Qwen2.5-0.5B-SFT-1e-5-5ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2292, + "BBH": 0.3259, + "MATH Level 5": 0.0521, + "GPQA": 0.2794, + "MUSR": 0.3235, + "MMLU-PRO": 0.1688 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-4", + "name": "Qwen2.5-0.5B-SFT-2e-4", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2034, + "BBH": 0.2936, + "MATH Level 5": 0.0242, + "GPQA": 0.2576, + "MUSR": 0.3434, + "MMLU-PRO": 0.1413 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-4-2ep", + "name": "Qwen2.5-0.5B-SFT-2e-4-2ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.1831, + "BBH": 0.2984, + "MATH Level 5": 0.0249, + "GPQA": 0.2424, + "MUSR": 0.3568, + "MMLU-PRO": 0.1484 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-4-3ep", + "name": "Qwen2.5-0.5B-SFT-2e-4-3ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.199, + "BBH": 0.311, + "MATH Level 5": 0.0151, + "GPQA": 0.2609, + "MUSR": 0.3449, + "MMLU-PRO": 0.1416 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-4-5ep", + "name": "Qwen2.5-0.5B-SFT-2e-4-5ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.1897, + "BBH": 0.2936, + "MATH Level 5": 0.0181, + "GPQA": 0.2693, + "MUSR": 0.3874, + "MMLU-PRO": 0.1336 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5", + "name": "Qwen2.5-0.5B-SFT-2e-5", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2068, + "BBH": 0.3204, + "MATH Level 5": 0.037, + "GPQA": 0.2693, + "MUSR": 0.3487, + "MMLU-PRO": 0.1678 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2201, + "BBH": 0.3217, + "MATH Level 5": 0.0408, + "GPQA": 0.2777, + "MUSR": 0.3367, + "MMLU-PRO": 0.171 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2411, + "BBH": 0.3167, + "MATH Level 5": 0.0347, + "GPQA": 0.271, + "MUSR": 0.3301, + "MMLU-PRO": 0.1562 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2369, + "BBH": 0.326, + "MATH Level 5": 0.0453, + "GPQA": 0.276, + "MUSR": 0.3355, + "MMLU-PRO": 0.157 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2262, + "BBH": 0.3262, + "MATH Level 5": 0.0347, + "GPQA": 0.2794, + "MUSR": 0.3408, + "MMLU-PRO": 0.1541 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2508, + "BBH": 0.3199, + "MATH Level 5": 0.0408, + "GPQA": 0.276, + "MUSR": 0.3355, + "MMLU-PRO": 0.1555 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.239, + "BBH": 0.3182, + "MATH Level 5": 0.04, + "GPQA": 0.2676, + "MUSR": 0.3328, + "MMLU-PRO": 0.156 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2423, + "BBH": 0.3154, + "MATH Level 5": 0.0347, + "GPQA": 0.2676, + "MUSR": 0.3328, + "MMLU-PRO": 0.1548 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2493, + "BBH": 0.319, + "MATH Level 5": 0.0438, + "GPQA": 0.2651, + "MUSR": 0.3341, + "MMLU-PRO": 0.1561 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2542, + "BBH": 0.3167, + "MATH Level 5": 0.0408, + "GPQA": 0.2718, + "MUSR": 0.3289, + "MMLU-PRO": 0.158 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2451, + "BBH": 0.316, + "MATH Level 5": 0.0408, + "GPQA": 0.2743, + "MUSR": 0.3302, + "MMLU-PRO": 0.1561 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2557, + "BBH": 0.3142, + "MATH Level 5": 0.04, + "GPQA": 0.2743, + "MUSR": 0.3315, + "MMLU-PRO": 0.1575 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2605, + "BBH": 0.3167, + "MATH Level 5": 0.0363, + "GPQA": 0.2701, + "MUSR": 0.3341, + "MMLU-PRO": 0.1577 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2578, + "BBH": 0.3173, + "MATH Level 5": 0.0355, + "GPQA": 0.2634, + "MUSR": 0.3288, + "MMLU-PRO": 0.1583 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2335, + "BBH": 0.3198, + "MATH Level 5": 0.0385, + "GPQA": 0.2752, + "MUSR": 0.3276, + "MMLU-PRO": 0.1581 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2472, + "BBH": 0.3226, + "MATH Level 5": 0.0506, + "GPQA": 0.276, + "MUSR": 0.3262, + "MMLU-PRO": 0.1538 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2474, + "BBH": 0.3229, + "MATH Level 5": 0.0415, + "GPQA": 0.2727, + "MUSR": 0.3275, + "MMLU-PRO": 0.1539 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2403, + "BBH": 0.3245, + "MATH Level 5": 0.0431, + "GPQA": 0.2819, + "MUSR": 0.3262, + "MMLU-PRO": 0.1573 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2368, + "BBH": 0.3224, + "MATH Level 5": 0.0461, + "GPQA": 0.2743, + "MUSR": 0.3355, + "MMLU-PRO": 0.1516 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2372, + "BBH": 0.3248, + "MATH Level 5": 0.0476, + "GPQA": 0.2701, + "MUSR": 0.3394, + "MMLU-PRO": 0.155 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2499, + "BBH": 0.3181, + "MATH Level 5": 0.0415, + "GPQA": 0.2651, + "MUSR": 0.3288, + "MMLU-PRO": 0.1574 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2381, + "BBH": 0.3242, + "MATH Level 5": 0.0498, + "GPQA": 0.2743, + "MUSR": 0.3328, + "MMLU-PRO": 0.1572 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2421, + "BBH": 0.3225, + "MATH Level 5": 0.04, + "GPQA": 0.2802, + "MUSR": 0.3408, + "MMLU-PRO": 0.1496 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2381, + "BBH": 0.3265, + "MATH Level 5": 0.0446, + "GPQA": 0.276, + "MUSR": 0.3408, + "MMLU-PRO": 0.1499 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2526, + "BBH": 0.3177, + "MATH Level 5": 0.0438, + "GPQA": 0.2735, + "MUSR": 0.3342, + "MMLU-PRO": 0.1572 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2457, + "BBH": 0.316, + "MATH Level 5": 0.0446, + "GPQA": 0.2727, + "MUSR": 0.3302, + "MMLU-PRO": 0.1572 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2442, + "BBH": 0.3194, + "MATH Level 5": 0.0483, + "GPQA": 0.2735, + "MUSR": 0.3315, + "MMLU-PRO": 0.1567 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2604, + "BBH": 0.3178, + "MATH Level 5": 0.0355, + "GPQA": 0.276, + "MUSR": 0.3288, + "MMLU-PRO": 0.1567 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.249, + "BBH": 0.3173, + "MATH Level 5": 0.0393, + "GPQA": 0.271, + "MUSR": 0.3302, + "MMLU-PRO": 0.1569 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2604, + "BBH": 0.315, + "MATH Level 5": 0.0378, + "GPQA": 0.2743, + "MUSR": 0.3342, + "MMLU-PRO": 0.1566 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.255, + "BBH": 0.3211, + "MATH Level 5": 0.0491, + "GPQA": 0.2701, + "MUSR": 0.3288, + "MMLU-PRO": 0.1571 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2478, + "BBH": 0.3198, + "MATH Level 5": 0.0423, + "GPQA": 0.2668, + "MUSR": 0.3315, + "MMLU-PRO": 0.1587 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2475, + "BBH": 0.3225, + "MATH Level 5": 0.04, + "GPQA": 0.271, + "MUSR": 0.3301, + "MMLU-PRO": 0.1556 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.259, + "BBH": 0.3185, + "MATH Level 5": 0.0363, + "GPQA": 0.2727, + "MUSR": 0.3275, + "MMLU-PRO": 0.1586 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2323, + "BBH": 0.3179, + "MATH Level 5": 0.0453, + "GPQA": 0.2827, + "MUSR": 0.3262, + "MMLU-PRO": 0.1548 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2315, + "BBH": 0.326, + "MATH Level 5": 0.0415, + "GPQA": 0.2701, + "MUSR": 0.3383, + "MMLU-PRO": 0.1521 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2298, + "BBH": 0.332, + "MATH Level 5": 0.0431, + "GPQA": 0.2659, + "MUSR": 0.3329, + "MMLU-PRO": 0.1567 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2469, + "BBH": 0.3179, + "MATH Level 5": 0.0415, + "GPQA": 0.2794, + "MUSR": 0.3302, + "MMLU-PRO": 0.1575 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.252, + "BBH": 0.3168, + "MATH Level 5": 0.037, + "GPQA": 0.2752, + "MUSR": 0.3328, + "MMLU-PRO": 0.1576 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2666, + "BBH": 0.3191, + "MATH Level 5": 0.0347, + "GPQA": 0.2718, + "MUSR": 0.3289, + "MMLU-PRO": 0.1567 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2499, + "BBH": 0.3178, + "MATH Level 5": 0.037, + "GPQA": 0.2651, + "MUSR": 0.3341, + "MMLU-PRO": 0.1562 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2417, + "BBH": 0.3178, + "MATH Level 5": 0.04, + "GPQA": 0.2685, + "MUSR": 0.3328, + "MMLU-PRO": 0.1575 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2562, + "BBH": 0.319, + "MATH Level 5": 0.0423, + "GPQA": 0.2659, + "MUSR": 0.3341, + "MMLU-PRO": 0.1576 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2408, + "BBH": 0.3165, + "MATH Level 5": 0.0431, + "GPQA": 0.2735, + "MUSR": 0.3315, + "MMLU-PRO": 0.1557 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2481, + "BBH": 0.3204, + "MATH Level 5": 0.0476, + "GPQA": 0.276, + "MUSR": 0.3302, + "MMLU-PRO": 0.1592 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2545, + "BBH": 0.3186, + "MATH Level 5": 0.0498, + "GPQA": 0.2718, + "MUSR": 0.3289, + "MMLU-PRO": 0.1561 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.252, + "BBH": 0.3204, + "MATH Level 5": 0.0393, + "GPQA": 0.2727, + "MUSR": 0.3262, + "MMLU-PRO": 0.1538 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2315, + "BBH": 0.3213, + "MATH Level 5": 0.0453, + "GPQA": 0.2802, + "MUSR": 0.3222, + "MMLU-PRO": 0.1582 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2515, + "BBH": 0.3187, + "MATH Level 5": 0.0431, + "GPQA": 0.2718, + "MUSR": 0.3289, + "MMLU-PRO": 0.1539 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2472, + "BBH": 0.3213, + "MATH Level 5": 0.0347, + "GPQA": 0.2727, + "MUSR": 0.3262, + "MMLU-PRO": 0.1588 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.246, + "BBH": 0.3234, + "MATH Level 5": 0.0378, + "GPQA": 0.2794, + "MUSR": 0.3302, + "MMLU-PRO": 0.1533 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2524, + "BBH": 0.3256, + "MATH Level 5": 0.0536, + "GPQA": 0.2777, + "MUSR": 0.3368, + "MMLU-PRO": 0.1531 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2265, + "BBH": 0.3252, + "MATH Level 5": 0.0476, + "GPQA": 0.2735, + "MUSR": 0.3262, + "MMLU-PRO": 0.1568 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2302, + "BBH": 0.3224, + "MATH Level 5": 0.0438, + "GPQA": 0.2768, + "MUSR": 0.3408, + "MMLU-PRO": 0.15 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2524, + "BBH": 0.3278, + "MATH Level 5": 0.0408, + "GPQA": 0.2777, + "MUSR": 0.3395, + "MMLU-PRO": 0.1521 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2658, + "BBH": 0.3175, + "MATH Level 5": 0.0363, + "GPQA": 0.2617, + "MUSR": 0.3302, + "MMLU-PRO": 0.1575 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2487, + "BBH": 0.3189, + "MATH Level 5": 0.0378, + "GPQA": 0.2718, + "MUSR": 0.3275, + "MMLU-PRO": 0.1595 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.256, + "BBH": 0.3159, + "MATH Level 5": 0.0378, + "GPQA": 0.2768, + "MUSR": 0.3275, + "MMLU-PRO": 0.1562 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2499, + "BBH": 0.3156, + "MATH Level 5": 0.04, + "GPQA": 0.2701, + "MUSR": 0.3302, + "MMLU-PRO": 0.1556 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2496, + "BBH": 0.3177, + "MATH Level 5": 0.0453, + "GPQA": 0.2626, + "MUSR": 0.3315, + "MMLU-PRO": 0.1567 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2515, + "BBH": 0.3172, + "MATH Level 5": 0.0438, + "GPQA": 0.2701, + "MUSR": 0.3275, + "MMLU-PRO": 0.1553 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-3ep", + "name": "Qwen2.5-0.5B-SFT-2e-5-3ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2281, + "BBH": 0.324, + "MATH Level 5": 0.0453, + "GPQA": 0.2617, + "MUSR": 0.3301, + "MMLU-PRO": 0.1746 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep", + "name": "Qwen2.5-0.5B-SFT-2e-5-5ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2348, + "BBH": 0.3308, + "MATH Level 5": 0.0506, + "GPQA": 0.2643, + "MUSR": 0.3409, + "MMLU-PRO": 0.1695 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2526, + "BBH": 0.3238, + "MATH Level 5": 0.0393, + "GPQA": 0.2676, + "MUSR": 0.3528, + "MMLU-PRO": 0.1574 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep", + "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2481, + "BBH": 0.3175, + "MATH Level 5": 0.0385, + "GPQA": 0.2626, + "MUSR": 0.3475, + "MMLU-PRO": 0.1597 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep", + "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2548, + "BBH": 0.3199, + "MATH Level 5": 0.0385, + "GPQA": 0.2651, + "MUSR": 0.3435, + "MMLU-PRO": 0.1562 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2423, + "BBH": 0.3219, + "MATH Level 5": 0.034, + "GPQA": 0.2701, + "MUSR": 0.3515, + "MMLU-PRO": 0.1563 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep", + "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2493, + "BBH": 0.3191, + "MATH Level 5": 0.0393, + "GPQA": 0.2685, + "MUSR": 0.3475, + "MMLU-PRO": 0.1592 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep", + "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2478, + "BBH": 0.3218, + "MATH Level 5": 0.0415, + "GPQA": 0.2693, + "MUSR": 0.3515, + "MMLU-PRO": 0.1556 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-5e-5", + "name": "Qwen2.5-0.5B-SFT-5e-5", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.201, + "BBH": 0.3109, + "MATH Level 5": 0.034, + "GPQA": 0.2676, + "MUSR": 0.3381, + "MMLU-PRO": 0.1672 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-5e-5-2ep", + "name": "Qwen2.5-0.5B-SFT-5e-5-2ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2175, + "BBH": 0.318, + "MATH Level 5": 0.0378, + "GPQA": 0.2601, + "MUSR": 0.3368, + "MMLU-PRO": 0.1627 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-5e-5-3ep", + "name": "Qwen2.5-0.5B-SFT-5e-5-3ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2199, + "BBH": 0.3297, + "MATH Level 5": 0.0302, + "GPQA": 0.2534, + "MUSR": 0.3593, + "MMLU-PRO": 0.1651 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-5e-5-5ep", + "name": "Qwen2.5-0.5B-SFT-5e-5-5ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2077, + "BBH": 0.3276, + "MATH Level 5": 0.0272, + "GPQA": 0.2685, + "MUSR": 0.3766, + "MMLU-PRO": 0.1587 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-7e-5", + "name": "Qwen2.5-0.5B-SFT-7e-5", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2093, + "BBH": 0.3158, + "MATH Level 5": 0.0302, + "GPQA": 0.2567, + "MUSR": 0.3367, + "MMLU-PRO": 0.1622 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-7e-5-2ep", + "name": "Qwen2.5-0.5B-SFT-7e-5-2ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2156, + "BBH": 0.31, + "MATH Level 5": 0.0393, + "GPQA": 0.2424, + "MUSR": 0.3367, + "MMLU-PRO": 0.1567 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-7e-5-3ep", + "name": "Qwen2.5-0.5B-SFT-7e-5-3ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2381, + "BBH": 0.3199, + "MATH Level 5": 0.0332, + "GPQA": 0.2366, + "MUSR": 0.3554, + "MMLU-PRO": 0.1522 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-7e-5-5ep", + "name": "Qwen2.5-0.5B-SFT-7e-5-5ep", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.212, + "BBH": 0.32, + "MATH Level 5": 0.0219, + "GPQA": 0.2458, + "MUSR": 0.3713, + "MMLU-PRO": 0.1628 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-DPO-1epoch_v1", + "name": "Qwen2.5-0.5B-SFT-DPO-1epoch_v1", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2025, + "BBH": 0.3268, + "MATH Level 5": 0.0363, + "GPQA": 0.2727, + "MUSR": 0.3209, + "MMLU-PRO": 0.133 + } + }, + { + "model_id": "JayHyeon/Qwen2.5-0.5B-SFT-MDPO-1epoch_v1", + "name": "Qwen2.5-0.5B-SFT-MDPO-1epoch_v1", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.1964, + "BBH": 0.3293, + "MATH Level 5": 0.0468, + "GPQA": 0.276, + "MUSR": 0.3262, + "MMLU-PRO": 0.1337 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2532, + "BBH": 0.314, + "MATH Level 5": 0.0491, + "GPQA": 0.2743, + "MUSR": 0.3315, + "MMLU-PRO": 0.1566 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.267, + "BBH": 0.3189, + "MATH Level 5": 0.0408, + "GPQA": 0.2668, + "MUSR": 0.3288, + "MMLU-PRO": 0.1562 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2481, + "BBH": 0.3261, + "MATH Level 5": 0.0438, + "GPQA": 0.2601, + "MUSR": 0.3368, + "MMLU-PRO": 0.1565 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2383, + "BBH": 0.3218, + "MATH Level 5": 0.0431, + "GPQA": 0.2794, + "MUSR": 0.3342, + "MMLU-PRO": 0.1503 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2471, + "BBH": 0.3224, + "MATH Level 5": 0.04, + "GPQA": 0.2701, + "MUSR": 0.3328, + "MMLU-PRO": 0.1533 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2447, + "BBH": 0.3181, + "MATH Level 5": 0.0438, + "GPQA": 0.2617, + "MUSR": 0.3341, + "MMLU-PRO": 0.1565 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2551, + "BBH": 0.3194, + "MATH Level 5": 0.0446, + "GPQA": 0.2617, + "MUSR": 0.3262, + "MMLU-PRO": 0.1567 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2538, + "BBH": 0.3153, + "MATH Level 5": 0.0415, + "GPQA": 0.2676, + "MUSR": 0.3261, + "MMLU-PRO": 0.1583 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2402, + "BBH": 0.3168, + "MATH Level 5": 0.0378, + "GPQA": 0.2718, + "MUSR": 0.3328, + "MMLU-PRO": 0.1568 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2484, + "BBH": 0.3211, + "MATH Level 5": 0.0438, + "GPQA": 0.2701, + "MUSR": 0.3288, + "MMLU-PRO": 0.1573 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2578, + "BBH": 0.3203, + "MATH Level 5": 0.0423, + "GPQA": 0.271, + "MUSR": 0.3289, + "MMLU-PRO": 0.1583 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPO_1e-6-3ep_0alp_0lam", + "name": "Qwen_0.5-DPO_1e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2316, + "BBH": 0.3258, + "MATH Level 5": 0.0529, + "GPQA": 0.2693, + "MUSR": 0.3221, + "MMLU-PRO": 0.158 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPO_1e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-DPO_1e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.236, + "BBH": 0.3225, + "MATH Level 5": 0.0438, + "GPQA": 0.271, + "MUSR": 0.3222, + "MMLU-PRO": 0.1596 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPO_3e-6-1ep_0alp_0lam", + "name": "Qwen_0.5-DPO_3e-6-1ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2337, + "BBH": 0.3132, + "MATH Level 5": 0.0347, + "GPQA": 0.2609, + "MUSR": 0.3235, + "MMLU-PRO": 0.1533 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPO_3e-6-2ep_0alp_0lam", + "name": "Qwen_0.5-DPO_3e-6-2ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2569, + "BBH": 0.3276, + "MATH Level 5": 0.0544, + "GPQA": 0.2718, + "MUSR": 0.3156, + "MMLU-PRO": 0.1565 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPO_3e-6-3ep_0alp_0lam", + "name": "Qwen_0.5-DPO_3e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.246, + "BBH": 0.3267, + "MATH Level 5": 0.0431, + "GPQA": 0.2685, + "MUSR": 0.3209, + "MMLU-PRO": 0.1543 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPO_3e-7-1ep_0alp_0lam", + "name": "Qwen_0.5-DPO_3e-7-1ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2529, + "BBH": 0.3229, + "MATH Level 5": 0.0551, + "GPQA": 0.2676, + "MUSR": 0.3195, + "MMLU-PRO": 0.1597 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPO_3e-7-2ep_0alp_0lam", + "name": "Qwen_0.5-DPO_3e-7-2ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2505, + "BBH": 0.3256, + "MATH Level 5": 0.0476, + "GPQA": 0.2718, + "MUSR": 0.3195, + "MMLU-PRO": 0.1599 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPO_3e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-DPO_3e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2387, + "BBH": 0.3258, + "MATH Level 5": 0.0446, + "GPQA": 0.2743, + "MUSR": 0.3169, + "MMLU-PRO": 0.1589 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPO_5e-7-1ep_0alp_0lam", + "name": "Qwen_0.5-DPO_5e-7-1ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2532, + "BBH": 0.3218, + "MATH Level 5": 0.0634, + "GPQA": 0.2685, + "MUSR": 0.3209, + "MMLU-PRO": 0.1593 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPO_5e-7-2ep_0alp_0lam", + "name": "Qwen_0.5-DPO_5e-7-2ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2456, + "BBH": 0.3299, + "MATH Level 5": 0.0536, + "GPQA": 0.271, + "MUSR": 0.3181, + "MMLU-PRO": 0.1602 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-DPO_5e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-DPO_5e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2423, + "BBH": 0.3271, + "MATH Level 5": 0.0514, + "GPQA": 0.2743, + "MUSR": 0.3181, + "MMLU-PRO": 0.1595 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-IPO_5e-7-1ep_0alp_0lam", + "name": "Qwen_0.5-IPO_5e-7-1ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2574, + "BBH": 0.3279, + "MATH Level 5": 0.0559, + "GPQA": 0.2693, + "MUSR": 0.3169, + "MMLU-PRO": 0.1651 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-IPO_5e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-IPO_5e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.3072, + "BBH": 0.3264, + "MATH Level 5": 0.0582, + "GPQA": 0.2567, + "MUSR": 0.3156, + "MMLU-PRO": 0.1624 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam", + "name": "Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2551, + "BBH": 0.3242, + "MATH Level 5": 0.0468, + "GPQA": 0.2668, + "MUSR": 0.3182, + "MMLU-PRO": 0.1574 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam", + "name": "Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2636, + "BBH": 0.3198, + "MATH Level 5": 0.0514, + "GPQA": 0.276, + "MUSR": 0.3262, + "MMLU-PRO": 0.1586 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam", + "name": "Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2323, + "BBH": 0.3255, + "MATH Level 5": 0.037, + "GPQA": 0.2508, + "MUSR": 0.3169, + "MMLU-PRO": 0.1612 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam", + "name": "Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2414, + "BBH": 0.3314, + "MATH Level 5": 0.0347, + "GPQA": 0.2517, + "MUSR": 0.3342, + "MMLU-PRO": 0.1532 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam", + "name": "Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2678, + "BBH": 0.3362, + "MATH Level 5": 0.0514, + "GPQA": 0.2542, + "MUSR": 0.3382, + "MMLU-PRO": 0.1561 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam", + "name": "Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2561, + "BBH": 0.3231, + "MATH Level 5": 0.0536, + "GPQA": 0.2718, + "MUSR": 0.3196, + "MMLU-PRO": 0.1589 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam", + "name": "Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2639, + "BBH": 0.3257, + "MATH Level 5": 0.0476, + "GPQA": 0.2701, + "MUSR": 0.3209, + "MMLU-PRO": 0.1587 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam", + "name": "Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2518, + "BBH": 0.3214, + "MATH Level 5": 0.0574, + "GPQA": 0.2735, + "MUSR": 0.3169, + "MMLU-PRO": 0.1585 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam", + "name": "Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2438, + "BBH": 0.3266, + "MATH Level 5": 0.0619, + "GPQA": 0.2727, + "MUSR": 0.3196, + "MMLU-PRO": 0.1554 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam", + "name": "Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2465, + "BBH": 0.3246, + "MATH Level 5": 0.0529, + "GPQA": 0.2718, + "MUSR": 0.3182, + "MMLU-PRO": 0.1563 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2506, + "BBH": 0.3261, + "MATH Level 5": 0.0498, + "GPQA": 0.2819, + "MUSR": 0.3382, + "MMLU-PRO": 0.1522 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2457, + "BBH": 0.318, + "MATH Level 5": 0.0347, + "GPQA": 0.2634, + "MUSR": 0.3315, + "MMLU-PRO": 0.1566 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2454, + "BBH": 0.3216, + "MATH Level 5": 0.0506, + "GPQA": 0.2802, + "MUSR": 0.3382, + "MMLU-PRO": 0.1544 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2342, + "BBH": 0.3189, + "MATH Level 5": 0.04, + "GPQA": 0.2701, + "MUSR": 0.3302, + "MMLU-PRO": 0.158 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.232, + "BBH": 0.3234, + "MATH Level 5": 0.0393, + "GPQA": 0.2743, + "MUSR": 0.3369, + "MMLU-PRO": 0.1543 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2418, + "BBH": 0.3175, + "MATH Level 5": 0.0423, + "GPQA": 0.2626, + "MUSR": 0.3288, + "MMLU-PRO": 0.158 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2493, + "BBH": 0.3197, + "MATH Level 5": 0.0423, + "GPQA": 0.2701, + "MUSR": 0.3315, + "MMLU-PRO": 0.1571 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.252, + "BBH": 0.3198, + "MATH Level 5": 0.0423, + "GPQA": 0.2634, + "MUSR": 0.3262, + "MMLU-PRO": 0.1551 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.258, + "BBH": 0.3248, + "MATH Level 5": 0.0476, + "GPQA": 0.2752, + "MUSR": 0.3422, + "MMLU-PRO": 0.1539 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.232, + "BBH": 0.3265, + "MATH Level 5": 0.0385, + "GPQA": 0.271, + "MUSR": 0.3395, + "MMLU-PRO": 0.1537 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2488, + "BBH": 0.3273, + "MATH Level 5": 0.0461, + "GPQA": 0.2718, + "MUSR": 0.3342, + "MMLU-PRO": 0.1531 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2524, + "BBH": 0.313, + "MATH Level 5": 0.0446, + "GPQA": 0.271, + "MUSR": 0.3289, + "MMLU-PRO": 0.1564 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2514, + "BBH": 0.3221, + "MATH Level 5": 0.0438, + "GPQA": 0.2752, + "MUSR": 0.3315, + "MMLU-PRO": 0.1538 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2457, + "BBH": 0.318, + "MATH Level 5": 0.0385, + "GPQA": 0.2668, + "MUSR": 0.3275, + "MMLU-PRO": 0.1572 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2636, + "BBH": 0.3181, + "MATH Level 5": 0.0476, + "GPQA": 0.2659, + "MUSR": 0.3235, + "MMLU-PRO": 0.1574 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-VDPO_3e-6-1ep_3vpo_const", + "name": "Qwen_0.5-VDPO_3e-6-1ep_3vpo_const", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2483, + "BBH": 0.3174, + "MATH Level 5": 0.0378, + "GPQA": 0.2542, + "MUSR": 0.3328, + "MMLU-PRO": 0.1558 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam", + "name": "Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2518, + "BBH": 0.3218, + "MATH Level 5": 0.0529, + "GPQA": 0.2718, + "MUSR": 0.3235, + "MMLU-PRO": 0.1595 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_10vpo_const", + "name": "Qwen_0.5-VDPO_5e-7-1ep_10vpo_const", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2536, + "BBH": 0.3234, + "MATH Level 5": 0.0491, + "GPQA": 0.276, + "MUSR": 0.3236, + "MMLU-PRO": 0.1597 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_1vpo_const", + "name": "Qwen_0.5-VDPO_5e-7-1ep_1vpo_const", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2448, + "BBH": 0.324, + "MATH Level 5": 0.0604, + "GPQA": 0.2752, + "MUSR": 0.3249, + "MMLU-PRO": 0.1587 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_3vpo_const", + "name": "Qwen_0.5-VDPO_5e-7-1ep_3vpo_const", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2505, + "BBH": 0.3227, + "MATH Level 5": 0.0468, + "GPQA": 0.271, + "MUSR": 0.3209, + "MMLU-PRO": 0.1589 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2472, + "BBH": 0.3255, + "MATH Level 5": 0.0498, + "GPQA": 0.2752, + "MUSR": 0.3208, + "MMLU-PRO": 0.1587 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_1vpo_const", + "name": "Qwen_0.5-VDPO_5e-7-3ep_1vpo_const", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2417, + "BBH": 0.3256, + "MATH Level 5": 0.0582, + "GPQA": 0.2727, + "MUSR": 0.3275, + "MMLU-PRO": 0.1562 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_3vpo_const", + "name": "Qwen_0.5-VDPO_5e-7-3ep_3vpo_const", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2527, + "BBH": 0.3235, + "MATH Level 5": 0.0536, + "GPQA": 0.2785, + "MUSR": 0.3235, + "MMLU-PRO": 0.158 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam", + "name": "Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2669, + "BBH": 0.3314, + "MATH Level 5": 0.071, + "GPQA": 0.2676, + "MUSR": 0.3168, + "MMLU-PRO": 0.1634 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_10vpo_const", + "name": "Qwen_0.5-VIPO_5e-7-1ep_10vpo_const", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2702, + "BBH": 0.33, + "MATH Level 5": 0.074, + "GPQA": 0.2752, + "MUSR": 0.3208, + "MMLU-PRO": 0.1635 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_1vpo_const", + "name": "Qwen_0.5-VIPO_5e-7-1ep_1vpo_const", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.248, + "BBH": 0.3309, + "MATH Level 5": 0.068, + "GPQA": 0.2643, + "MUSR": 0.3208, + "MMLU-PRO": 0.1649 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_30vpo_const", + "name": "Qwen_0.5-VIPO_5e-7-1ep_30vpo_const", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2622, + "BBH": 0.3282, + "MATH Level 5": 0.074, + "GPQA": 0.2693, + "MUSR": 0.3221, + "MMLU-PRO": 0.1634 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_3vpo_const", + "name": "Qwen_0.5-VIPO_5e-7-1ep_3vpo_const", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2609, + "BBH": 0.3298, + "MATH Level 5": 0.065, + "GPQA": 0.2701, + "MUSR": 0.3168, + "MMLU-PRO": 0.1651 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.293, + "BBH": 0.322, + "MATH Level 5": 0.0627, + "GPQA": 0.2685, + "MUSR": 0.3116, + "MMLU-PRO": 0.1591 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_10vpo_const", + "name": "Qwen_0.5-VIPO_5e-7-3ep_10vpo_const", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2881, + "BBH": 0.3255, + "MATH Level 5": 0.0725, + "GPQA": 0.2752, + "MUSR": 0.3102, + "MMLU-PRO": 0.1582 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_1vpo_const", + "name": "Qwen_0.5-VIPO_5e-7-3ep_1vpo_const", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2887, + "BBH": 0.3237, + "MATH Level 5": 0.0748, + "GPQA": 0.2802, + "MUSR": 0.3142, + "MMLU-PRO": 0.1609 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_30vpo_const", + "name": "Qwen_0.5-VIPO_5e-7-3ep_30vpo_const", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2905, + "BBH": 0.3254, + "MATH Level 5": 0.077, + "GPQA": 0.2735, + "MUSR": 0.3129, + "MMLU-PRO": 0.1574 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_3vpo_const", + "name": "Qwen_0.5-VIPO_5e-7-3ep_3vpo_const", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2905, + "BBH": 0.3238, + "MATH Level 5": 0.0702, + "GPQA": 0.2735, + "MUSR": 0.3089, + "MMLU-PRO": 0.1592 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1", + "name": "Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2393, + "BBH": 0.3244, + "MATH Level 5": 0.0514, + "GPQA": 0.2777, + "MUSR": 0.3222, + "MMLU-PRO": 0.1573 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3", + "name": "Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2475, + "BBH": 0.3209, + "MATH Level 5": 0.0461, + "GPQA": 0.281, + "MUSR": 0.3275, + "MMLU-PRO": 0.1567 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1", + "name": "Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2321, + "BBH": 0.3278, + "MATH Level 5": 0.0476, + "GPQA": 0.2576, + "MUSR": 0.3022, + "MMLU-PRO": 0.1496 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1", + "name": "Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2542, + "BBH": 0.3253, + "MATH Level 5": 0.0529, + "GPQA": 0.271, + "MUSR": 0.3181, + "MMLU-PRO": 0.1609 + } + }, + { + "model_id": "JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3", + "name": "Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3", + "developer": "JayHyeon", + "scores": { + "IFEval": 0.2739, + "BBH": 0.3245, + "MATH Level 5": 0.0461, + "GPQA": 0.2508, + "MUSR": 0.3089, + "MMLU-PRO": 0.1597 + } + }, + { + "model_id": "Jimmy19991222/Llama-3-Instruct-8B-SimPO-v0.2", + "name": "Llama-3-Instruct-8B-SimPO-v0.2", + "developer": "Jimmy19991222", + "scores": { + "IFEval": 0.654, + "BBH": 0.4984, + "MATH Level 5": 0.0619, + "GPQA": 0.3146, + "MUSR": 0.4013, + "MMLU-PRO": 0.3686 + } + }, + { + "model_id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun", + "name": "llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun", + "developer": "Jimmy19991222", + "scores": { + "IFEval": 0.6717, + "BBH": 0.488, + "MATH Level 5": 0.0604, + "GPQA": 0.2945, + "MUSR": 0.4041, + "MMLU-PRO": 0.3634 + } + }, + { + "model_id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log", + "name": "llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log", + "developer": "Jimmy19991222", + "scores": { + "IFEval": 0.6556, + "BBH": 0.4935, + "MATH Level 5": 0.0544, + "GPQA": 0.3045, + "MUSR": 0.4, + "MMLU-PRO": 0.3658 + } + }, + { + "model_id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log", + "name": "llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log", + "developer": "Jimmy19991222", + "scores": { + "IFEval": 0.6315, + "BBH": 0.4916, + "MATH Level 5": 0.065, + "GPQA": 0.2861, + "MUSR": 0.3935, + "MMLU-PRO": 0.3611 + } + }, + { + "model_id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4", + "name": "llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4", + "developer": "Jimmy19991222", + "scores": { + "IFEval": 0.6285, + "BBH": 0.4986, + "MATH Level 5": 0.0514, + "GPQA": 0.2928, + "MUSR": 0.4014, + "MMLU-PRO": 0.3545 + } + }, + { + "model_id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun", + "name": "llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun", + "developer": "Jimmy19991222", + "scores": { + "IFEval": 0.6678, + "BBH": 0.494, + "MATH Level 5": 0.0612, + "GPQA": 0.3062, + "MUSR": 0.3987, + "MMLU-PRO": 0.3658 + } + }, + { + "model_id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log", + "name": "llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log", + "developer": "Jimmy19991222", + "scores": { + "IFEval": 0.6605, + "BBH": 0.4916, + "MATH Level 5": 0.0657, + "GPQA": 0.3037, + "MUSR": 0.4, + "MMLU-PRO": 0.3664 + } + }, + { + "model_id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log", + "name": "llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log", + "developer": "Jimmy19991222", + "scores": { + "IFEval": 0.6492, + "BBH": 0.4952, + "MATH Level 5": 0.0642, + "GPQA": 0.302, + "MUSR": 0.3961, + "MMLU-PRO": 0.3711 + } + }, + { + "model_id": "Joseph717171/Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32", + "name": "Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32", + "developer": "Joseph717171", + "scores": { + "IFEval": 0.6185, + "BBH": 0.5177, + "MATH Level 5": 0.0514, + "GPQA": 0.2827, + "MUSR": 0.4369, + "MMLU-PRO": 0.3144 + } + }, + { + "model_id": "Joseph717171/Llama-3.1-SuperNova-8B-Lite_TIES_with_Base", + "name": "Llama-3.1-SuperNova-8B-Lite_TIES_with_Base", + "developer": "Joseph717171", + "scores": { + "IFEval": 0.8096, + "BBH": 0.5147, + "MATH Level 5": 0.1835, + "GPQA": 0.3096, + "MUSR": 0.411, + "MMLU-PRO": 0.388 + } + }, + { + "model_id": "Josephgflowers/Cinder-Phi-2-V1-F16-gguf", + "name": "Cinder-Phi-2-V1-F16-gguf", + "developer": "Josephgflowers", + "scores": { + "IFEval": 0.2357, + "BBH": 0.4397, + "MATH Level 5": 0.0242, + "GPQA": 0.2819, + "MUSR": 0.3435, + "MMLU-PRO": 0.2161 + } + }, + { + "model_id": "Josephgflowers/Differential-Attention-Liquid-Metal-Tinyllama", + "name": "Differential-Attention-Liquid-Metal-Tinyllama", + "developer": "Josephgflowers", + "scores": { + "IFEval": 0.2227, + "BBH": 0.2926, + "MATH Level 5": 0.0325, + "GPQA": 0.2508, + "MUSR": 0.3356, + "MMLU-PRO": 0.1214 + } + }, + { + "model_id": "Josephgflowers/TinyLlama-Cinder-Agent-v1", + "name": "TinyLlama-Cinder-Agent-v1", + "developer": "Josephgflowers", + "scores": { + "IFEval": 0.267, + "BBH": 0.3116, + "MATH Level 5": 0.0347, + "GPQA": 0.2441, + "MUSR": 0.3395, + "MMLU-PRO": 0.1161 + } + }, + { + "model_id": "Josephgflowers/TinyLlama-v1.1-Cinders-World", + "name": "TinyLlama-v1.1-Cinders-World", + "developer": "Josephgflowers", + "scores": { + "IFEval": 0.2469, + "BBH": 0.2998, + "MATH Level 5": 0.0347, + "GPQA": 0.2441, + "MUSR": 0.3356, + "MMLU-PRO": 0.1198 + } + }, + { + "model_id": "Josephgflowers/TinyLlama_v1.1_math_code-world-test-1", + "name": "TinyLlama_v1.1_math_code-world-test-1", + "developer": "Josephgflowers", + "scores": { + "IFEval": 0.0078, + "BBH": 0.3146, + "MATH Level 5": 0.0196, + "GPQA": 0.2341, + "MUSR": 0.3499, + "MMLU-PRO": 0.1132 + } + }, + { + "model_id": "Josephgflowers/Tinyllama-STEM-Cinder-Agent-v1", + "name": "Tinyllama-STEM-Cinder-Agent-v1", + "developer": "Josephgflowers", + "scores": { + "IFEval": 0.2126, + "BBH": 0.3084, + "MATH Level 5": 0.0672, + "GPQA": 0.2349, + "MUSR": 0.3341, + "MMLU-PRO": 0.1086 + } + }, + { + "model_id": "Josephgflowers/Tinyllama-r1", + "name": "Tinyllama-r1", + "developer": "Josephgflowers", + "scores": { + "IFEval": 0.2119, + "BBH": 0.3015, + "MATH Level 5": 0.0325, + "GPQA": 0.2567, + "MUSR": 0.3315, + "MMLU-PRO": 0.1134 + } + }, + { + "model_id": "JungZoona/T3Q-Qwen2.5-14B-Instruct-1M-e3", + "name": "T3Q-Qwen2.5-14B-Instruct-1M-e3", + "developer": "JungZoona", + "scores": { + "IFEval": 0.7324, + "BBH": 0.7586, + "MATH Level 5": 0.2863, + "GPQA": 0.4169, + "MUSR": 0.5911, + "MMLU-PRO": 0.5884 + } + }, + { + "model_id": "JungZoona/T3Q-qwen2.5-14b-v1.0-e3", + "name": "T3Q-qwen2.5-14b-v1.0-e3", + "developer": "JungZoona", + "scores": { + "IFEval": 0.7324, + "BBH": 0.7586, + "MATH Level 5": 0.2863, + "GPQA": 0.4169, + "MUSR": 0.5911, + "MMLU-PRO": 0.5884 + } + }, + { + "model_id": "Junhoee/Qwen-Megumin", + "name": "Qwen-Megumin", + "developer": "Junhoee", + "scores": { + "IFEval": 0.7141, + "BBH": 0.5285, + "MATH Level 5": 0.4902, + "GPQA": 0.2961, + "MUSR": 0.398, + "MMLU-PRO": 0.4199 + } + }, + { + "model_id": "KSU-HW-SEC/Llama3-70b-SVA-FT-1415", + "name": "Llama3-70b-SVA-FT-1415", + "developer": "KSU-HW-SEC", + "scores": { + "IFEval": 0.618, + "BBH": 0.665, + "MATH Level 5": 0.2198, + "GPQA": 0.375, + "MUSR": 0.4565, + "MMLU-PRO": 0.5243 + } + }, + { + "model_id": "KSU-HW-SEC/Llama3-70b-SVA-FT-500", + "name": "Llama3-70b-SVA-FT-500", + "developer": "KSU-HW-SEC", + "scores": { + "IFEval": 0.6105, + "BBH": 0.6692, + "MATH Level 5": 0.2137, + "GPQA": 0.3809, + "MUSR": 0.4511, + "MMLU-PRO": 0.5227 + } + }, + { + "model_id": "KSU-HW-SEC/Llama3-70b-SVA-FT-final", + "name": "Llama3-70b-SVA-FT-final", + "developer": "KSU-HW-SEC", + "scores": { + "IFEval": 0.6165, + "BBH": 0.665, + "MATH Level 5": 0.2198, + "GPQA": 0.375, + "MUSR": 0.4565, + "MMLU-PRO": 0.5243 + } + }, + { + "model_id": "KSU-HW-SEC/Llama3.1-70b-SVA-FT-1000step", + "name": "Llama3.1-70b-SVA-FT-1000step", + "developer": "KSU-HW-SEC", + "scores": { + "IFEval": 0.7238, + "BBH": 0.6903, + "MATH Level 5": 0.321, + "GPQA": 0.396, + "MUSR": 0.4592, + "MMLU-PRO": 0.5252 + } + }, + { + "model_id": "Khetterman/DarkAtom-12B-v3", + "name": "DarkAtom-12B-v3", + "developer": "Khetterman", + "scores": { + "IFEval": 0.6173, + "BBH": 0.5154, + "MATH Level 5": 0.111, + "GPQA": 0.2978, + "MUSR": 0.4468, + "MMLU-PRO": 0.3546 + } + }, + { + "model_id": "Khetterman/Kosmos-8B-v1", + "name": "Kosmos-8B-v1", + "developer": "Khetterman", + "scores": { + "IFEval": 0.4129, + "BBH": 0.5234, + "MATH Level 5": 0.0989, + "GPQA": 0.2987, + "MUSR": 0.3919, + "MMLU-PRO": 0.3669 + } + }, + { + "model_id": "Kimargin/GPT-NEO-1.3B-wiki", + "name": "GPT-NEO-1.3B-wiki", + "developer": "Kimargin", + "scores": { + "IFEval": 0.1921, + "BBH": 0.3026, + "MATH Level 5": 0.0144, + "GPQA": 0.245, + "MUSR": 0.3883, + "MMLU-PRO": 0.1099 + } + }, + { + "model_id": "KingNish/Qwen2.5-0.5b-Test-ft", + "name": "Qwen2.5-0.5b-Test-ft", + "developer": "KingNish", + "scores": { + "IFEval": 0.2671, + "BBH": 0.3232, + "MATH Level 5": 0.0355, + "GPQA": 0.2634, + "MUSR": 0.3421, + "MMLU-PRO": 0.1689 + } + }, + { + "model_id": "KingNish/Reasoning-0.5b", + "name": "Reasoning-0.5b", + "developer": "KingNish", + "scores": { + "IFEval": 0.2174, + "BBH": 0.3354, + "MATH Level 5": 0.0219, + "GPQA": 0.2676, + "MUSR": 0.3513, + "MMLU-PRO": 0.1641 + } + }, + { + "model_id": "KingNish/Reasoning-Llama-3b-v0.1", + "name": "Reasoning-Llama-3b-v0.1", + "developer": "KingNish", + "scores": { + "IFEval": 0.6225, + "BBH": 0.4343, + "MATH Level 5": 0.1299, + "GPQA": 0.2592, + "MUSR": 0.3168, + "MMLU-PRO": 0.3029 + } + }, + { + "model_id": "KingNish/qwen-1b-continued", + "name": "qwen-1b-continued", + "developer": "KingNish", + "scores": { + "IFEval": 0.1255, + "BBH": 0.2991, + "MATH Level 5": 0.0091, + "GPQA": 0.2676, + "MUSR": 0.3859, + "MMLU-PRO": 0.1261 + } + }, + { + "model_id": "KingNish/qwen-1b-continued-v2", + "name": "qwen-1b-continued-v2", + "developer": "KingNish", + "scores": { + "IFEval": 0.1579, + "BBH": 0.3119, + "MATH Level 5": 0.0106, + "GPQA": 0.25, + "MUSR": 0.3393, + "MMLU-PRO": 0.1193 + } + }, + { + "model_id": "KingNish/qwen-1b-continued-v2.1", + "name": "qwen-1b-continued-v2.1", + "developer": "KingNish", + "scores": { + "IFEval": 0.1127, + "BBH": 0.3042, + "MATH Level 5": 0.0091, + "GPQA": 0.2676, + "MUSR": 0.4154, + "MMLU-PRO": 0.1278 + } + }, + { + "model_id": "KingNish/qwen-1b-continued-v2.2", + "name": "qwen-1b-continued-v2.2", + "developer": "KingNish", + "scores": { + "IFEval": 0.1413, + "BBH": 0.3059, + "MATH Level 5": 0.0151, + "GPQA": 0.2567, + "MUSR": 0.3513, + "MMLU-PRO": 0.1262 + } + }, + { + "model_id": "Kquant03/CognitiveFusion2-4x7B-BF16", + "name": "CognitiveFusion2-4x7B-BF16", + "developer": "Kquant03", + "scores": { + "IFEval": 0.3567, + "BBH": 0.4108, + "MATH Level 5": 0.0574, + "GPQA": 0.2861, + "MUSR": 0.4146, + "MMLU-PRO": 0.2793 + } + }, + { + "model_id": "Kquant03/L3-Pneuma-8B", + "name": "L3-Pneuma-8B", + "developer": "Kquant03", + "scores": { + "IFEval": 0.2374, + "BBH": 0.4955, + "MATH Level 5": 0.0506, + "GPQA": 0.307, + "MUSR": 0.4172, + "MMLU-PRO": 0.3184 + } + }, + { + "model_id": "Krystalan/DRT-o1-14B", + "name": "DRT-o1-14B", + "developer": "Krystalan", + "scores": { + "IFEval": 0.4068, + "BBH": 0.6379, + "MATH Level 5": 0.4826, + "GPQA": 0.3523, + "MUSR": 0.4795, + "MMLU-PRO": 0.5179 + } + }, + { + "model_id": "Krystalan/DRT-o1-7B", + "name": "DRT-o1-7B", + "developer": "Krystalan", + "scores": { + "IFEval": 0.3928, + "BBH": 0.5468, + "MATH Level 5": 0.4479, + "GPQA": 0.3213, + "MUSR": 0.5087, + "MMLU-PRO": 0.4151 + } + }, + { + "model_id": "Kukedlc/NeuralExperiment-7b-MagicCoder-v7.5", + "name": "NeuralExperiment-7b-MagicCoder-v7.5", + "developer": "Kukedlc", + "scores": { + "IFEval": 0.4553, + "BBH": 0.3988, + "MATH Level 5": 0.0665, + "GPQA": 0.2961, + "MUSR": 0.4282, + "MMLU-PRO": 0.2824 + } + }, + { + "model_id": "Kukedlc/NeuralLLaMa-3-8b-DT-v0.1", + "name": "NeuralLLaMa-3-8b-DT-v0.1", + "developer": "Kukedlc", + "scores": { + "IFEval": 0.4371, + "BBH": 0.4987, + "MATH Level 5": 0.0808, + "GPQA": 0.3029, + "MUSR": 0.4071, + "MMLU-PRO": 0.3792 + } + }, + { + "model_id": "Kukedlc/NeuralLLaMa-3-8b-ORPO-v0.3", + "name": "NeuralLLaMa-3-8b-ORPO-v0.3", + "developer": "Kukedlc", + "scores": { + "IFEval": 0.5276, + "BBH": 0.4557, + "MATH Level 5": 0.0483, + "GPQA": 0.2391, + "MUSR": 0.37, + "MMLU-PRO": 0.3057 + } + }, + { + "model_id": "Kukedlc/NeuralSynthesis-7B-v0.1", + "name": "NeuralSynthesis-7B-v0.1", + "developer": "Kukedlc", + "scores": { + "IFEval": 0.4185, + "BBH": 0.5145, + "MATH Level 5": 0.0634, + "GPQA": 0.281, + "MUSR": 0.4333, + "MMLU-PRO": 0.3049 + } + }, + { + "model_id": "Kukedlc/NeuralSynthesis-7B-v0.3", + "name": "NeuralSynthesis-7B-v0.3", + "developer": "Kukedlc", + "scores": { + "IFEval": 0.4078, + "BBH": 0.5138, + "MATH Level 5": 0.0778, + "GPQA": 0.2802, + "MUSR": 0.4346, + "MMLU-PRO": 0.305 + } + }, + { + "model_id": "Kukedlc/NeuralSynthesis-7b-v0.4-slerp", + "name": "NeuralSynthesis-7b-v0.4-slerp", + "developer": "Kukedlc", + "scores": { + "IFEval": 0.3947, + "BBH": 0.5143, + "MATH Level 5": 0.0627, + "GPQA": 0.2777, + "MUSR": 0.4332, + "MMLU-PRO": 0.3043 + } + }, + { + "model_id": "Kukedlc/Qwen-2.5-7b-Spanish-o1-CoT", + "name": "Qwen-2.5-7b-Spanish-o1-CoT", + "developer": "Kukedlc", + "scores": { + "IFEval": 0.421, + "BBH": 0.5602, + "MATH Level 5": 0.2727, + "GPQA": 0.3205, + "MUSR": 0.4777, + "MMLU-PRO": 0.4363 + } + }, + { + "model_id": "Kumar955/Hemanth-llm", + "name": "Hemanth-llm", + "developer": "Kumar955", + "scores": { + "IFEval": 0.5045, + "BBH": 0.5225, + "MATH Level 5": 0.0702, + "GPQA": 0.2827, + "MUSR": 0.4486, + "MMLU-PRO": 0.3113 + } + }, + { + "model_id": "L-RAGE/3_PRYMMAL-ECE-7B-SLERP-V1", + "name": "3_PRYMMAL-ECE-7B-SLERP-V1", + "developer": "L-RAGE", + "scores": { + "IFEval": 0.2742, + "BBH": 0.4228, + "MATH Level 5": 0.108, + "GPQA": 0.2819, + "MUSR": 0.3841, + "MMLU-PRO": 0.2925 + } + }, + { + "model_id": "LEESM/llama-2-7b-hf-lora-oki100p", + "name": "llama-2-7b-hf-lora-oki100p", + "developer": "LEESM", + "scores": { + "IFEval": 0.2513, + "BBH": 0.3492, + "MATH Level 5": 0.0166, + "GPQA": 0.2693, + "MUSR": 0.3687, + "MMLU-PRO": 0.1856 + } + }, + { + "model_id": "LEESM/llama-2-7b-hf-lora-oki10p", + "name": "llama-2-7b-hf-lora-oki10p", + "developer": "LEESM", + "scores": { + "IFEval": 0.227, + "BBH": 0.3531, + "MATH Level 5": 0.0166, + "GPQA": 0.2542, + "MUSR": 0.3475, + "MMLU-PRO": 0.1679 + } + }, + { + "model_id": "LEESM/llama-3-8b-bnb-4b-kowiki231101", + "name": "llama-3-8b-bnb-4b-kowiki231101", + "developer": "LEESM", + "scores": { + "IFEval": 0.1685, + "BBH": 0.4131, + "MATH Level 5": 0.0136, + "GPQA": 0.271, + "MUSR": 0.3551, + "MMLU-PRO": 0.2425 + } + }, + { + "model_id": "LEESM/llama-3-Korean-Bllossom-8B-trexlab-oki10p", + "name": "llama-3-Korean-Bllossom-8B-trexlab-oki10p", + "developer": "LEESM", + "scores": { + "IFEval": 0.2137, + "BBH": 0.4343, + "MATH Level 5": 0.0468, + "GPQA": 0.2752, + "MUSR": 0.3869, + "MMLU-PRO": 0.3177 + } + }, + { + "model_id": "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", + "name": "EXAONE-3.0-7.8B-Instruct", + "developer": "LGAI-EXAONE", + "scores": { + "IFEval": 0.7193, + "BBH": 0.4174, + "MATH Level 5": 0.3044, + "GPQA": 0.2659, + "MUSR": 0.3661, + "MMLU-PRO": 0.3577 + } + }, + { + "model_id": "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct", + "name": "EXAONE-3.5-2.4B-Instruct", + "developer": "LGAI-EXAONE", + "scores": { + "IFEval": 0.795, + "BBH": 0.4092, + "MATH Level 5": 0.3678, + "GPQA": 0.2659, + "MUSR": 0.3661, + "MMLU-PRO": 0.328 + } + }, + { + "model_id": "LGAI-EXAONE/EXAONE-3.5-32B-Instruct", + "name": "EXAONE-3.5-32B-Instruct", + "developer": "LGAI-EXAONE", + "scores": { + "IFEval": 0.8392, + "BBH": 0.5761, + "MATH Level 5": 0.5128, + "GPQA": 0.2878, + "MUSR": 0.3807, + "MMLU-PRO": 0.4637 + } + }, + { + "model_id": "LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct", + "name": "EXAONE-3.5-7.8B-Instruct", + "developer": "LGAI-EXAONE", + "scores": { + "IFEval": 0.8136, + "BBH": 0.4728, + "MATH Level 5": 0.4751, + "GPQA": 0.2576, + "MUSR": 0.3779, + "MMLU-PRO": 0.4133 + } + }, + { + "model_id": "LLM360/K2", + "name": "K2", + "developer": "LLM360", + "scores": { + "IFEval": 0.2252, + "BBH": 0.4972, + "MATH Level 5": 0.0272, + "GPQA": 0.2768, + "MUSR": 0.398, + "MMLU-PRO": 0.3004 + } + }, + { + "model_id": "LLM360/K2-Chat", + "name": "K2-Chat", + "developer": "LLM360", + "scores": { + "IFEval": 0.5152, + "BBH": 0.5358, + "MATH Level 5": 0.1035, + "GPQA": 0.3062, + "MUSR": 0.457, + "MMLU-PRO": 0.3371 + } + }, + { + "model_id": "LLM4Binary/llm4decompile-1.3b-v2", + "name": "llm4decompile-1.3b-v2", + "developer": "LLM4Binary", + "scores": { + "IFEval": 0.2268, + "BBH": 0.3272, + "MATH Level 5": 0.0128, + "GPQA": 0.2357, + "MUSR": 0.4072, + "MMLU-PRO": 0.1209 + } + }, + { + "model_id": "Lambent/qwen2.5-reinstruct-alternate-lumen-14B", + "name": "qwen2.5-reinstruct-alternate-lumen-14B", + "developer": "Lambent", + "scores": { + "IFEval": 0.4794, + "BBH": 0.6459, + "MATH Level 5": 0.4622, + "GPQA": 0.3767, + "MUSR": 0.477, + "MMLU-PRO": 0.5388 + } + }, + { + "model_id": "Langboat/Mengzi3-8B-Chat", + "name": "Mengzi3-8B-Chat", + "developer": "Langboat", + "scores": { + "IFEval": 0.514, + "BBH": 0.4684, + "MATH Level 5": 0.0906, + "GPQA": 0.2743, + "MUSR": 0.4078, + "MMLU-PRO": 0.3142 + } + }, + { + "model_id": "Lawnakk/BBA100", + "name": "BBA100", + "developer": "Lawnakk", + "scores": { + "IFEval": 0.2076, + "BBH": 0.2826, + "MATH Level 5": 0.0098, + "GPQA": 0.2441, + "MUSR": 0.402, + "MMLU-PRO": 0.1122 + } + }, + { + "model_id": "Lawnakk/BBALAW1", + "name": "BBALAW1", + "developer": "Lawnakk", + "scores": { + "IFEval": 0.1905, + "BBH": 0.2872, + "MATH Level 5": 0.0098, + "GPQA": 0.2433, + "MUSR": 0.4153, + "MMLU-PRO": 0.1121 + } + }, + { + "model_id": "Lawnakk/BBALAW1.0", + "name": "BBALAW1.0", + "developer": "Lawnakk", + "scores": { + "IFEval": 0.1351, + "BBH": 0.2828, + "MATH Level 5": 0.0, + "GPQA": 0.2559, + "MUSR": 0.3526, + "MMLU-PRO": 0.1128 + } + }, + { + "model_id": "Lawnakk/BBALAW1.2", + "name": "BBALAW1.2", + "developer": "Lawnakk", + "scores": { + "IFEval": 0.1354, + "BBH": 0.2811, + "MATH Level 5": 0.0, + "GPQA": 0.2643, + "MUSR": 0.3579, + "MMLU-PRO": 0.1105 + } + }, + { + "model_id": "Lawnakk/BBALAW1.3", + "name": "BBALAW1.3", + "developer": "Lawnakk", + "scores": { + "IFEval": 0.1354, + "BBH": 0.2827, + "MATH Level 5": 0.0, + "GPQA": 0.2609, + "MUSR": 0.3619, + "MMLU-PRO": 0.1094 + } + }, + { + "model_id": "Lawnakk/BBALAW1.6", + "name": "BBALAW1.6", + "developer": "Lawnakk", + "scores": { + "IFEval": 0.5245, + "BBH": 0.5554, + "MATH Level 5": 0.3603, + "GPQA": 0.3238, + "MUSR": 0.4368, + "MMLU-PRO": 0.4507 + } + }, + { + "model_id": "Lawnakk/BBALAW1.61", + "name": "BBALAW1.61", + "developer": "Lawnakk", + "scores": { + "IFEval": 0.5771, + "BBH": 0.5549, + "MATH Level 5": 0.3663, + "GPQA": 0.3171, + "MUSR": 0.4355, + "MMLU-PRO": 0.4471 + } + }, + { + "model_id": "Lawnakk/BBALAW1.62", + "name": "BBALAW1.62", + "developer": "Lawnakk", + "scores": { + "IFEval": 0.5046, + "BBH": 0.5581, + "MATH Level 5": 0.2825, + "GPQA": 0.3196, + "MUSR": 0.4343, + "MMLU-PRO": 0.4545 + } + }, + { + "model_id": "Lawnakk/BBALAW1.63", + "name": "BBALAW1.63", + "developer": "Lawnakk", + "scores": { + "IFEval": 0.4407, + "BBH": 0.5541, + "MATH Level 5": 0.3701, + "GPQA": 0.3121, + "MUSR": 0.4303, + "MMLU-PRO": 0.4471 + } + }, + { + "model_id": "Lawnakk/BBALAW1.64", + "name": "BBALAW1.64", + "developer": "Lawnakk", + "scores": { + "IFEval": 0.1395, + "BBH": 0.2779, + "MATH Level 5": 0.0, + "GPQA": 0.2483, + "MUSR": 0.3447, + "MMLU-PRO": 0.1115 + } + }, + { + "model_id": "LenguajeNaturalAI/leniachat-gemma-2b-v0", + "name": "leniachat-gemma-2b-v0", + "developer": "LenguajeNaturalAI", + "scores": { + "IFEval": 0.215, + "BBH": 0.3074, + "MATH Level 5": 0.0113, + "GPQA": 0.2659, + "MUSR": 0.3659, + "MMLU-PRO": 0.117 + } + }, + { + "model_id": "LenguajeNaturalAI/leniachat-qwen2-1.5B-v0", + "name": "leniachat-qwen2-1.5B-v0", + "developer": "LenguajeNaturalAI", + "scores": { + "IFEval": 0.2221, + "BBH": 0.3684, + "MATH Level 5": 0.0128, + "GPQA": 0.2617, + "MUSR": 0.375, + "MMLU-PRO": 0.188 + } + }, + { + "model_id": "LeroyDyer/CheckPoint_A", + "name": "CheckPoint_A", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.4513, + "BBH": 0.4748, + "MATH Level 5": 0.0589, + "GPQA": 0.2836, + "MUSR": 0.4231, + "MMLU-PRO": 0.288 + } + }, + { + "model_id": "LeroyDyer/CheckPoint_B", + "name": "CheckPoint_B", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.444, + "BBH": 0.478, + "MATH Level 5": 0.0718, + "GPQA": 0.2903, + "MUSR": 0.3898, + "MMLU-PRO": 0.2907 + } + }, + { + "model_id": "LeroyDyer/CheckPoint_C", + "name": "CheckPoint_C", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.3477, + "BBH": 0.4586, + "MATH Level 5": 0.0551, + "GPQA": 0.271, + "MUSR": 0.4346, + "MMLU-PRO": 0.3021 + } + }, + { + "model_id": "LeroyDyer/CheckPoint_R1", + "name": "CheckPoint_R1", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.1728, + "BBH": 0.4225, + "MATH Level 5": 0.0431, + "GPQA": 0.2743, + "MUSR": 0.4031, + "MMLU-PRO": 0.2205 + } + }, + { + "model_id": "LeroyDyer/LCARS_AI_001", + "name": "LCARS_AI_001", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.3109, + "BBH": 0.4258, + "MATH Level 5": 0.0234, + "GPQA": 0.2634, + "MUSR": 0.4384, + "MMLU-PRO": 0.267 + } + }, + { + "model_id": "LeroyDyer/LCARS_AI_1x4_003_SuperAI", + "name": "LCARS_AI_1x4_003_SuperAI", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.4111, + "BBH": 0.492, + "MATH Level 5": 0.0574, + "GPQA": 0.2827, + "MUSR": 0.4506, + "MMLU-PRO": 0.2972 + } + }, + { + "model_id": "LeroyDyer/LCARS_AI_StarTrek_Computer", + "name": "LCARS_AI_StarTrek_Computer", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.3583, + "BBH": 0.4446, + "MATH Level 5": 0.0408, + "GPQA": 0.2676, + "MUSR": 0.395, + "MMLU-PRO": 0.2458 + } + }, + { + "model_id": "LeroyDyer/LCARS_TOP_SCORE", + "name": "LCARS_TOP_SCORE", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.4371, + "BBH": 0.5127, + "MATH Level 5": 0.0672, + "GPQA": 0.2861, + "MUSR": 0.4293, + "MMLU-PRO": 0.3031 + } + }, + { + "model_id": "LeroyDyer/Mixtral_AI_SwahiliTron_7b", + "name": "Mixtral_AI_SwahiliTron_7b", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.1534, + "BBH": 0.3055, + "MATH Level 5": 0.0136, + "GPQA": 0.2651, + "MUSR": 0.342, + "MMLU-PRO": 0.1208 + } + }, + { + "model_id": "LeroyDyer/SpydazWebAI_Human_AGI", + "name": "SpydazWebAI_Human_AGI", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.3388, + "BBH": 0.3375, + "MATH Level 5": 0.0144, + "GPQA": 0.2827, + "MUSR": 0.3966, + "MMLU-PRO": 0.1479 + } + }, + { + "model_id": "LeroyDyer/SpydazWebAI_Human_AGI_001", + "name": "SpydazWebAI_Human_AGI_001", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.3118, + "BBH": 0.3433, + "MATH Level 5": 0.0196, + "GPQA": 0.2987, + "MUSR": 0.3994, + "MMLU-PRO": 0.1426 + } + }, + { + "model_id": "LeroyDyer/SpydazWeb_AI_CyberTron_Ultra_7b", + "name": "SpydazWeb_AI_CyberTron_Ultra_7b", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.1556, + "BBH": 0.4811, + "MATH Level 5": 0.0136, + "GPQA": 0.2928, + "MUSR": 0.4136, + "MMLU-PRO": 0.2866 + } + }, + { + "model_id": "LeroyDyer/SpydazWeb_AI_HumanAGI_001_M2", + "name": "SpydazWeb_AI_HumanAGI_001_M2", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.394, + "BBH": 0.4888, + "MATH Level 5": 0.0385, + "GPQA": 0.2894, + "MUSR": 0.4503, + "MMLU-PRO": 0.3005 + } + }, + { + "model_id": "LeroyDyer/SpydazWeb_AI_HumanAGI_002", + "name": "SpydazWeb_AI_HumanAGI_002", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.4088, + "BBH": 0.5044, + "MATH Level 5": 0.0665, + "GPQA": 0.2869, + "MUSR": 0.4865, + "MMLU-PRO": 0.3059 + } + }, + { + "model_id": "LeroyDyer/SpydazWeb_AI_HumanAI_001", + "name": "SpydazWeb_AI_HumanAI_001", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.2252, + "BBH": 0.3344, + "MATH Level 5": 0.0166, + "GPQA": 0.2886, + "MUSR": 0.386, + "MMLU-PRO": 0.1271 + } + }, + { + "model_id": "LeroyDyer/SpydazWeb_AI_HumanAI_006", + "name": "SpydazWeb_AI_HumanAI_006", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.143, + "BBH": 0.3302, + "MATH Level 5": 0.0106, + "GPQA": 0.2802, + "MUSR": 0.3568, + "MMLU-PRO": 0.1135 + } + }, + { + "model_id": "LeroyDyer/SpydazWeb_AI_HumanAI_007", + "name": "SpydazWeb_AI_HumanAI_007", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.3352, + "BBH": 0.3416, + "MATH Level 5": 0.0227, + "GPQA": 0.2886, + "MUSR": 0.4096, + "MMLU-PRO": 0.1352 + } + }, + { + "model_id": "LeroyDyer/SpydazWeb_AI_HumanAI_009_CHAT", + "name": "SpydazWeb_AI_HumanAI_009_CHAT", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.2973, + "BBH": 0.3307, + "MATH Level 5": 0.0166, + "GPQA": 0.281, + "MUSR": 0.4138, + "MMLU-PRO": 0.1433 + } + }, + { + "model_id": "LeroyDyer/SpydazWeb_AI_HumanAI_010_CHAT", + "name": "SpydazWeb_AI_HumanAI_010_CHAT", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.2507, + "BBH": 0.3336, + "MATH Level 5": 0.0181, + "GPQA": 0.2592, + "MUSR": 0.4137, + "MMLU-PRO": 0.143 + } + }, + { + "model_id": "LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT", + "name": "SpydazWeb_AI_HumanAI_011_INSTRUCT", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.3149, + "BBH": 0.3523, + "MATH Level 5": 0.0144, + "GPQA": 0.2794, + "MUSR": 0.3831, + "MMLU-PRO": 0.1595 + } + }, + { + "model_id": "LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML", + "name": "SpydazWeb_AI_HumanAI_011_INSTRUCT_ML", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.3752, + "BBH": 0.3984, + "MATH Level 5": 0.0257, + "GPQA": 0.2928, + "MUSR": 0.4239, + "MMLU-PRO": 0.2019 + } + }, + { + "model_id": "LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1", + "name": "SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.405, + "BBH": 0.4858, + "MATH Level 5": 0.0551, + "GPQA": 0.2928, + "MUSR": 0.3921, + "MMLU-PRO": 0.2956 + } + }, + { + "model_id": "LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA", + "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_IA", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.3036, + "BBH": 0.4575, + "MATH Level 5": 0.0446, + "GPQA": 0.3012, + "MUSR": 0.4253, + "MMLU-PRO": 0.2329 + } + }, + { + "model_id": "LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_MX", + "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_MX", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.3066, + "BBH": 0.3158, + "MATH Level 5": 0.0151, + "GPQA": 0.2911, + "MUSR": 0.3444, + "MMLU-PRO": 0.1107 + } + }, + { + "model_id": "LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA", + "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_XA", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.3579, + "BBH": 0.4477, + "MATH Level 5": 0.0423, + "GPQA": 0.3096, + "MUSR": 0.4134, + "MMLU-PRO": 0.2376 + } + }, + { + "model_id": "LeroyDyer/SpydazWeb_AI_HumanAI_RP", + "name": "SpydazWeb_AI_HumanAI_RP", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.2541, + "BBH": 0.3323, + "MATH Level 5": 0.0128, + "GPQA": 0.2752, + "MUSR": 0.3883, + "MMLU-PRO": 0.1324 + } + }, + { + "model_id": "LeroyDyer/SpydazWeb_AI_HumanAI_TextVision", + "name": "SpydazWeb_AI_HumanAI_TextVision", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.3063, + "BBH": 0.3354, + "MATH Level 5": 0.0144, + "GPQA": 0.2919, + "MUSR": 0.3938, + "MMLU-PRO": 0.1387 + } + }, + { + "model_id": "LeroyDyer/SpydazWeb_HumanAI_M1", + "name": "SpydazWeb_HumanAI_M1", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.3582, + "BBH": 0.3563, + "MATH Level 5": 0.0249, + "GPQA": 0.2676, + "MUSR": 0.3671, + "MMLU-PRO": 0.1663 + } + }, + { + "model_id": "LeroyDyer/SpydazWeb_HumanAI_M2", + "name": "SpydazWeb_HumanAI_M2", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.375, + "BBH": 0.3931, + "MATH Level 5": 0.0287, + "GPQA": 0.2794, + "MUSR": 0.3751, + "MMLU-PRO": 0.201 + } + }, + { + "model_id": "LeroyDyer/SpydazWeb_HumanAI_M3", + "name": "SpydazWeb_HumanAI_M3", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.1579, + "BBH": 0.3127, + "MATH Level 5": 0.0091, + "GPQA": 0.271, + "MUSR": 0.3914, + "MMLU-PRO": 0.1149 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_12", + "name": "_Spydaz_Web_AI_12", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.2765, + "BBH": 0.3163, + "MATH Level 5": 0.0136, + "GPQA": 0.2685, + "MUSR": 0.3582, + "MMLU-PRO": 0.1137 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_14", + "name": "_Spydaz_Web_AI_14", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.1812, + "BBH": 0.2989, + "MATH Level 5": 0.0121, + "GPQA": 0.2659, + "MUSR": 0.3395, + "MMLU-PRO": 0.1139 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_001", + "name": "_Spydaz_Web_AI_AGI_R1_001", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.4505, + "BBH": 0.4609, + "MATH Level 5": 0.0634, + "GPQA": 0.2676, + "MUSR": 0.4256, + "MMLU-PRO": 0.2734 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_002", + "name": "_Spydaz_Web_AI_AGI_R1_002", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.5307, + "BBH": 0.4683, + "MATH Level 5": 0.0582, + "GPQA": 0.2685, + "MUSR": 0.4255, + "MMLU-PRO": 0.2894 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_MUSR", + "name": "_Spydaz_Web_AI_AGI_R1_MUSR", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.4786, + "BBH": 0.4672, + "MATH Level 5": 0.0604, + "GPQA": 0.2844, + "MUSR": 0.4869, + "MMLU-PRO": 0.2828 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_MasterCoder", + "name": "_Spydaz_Web_AI_AGI_R1_MasterCoder", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.4143, + "BBH": 0.4689, + "MATH Level 5": 0.0612, + "GPQA": 0.276, + "MUSR": 0.472, + "MMLU-PRO": 0.2719 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_001", + "name": "_Spydaz_Web_AI_AGI_R1_Math_001", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.4571, + "BBH": 0.4818, + "MATH Level 5": 0.0695, + "GPQA": 0.2768, + "MUSR": 0.4778, + "MMLU-PRO": 0.2681 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_003", + "name": "_Spydaz_Web_AI_AGI_R1_Math_003", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.62, + "BBH": 0.4756, + "MATH Level 5": 0.0695, + "GPQA": 0.281, + "MUSR": 0.4202, + "MMLU-PRO": 0.2999 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent", + "name": "_Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.5951, + "BBH": 0.4927, + "MATH Level 5": 0.0544, + "GPQA": 0.2919, + "MUSR": 0.5198, + "MMLU-PRO": 0.3 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Student", + "name": "_Spydaz_Web_AI_AGI_R1_Math_Student", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.5736, + "BBH": 0.4881, + "MATH Level 5": 0.0514, + "GPQA": 0.2903, + "MUSR": 0.5098, + "MMLU-PRO": 0.2927 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Teacher", + "name": "_Spydaz_Web_AI_AGI_R1_Math_Teacher", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.5772, + "BBH": 0.4805, + "MATH Level 5": 0.0544, + "GPQA": 0.2861, + "MUSR": 0.5222, + "MMLU-PRO": 0.2956 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_001", + "name": "_Spydaz_Web_AI_AGI_R1_OmG_001", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.5818, + "BBH": 0.4908, + "MATH Level 5": 0.0506, + "GPQA": 0.3003, + "MUSR": 0.4486, + "MMLU-PRO": 0.2906 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_002", + "name": "_Spydaz_Web_AI_AGI_R1_OmG_002", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.5462, + "BBH": 0.4655, + "MATH Level 5": 0.0498, + "GPQA": 0.2785, + "MUSR": 0.4511, + "MMLU-PRO": 0.2867 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Coder", + "name": "_Spydaz_Web_AI_AGI_R1_OmG_Coder", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.4924, + "BBH": 0.4638, + "MATH Level 5": 0.0544, + "GPQA": 0.2735, + "MUSR": 0.5625, + "MMLU-PRO": 0.289 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Math", + "name": "_Spydaz_Web_AI_AGI_R1_OmG_Math", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.5033, + "BBH": 0.4677, + "MATH Level 5": 0.0476, + "GPQA": 0.2827, + "MUSR": 0.4326, + "MMLU-PRO": 0.2913 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_MathMaster", + "name": "_Spydaz_Web_AI_AGI_R1_OmG_MathMaster", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.5558, + "BBH": 0.4742, + "MATH Level 5": 0.0536, + "GPQA": 0.2878, + "MUSR": 0.451, + "MMLU-PRO": 0.2672 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Student_Coder", + "name": "_Spydaz_Web_AI_AGI_R1_Student_Coder", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.545, + "BBH": 0.4651, + "MATH Level 5": 0.0657, + "GPQA": 0.2844, + "MUSR": 0.4388, + "MMLU-PRO": 0.2768 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Teacher_Coder", + "name": "_Spydaz_Web_AI_AGI_R1_Teacher_Coder", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.5082, + "BBH": 0.4797, + "MATH Level 5": 0.065, + "GPQA": 0.2911, + "MUSR": 0.4338, + "MMLU-PRO": 0.2845 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Top_Student", + "name": "_Spydaz_Web_AI_AGI_R1_Top_Student", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.604, + "BBH": 0.4988, + "MATH Level 5": 0.0725, + "GPQA": 0.2727, + "MUSR": 0.5398, + "MMLU-PRO": 0.3024 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_X1", + "name": "_Spydaz_Web_AI_AGI_R1_X1", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.4273, + "BBH": 0.4759, + "MATH Level 5": 0.0566, + "GPQA": 0.2601, + "MUSR": 0.4232, + "MMLU-PRO": 0.2891 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_X2", + "name": "_Spydaz_Web_AI_AGI_R1_X2", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.5434, + "BBH": 0.4786, + "MATH Level 5": 0.0612, + "GPQA": 0.2978, + "MUSR": 0.4695, + "MMLU-PRO": 0.2921 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_AGI_RP_R1", + "name": "_Spydaz_Web_AI_AGI_RP_R1", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.5426, + "BBH": 0.4701, + "MATH Level 5": 0.0604, + "GPQA": 0.2693, + "MUSR": 0.4201, + "MMLU-PRO": 0.2894 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_BIBLE_002", + "name": "_Spydaz_Web_AI_BIBLE_002", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.2195, + "BBH": 0.3289, + "MATH Level 5": 0.0174, + "GPQA": 0.2844, + "MUSR": 0.3407, + "MMLU-PRO": 0.1368 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_ChatML_002", + "name": "_Spydaz_Web_AI_ChatML_002", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.2412, + "BBH": 0.3106, + "MATH Level 5": 0.0113, + "GPQA": 0.2576, + "MUSR": 0.3623, + "MMLU-PRO": 0.1095 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_ChatQA", + "name": "_Spydaz_Web_AI_ChatQA", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.1415, + "BBH": 0.3236, + "MATH Level 5": 0.0098, + "GPQA": 0.2659, + "MUSR": 0.3447, + "MMLU-PRO": 0.1475 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_ChatQA_003", + "name": "_Spydaz_Web_AI_ChatQA_003", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.2209, + "BBH": 0.3172, + "MATH Level 5": 0.0106, + "GPQA": 0.271, + "MUSR": 0.3818, + "MMLU-PRO": 0.1133 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_TEMP_", + "name": "_Spydaz_Web_AI_TEMP_", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.4795, + "BBH": 0.4957, + "MATH Level 5": 0.1239, + "GPQA": 0.2794, + "MUSR": 0.4218, + "MMLU-PRO": 0.3121 + } + }, + { + "model_id": "LeroyDyer/_Spydaz_Web_AI_Top_Teacher_", + "name": "_Spydaz_Web_AI_Top_Teacher_", + "developer": "LeroyDyer", + "scores": { + "IFEval": 0.4404, + "BBH": 0.4891, + "MATH Level 5": 0.1156, + "GPQA": 0.2777, + "MUSR": 0.4366, + "MMLU-PRO": 0.315 + } + }, + { + "model_id": "LightningRodLabs/Flashlight-v1.0", + "name": "Flashlight-v1.0", + "developer": "LightningRodLabs", + "scores": { + "IFEval": 0.6745, + "BBH": 0.6877, + "MATH Level 5": 0.497, + "GPQA": 0.3423, + "MUSR": 0.4101, + "MMLU-PRO": 0.5402 + } + }, + { + "model_id": "LightningRodLabs/Flashlight-v1.1", + "name": "Flashlight-v1.1", + "developer": "LightningRodLabs", + "scores": { + "IFEval": 0.6721, + "BBH": 0.6901, + "MATH Level 5": 0.5325, + "GPQA": 0.3398, + "MUSR": 0.4048, + "MMLU-PRO": 0.5416 + } + }, + { + "model_id": "LightningRodLabs/Flashlight-v1.2", + "name": "Flashlight-v1.2", + "developer": "LightningRodLabs", + "scores": { + "IFEval": 0.436, + "BBH": 0.3265, + "MATH Level 5": 0.1556, + "GPQA": 0.2357, + "MUSR": 0.4554, + "MMLU-PRO": 0.2485 + } + }, + { + "model_id": "Lil-R/2_PRYMMAL-ECE-2B-SLERP-V1", + "name": "2_PRYMMAL-ECE-2B-SLERP-V1", + "developer": "Lil-R", + "scores": { + "IFEval": 0.5823, + "BBH": 0.4287, + "MATH Level 5": 0.0914, + "GPQA": 0.3062, + "MUSR": 0.4375, + "MMLU-PRO": 0.2678 + } + }, + { + "model_id": "Lil-R/2_PRYMMAL-ECE-2B-SLERP-V2", + "name": "2_PRYMMAL-ECE-2B-SLERP-V2", + "developer": "Lil-R", + "scores": { + "IFEval": 0.5543, + "BBH": 0.4376, + "MATH Level 5": 0.0944, + "GPQA": 0.2978, + "MUSR": 0.4482, + "MMLU-PRO": 0.2744 + } + }, + { + "model_id": "Lil-R/2_PRYMMAL-ECE-7B-SLERP", + "name": "2_PRYMMAL-ECE-7B-SLERP", + "developer": "Lil-R", + "scores": { + "IFEval": 0.5577, + "BBH": 0.5557, + "MATH Level 5": 0.3633, + "GPQA": 0.3104, + "MUSR": 0.4396, + "MMLU-PRO": 0.4507 + } + }, + { + "model_id": "Lil-R/2_PRYMMAL-ECE-7B-SLERP-V1", + "name": "2_PRYMMAL-ECE-7B-SLERP-V1", + "developer": "Lil-R", + "scores": { + "IFEval": 0.1073, + "BBH": 0.3053, + "MATH Level 5": 0.0008, + "GPQA": 0.2508, + "MUSR": 0.3911, + "MMLU-PRO": 0.1124 + } + }, + { + "model_id": "Lil-R/2_PRYMMAL-ECE-7B-SLERP-V2", + "name": "2_PRYMMAL-ECE-7B-SLERP-V2", + "developer": "Lil-R", + "scores": { + "IFEval": 0.1073, + "BBH": 0.3053, + "MATH Level 5": 0.0008, + "GPQA": 0.2508, + "MUSR": 0.3911, + "MMLU-PRO": 0.1124 + } + }, + { + "model_id": "Lil-R/2_PRYMMAL-ECE-7B-SLERP-V3", + "name": "2_PRYMMAL-ECE-7B-SLERP-V3", + "developer": "Lil-R", + "scores": { + "IFEval": 0.2235, + "BBH": 0.3578, + "MATH Level 5": 0.006, + "GPQA": 0.2567, + "MUSR": 0.4107, + "MMLU-PRO": 0.1817 + } + }, + { + "model_id": "Lil-R/PRYMMAL-ECE-1B-SLERP-V1", + "name": "PRYMMAL-ECE-1B-SLERP-V1", + "developer": "Lil-R", + "scores": { + "IFEval": 0.2874, + "BBH": 0.419, + "MATH Level 5": 0.1035, + "GPQA": 0.276, + "MUSR": 0.3974, + "MMLU-PRO": 0.2926 + } + }, + { + "model_id": "Lil-R/PRYMMAL-ECE-7B-SLERP-V8", + "name": "PRYMMAL-ECE-7B-SLERP-V8", + "developer": "Lil-R", + "scores": { + "IFEval": 0.1258, + "BBH": 0.2955, + "MATH Level 5": 0.0098, + "GPQA": 0.25, + "MUSR": 0.3631, + "MMLU-PRO": 0.1128 + } + }, + { + "model_id": "LilRg/10PRYMMAL-3B-slerp", + "name": "10PRYMMAL-3B-slerp", + "developer": "LilRg", + "scores": { + "IFEval": 0.1946, + "BBH": 0.532, + "MATH Level 5": 0.1495, + "GPQA": 0.3213, + "MUSR": 0.4529, + "MMLU-PRO": 0.3881 + } + }, + { + "model_id": "LilRg/ECE-1B-merge-PRYMMAL", + "name": "ECE-1B-merge-PRYMMAL", + "developer": "LilRg", + "scores": { + "IFEval": 0.2712, + "BBH": 0.4235, + "MATH Level 5": 0.1012, + "GPQA": 0.281, + "MUSR": 0.3801, + "MMLU-PRO": 0.2906 + } + }, + { + "model_id": "LilRg/ECE_Finetunning", + "name": "ECE_Finetunning", + "developer": "LilRg", + "scores": { + "IFEval": 0.0445, + "BBH": 0.4732, + "MATH Level 5": 0.0453, + "GPQA": 0.2827, + "MUSR": 0.3839, + "MMLU-PRO": 0.3191 + } + }, + { + "model_id": "LilRg/PRYMMAL-6B-slerp", + "name": "PRYMMAL-6B-slerp", + "developer": "LilRg", + "scores": { + "IFEval": 0.1153, + "BBH": 0.2868, + "MATH Level 5": 0.0, + "GPQA": 0.2458, + "MUSR": 0.3698, + "MMLU-PRO": 0.1108 + } + }, + { + "model_id": "LilRg/PRYMMAL-ECE-7B-SLERP-V3", + "name": "PRYMMAL-ECE-7B-SLERP-V3", + "developer": "LilRg", + "scores": { + "IFEval": 0.1243, + "BBH": 0.2957, + "MATH Level 5": 0.0098, + "GPQA": 0.2567, + "MUSR": 0.3671, + "MMLU-PRO": 0.1127 + } + }, + { + "model_id": "LilRg/PRYMMAL-ECE-7B-SLERP-V4", + "name": "PRYMMAL-ECE-7B-SLERP-V4", + "developer": "LilRg", + "scores": { + "IFEval": 0.1249, + "BBH": 0.2957, + "MATH Level 5": 0.0098, + "GPQA": 0.2567, + "MUSR": 0.3671, + "MMLU-PRO": 0.1127 + } + }, + { + "model_id": "LilRg/PRYMMAL-ECE-7B-SLERP-V5", + "name": "PRYMMAL-ECE-7B-SLERP-V5", + "developer": "LilRg", + "scores": { + "IFEval": 0.1249, + "BBH": 0.2957, + "MATH Level 5": 0.0098, + "GPQA": 0.2567, + "MUSR": 0.3671, + "MMLU-PRO": 0.1127 + } + }, + { + "model_id": "LilRg/PRYMMAL-ECE-7B-SLERP-V6", + "name": "PRYMMAL-ECE-7B-SLERP-V6", + "developer": "LilRg", + "scores": { + "IFEval": 0.1243, + "BBH": 0.2957, + "MATH Level 5": 0.0098, + "GPQA": 0.2567, + "MUSR": 0.3671, + "MMLU-PRO": 0.1127 + } + }, + { + "model_id": "LilRg/PRYMMAL-ECE-7B-SLERP-V7", + "name": "PRYMMAL-ECE-7B-SLERP-V7", + "developer": "LilRg", + "scores": { + "IFEval": 0.1249, + "BBH": 0.2957, + "MATH Level 5": 0.0098, + "GPQA": 0.2567, + "MUSR": 0.3671, + "MMLU-PRO": 0.1127 + } + }, + { + "model_id": "LilRg/PRYMMAL-slerp-Merge", + "name": "PRYMMAL-slerp-Merge", + "developer": "LilRg", + "scores": { + "IFEval": 0.3044, + "BBH": 0.5364, + "MATH Level 5": 0.1616, + "GPQA": 0.3205, + "MUSR": 0.4635, + "MMLU-PRO": 0.3863 + } + }, + { + "model_id": "LimYeri/CodeMind-Llama3-8B-unsloth_v2-merged", + "name": "CodeMind-Llama3-8B-unsloth_v2-merged", + "developer": "LimYeri", + "scores": { + "IFEval": 0.6946, + "BBH": 0.486, + "MATH Level 5": 0.0665, + "GPQA": 0.2651, + "MUSR": 0.3316, + "MMLU-PRO": 0.3506 + } + }, + { + "model_id": "LimYeri/CodeMind-Llama3-8B-unsloth_v3-merged", + "name": "CodeMind-Llama3-8B-unsloth_v3-merged", + "developer": "LimYeri", + "scores": { + "IFEval": 0.6763, + "BBH": 0.4908, + "MATH Level 5": 0.068, + "GPQA": 0.2584, + "MUSR": 0.3356, + "MMLU-PRO": 0.3496 + } + }, + { + "model_id": "LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged", + "name": "CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged", + "developer": "LimYeri", + "scores": { + "IFEval": 0.6492, + "BBH": 0.4853, + "MATH Level 5": 0.068, + "GPQA": 0.2685, + "MUSR": 0.3608, + "MMLU-PRO": 0.3354 + } + }, + { + "model_id": "LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-merged", + "name": "CodeMind-Llama3-8B-unsloth_v4-one-merged", + "developer": "LimYeri", + "scores": { + "IFEval": 0.3211, + "BBH": 0.4739, + "MATH Level 5": 0.0551, + "GPQA": 0.3096, + "MUSR": 0.4069, + "MMLU-PRO": 0.3353 + } + }, + { + "model_id": "LimYeri/CodeMind-Llama3.1-8B-unsloth-merged", + "name": "CodeMind-Llama3.1-8B-unsloth-merged", + "developer": "LimYeri", + "scores": { + "IFEval": 0.649, + "BBH": 0.4695, + "MATH Level 5": 0.1088, + "GPQA": 0.2643, + "MUSR": 0.3752, + "MMLU-PRO": 0.334 + } + }, + { + "model_id": "Locutusque/CollectiveLM-Falcon-3-7B", + "name": "CollectiveLM-Falcon-3-7B", + "developer": "Locutusque", + "scores": { + "IFEval": 0.3918, + "BBH": 0.5105, + "MATH Level 5": 0.2183, + "GPQA": 0.3255, + "MUSR": 0.3887, + "MMLU-PRO": 0.3599 + } + }, + { + "model_id": "Locutusque/Hercules-6.0-Llama-3.1-8B", + "name": "Hercules-6.0-Llama-3.1-8B", + "developer": "Locutusque", + "scores": { + "IFEval": 0.663, + "BBH": 0.4813, + "MATH Level 5": 0.1669, + "GPQA": 0.2643, + "MUSR": 0.3621, + "MMLU-PRO": 0.3615 + } + }, + { + "model_id": "Locutusque/Hercules-6.1-Llama-3.1-8B", + "name": "Hercules-6.1-Llama-3.1-8B", + "developer": "Locutusque", + "scores": { + "IFEval": 0.6007, + "BBH": 0.4656, + "MATH Level 5": 0.176, + "GPQA": 0.2609, + "MUSR": 0.3553, + "MMLU-PRO": 0.3669 + } + }, + { + "model_id": "Locutusque/Llama-3-NeuralHercules-5.0-8B", + "name": "Llama-3-NeuralHercules-5.0-8B", + "developer": "Locutusque", + "scores": { + "IFEval": 0.4489, + "BBH": 0.394, + "MATH Level 5": 0.0431, + "GPQA": 0.2685, + "MUSR": 0.3881, + "MMLU-PRO": 0.2933 + } + }, + { + "model_id": "Locutusque/Llama-3-Yggdrasil-2.0-8B", + "name": "Llama-3-Yggdrasil-2.0-8B", + "developer": "Locutusque", + "scores": { + "IFEval": 0.5371, + "BBH": 0.4772, + "MATH Level 5": 0.0831, + "GPQA": 0.2626, + "MUSR": 0.3977, + "MMLU-PRO": 0.3167 + } + }, + { + "model_id": "Locutusque/TinyMistral-248M-v2.5", + "name": "TinyMistral-248M-v2.5", + "developer": "Locutusque", + "scores": { + "IFEval": 0.1336, + "BBH": 0.3039, + "MATH Level 5": 0.0098, + "GPQA": 0.2508, + "MUSR": 0.3782, + "MMLU-PRO": 0.1135 + } + }, + { + "model_id": "Luni/StarDust-12b-v1", + "name": "StarDust-12b-v1", + "developer": "Luni", + "scores": { + "IFEval": 0.5459, + "BBH": 0.5366, + "MATH Level 5": 0.0763, + "GPQA": 0.276, + "MUSR": 0.4324, + "MMLU-PRO": 0.3412 + } + }, + { + "model_id": "Luni/StarDust-12b-v2", + "name": "StarDust-12b-v2", + "developer": "Luni", + "scores": { + "IFEval": 0.5629, + "BBH": 0.5419, + "MATH Level 5": 0.0687, + "GPQA": 0.2936, + "MUSR": 0.4338, + "MMLU-PRO": 0.3439 + } + }, + { + "model_id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v3", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v3", + "developer": "Lunzima", + "scores": { + "IFEval": 0.7049, + "BBH": 0.6478, + "MATH Level 5": 0.4162, + "GPQA": 0.3817, + "MUSR": 0.4808, + "MMLU-PRO": 0.5394 + } + }, + { + "model_id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v4", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v4", + "developer": "Lunzima", + "scores": { + "IFEval": 0.6943, + "BBH": 0.642, + "MATH Level 5": 0.3467, + "GPQA": 0.3716, + "MUSR": 0.4769, + "MMLU-PRO": 0.5252 + } + }, + { + "model_id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v5", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v5", + "developer": "Lunzima", + "scores": { + "IFEval": 0.7485, + "BBH": 0.6467, + "MATH Level 5": 0.4358, + "GPQA": 0.3624, + "MUSR": 0.4473, + "MMLU-PRO": 0.514 + } + }, + { + "model_id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v6", + "developer": "Lunzima", + "scores": { + "IFEval": 0.7043, + "BBH": 0.6458, + "MATH Level 5": 0.3958, + "GPQA": 0.3775, + "MUSR": 0.4768, + "MMLU-PRO": 0.5392 + } + }, + { + "model_id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt", + "developer": "Lunzima", + "scores": { + "IFEval": 0.4663, + "BBH": 0.6215, + "MATH Level 5": 0.3316, + "GPQA": 0.3758, + "MUSR": 0.4937, + "MMLU-PRO": 0.5204 + } + }, + { + "model_id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v7", + "developer": "Lunzima", + "scores": { + "IFEval": 0.6794, + "BBH": 0.6531, + "MATH Level 5": 0.4101, + "GPQA": 0.3792, + "MUSR": 0.4834, + "MMLU-PRO": 0.5376 + } + }, + { + "model_id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase", + "developer": "Lunzima", + "scores": { + "IFEval": 0.6931, + "BBH": 0.6423, + "MATH Level 5": 0.3406, + "GPQA": 0.375, + "MUSR": 0.4888, + "MMLU-PRO": 0.5277 + } + }, + { + "model_id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8", + "developer": "Lunzima", + "scores": { + "IFEval": 0.7875, + "BBH": 0.6419, + "MATH Level 5": 0.5559, + "GPQA": 0.3356, + "MUSR": 0.4394, + "MMLU-PRO": 0.5206 + } + }, + { + "model_id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.5", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8.5", + "developer": "Lunzima", + "scores": { + "IFEval": 0.5929, + "BBH": 0.6451, + "MATH Level 5": 0.3656, + "GPQA": 0.38, + "MUSR": 0.477, + "MMLU-PRO": 0.529 + } + }, + { + "model_id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.6", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8.6", + "developer": "Lunzima", + "scores": { + "IFEval": 0.5919, + "BBH": 0.6457, + "MATH Level 5": 0.4071, + "GPQA": 0.3842, + "MUSR": 0.4953, + "MMLU-PRO": 0.54 + } + }, + { + "model_id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.7", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8.7", + "developer": "Lunzima", + "scores": { + "IFEval": 0.7875, + "BBH": 0.6483, + "MATH Level 5": 0.5408, + "GPQA": 0.3515, + "MUSR": 0.4381, + "MMLU-PRO": 0.5242 + } + }, + { + "model_id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.8", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8.8", + "developer": "Lunzima", + "scores": { + "IFEval": 0.7028, + "BBH": 0.6566, + "MATH Level 5": 0.4237, + "GPQA": 0.3758, + "MUSR": 0.4912, + "MMLU-PRO": 0.5323 + } + }, + { + "model_id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.9", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8.9", + "developer": "Lunzima", + "scores": { + "IFEval": 0.7993, + "BBH": 0.6483, + "MATH Level 5": 0.537, + "GPQA": 0.3297, + "MUSR": 0.4328, + "MMLU-PRO": 0.5199 + } + }, + { + "model_id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v9", + "developer": "Lunzima", + "scores": { + "IFEval": 0.5235, + "BBH": 0.6546, + "MATH Level 5": 0.4366, + "GPQA": 0.3884, + "MUSR": 0.4806, + "MMLU-PRO": 0.5422 + } + }, + { + "model_id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9-stock", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v9-stock", + "developer": "Lunzima", + "scores": { + "IFEval": 0.6514, + "BBH": 0.6571, + "MATH Level 5": 0.4184, + "GPQA": 0.3842, + "MUSR": 0.482, + "MMLU-PRO": 0.5412 + } + }, + { + "model_id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.1", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v9.1", + "developer": "Lunzima", + "scores": { + "IFEval": 0.8003, + "BBH": 0.6555, + "MATH Level 5": 0.5468, + "GPQA": 0.3431, + "MUSR": 0.4354, + "MMLU-PRO": 0.5251 + } + }, + { + "model_id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.2", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v9.2", + "developer": "Lunzima", + "scores": { + "IFEval": 0.7862, + "BBH": 0.6538, + "MATH Level 5": 0.5332, + "GPQA": 0.3557, + "MUSR": 0.4381, + "MMLU-PRO": 0.5283 + } + }, + { + "model_id": "Lunzima/NQLSG-Qwen2.5-14B-OriginalFusion", + "name": "NQLSG-Qwen2.5-14B-OriginalFusion", + "developer": "Lunzima", + "scores": { + "IFEval": 0.6142, + "BBH": 0.6592, + "MATH Level 5": 0.4275, + "GPQA": 0.3809, + "MUSR": 0.5122, + "MMLU-PRO": 0.5239 + } + }, + { + "model_id": "Lyte/Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3", + "name": "Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3", + "developer": "Lyte", + "scores": { + "IFEval": 0.7098, + "BBH": 0.495, + "MATH Level 5": 0.1903, + "GPQA": 0.2701, + "MUSR": 0.3461, + "MMLU-PRO": 0.3618 + } + }, + { + "model_id": "Lyte/Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04", + "name": "Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04", + "developer": "Lyte", + "scores": { + "IFEval": 0.5774, + "BBH": 0.3515, + "MATH Level 5": 0.0801, + "GPQA": 0.2601, + "MUSR": 0.3236, + "MMLU-PRO": 0.1843 + } + }, + { + "model_id": "Lyte/Llama-3.2-3B-Overthinker", + "name": "Llama-3.2-3B-Overthinker", + "developer": "Lyte", + "scores": { + "IFEval": 0.6408, + "BBH": 0.432, + "MATH Level 5": 0.1563, + "GPQA": 0.2592, + "MUSR": 0.3419, + "MMLU-PRO": 0.2985 + } + }, + { + "model_id": "M4-ai/TinyMistral-248M-v3", + "name": "TinyMistral-248M-v3", + "developer": "M4-ai", + "scores": { + "IFEval": 0.1639, + "BBH": 0.2885, + "MATH Level 5": 0.0045, + "GPQA": 0.2408, + "MUSR": 0.3793, + "MMLU-PRO": 0.1132 + } + }, + { + "model_id": "MEscriva/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis", + "name": "ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis", + "developer": "MEscriva", + "scores": { + "IFEval": 0.0866, + "BBH": 0.3057, + "MATH Level 5": 0.0106, + "GPQA": 0.2517, + "MUSR": 0.4017, + "MMLU-PRO": 0.1154 + } + }, + { + "model_id": "MLP-KTLim/llama-3-Korean-Bllossom-8B", + "name": "llama-3-Korean-Bllossom-8B", + "developer": "MLP-KTLim", + "scores": { + "IFEval": 0.5113, + "BBH": 0.49, + "MATH Level 5": 0.102, + "GPQA": 0.2626, + "MUSR": 0.3675, + "MMLU-PRO": 0.3594 + } + }, + { + "model_id": "MTSAIR/Cotype-Nano", + "name": "Cotype-Nano", + "developer": "MTSAIR", + "scores": { + "IFEval": 0.3748, + "BBH": 0.3865, + "MATH Level 5": 0.0974, + "GPQA": 0.2701, + "MUSR": 0.3289, + "MMLU-PRO": 0.2477 + } + }, + { + "model_id": "MTSAIR/MultiVerse_70B", + "name": "MultiVerse_70B", + "developer": "MTSAIR", + "scores": { + "IFEval": 0.5249, + "BBH": 0.6183, + "MATH Level 5": 0.1926, + "GPQA": 0.354, + "MUSR": 0.474, + "MMLU-PRO": 0.486 + } + }, + { + "model_id": "Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.1", + "name": "Llama-3-8B-Magpie-Align-SFT-v0.1", + "developer": "Magpie-Align", + "scores": { + "IFEval": 0.4361, + "BBH": 0.4615, + "MATH Level 5": 0.0574, + "GPQA": 0.2626, + "MUSR": 0.3277, + "MMLU-PRO": 0.2863 + } + }, + { + "model_id": "Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.3", + "name": "Llama-3-8B-Magpie-Align-SFT-v0.3", + "developer": "Magpie-Align", + "scores": { + "IFEval": 0.5064, + "BBH": 0.4572, + "MATH Level 5": 0.0733, + "GPQA": 0.2659, + "MUSR": 0.3424, + "MMLU-PRO": 0.2902 + } + }, + { + "model_id": "Magpie-Align/Llama-3-8B-Magpie-Align-v0.1", + "name": "Llama-3-8B-Magpie-Align-v0.1", + "developer": "Magpie-Align", + "scores": { + "IFEval": 0.4027, + "BBH": 0.4789, + "MATH Level 5": 0.0461, + "GPQA": 0.2768, + "MUSR": 0.3087, + "MMLU-PRO": 0.3001 + } + }, + { + "model_id": "Magpie-Align/Llama-3-8B-Magpie-Align-v0.3", + "name": "Llama-3-8B-Magpie-Align-v0.3", + "developer": "Magpie-Align", + "scores": { + "IFEval": 0.4497, + "BBH": 0.457, + "MATH Level 5": 0.0566, + "GPQA": 0.2651, + "MUSR": 0.3406, + "MMLU-PRO": 0.3134 + } + }, + { + "model_id": "Magpie-Align/Llama-3.1-8B-Magpie-Align-SFT-v0.1", + "name": "Llama-3.1-8B-Magpie-Align-SFT-v0.1", + "developer": "Magpie-Align", + "scores": { + "IFEval": 0.4782, + "BBH": 0.4764, + "MATH Level 5": 0.0899, + "GPQA": 0.2609, + "MUSR": 0.3397, + "MMLU-PRO": 0.2943 + } + }, + { + "model_id": "Magpie-Align/Llama-3.1-8B-Magpie-Align-v0.1", + "name": "Llama-3.1-8B-Magpie-Align-v0.1", + "developer": "Magpie-Align", + "scores": { + "IFEval": 0.4458, + "BBH": 0.4622, + "MATH Level 5": 0.0665, + "GPQA": 0.2634, + "MUSR": 0.3141, + "MMLU-PRO": 0.3262 + } + }, + { + "model_id": "Magpie-Align/MagpieLM-8B-Chat-v0.1", + "name": "MagpieLM-8B-Chat-v0.1", + "developer": "Magpie-Align", + "scores": { + "IFEval": 0.3701, + "BBH": 0.4172, + "MATH Level 5": 0.0612, + "GPQA": 0.2617, + "MUSR": 0.3501, + "MMLU-PRO": 0.3195 + } + }, + { + "model_id": "Magpie-Align/MagpieLM-8B-SFT-v0.1", + "name": "MagpieLM-8B-SFT-v0.1", + "developer": "Magpie-Align", + "scores": { + "IFEval": 0.4721, + "BBH": 0.4553, + "MATH Level 5": 0.0755, + "GPQA": 0.2676, + "MUSR": 0.3649, + "MMLU-PRO": 0.299 + } + }, + { + "model_id": "MagusCorp/grpo_lora_enem_llama3_7b", + "name": "grpo_lora_enem_llama3_7b", + "developer": "MagusCorp", + "scores": { + "IFEval": 0.4724, + "BBH": 0.4801, + "MATH Level 5": 0.1216, + "GPQA": 0.3096, + "MUSR": 0.3971, + "MMLU-PRO": 0.3574 + } + }, + { + "model_id": "ManoloPueblo/ContentCuisine_1-7B-slerp", + "name": "ContentCuisine_1-7B-slerp", + "developer": "ManoloPueblo", + "scores": { + "IFEval": 0.3907, + "BBH": 0.5188, + "MATH Level 5": 0.0733, + "GPQA": 0.3029, + "MUSR": 0.4672, + "MMLU-PRO": 0.3054 + } + }, + { + "model_id": "ManoloPueblo/LLM_MERGE_CC2", + "name": "LLM_MERGE_CC2", + "developer": "ManoloPueblo", + "scores": { + "IFEval": 0.3853, + "BBH": 0.5209, + "MATH Level 5": 0.0642, + "GPQA": 0.3045, + "MUSR": 0.4593, + "MMLU-PRO": 0.3032 + } + }, + { + "model_id": "ManoloPueblo/LLM_MERGE_CC3", + "name": "LLM_MERGE_CC3", + "developer": "ManoloPueblo", + "scores": { + "IFEval": 0.3959, + "BBH": 0.5246, + "MATH Level 5": 0.0793, + "GPQA": 0.3096, + "MUSR": 0.4672, + "MMLU-PRO": 0.3156 + } + }, + { + "model_id": "MarinaraSpaghetti/NemoReRemix-12B", + "name": "NemoReRemix-12B", + "developer": "MarinaraSpaghetti", + "scores": { + "IFEval": 0.3343, + "BBH": 0.5537, + "MATH Level 5": 0.0906, + "GPQA": 0.318, + "MUSR": 0.4501, + "MMLU-PRO": 0.3598 + } + }, + { + "model_id": "MarinaraSpaghetti/Nemomix-v4.0-12B", + "name": "Nemomix-v4.0-12B", + "developer": "MarinaraSpaghetti", + "scores": { + "IFEval": 0.5575, + "BBH": 0.5275, + "MATH Level 5": 0.108, + "GPQA": 0.2919, + "MUSR": 0.4244, + "MMLU-PRO": 0.3613 + } + }, + { + "model_id": "Marsouuu/MiniMathExpert-2_61B-ECE-PRYMMAL-Martial", + "name": "MiniMathExpert-2_61B-ECE-PRYMMAL-Martial", + "developer": "Marsouuu", + "scores": { + "IFEval": 0.2548, + "BBH": 0.3953, + "MATH Level 5": 0.074, + "GPQA": 0.2752, + "MUSR": 0.4083, + "MMLU-PRO": 0.2274 + } + }, + { + "model_id": "Marsouuu/MiniQwenMathExpert-ECE-PRYMMAL-Martial", + "name": "MiniQwenMathExpert-ECE-PRYMMAL-Martial", + "developer": "Marsouuu", + "scores": { + "IFEval": 0.2795, + "BBH": 0.423, + "MATH Level 5": 0.114, + "GPQA": 0.2819, + "MUSR": 0.3867, + "MMLU-PRO": 0.2922 + } + }, + { + "model_id": "Marsouuu/MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial", + "name": "MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial", + "developer": "Marsouuu", + "scores": { + "IFEval": 0.1697, + "BBH": 0.3464, + "MATH Level 5": 0.0144, + "GPQA": 0.2592, + "MUSR": 0.3991, + "MMLU-PRO": 0.1379 + } + }, + { + "model_id": "Marsouuu/general3B-ECE-PRYMMAL-Martial", + "name": "general3B-ECE-PRYMMAL-Martial", + "developer": "Marsouuu", + "scores": { + "IFEval": 0.2722, + "BBH": 0.5394, + "MATH Level 5": 0.1548, + "GPQA": 0.3196, + "MUSR": 0.4701, + "MMLU-PRO": 0.3876 + } + }, + { + "model_id": "Marsouuu/general3Bv2-ECE-PRYMMAL-Martial", + "name": "general3Bv2-ECE-PRYMMAL-Martial", + "developer": "Marsouuu", + "scores": { + "IFEval": 0.5693, + "BBH": 0.5637, + "MATH Level 5": 0.3671, + "GPQA": 0.3104, + "MUSR": 0.4396, + "MMLU-PRO": 0.4498 + } + }, + { + "model_id": "Marsouuu/lareneg1_78B-ECE-PRYMMAL-Martial", + "name": "lareneg1_78B-ECE-PRYMMAL-Martial", + "developer": "Marsouuu", + "scores": { + "IFEval": 0.2795, + "BBH": 0.423, + "MATH Level 5": 0.114, + "GPQA": 0.2819, + "MUSR": 0.3867, + "MMLU-PRO": 0.2922 + } + }, + { + "model_id": "Marsouuu/lareneg3B-ECE-PRYMMAL-Martial", + "name": "lareneg3B-ECE-PRYMMAL-Martial", + "developer": "Marsouuu", + "scores": { + "IFEval": 0.3303, + "BBH": 0.5453, + "MATH Level 5": 0.1518, + "GPQA": 0.3247, + "MUSR": 0.4725, + "MMLU-PRO": 0.3767 + } + }, + { + "model_id": "Marsouuu/lareneg3Bv2-ECE-PRYMMAL-Martial", + "name": "lareneg3Bv2-ECE-PRYMMAL-Martial", + "developer": "Marsouuu", + "scores": { + "IFEval": 0.5753, + "BBH": 0.5623, + "MATH Level 5": 0.3656, + "GPQA": 0.3196, + "MUSR": 0.4369, + "MMLU-PRO": 0.4511 + } + }, + { + "model_id": "MaziyarPanahi/Calme-4x7B-MoE-v0.1", + "name": "Calme-4x7B-MoE-v0.1", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.4315, + "BBH": 0.5103, + "MATH Level 5": 0.0801, + "GPQA": 0.2819, + "MUSR": 0.4199, + "MMLU-PRO": 0.3057 + } + }, + { + "model_id": "MaziyarPanahi/Calme-4x7B-MoE-v0.2", + "name": "Calme-4x7B-MoE-v0.2", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.4294, + "BBH": 0.5111, + "MATH Level 5": 0.074, + "GPQA": 0.2794, + "MUSR": 0.4318, + "MMLU-PRO": 0.3058 + } + }, + { + "model_id": "MaziyarPanahi/Llama-3-70B-Instruct-v0.1", + "name": "Llama-3-70B-Instruct-v0.1", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.4714, + "BBH": 0.5366, + "MATH Level 5": 0.1805, + "GPQA": 0.2844, + "MUSR": 0.4433, + "MMLU-PRO": 0.4618 + } + }, + { + "model_id": "MaziyarPanahi/Llama-3-8B-Instruct-v0.10", + "name": "Llama-3-8B-Instruct-v0.10", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.7667, + "BBH": 0.4924, + "MATH Level 5": 0.0574, + "GPQA": 0.3087, + "MUSR": 0.4214, + "MMLU-PRO": 0.3862 + } + }, + { + "model_id": "MaziyarPanahi/Llama-3-8B-Instruct-v0.8", + "name": "Llama-3-8B-Instruct-v0.8", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.7528, + "BBH": 0.4963, + "MATH Level 5": 0.0778, + "GPQA": 0.3054, + "MUSR": 0.4202, + "MMLU-PRO": 0.3853 + } + }, + { + "model_id": "MaziyarPanahi/Llama-3-8B-Instruct-v0.9", + "name": "Llama-3-8B-Instruct-v0.9", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.763, + "BBH": 0.4936, + "MATH Level 5": 0.0733, + "GPQA": 0.3079, + "MUSR": 0.4148, + "MMLU-PRO": 0.3846 + } + }, + { + "model_id": "MaziyarPanahi/Qwen1.5-MoE-A2.7B-Wikihow", + "name": "Qwen1.5-MoE-A2.7B-Wikihow", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.2954, + "BBH": 0.392, + "MATH Level 5": 0.0823, + "GPQA": 0.2752, + "MUSR": 0.3502, + "MMLU-PRO": 0.238 + } + }, + { + "model_id": "MaziyarPanahi/Qwen2-7B-Instruct-v0.1", + "name": "Qwen2-7B-Instruct-v0.1", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.3352, + "BBH": 0.5123, + "MATH Level 5": 0.2213, + "GPQA": 0.2852, + "MUSR": 0.4435, + "MMLU-PRO": 0.3857 + } + }, + { + "model_id": "MaziyarPanahi/Qwen2-7B-Instruct-v0.8", + "name": "Qwen2-7B-Instruct-v0.8", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.2775, + "BBH": 0.4637, + "MATH Level 5": 0.1767, + "GPQA": 0.2936, + "MUSR": 0.4293, + "MMLU-PRO": 0.3566 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.1-llama3.1-70b", + "name": "calme-2.1-llama3.1-70b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.8434, + "BBH": 0.6448, + "MATH Level 5": 0.4101, + "GPQA": 0.328, + "MUSR": 0.438, + "MMLU-PRO": 0.5283 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.1-phi3-4b", + "name": "calme-2.1-phi3-4b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.5525, + "BBH": 0.5595, + "MATH Level 5": 0.1314, + "GPQA": 0.3297, + "MUSR": 0.4015, + "MMLU-PRO": 0.3746 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.1-phi3.5-4b", + "name": "calme-2.1-phi3.5-4b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.5659, + "BBH": 0.5484, + "MATH Level 5": 0.2039, + "GPQA": 0.344, + "MUSR": 0.3995, + "MMLU-PRO": 0.3935 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.1-qwen2-72b", + "name": "calme-2.1-qwen2-72b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.8163, + "BBH": 0.6966, + "MATH Level 5": 0.4079, + "GPQA": 0.3809, + "MUSR": 0.4732, + "MMLU-PRO": 0.5415 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.1-qwen2-7b", + "name": "calme-2.1-qwen2-7b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.3816, + "BBH": 0.5046, + "MATH Level 5": 0.2311, + "GPQA": 0.2894, + "MUSR": 0.4437, + "MMLU-PRO": 0.3693 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.1-qwen2.5-72b", + "name": "calme-2.1-qwen2.5-72b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.8662, + "BBH": 0.7262, + "MATH Level 5": 0.5914, + "GPQA": 0.3633, + "MUSR": 0.4298, + "MMLU-PRO": 0.5619 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.1-rys-78b", + "name": "calme-2.1-rys-78b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.8136, + "BBH": 0.7098, + "MATH Level 5": 0.3943, + "GPQA": 0.3943, + "MUSR": 0.4693, + "MMLU-PRO": 0.5444 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.2-llama3-70b", + "name": "calme-2.2-llama3-70b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.8208, + "BBH": 0.6435, + "MATH Level 5": 0.2394, + "GPQA": 0.3414, + "MUSR": 0.4446, + "MMLU-PRO": 0.5207 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.2-llama3.1-70b", + "name": "calme-2.2-llama3.1-70b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.8593, + "BBH": 0.6793, + "MATH Level 5": 0.4366, + "GPQA": 0.3247, + "MUSR": 0.4542, + "MMLU-PRO": 0.5415 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.2-phi3-4b", + "name": "calme-2.2-phi3-4b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.5069, + "BBH": 0.553, + "MATH Level 5": 0.145, + "GPQA": 0.3213, + "MUSR": 0.3976, + "MMLU-PRO": 0.3814 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.2-qwen2-72b", + "name": "calme-2.2-qwen2-72b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.8008, + "BBH": 0.694, + "MATH Level 5": 0.4532, + "GPQA": 0.3742, + "MUSR": 0.4508, + "MMLU-PRO": 0.5435 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.2-qwen2-7b", + "name": "calme-2.2-qwen2-7b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.3597, + "BBH": 0.5215, + "MATH Level 5": 0.2145, + "GPQA": 0.2911, + "MUSR": 0.4358, + "MMLU-PRO": 0.3899 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.2-qwen2.5-72b", + "name": "calme-2.2-qwen2.5-72b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.8477, + "BBH": 0.7276, + "MATH Level 5": 0.5891, + "GPQA": 0.3591, + "MUSR": 0.4207, + "MMLU-PRO": 0.5618 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.2-rys-78b", + "name": "calme-2.2-rys-78b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.7986, + "BBH": 0.7081, + "MATH Level 5": 0.4071, + "GPQA": 0.4069, + "MUSR": 0.4536, + "MMLU-PRO": 0.5386 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.3-llama3-70b", + "name": "calme-2.3-llama3-70b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.801, + "BBH": 0.6399, + "MATH Level 5": 0.2326, + "GPQA": 0.3381, + "MUSR": 0.4261, + "MMLU-PRO": 0.5204 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.3-llama3.1-70b", + "name": "calme-2.3-llama3.1-70b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.8605, + "BBH": 0.6872, + "MATH Level 5": 0.3927, + "GPQA": 0.344, + "MUSR": 0.4568, + "MMLU-PRO": 0.5363 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.3-phi3-4b", + "name": "calme-2.3-phi3-4b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.4926, + "BBH": 0.5538, + "MATH Level 5": 0.1473, + "GPQA": 0.318, + "MUSR": 0.3988, + "MMLU-PRO": 0.3828 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.3-qwen2-72b", + "name": "calme-2.3-qwen2-72b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.385, + "BBH": 0.6576, + "MATH Level 5": 0.3172, + "GPQA": 0.3716, + "MUSR": 0.4112, + "MMLU-PRO": 0.5419 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.3-qwen2-7b", + "name": "calme-2.3-qwen2-7b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.3825, + "BBH": 0.5064, + "MATH Level 5": 0.2069, + "GPQA": 0.297, + "MUSR": 0.4422, + "MMLU-PRO": 0.3611 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.3-rys-78b", + "name": "calme-2.3-rys-78b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.8066, + "BBH": 0.7108, + "MATH Level 5": 0.398, + "GPQA": 0.4044, + "MUSR": 0.4549, + "MMLU-PRO": 0.5475 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.4-llama3-70b", + "name": "calme-2.4-llama3-70b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.5027, + "BBH": 0.6418, + "MATH Level 5": 0.2447, + "GPQA": 0.3398, + "MUSR": 0.4288, + "MMLU-PRO": 0.5204 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.4-qwen2-7b", + "name": "calme-2.4-qwen2-7b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.33, + "BBH": 0.5101, + "MATH Level 5": 0.2032, + "GPQA": 0.2836, + "MUSR": 0.4453, + "MMLU-PRO": 0.3977 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.4-rys-78b", + "name": "calme-2.4-rys-78b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.8011, + "BBH": 0.728, + "MATH Level 5": 0.4071, + "GPQA": 0.4027, + "MUSR": 0.5771, + "MMLU-PRO": 0.7002 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.5-qwen2-7b", + "name": "calme-2.5-qwen2-7b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.3145, + "BBH": 0.4887, + "MATH Level 5": 0.2258, + "GPQA": 0.3104, + "MUSR": 0.4565, + "MMLU-PRO": 0.3682 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.6-qwen2-7b", + "name": "calme-2.6-qwen2-7b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.3443, + "BBH": 0.493, + "MATH Level 5": 0.1216, + "GPQA": 0.2844, + "MUSR": 0.4586, + "MMLU-PRO": 0.3732 + } + }, + { + "model_id": "MaziyarPanahi/calme-2.7-qwen2-7b", + "name": "calme-2.7-qwen2-7b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.3592, + "BBH": 0.4883, + "MATH Level 5": 0.1382, + "GPQA": 0.2911, + "MUSR": 0.4824, + "MMLU-PRO": 0.3705 + } + }, + { + "model_id": "MaziyarPanahi/calme-3.1-baguette-3b", + "name": "calme-3.1-baguette-3b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.6234, + "BBH": 0.4683, + "MATH Level 5": 0.256, + "GPQA": 0.2861, + "MUSR": 0.4008, + "MMLU-PRO": 0.3399 + } + }, + { + "model_id": "MaziyarPanahi/calme-3.1-instruct-3b", + "name": "calme-3.1-instruct-3b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.4336, + "BBH": 0.4813, + "MATH Level 5": 0.1775, + "GPQA": 0.2861, + "MUSR": 0.3952, + "MMLU-PRO": 0.3557 + } + }, + { + "model_id": "MaziyarPanahi/calme-3.1-instruct-78b", + "name": "calme-3.1-instruct-78b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.8136, + "BBH": 0.7305, + "MATH Level 5": 0.3927, + "GPQA": 0.396, + "MUSR": 0.5891, + "MMLU-PRO": 0.7185 + } + }, + { + "model_id": "MaziyarPanahi/calme-3.1-llamaloi-3b", + "name": "calme-3.1-llamaloi-3b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.7375, + "BBH": 0.4587, + "MATH Level 5": 0.173, + "GPQA": 0.281, + "MUSR": 0.3515, + "MMLU-PRO": 0.3205 + } + }, + { + "model_id": "MaziyarPanahi/calme-3.2-baguette-3b", + "name": "calme-3.2-baguette-3b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.6338, + "BBH": 0.4709, + "MATH Level 5": 0.2825, + "GPQA": 0.2945, + "MUSR": 0.4021, + "MMLU-PRO": 0.3338 + } + }, + { + "model_id": "MaziyarPanahi/calme-3.2-instruct-3b", + "name": "calme-3.2-instruct-3b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.5533, + "BBH": 0.4866, + "MATH Level 5": 0.2168, + "GPQA": 0.2836, + "MUSR": 0.4047, + "MMLU-PRO": 0.3653 + } + }, + { + "model_id": "MaziyarPanahi/calme-3.2-instruct-78b", + "name": "calme-3.2-instruct-78b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.8063, + "BBH": 0.7319, + "MATH Level 5": 0.4033, + "GPQA": 0.4027, + "MUSR": 0.6024, + "MMLU-PRO": 0.7303 + } + }, + { + "model_id": "MaziyarPanahi/calme-3.3-baguette-3b", + "name": "calme-3.3-baguette-3b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.636, + "BBH": 0.4678, + "MATH Level 5": 0.3807, + "GPQA": 0.2802, + "MUSR": 0.3928, + "MMLU-PRO": 0.3342 + } + }, + { + "model_id": "MaziyarPanahi/calme-3.3-instruct-3b", + "name": "calme-3.3-instruct-3b", + "developer": "MaziyarPanahi", + "scores": { + "IFEval": 0.6423, + "BBH": 0.4693, + "MATH Level 5": 0.3739, + "GPQA": 0.2827, + "MUSR": 0.4074, + "MMLU-PRO": 0.3305 + } + }, + { + "model_id": "Minami-su/Amara-o1-7B-Qwen", + "name": "Amara-o1-7B-Qwen", + "developer": "Minami-su", + "scores": { + "IFEval": 0.739, + "BBH": 0.5199, + "MATH Level 5": 0.5181, + "GPQA": 0.2936, + "MUSR": 0.4007, + "MMLU-PRO": 0.4083 + } + }, + { + "model_id": "Minami-su/Amara-o2-7B-Qwen", + "name": "Amara-o2-7B-Qwen", + "developer": "Minami-su", + "scores": { + "IFEval": 0.7147, + "BBH": 0.5173, + "MATH Level 5": 0.4086, + "GPQA": 0.2634, + "MUSR": 0.3781, + "MMLU-PRO": 0.4165 + } + }, + { + "model_id": "Minami-su/test-7B-00", + "name": "test-7B-00", + "developer": "Minami-su", + "scores": { + "IFEval": 0.669, + "BBH": 0.4466, + "MATH Level 5": 0.4517, + "GPQA": 0.3029, + "MUSR": 0.4126, + "MMLU-PRO": 0.3588 + } + }, + { + "model_id": "Minami-su/test-7B-01", + "name": "test-7B-01", + "developer": "Minami-su", + "scores": { + "IFEval": 0.6736, + "BBH": 0.4422, + "MATH Level 5": 0.4554, + "GPQA": 0.307, + "MUSR": 0.4153, + "MMLU-PRO": 0.3536 + } + }, + { + "model_id": "Minami-su/test-v2-7B-00", + "name": "test-v2-7B-00", + "developer": "Minami-su", + "scores": { + "IFEval": 0.6747, + "BBH": 0.4416, + "MATH Level 5": 0.4418, + "GPQA": 0.2919, + "MUSR": 0.4154, + "MMLU-PRO": 0.3472 + } + }, + { + "model_id": "ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1", + "name": "Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1", + "developer": "ModelCloud", + "scores": { + "IFEval": 0.5269, + "BBH": 0.3253, + "MATH Level 5": 0.0604, + "GPQA": 0.2534, + "MUSR": 0.3249, + "MMLU-PRO": 0.1764 + } + }, + { + "model_id": "ModelSpace/GemmaX2-28-9B-v0.1", + "name": "GemmaX2-28-9B-v0.1", + "developer": "ModelSpace", + "scores": { + "IFEval": 0.0039, + "BBH": 0.3687, + "MATH Level 5": 0.0272, + "GPQA": 0.2768, + "MUSR": 0.3537, + "MMLU-PRO": 0.2231 + } + }, + { + "model_id": "MoonRide/Llama-3.2-3B-Khelavaster", + "name": "Llama-3.2-3B-Khelavaster", + "developer": "MoonRide", + "scores": { + "IFEval": 0.4925, + "BBH": 0.4516, + "MATH Level 5": 0.1616, + "GPQA": 0.2777, + "MUSR": 0.3699, + "MMLU-PRO": 0.3122 + } + }, + { + "model_id": "Mostafa8Mehrabi/llama-3.2-1b-Insomnia-ChatBot-merged", + "name": "llama-3.2-1b-Insomnia-ChatBot-merged", + "developer": "Mostafa8Mehrabi", + "scores": { + "IFEval": 0.1321, + "BBH": 0.3004, + "MATH Level 5": 0.0076, + "GPQA": 0.2366, + "MUSR": 0.3382, + "MMLU-PRO": 0.1131 + } + }, + { + "model_id": "MrRobotoAI/MrRoboto-ProLong-8b-v4i", + "name": "MrRoboto-ProLong-8b-v4i", + "developer": "MrRobotoAI", + "scores": { + "IFEval": 0.3835, + "BBH": 0.4585, + "MATH Level 5": 0.0551, + "GPQA": 0.2894, + "MUSR": 0.4014, + "MMLU-PRO": 0.3068 + } + }, + { + "model_id": "MrRobotoAI/MrRoboto-ProLongBASE-pt8-unaligned-8b", + "name": "MrRoboto-ProLongBASE-pt8-unaligned-8b", + "developer": "MrRobotoAI", + "scores": { + "IFEval": 0.3475, + "BBH": 0.4515, + "MATH Level 5": 0.0423, + "GPQA": 0.281, + "MUSR": 0.4279, + "MMLU-PRO": 0.2566 + } + }, + { + "model_id": "MultivexAI/Gladiator-Mini-Exp-1211-3B", + "name": "Gladiator-Mini-Exp-1211-3B", + "developer": "MultivexAI", + "scores": { + "IFEval": 0.6876, + "BBH": 0.4484, + "MATH Level 5": 0.1375, + "GPQA": 0.2727, + "MUSR": 0.326, + "MMLU-PRO": 0.3152 + } + }, + { + "model_id": "MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct", + "name": "Gladiator-Mini-Exp-1221-3B-Instruct", + "developer": "MultivexAI", + "scores": { + "IFEval": 0.6079, + "BBH": 0.437, + "MATH Level 5": 0.1352, + "GPQA": 0.2634, + "MUSR": 0.3115, + "MMLU-PRO": 0.3049 + } + }, + { + "model_id": "MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct-V2", + "name": "Gladiator-Mini-Exp-1221-3B-Instruct-V2", + "developer": "MultivexAI", + "scores": { + "IFEval": 0.6215, + "BBH": 0.4389, + "MATH Level 5": 0.1412, + "GPQA": 0.2634, + "MUSR": 0.3008, + "MMLU-PRO": 0.3025 + } + }, + { + "model_id": "MultivexAI/Gladiator-Mini-Exp-1222-3B-Instruct", + "name": "Gladiator-Mini-Exp-1222-3B-Instruct", + "developer": "MultivexAI", + "scores": { + "IFEval": 0.6163, + "BBH": 0.4373, + "MATH Level 5": 0.1412, + "GPQA": 0.2634, + "MUSR": 0.3128, + "MMLU-PRO": 0.3017 + } + }, + { + "model_id": "MultivexAI/Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF", + "name": "Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF", + "developer": "MultivexAI", + "scores": { + "IFEval": 0.144, + "BBH": 0.2908, + "MATH Level 5": 0.006, + "GPQA": 0.255, + "MUSR": 0.3642, + "MMLU-PRO": 0.1109 + } + }, + { + "model_id": "Mxode/NanoLM-0.3B-Instruct-v1", + "name": "NanoLM-0.3B-Instruct-v1", + "developer": "Mxode", + "scores": { + "IFEval": 0.1537, + "BBH": 0.3028, + "MATH Level 5": 0.0144, + "GPQA": 0.2718, + "MUSR": 0.4155, + "MMLU-PRO": 0.1105 + } + }, + { + "model_id": "Mxode/NanoLM-0.3B-Instruct-v1.1", + "name": "NanoLM-0.3B-Instruct-v1.1", + "developer": "Mxode", + "scores": { + "IFEval": 0.1783, + "BBH": 0.3014, + "MATH Level 5": 0.0136, + "GPQA": 0.25, + "MUSR": 0.4273, + "MMLU-PRO": 0.1121 + } + }, + { + "model_id": "Mxode/NanoLM-0.3B-Instruct-v2", + "name": "NanoLM-0.3B-Instruct-v2", + "developer": "Mxode", + "scores": { + "IFEval": 0.1668, + "BBH": 0.2921, + "MATH Level 5": 0.0068, + "GPQA": 0.2609, + "MUSR": 0.3955, + "MMLU-PRO": 0.1134 + } + }, + { + "model_id": "Mxode/NanoLM-1B-Instruct-v1.1", + "name": "NanoLM-1B-Instruct-v1.1", + "developer": "Mxode", + "scores": { + "IFEval": 0.2395, + "BBH": 0.3184, + "MATH Level 5": 0.0363, + "GPQA": 0.2634, + "MUSR": 0.3433, + "MMLU-PRO": 0.1215 + } + }, + { + "model_id": "Mxode/NanoLM-1B-Instruct-v2", + "name": "NanoLM-1B-Instruct-v2", + "developer": "Mxode", + "scores": { + "IFEval": 0.263, + "BBH": 0.3123, + "MATH Level 5": 0.0415, + "GPQA": 0.2634, + "MUSR": 0.3552, + "MMLU-PRO": 0.1238 + } + }, + { + "model_id": "NAPS-ai/naps-gemma-2-27b-v-0.1.0", + "name": "naps-gemma-2-27b-v-0.1.0", + "developer": "NAPS-ai", + "scores": { + "IFEval": 0.0, + "BBH": 0.2912, + "MATH Level 5": 0.0, + "GPQA": 0.2601, + "MUSR": 0.3575, + "MMLU-PRO": 0.1168 + } + }, + { + "model_id": "NAPS-ai/naps-gemma-2-27b-v0.1.0", + "name": "naps-gemma-2-27b-v0.1.0", + "developer": "NAPS-ai", + "scores": { + "IFEval": 0.0, + "BBH": 0.2912, + "MATH Level 5": 0.0, + "GPQA": 0.2601, + "MUSR": 0.3575, + "MMLU-PRO": 0.1168 + } + }, + { + "model_id": "NAPS-ai/naps-llama-3_1-8b-instruct-v0.3", + "name": "naps-llama-3_1-8b-instruct-v0.3", + "developer": "NAPS-ai", + "scores": { + "IFEval": 0.5391, + "BBH": 0.4901, + "MATH Level 5": 0.1903, + "GPQA": 0.2995, + "MUSR": 0.3787, + "MMLU-PRO": 0.3398 + } + }, + { + "model_id": "NAPS-ai/naps-llama-3_1-8b-instruct-v0.4", + "name": "naps-llama-3_1-8b-instruct-v0.4", + "developer": "NAPS-ai", + "scores": { + "IFEval": 0.7344, + "BBH": 0.4862, + "MATH Level 5": 0.1964, + "GPQA": 0.2794, + "MUSR": 0.4421, + "MMLU-PRO": 0.3475 + } + }, + { + "model_id": "NAPS-ai/naps-llama-3_1-instruct-v0.5.0", + "name": "naps-llama-3_1-instruct-v0.5.0", + "developer": "NAPS-ai", + "scores": { + "IFEval": 0.502, + "BBH": 0.4148, + "MATH Level 5": 0.0363, + "GPQA": 0.2685, + "MUSR": 0.3713, + "MMLU-PRO": 0.2614 + } + }, + { + "model_id": "NAPS-ai/naps-llama-3_1_instruct-v0.6.0", + "name": "naps-llama-3_1_instruct-v0.6.0", + "developer": "NAPS-ai", + "scores": { + "IFEval": 0.328, + "BBH": 0.4528, + "MATH Level 5": 0.0642, + "GPQA": 0.2819, + "MUSR": 0.3739, + "MMLU-PRO": 0.3241 + } + }, + { + "model_id": "NAPS-ai/naps-llama3.1-70B-v0.2-fp16", + "name": "naps-llama3.1-70B-v0.2-fp16", + "developer": "NAPS-ai", + "scores": { + "IFEval": 0.1845, + "BBH": 0.3041, + "MATH Level 5": 0.0, + "GPQA": 0.2391, + "MUSR": 0.3486, + "MMLU-PRO": 0.1099 + } + }, + { + "model_id": "NCSOFT/Llama-VARCO-8B-Instruct", + "name": "Llama-VARCO-8B-Instruct", + "developer": "NCSOFT", + "scores": { + "IFEval": 0.447, + "BBH": 0.5023, + "MATH Level 5": 0.1065, + "GPQA": 0.297, + "MUSR": 0.3841, + "MMLU-PRO": 0.319 + } + }, + { + "model_id": "NJS26/NJS_777", + "name": "NJS_777", + "developer": "NJS26", + "scores": { + "IFEval": 0.1881, + "BBH": 0.2178, + "MATH Level 5": 0.0, + "GPQA": 0.2064, + "MUSR": 0.3538, + "MMLU-PRO": 0.1163 + } + }, + { + "model_id": "NLPark/AnFeng_v3.1-Avocet", + "name": "AnFeng_v3.1-Avocet", + "developer": "NLPark", + "scores": { + "IFEval": 0.5096, + "BBH": 0.5829, + "MATH Level 5": 0.1594, + "GPQA": 0.3247, + "MUSR": 0.4476, + "MMLU-PRO": 0.4438 + } + }, + { + "model_id": "NLPark/B-and-W_Flycatcher-3AD1E", + "name": "B-and-W_Flycatcher-3AD1E", + "developer": "NLPark", + "scores": { + "IFEval": 0.4908, + "BBH": 0.6065, + "MATH Level 5": 0.2379, + "GPQA": 0.3305, + "MUSR": 0.4423, + "MMLU-PRO": 0.4741 + } + }, + { + "model_id": "NLPark/Shi-Ci-Robin-Test_3AD80", + "name": "Shi-Ci-Robin-Test_3AD80", + "developer": "NLPark", + "scores": { + "IFEval": 0.7227, + "BBH": 0.6705, + "MATH Level 5": 0.3157, + "GPQA": 0.3599, + "MUSR": 0.4696, + "MMLU-PRO": 0.5121 + } + }, + { + "model_id": "NTQAI/NxMobileLM-1.5B-SFT", + "name": "NxMobileLM-1.5B-SFT", + "developer": "NTQAI", + "scores": { + "IFEval": 0.6392, + "BBH": 0.3957, + "MATH Level 5": 0.0846, + "GPQA": 0.2592, + "MUSR": 0.3555, + "MMLU-PRO": 0.2817 + } + }, + { + "model_id": "NTQAI/Nxcode-CQ-7B-orpo", + "name": "Nxcode-CQ-7B-orpo", + "developer": "NTQAI", + "scores": { + "IFEval": 0.4007, + "BBH": 0.4143, + "MATH Level 5": 0.0219, + "GPQA": 0.2542, + "MUSR": 0.394, + "MMLU-PRO": 0.1612 + } + }, + { + "model_id": "NYTK/PULI-GPTrio", + "name": "PULI-GPTrio", + "developer": "NYTK", + "scores": { + "IFEval": 0.218, + "BBH": 0.306, + "MATH Level 5": 0.0121, + "GPQA": 0.2659, + "MUSR": 0.3819, + "MMLU-PRO": 0.1137 + } + }, + { + "model_id": "NYTK/PULI-LlumiX-32K", + "name": "PULI-LlumiX-32K", + "developer": "NYTK", + "scores": { + "IFEval": 0.17, + "BBH": 0.3189, + "MATH Level 5": 0.0128, + "GPQA": 0.2534, + "MUSR": 0.3964, + "MMLU-PRO": 0.1681 + } + }, + { + "model_id": "Naveenpoliasetty/llama3-8B-V2", + "name": "llama3-8B-V2", + "developer": "Naveenpoliasetty", + "scores": { + "IFEval": 0.4123, + "BBH": 0.5189, + "MATH Level 5": 0.0785, + "GPQA": 0.2903, + "MUSR": 0.4081, + "MMLU-PRO": 0.3738 + } + }, + { + "model_id": "NbAiLab/nb-llama-3.1-8B-Instruct", + "name": "nb-llama-3.1-8B-Instruct", + "developer": "NbAiLab", + "scores": { + "IFEval": 0.3625, + "BBH": 0.3247, + "MATH Level 5": 0.0227, + "GPQA": 0.2735, + "MUSR": 0.3208, + "MMLU-PRO": 0.1197 + } + }, + { + "model_id": "NbAiLab/nb-llama-3.1-8B-sft", + "name": "nb-llama-3.1-8B-sft", + "developer": "NbAiLab", + "scores": { + "IFEval": 0.3616, + "BBH": 0.3282, + "MATH Level 5": 0.0219, + "GPQA": 0.2542, + "MUSR": 0.3287, + "MMLU-PRO": 0.1222 + } + }, + { + "model_id": "Nekochu/Llama-3.1-8B-German-ORPO", + "name": "Llama-3.1-8B-German-ORPO", + "developer": "Nekochu", + "scores": { + "IFEval": 0.4611, + "BBH": 0.4983, + "MATH Level 5": 0.1171, + "GPQA": 0.3163, + "MUSR": 0.4647, + "MMLU-PRO": 0.3393 + } + }, + { + "model_id": "Nekochu/Llama-3.1-8B-french-DPO", + "name": "Llama-3.1-8B-french-DPO", + "developer": "Nekochu", + "scores": { + "IFEval": 0.4656, + "BBH": 0.5111, + "MATH Level 5": 0.0974, + "GPQA": 0.2911, + "MUSR": 0.4216, + "MMLU-PRO": 0.3414 + } + }, + { + "model_id": "Nekochu/Luminia-13B-v3", + "name": "Luminia-13B-v3", + "developer": "Nekochu", + "scores": { + "IFEval": 0.2523, + "BBH": 0.4112, + "MATH Level 5": 0.0181, + "GPQA": 0.2701, + "MUSR": 0.3983, + "MMLU-PRO": 0.2215 + } + }, + { + "model_id": "Nekochu/Luminia-8B-RP", + "name": "Luminia-8B-RP", + "developer": "Nekochu", + "scores": { + "IFEval": 0.5574, + "BBH": 0.5218, + "MATH Level 5": 0.136, + "GPQA": 0.297, + "MUSR": 0.3998, + "MMLU-PRO": 0.3631 + } + }, + { + "model_id": "NeverSleep/Lumimaid-v0.2-12B", + "name": "Lumimaid-v0.2-12B", + "developer": "NeverSleep", + "scores": { + "IFEval": 0.1099, + "BBH": 0.5396, + "MATH Level 5": 0.0566, + "GPQA": 0.3146, + "MUSR": 0.4821, + "MMLU-PRO": 0.3511 + } + }, + { + "model_id": "NeverSleep/Lumimaid-v0.2-8B", + "name": "Lumimaid-v0.2-8B", + "developer": "NeverSleep", + "scores": { + "IFEval": 0.5038, + "BBH": 0.5238, + "MATH Level 5": 0.1435, + "GPQA": 0.3112, + "MUSR": 0.4303, + "MMLU-PRO": 0.3636 + } + }, + { + "model_id": "Nexesenex/Dolphin3.0-Llama3.1-1B-abliterated", + "name": "Dolphin3.0-Llama3.1-1B-abliterated", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.5312, + "BBH": 0.3241, + "MATH Level 5": 0.0385, + "GPQA": 0.2408, + "MUSR": 0.3237, + "MMLU-PRO": 0.1373 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_DeepDive_3_Prev_v1.0", + "name": "Llama_3.1_8b_DeepDive_3_Prev_v1.0", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.6809, + "BBH": 0.5155, + "MATH Level 5": 0.1866, + "GPQA": 0.2911, + "MUSR": 0.3666, + "MMLU-PRO": 0.3438 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0", + "name": "Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.7101, + "BBH": 0.512, + "MATH Level 5": 0.1926, + "GPQA": 0.3003, + "MUSR": 0.3758, + "MMLU-PRO": 0.3441 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_DobHerWild_R1_v1.1R", + "name": "Llama_3.1_8b_DobHerWild_R1_v1.1R", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.76, + "BBH": 0.5257, + "MATH Level 5": 0.2319, + "GPQA": 0.2995, + "MUSR": 0.3852, + "MMLU-PRO": 0.3688 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_DoberWild_v2.01", + "name": "Llama_3.1_8b_DoberWild_v2.01", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.7996, + "BBH": 0.5251, + "MATH Level 5": 0.2002, + "GPQA": 0.3029, + "MUSR": 0.4012, + "MMLU-PRO": 0.3791 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_DoberWild_v2.02", + "name": "Llama_3.1_8b_DoberWild_v2.02", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.7746, + "BBH": 0.5313, + "MATH Level 5": 0.1994, + "GPQA": 0.2945, + "MUSR": 0.3946, + "MMLU-PRO": 0.3764 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_DoberWild_v2.03", + "name": "Llama_3.1_8b_DoberWild_v2.03", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.7764, + "BBH": 0.5294, + "MATH Level 5": 0.2077, + "GPQA": 0.3045, + "MUSR": 0.3906, + "MMLU-PRO": 0.3722 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_DodoWild_v2.01", + "name": "Llama_3.1_8b_DodoWild_v2.01", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.7978, + "BBH": 0.5253, + "MATH Level 5": 0.1986, + "GPQA": 0.3037, + "MUSR": 0.409, + "MMLU-PRO": 0.3738 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_DodoWild_v2.02", + "name": "Llama_3.1_8b_DodoWild_v2.02", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.8017, + "BBH": 0.5262, + "MATH Level 5": 0.2273, + "GPQA": 0.3045, + "MUSR": 0.3971, + "MMLU-PRO": 0.3761 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_DodoWild_v2.03", + "name": "Llama_3.1_8b_DodoWild_v2.03", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.7941, + "BBH": 0.5308, + "MATH Level 5": 0.2221, + "GPQA": 0.3079, + "MUSR": 0.3959, + "MMLU-PRO": 0.3786 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_DodoWild_v2.10", + "name": "Llama_3.1_8b_DodoWild_v2.10", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.8054, + "BBH": 0.5278, + "MATH Level 5": 0.1971, + "GPQA": 0.2961, + "MUSR": 0.4157, + "MMLU-PRO": 0.3855 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.01", + "name": "Llama_3.1_8b_Dolermed_R1_V1.01", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.7534, + "BBH": 0.5312, + "MATH Level 5": 0.2017, + "GPQA": 0.3054, + "MUSR": 0.3747, + "MMLU-PRO": 0.3733 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.03", + "name": "Llama_3.1_8b_Dolermed_R1_V1.03", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.7564, + "BBH": 0.5316, + "MATH Level 5": 0.2092, + "GPQA": 0.318, + "MUSR": 0.38, + "MMLU-PRO": 0.372 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_Dolermed_V1.01", + "name": "Llama_3.1_8b_Dolermed_V1.01", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.5087, + "BBH": 0.5194, + "MATH Level 5": 0.1344, + "GPQA": 0.2945, + "MUSR": 0.3945, + "MMLU-PRO": 0.357 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_Dolerstormed_V1.04", + "name": "Llama_3.1_8b_Dolerstormed_V1.04", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.7889, + "BBH": 0.5195, + "MATH Level 5": 0.1926, + "GPQA": 0.3221, + "MUSR": 0.403, + "MMLU-PRO": 0.3889 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_Hermedash_R1_V1.04", + "name": "Llama_3.1_8b_Hermedash_R1_V1.04", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.7872, + "BBH": 0.5192, + "MATH Level 5": 0.1866, + "GPQA": 0.323, + "MUSR": 0.4111, + "MMLU-PRO": 0.3882 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.01", + "name": "Llama_3.1_8b_Hermedive_R1_V1.01", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.5001, + "BBH": 0.5171, + "MATH Level 5": 0.1775, + "GPQA": 0.2827, + "MUSR": 0.4008, + "MMLU-PRO": 0.3427 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.03", + "name": "Llama_3.1_8b_Hermedive_R1_V1.03", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.6648, + "BBH": 0.5141, + "MATH Level 5": 0.1858, + "GPQA": 0.2978, + "MUSR": 0.3613, + "MMLU-PRO": 0.3488 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_Hermedive_V1.01", + "name": "Llama_3.1_8b_Hermedive_V1.01", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.5062, + "BBH": 0.4918, + "MATH Level 5": 0.1647, + "GPQA": 0.2894, + "MUSR": 0.3697, + "MMLU-PRO": 0.3551 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_Mediver_V1.01", + "name": "Llama_3.1_8b_Mediver_V1.01", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.1885, + "BBH": 0.4415, + "MATH Level 5": 0.0015, + "GPQA": 0.2777, + "MUSR": 0.3898, + "MMLU-PRO": 0.2994 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_Medusa_v1.01", + "name": "Llama_3.1_8b_Medusa_v1.01", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.7685, + "BBH": 0.5018, + "MATH Level 5": 0.1465, + "GPQA": 0.2919, + "MUSR": 0.4067, + "MMLU-PRO": 0.3531 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_Smarteaz_0.2_R1", + "name": "Llama_3.1_8b_Smarteaz_0.2_R1", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.6346, + "BBH": 0.5113, + "MATH Level 5": 0.2606, + "GPQA": 0.3003, + "MUSR": 0.4188, + "MMLU-PRO": 0.3645 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_Smarteaz_V1.01", + "name": "Llama_3.1_8b_Smarteaz_V1.01", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.8151, + "BBH": 0.5241, + "MATH Level 5": 0.2341, + "GPQA": 0.3096, + "MUSR": 0.3789, + "MMLU-PRO": 0.3736 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_Stormeder_v1.04", + "name": "Llama_3.1_8b_Stormeder_v1.04", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.7853, + "BBH": 0.5207, + "MATH Level 5": 0.185, + "GPQA": 0.3205, + "MUSR": 0.3949, + "MMLU-PRO": 0.3852 + } + }, + { + "model_id": "Nexesenex/Llama_3.1_8b_Typhoon_v1.03", + "name": "Llama_3.1_8b_Typhoon_v1.03", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.8078, + "BBH": 0.5314, + "MATH Level 5": 0.2273, + "GPQA": 0.307, + "MUSR": 0.3815, + "MMLU-PRO": 0.3842 + } + }, + { + "model_id": "Nexesenex/Llama_3.2_1b_AquaSyn_0.1", + "name": "Llama_3.2_1b_AquaSyn_0.1", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.2741, + "BBH": 0.3284, + "MATH Level 5": 0.0219, + "GPQA": 0.2483, + "MUSR": 0.346, + "MMLU-PRO": 0.1378 + } + }, + { + "model_id": "Nexesenex/Llama_3.2_1b_AquaSyn_0.11", + "name": "Llama_3.2_1b_AquaSyn_0.11", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.2431, + "BBH": 0.3112, + "MATH Level 5": 0.0234, + "GPQA": 0.2651, + "MUSR": 0.3368, + "MMLU-PRO": 0.1116 + } + }, + { + "model_id": "Nexesenex/Llama_3.2_1b_Dolto_0.1", + "name": "Llama_3.2_1b_Dolto_0.1", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.5434, + "BBH": 0.335, + "MATH Level 5": 0.037, + "GPQA": 0.2374, + "MUSR": 0.3421, + "MMLU-PRO": 0.1364 + } + }, + { + "model_id": "Nexesenex/Llama_3.2_1b_Odyssea_V1", + "name": "Llama_3.2_1b_Odyssea_V1", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.2553, + "BBH": 0.301, + "MATH Level 5": 0.0144, + "GPQA": 0.2584, + "MUSR": 0.3394, + "MMLU-PRO": 0.1153 + } + }, + { + "model_id": "Nexesenex/Llama_3.2_1b_Odyssea_V1.01", + "name": "Llama_3.2_1b_Odyssea_V1.01", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.2495, + "BBH": 0.3045, + "MATH Level 5": 0.0174, + "GPQA": 0.2559, + "MUSR": 0.342, + "MMLU-PRO": 0.1152 + } + }, + { + "model_id": "Nexesenex/Llama_3.2_1b_OpenTree_R1_0.1", + "name": "Llama_3.2_1b_OpenTree_R1_0.1", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.5366, + "BBH": 0.328, + "MATH Level 5": 0.0476, + "GPQA": 0.2525, + "MUSR": 0.3131, + "MMLU-PRO": 0.1675 + } + }, + { + "model_id": "Nexesenex/Llama_3.2_1b_OrcaSun_V1", + "name": "Llama_3.2_1b_OrcaSun_V1", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.5949, + "BBH": 0.355, + "MATH Level 5": 0.0597, + "GPQA": 0.2366, + "MUSR": 0.338, + "MMLU-PRO": 0.1904 + } + }, + { + "model_id": "Nexesenex/Llama_3.2_1b_RandomLego_RP_R1_0.1", + "name": "Llama_3.2_1b_RandomLego_RP_R1_0.1", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.5543, + "BBH": 0.3428, + "MATH Level 5": 0.0566, + "GPQA": 0.25, + "MUSR": 0.3249, + "MMLU-PRO": 0.1563 + } + }, + { + "model_id": "Nexesenex/Llama_3.2_1b_SunOrca_V1", + "name": "Llama_3.2_1b_SunOrca_V1", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.543, + "BBH": 0.3431, + "MATH Level 5": 0.0672, + "GPQA": 0.2743, + "MUSR": 0.3262, + "MMLU-PRO": 0.1884 + } + }, + { + "model_id": "Nexesenex/Llama_3.2_1b_Sydonia_0.1", + "name": "Llama_3.2_1b_Sydonia_0.1", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.2197, + "BBH": 0.3121, + "MATH Level 5": 0.0204, + "GPQA": 0.2282, + "MUSR": 0.3382, + "MMLU-PRO": 0.1224 + } + }, + { + "model_id": "Nexesenex/Llama_3.2_1b_Syneridol_0.2", + "name": "Llama_3.2_1b_Syneridol_0.2", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.2157, + "BBH": 0.3139, + "MATH Level 5": 0.0219, + "GPQA": 0.2349, + "MUSR": 0.3343, + "MMLU-PRO": 0.1227 + } + }, + { + "model_id": "Nexesenex/Llama_3.2_1b_Synopsys_0.1", + "name": "Llama_3.2_1b_Synopsys_0.1", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.1764, + "BBH": 0.3162, + "MATH Level 5": 0.0166, + "GPQA": 0.2391, + "MUSR": 0.3461, + "MMLU-PRO": 0.1231 + } + }, + { + "model_id": "Nexesenex/Llama_3.2_1b_Synopsys_0.11", + "name": "Llama_3.2_1b_Synopsys_0.11", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.2842, + "BBH": 0.3102, + "MATH Level 5": 0.0128, + "GPQA": 0.2626, + "MUSR": 0.3513, + "MMLU-PRO": 0.1123 + } + }, + { + "model_id": "Nexesenex/Llama_3.2_3b_Kermes_v1", + "name": "Llama_3.2_3b_Kermes_v1", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.4852, + "BBH": 0.441, + "MATH Level 5": 0.031, + "GPQA": 0.2735, + "MUSR": 0.407, + "MMLU-PRO": 0.2547 + } + }, + { + "model_id": "Nexesenex/Llama_3.2_3b_Kermes_v2", + "name": "Llama_3.2_3b_Kermes_v2", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.5754, + "BBH": 0.4455, + "MATH Level 5": 0.0544, + "GPQA": 0.2651, + "MUSR": 0.3778, + "MMLU-PRO": 0.2734 + } + }, + { + "model_id": "Nexesenex/Llama_3.2_3b_Kermes_v2.1", + "name": "Llama_3.2_3b_Kermes_v2.1", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.5584, + "BBH": 0.4464, + "MATH Level 5": 0.0521, + "GPQA": 0.2794, + "MUSR": 0.3964, + "MMLU-PRO": 0.2692 + } + }, + { + "model_id": "Nexesenex/Nemotron_W_4b_Halo_0.1", + "name": "Nemotron_W_4b_Halo_0.1", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.3627, + "BBH": 0.4135, + "MATH Level 5": 0.0423, + "GPQA": 0.2802, + "MUSR": 0.4165, + "MMLU-PRO": 0.2505 + } + }, + { + "model_id": "Nexesenex/Nemotron_W_4b_MagLight_0.1", + "name": "Nemotron_W_4b_MagLight_0.1", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.423, + "BBH": 0.4231, + "MATH Level 5": 0.04, + "GPQA": 0.2836, + "MUSR": 0.4112, + "MMLU-PRO": 0.2545 + } + }, + { + "model_id": "Nexesenex/Qwen_2.5_3b_Smarteaz_0.01a", + "name": "Qwen_2.5_3b_Smarteaz_0.01a", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.4012, + "BBH": 0.4637, + "MATH Level 5": 0.1805, + "GPQA": 0.2777, + "MUSR": 0.432, + "MMLU-PRO": 0.286 + } + }, + { + "model_id": "Nexesenex/pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL", + "name": "pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL", + "developer": "Nexesenex", + "scores": { + "IFEval": 0.589, + "BBH": 0.3562, + "MATH Level 5": 0.0748, + "GPQA": 0.2668, + "MUSR": 0.3396, + "MMLU-PRO": 0.1803 + } + }, + { + "model_id": "Nexusflow/NexusRaven-V2-13B", + "name": "NexusRaven-V2-13B", + "developer": "Nexusflow", + "scores": { + "IFEval": 0.1791, + "BBH": 0.3949, + "MATH Level 5": 0.0295, + "GPQA": 0.2601, + "MUSR": 0.3737, + "MMLU-PRO": 0.1872 + } + }, + { + "model_id": "NikolaSigmoid/AceMath-1.5B-Instruct-1epoch", + "name": "AceMath-1.5B-Instruct-1epoch", + "developer": "NikolaSigmoid", + "scores": { + "IFEval": 0.2849, + "BBH": 0.4263, + "MATH Level 5": 0.3051, + "GPQA": 0.2777, + "MUSR": 0.3925, + "MMLU-PRO": 0.2376 + } + }, + { + "model_id": "NikolaSigmoid/AceMath-1.5B-Instruct-dolphin-r1-200", + "name": "AceMath-1.5B-Instruct-dolphin-r1-200", + "developer": "NikolaSigmoid", + "scores": { + "IFEval": 0.1808, + "BBH": 0.2815, + "MATH Level 5": 0.0, + "GPQA": 0.2559, + "MUSR": 0.375, + "MMLU-PRO": 0.1143 + } + }, + { + "model_id": "NikolaSigmoid/DeepSeek-R1-Distill-Qwen-1.5B-500", + "name": "DeepSeek-R1-Distill-Qwen-1.5B-500", + "developer": "NikolaSigmoid", + "scores": { + "IFEval": 0.1749, + "BBH": 0.2602, + "MATH Level 5": 0.0, + "GPQA": 0.2458, + "MUSR": 0.338, + "MMLU-PRO": 0.1125 + } + }, + { + "model_id": "NikolaSigmoid/acemath-200", + "name": "acemath-200", + "developer": "NikolaSigmoid", + "scores": { + "IFEval": 0.2849, + "BBH": 0.4263, + "MATH Level 5": 0.3051, + "GPQA": 0.2777, + "MUSR": 0.3925, + "MMLU-PRO": 0.2376 + } + }, + { + "model_id": "NikolaSigmoid/phi-4-14b", + "name": "phi-4-14b", + "developer": "NikolaSigmoid", + "scores": { + "IFEval": 0.0561, + "BBH": 0.6695, + "MATH Level 5": 0.2938, + "GPQA": 0.4035, + "MUSR": 0.5047, + "MMLU-PRO": 0.5278 + } + }, + { + "model_id": "NikolaSigmoid/phi-4-1steps", + "name": "phi-4-1steps", + "developer": "NikolaSigmoid", + "scores": { + "IFEval": 0.0528, + "BBH": 0.6707, + "MATH Level 5": 0.2983, + "GPQA": 0.4018, + "MUSR": 0.5021, + "MMLU-PRO": 0.5273 + } + }, + { + "model_id": "NikolaSigmoid/phi-4-300steps", + "name": "phi-4-300steps", + "developer": "NikolaSigmoid", + "scores": { + "IFEval": 0.0561, + "BBH": 0.6701, + "MATH Level 5": 0.2946, + "GPQA": 0.4052, + "MUSR": 0.5034, + "MMLU-PRO": 0.5288 + } + }, + { + "model_id": "Nitral-AI/Captain-Eris-BMO_Violent-GRPO-v0.420", + "name": "Captain-Eris-BMO_Violent-GRPO-v0.420", + "developer": "Nitral-AI", + "scores": { + "IFEval": 0.6313, + "BBH": 0.5079, + "MATH Level 5": 0.1314, + "GPQA": 0.3096, + "MUSR": 0.4228, + "MMLU-PRO": 0.3596 + } + }, + { + "model_id": "Nitral-AI/Captain-Eris_BMO-Violent-12B", + "name": "Captain-Eris_BMO-Violent-12B", + "developer": "Nitral-AI", + "scores": { + "IFEval": 0.6152, + "BBH": 0.5104, + "MATH Level 5": 0.1367, + "GPQA": 0.3096, + "MUSR": 0.4255, + "MMLU-PRO": 0.3571 + } + }, + { + "model_id": "Nitral-AI/Captain-Eris_Violet-GRPO-v0.420", + "name": "Captain-Eris_Violet-GRPO-v0.420", + "developer": "Nitral-AI", + "scores": { + "IFEval": 0.6262, + "BBH": 0.5159, + "MATH Level 5": 0.108, + "GPQA": 0.2987, + "MUSR": 0.4279, + "MMLU-PRO": 0.3535 + } + }, + { + "model_id": "Nitral-AI/Captain-Eris_Violet-V0.420-12B", + "name": "Captain-Eris_Violet-V0.420-12B", + "developer": "Nitral-AI", + "scores": { + "IFEval": 0.4339, + "BBH": 0.5478, + "MATH Level 5": 0.1073, + "GPQA": 0.3112, + "MUSR": 0.4331, + "MMLU-PRO": 0.3723 + } + }, + { + "model_id": "Nitral-AI/Captain_BMO-12B", + "name": "Captain_BMO-12B", + "developer": "Nitral-AI", + "scores": { + "IFEval": 0.4751, + "BBH": 0.5286, + "MATH Level 5": 0.1397, + "GPQA": 0.3196, + "MUSR": 0.3748, + "MMLU-PRO": 0.3569 + } + }, + { + "model_id": "Nitral-AI/Hathor_Stable-v0.2-L3-8B", + "name": "Hathor_Stable-v0.2-L3-8B", + "developer": "Nitral-AI", + "scores": { + "IFEval": 0.7175, + "BBH": 0.5286, + "MATH Level 5": 0.105, + "GPQA": 0.2869, + "MUSR": 0.3781, + "MMLU-PRO": 0.3696 + } + }, + { + "model_id": "Nitral-AI/Hathor_Tahsin-L3-8B-v0.85", + "name": "Hathor_Tahsin-L3-8B-v0.85", + "developer": "Nitral-AI", + "scores": { + "IFEval": 0.711, + "BBH": 0.5279, + "MATH Level 5": 0.1005, + "GPQA": 0.2852, + "MUSR": 0.3647, + "MMLU-PRO": 0.372 + } + }, + { + "model_id": "Nitral-AI/Nera_Noctis-12B", + "name": "Nera_Noctis-12B", + "developer": "Nitral-AI", + "scores": { + "IFEval": 0.4562, + "BBH": 0.5194, + "MATH Level 5": 0.0876, + "GPQA": 0.2634, + "MUSR": 0.3979, + "MMLU-PRO": 0.3468 + } + }, + { + "model_id": "Nohobby/MS-Schisandra-22B-v0.1", + "name": "MS-Schisandra-22B-v0.1", + "developer": "Nohobby", + "scores": { + "IFEval": 0.6331, + "BBH": 0.579, + "MATH Level 5": 0.2228, + "GPQA": 0.3322, + "MUSR": 0.3928, + "MMLU-PRO": 0.4096 + } + }, + { + "model_id": "Nohobby/MS-Schisandra-22B-v0.2", + "name": "MS-Schisandra-22B-v0.2", + "developer": "Nohobby", + "scores": { + "IFEval": 0.6383, + "BBH": 0.5841, + "MATH Level 5": 0.2032, + "GPQA": 0.3356, + "MUSR": 0.4075, + "MMLU-PRO": 0.4136 + } + }, + { + "model_id": "Norquinal/Alpha", + "name": "Alpha", + "developer": "Norquinal", + "scores": { + "IFEval": 0.2803, + "BBH": 0.3374, + "MATH Level 5": 0.0574, + "GPQA": 0.2651, + "MUSR": 0.3631, + "MMLU-PRO": 0.3003 + } + }, + { + "model_id": "Norquinal/Bravo", + "name": "Bravo", + "developer": "Norquinal", + "scores": { + "IFEval": 0.3025, + "BBH": 0.3558, + "MATH Level 5": 0.0574, + "GPQA": 0.2819, + "MUSR": 0.3869, + "MMLU-PRO": 0.3127 + } + }, + { + "model_id": "Norquinal/Charlie", + "name": "Charlie", + "developer": "Norquinal", + "scores": { + "IFEval": 0.3061, + "BBH": 0.3515, + "MATH Level 5": 0.0582, + "GPQA": 0.271, + "MUSR": 0.3737, + "MMLU-PRO": 0.3093 + } + }, + { + "model_id": "Norquinal/Delta", + "name": "Delta", + "developer": "Norquinal", + "scores": { + "IFEval": 0.2538, + "BBH": 0.3435, + "MATH Level 5": 0.0612, + "GPQA": 0.2609, + "MUSR": 0.3777, + "MMLU-PRO": 0.2959 + } + }, + { + "model_id": "Norquinal/Echo", + "name": "Echo", + "developer": "Norquinal", + "scores": { + "IFEval": 0.3158, + "BBH": 0.353, + "MATH Level 5": 0.0574, + "GPQA": 0.2794, + "MUSR": 0.3804, + "MMLU-PRO": 0.3095 + } + }, + { + "model_id": "Norquinal/Foxtrot", + "name": "Foxtrot", + "developer": "Norquinal", + "scores": { + "IFEval": 0.3012, + "BBH": 0.3558, + "MATH Level 5": 0.0582, + "GPQA": 0.2869, + "MUSR": 0.3804, + "MMLU-PRO": 0.305 + } + }, + { + "model_id": "Norquinal/Golf", + "name": "Golf", + "developer": "Norquinal", + "scores": { + "IFEval": 0.3534, + "BBH": 0.3533, + "MATH Level 5": 0.0536, + "GPQA": 0.2903, + "MUSR": 0.338, + "MMLU-PRO": 0.3056 + } + }, + { + "model_id": "Norquinal/Hotel", + "name": "Hotel", + "developer": "Norquinal", + "scores": { + "IFEval": 0.3215, + "BBH": 0.3679, + "MATH Level 5": 0.0529, + "GPQA": 0.2794, + "MUSR": 0.3288, + "MMLU-PRO": 0.3157 + } + }, + { + "model_id": "NotASI/FineTome-Llama3.2-1B-0929", + "name": "FineTome-Llama3.2-1B-0929", + "developer": "NotASI", + "scores": { + "IFEval": 0.3991, + "BBH": 0.3246, + "MATH Level 5": 0.0363, + "GPQA": 0.2727, + "MUSR": 0.3488, + "MMLU-PRO": 0.1429 + } + }, + { + "model_id": "NotASI/FineTome-Llama3.2-3B-1002", + "name": "FineTome-Llama3.2-3B-1002", + "developer": "NotASI", + "scores": { + "IFEval": 0.5474, + "BBH": 0.4319, + "MATH Level 5": 0.0627, + "GPQA": 0.2508, + "MUSR": 0.3685, + "MMLU-PRO": 0.2437 + } + }, + { + "model_id": "NotASI/FineTome-v1.5-Llama3.2-1B-1007", + "name": "FineTome-v1.5-Llama3.2-1B-1007", + "developer": "NotASI", + "scores": { + "IFEval": 0.3924, + "BBH": 0.3241, + "MATH Level 5": 0.0317, + "GPQA": 0.25, + "MUSR": 0.3475, + "MMLU-PRO": 0.1427 + } + }, + { + "model_id": "NotASI/FineTome-v1.5-Llama3.2-3B-1007", + "name": "FineTome-v1.5-Llama3.2-3B-1007", + "developer": "NotASI", + "scores": { + "IFEval": 0.5508, + "BBH": 0.4312, + "MATH Level 5": 0.0642, + "GPQA": 0.2617, + "MUSR": 0.3645, + "MMLU-PRO": 0.2448 + } + }, + { + "model_id": "NousResearch/DeepHermes-3-Mistral-24B-Preview", + "name": "DeepHermes-3-Mistral-24B-Preview", + "developer": "NousResearch", + "scores": { + "IFEval": 0.4536, + "BBH": 0.6488, + "MATH Level 5": 0.2576, + "GPQA": 0.37, + "MUSR": 0.4503, + "MMLU-PRO": 0.459 + } + }, + { + "model_id": "NousResearch/Hermes-2-Pro-Llama-3-8B", + "name": "Hermes-2-Pro-Llama-3-8B", + "developer": "NousResearch", + "scores": { + "IFEval": 0.5362, + "BBH": 0.5071, + "MATH Level 5": 0.0838, + "GPQA": 0.2928, + "MUSR": 0.4262, + "MMLU-PRO": 0.3052 + } + }, + { + "model_id": "NousResearch/Hermes-2-Pro-Mistral-7B", + "name": "Hermes-2-Pro-Mistral-7B", + "developer": "NousResearch", + "scores": { + "IFEval": 0.5668, + "BBH": 0.4995, + "MATH Level 5": 0.0604, + "GPQA": 0.2735, + "MUSR": 0.4376, + "MMLU-PRO": 0.2946 + } + }, + { + "model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B", + "name": "Hermes-2-Theta-Llama-3-8B", + "developer": "NousResearch", + "scores": { + "IFEval": 0.6518, + "BBH": 0.5207, + "MATH Level 5": 0.0967, + "GPQA": 0.3037, + "MUSR": 0.3949, + "MMLU-PRO": 0.3369 + } + }, + { + "model_id": "NousResearch/Hermes-3-Llama-3.1-70B", + "name": "Hermes-3-Llama-3.1-70B", + "developer": "NousResearch", + "scores": { + "IFEval": 0.7661, + "BBH": 0.6756, + "MATH Level 5": 0.21, + "GPQA": 0.3616, + "MUSR": 0.4949, + "MMLU-PRO": 0.4727 + } + }, + { + "model_id": "NousResearch/Hermes-3-Llama-3.1-8B", + "name": "Hermes-3-Llama-3.1-8B", + "developer": "NousResearch", + "scores": { + "IFEval": 0.617, + "BBH": 0.5177, + "MATH Level 5": 0.0476, + "GPQA": 0.2978, + "MUSR": 0.4369, + "MMLU-PRO": 0.3139 + } + }, + { + "model_id": "NousResearch/Hermes-3-Llama-3.2-3B", + "name": "Hermes-3-Llama-3.2-3B", + "developer": "NousResearch", + "scores": { + "IFEval": 0.3825, + "BBH": 0.4352, + "MATH Level 5": 0.0393, + "GPQA": 0.2752, + "MUSR": 0.403, + "MMLU-PRO": 0.2544 + } + }, + { + "model_id": "NousResearch/Nous-Hermes-2-Mistral-7B-DPO", + "name": "Nous-Hermes-2-Mistral-7B-DPO", + "developer": "NousResearch", + "scores": { + "IFEval": 0.5763, + "BBH": 0.4853, + "MATH Level 5": 0.0476, + "GPQA": 0.2928, + "MUSR": 0.4, + "MMLU-PRO": 0.3015 + } + }, + { + "model_id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO", + "name": "Nous-Hermes-2-Mixtral-8x7B-DPO", + "developer": "NousResearch", + "scores": { + "IFEval": 0.5897, + "BBH": 0.5539, + "MATH Level 5": 0.1224, + "GPQA": 0.3213, + "MUSR": 0.4595, + "MMLU-PRO": 0.3666 + } + }, + { + "model_id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT", + "name": "Nous-Hermes-2-Mixtral-8x7B-SFT", + "developer": "NousResearch", + "scores": { + "IFEval": 0.5731, + "BBH": 0.5058, + "MATH Level 5": 0.0211, + "GPQA": 0.302, + "MUSR": 0.4214, + "MMLU-PRO": 0.3066 + } + }, + { + "model_id": "NousResearch/Nous-Hermes-2-SOLAR-10.7B", + "name": "Nous-Hermes-2-SOLAR-10.7B", + "developer": "NousResearch", + "scores": { + "IFEval": 0.5279, + "BBH": 0.5414, + "MATH Level 5": 0.0574, + "GPQA": 0.2936, + "MUSR": 0.4373, + "MMLU-PRO": 0.3458 + } + }, + { + "model_id": "NousResearch/Nous-Hermes-llama-2-7b", + "name": "Nous-Hermes-llama-2-7b", + "developer": "NousResearch", + "scores": { + "IFEval": 0.1729, + "BBH": 0.3824, + "MATH Level 5": 0.0091, + "GPQA": 0.2634, + "MUSR": 0.4257, + "MMLU-PRO": 0.194 + } + }, + { + "model_id": "NousResearch/Yarn-Llama-2-13b-128k", + "name": "Yarn-Llama-2-13b-128k", + "developer": "NousResearch", + "scores": { + "IFEval": 0.1655, + "BBH": 0.3827, + "MATH Level 5": 0.0174, + "GPQA": 0.2584, + "MUSR": 0.3458, + "MMLU-PRO": 0.232 + } + }, + { + "model_id": "NousResearch/Yarn-Llama-2-7b-128k", + "name": "Yarn-Llama-2-7b-128k", + "developer": "NousResearch", + "scores": { + "IFEval": 0.1485, + "BBH": 0.3248, + "MATH Level 5": 0.0151, + "GPQA": 0.2601, + "MUSR": 0.3967, + "MMLU-PRO": 0.1791 + } + }, + { + "model_id": "NousResearch/Yarn-Llama-2-7b-64k", + "name": "Yarn-Llama-2-7b-64k", + "developer": "NousResearch", + "scores": { + "IFEval": 0.17, + "BBH": 0.3326, + "MATH Level 5": 0.0159, + "GPQA": 0.2643, + "MUSR": 0.3939, + "MMLU-PRO": 0.1799 + } + }, + { + "model_id": "NousResearch/Yarn-Mistral-7b-128k", + "name": "Yarn-Mistral-7b-128k", + "developer": "NousResearch", + "scores": { + "IFEval": 0.1934, + "BBH": 0.4314, + "MATH Level 5": 0.0317, + "GPQA": 0.2987, + "MUSR": 0.4071, + "MMLU-PRO": 0.2893 + } + }, + { + "model_id": "NousResearch/Yarn-Mistral-7b-64k", + "name": "Yarn-Mistral-7b-64k", + "developer": "NousResearch", + "scores": { + "IFEval": 0.208, + "BBH": 0.4293, + "MATH Level 5": 0.037, + "GPQA": 0.2903, + "MUSR": 0.4124, + "MMLU-PRO": 0.2914 + } + }, + { + "model_id": "NousResearch/Yarn-Solar-10b-32k", + "name": "Yarn-Solar-10b-32k", + "developer": "NousResearch", + "scores": { + "IFEval": 0.1942, + "BBH": 0.4987, + "MATH Level 5": 0.0302, + "GPQA": 0.3029, + "MUSR": 0.4146, + "MMLU-PRO": 0.3272 + } + }, + { + "model_id": "NousResearch/Yarn-Solar-10b-64k", + "name": "Yarn-Solar-10b-64k", + "developer": "NousResearch", + "scores": { + "IFEval": 0.1989, + "BBH": 0.4922, + "MATH Level 5": 0.0287, + "GPQA": 0.302, + "MUSR": 0.4014, + "MMLU-PRO": 0.3148 + } + }, + { + "model_id": "Novaciano/ASTAROTH-3.2-1B", + "name": "ASTAROTH-3.2-1B", + "developer": "Novaciano", + "scores": { + "IFEval": 0.5613, + "BBH": 0.3543, + "MATH Level 5": 0.0733, + "GPQA": 0.2559, + "MUSR": 0.3142, + "MMLU-PRO": 0.1909 + } + }, + { + "model_id": "Novaciano/BLAST_PROCESSING-3.2-1B", + "name": "BLAST_PROCESSING-3.2-1B", + "developer": "Novaciano", + "scores": { + "IFEval": 0.3922, + "BBH": 0.346, + "MATH Level 5": 0.0748, + "GPQA": 0.2659, + "MUSR": 0.3351, + "MMLU-PRO": 0.1941 + } + }, + { + "model_id": "Novaciano/Cerberus-3.2-1B", + "name": "Cerberus-3.2-1B", + "developer": "Novaciano", + "scores": { + "IFEval": 0.5017, + "BBH": 0.4165, + "MATH Level 5": 0.0582, + "GPQA": 0.2584, + "MUSR": 0.3289, + "MMLU-PRO": 0.1663 + } + }, + { + "model_id": "Novaciano/Cultist-3.2-1B", + "name": "Cultist-3.2-1B", + "developer": "Novaciano", + "scores": { + "IFEval": 0.5295, + "BBH": 0.3399, + "MATH Level 5": 0.0589, + "GPQA": 0.2609, + "MUSR": 0.333, + "MMLU-PRO": 0.1714 + } + }, + { + "model_id": "Novaciano/FuseChat-3.2-1B-GRPO_Creative_RP", + "name": "FuseChat-3.2-1B-GRPO_Creative_RP", + "developer": "Novaciano", + "scores": { + "IFEval": 0.5598, + "BBH": 0.3488, + "MATH Level 5": 0.0801, + "GPQA": 0.2559, + "MUSR": 0.3329, + "MMLU-PRO": 0.1735 + } + }, + { + "model_id": "Novaciano/Fusetrix-3.2-1B-GRPO_RP_Creative", + "name": "Fusetrix-3.2-1B-GRPO_RP_Creative", + "developer": "Novaciano", + "scores": { + "IFEval": 0.5366, + "BBH": 0.3435, + "MATH Level 5": 0.1148, + "GPQA": 0.25, + "MUSR": 0.3209, + "MMLU-PRO": 0.1758 + } + }, + { + "model_id": "Novaciano/Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP", + "name": "Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP", + "developer": "Novaciano", + "scores": { + "IFEval": 0.5343, + "BBH": 0.3502, + "MATH Level 5": 0.105, + "GPQA": 0.2685, + "MUSR": 0.3183, + "MMLU-PRO": 0.1823 + } + }, + { + "model_id": "Novaciano/HarmfulProject-3.2-1B", + "name": "HarmfulProject-3.2-1B", + "developer": "Novaciano", + "scores": { + "IFEval": 0.3874, + "BBH": 0.3274, + "MATH Level 5": 0.0476, + "GPQA": 0.2668, + "MUSR": 0.3419, + "MMLU-PRO": 0.1823 + } + }, + { + "model_id": "Novaciano/LEWD-Mental-Cultist-3.2-1B", + "name": "LEWD-Mental-Cultist-3.2-1B", + "developer": "Novaciano", + "scores": { + "IFEval": 0.5309, + "BBH": 0.3513, + "MATH Level 5": 0.0529, + "GPQA": 0.2567, + "MUSR": 0.3223, + "MMLU-PRO": 0.1769 + } + }, + { + "model_id": "Novaciano/La_Mejor_Mezcla-3.2-1B", + "name": "La_Mejor_Mezcla-3.2-1B", + "developer": "Novaciano", + "scores": { + "IFEval": 0.551, + "BBH": 0.3488, + "MATH Level 5": 0.0899, + "GPQA": 0.2576, + "MUSR": 0.3196, + "MMLU-PRO": 0.1829 + } + }, + { + "model_id": "Novaciano/Sigil-Of-Satan-3.2-1B", + "name": "Sigil-Of-Satan-3.2-1B", + "developer": "Novaciano", + "scores": { + "IFEval": 0.5494, + "BBH": 0.3546, + "MATH Level 5": 0.0544, + "GPQA": 0.2609, + "MUSR": 0.3276, + "MMLU-PRO": 0.1855 + } + }, + { + "model_id": "NucleusAI/nucleus-22B-token-500B", + "name": "nucleus-22B-token-500B", + "developer": "NucleusAI", + "scores": { + "IFEval": 0.0257, + "BBH": 0.292, + "MATH Level 5": 0.0, + "GPQA": 0.25, + "MUSR": 0.3511, + "MMLU-PRO": 0.1162 + } + }, + { + "model_id": "NyxKrage/Microsoft_Phi-4", + "name": "Microsoft_Phi-4", + "developer": "NyxKrage", + "scores": { + "IFEval": 0.0585, + "BBH": 0.6691, + "MATH Level 5": 0.2991, + "GPQA": 0.406, + "MUSR": 0.5034, + "MMLU-PRO": 0.5287 + } + }, + { + "model_id": "OEvortex/Emotional-llama-8B", + "name": "Emotional-llama-8B", + "developer": "OEvortex", + "scores": { + "IFEval": 0.3516, + "BBH": 0.4839, + "MATH Level 5": 0.0816, + "GPQA": 0.2945, + "MUSR": 0.3659, + "MMLU-PRO": 0.3535 + } + }, + { + "model_id": "OEvortex/HelpingAI-15B", + "name": "HelpingAI-15B", + "developer": "OEvortex", + "scores": { + "IFEval": 0.203, + "BBH": 0.2936, + "MATH Level 5": 0.0, + "GPQA": 0.2576, + "MUSR": 0.3619, + "MMLU-PRO": 0.1111 + } + }, + { + "model_id": "OEvortex/HelpingAI-3B-reloaded", + "name": "HelpingAI-3B-reloaded", + "developer": "OEvortex", + "scores": { + "IFEval": 0.4647, + "BBH": 0.4129, + "MATH Level 5": 0.0136, + "GPQA": 0.2634, + "MUSR": 0.3524, + "MMLU-PRO": 0.2595 + } + }, + { + "model_id": "OEvortex/HelpingAI2-9B", + "name": "HelpingAI2-9B", + "developer": "OEvortex", + "scores": { + "IFEval": 0.4413, + "BBH": 0.4845, + "MATH Level 5": 0.0589, + "GPQA": 0.2584, + "MUSR": 0.3711, + "MMLU-PRO": 0.29 + } + }, + { + "model_id": "OEvortex/HelpingAI2.5-10B", + "name": "HelpingAI2.5-10B", + "developer": "OEvortex", + "scores": { + "IFEval": 0.3277, + "BBH": 0.4496, + "MATH Level 5": 0.0204, + "GPQA": 0.2693, + "MUSR": 0.3738, + "MMLU-PRO": 0.2575 + } + }, + { + "model_id": "OliveiraJLT/Sagui-7B-Instruct-v0.1", + "name": "Sagui-7B-Instruct-v0.1", + "developer": "OliveiraJLT", + "scores": { + "IFEval": 0.2892, + "BBH": 0.3111, + "MATH Level 5": 0.0151, + "GPQA": 0.2424, + "MUSR": 0.4191, + "MMLU-PRO": 0.1485 + } + }, + { + "model_id": "Omkar1102/code-yi", + "name": "code-yi", + "developer": "Omkar1102", + "scores": { + "IFEval": 0.2148, + "BBH": 0.276, + "MATH Level 5": 0.0, + "GPQA": 0.2508, + "MUSR": 0.3802, + "MMLU-PRO": 0.1126 + } + }, + { + "model_id": "OmnicromsBrain/NeuralStar_FusionWriter_4x7b", + "name": "NeuralStar_FusionWriter_4x7b", + "developer": "OmnicromsBrain", + "scores": { + "IFEval": 0.5964, + "BBH": 0.4776, + "MATH Level 5": 0.0491, + "GPQA": 0.2785, + "MUSR": 0.4019, + "MMLU-PRO": 0.2606 + } + }, + { + "model_id": "OnlyCheeini/greesychat-turbo", + "name": "greesychat-turbo", + "developer": "OnlyCheeini", + "scores": { + "IFEval": 0.0233, + "BBH": 0.3092, + "MATH Level 5": 0.0, + "GPQA": 0.2601, + "MUSR": 0.3314, + "MMLU-PRO": 0.1138 + } + }, + { + "model_id": "Open-Orca/Mistral-7B-OpenOrca", + "name": "Mistral-7B-OpenOrca", + "developer": "Open-Orca", + "scores": { + "IFEval": 0.4978, + "BBH": 0.4768, + "MATH Level 5": 0.0355, + "GPQA": 0.2718, + "MUSR": 0.3858, + "MMLU-PRO": 0.2653 + } + }, + { + "model_id": "OpenAssistant/oasst-sft-1-pythia-12b", + "name": "oasst-sft-1-pythia-12b", + "developer": "OpenAssistant", + "scores": { + "IFEval": 0.1055, + "BBH": 0.3147, + "MATH Level 5": 0.0151, + "GPQA": 0.2576, + "MUSR": 0.3327, + "MMLU-PRO": 0.1113 + } + }, + { + "model_id": "OpenBuddy/openbuddy-falcon3-10b-v24.2-131k", + "name": "openbuddy-falcon3-10b-v24.2-131k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.5086, + "BBH": 0.6004, + "MATH Level 5": 0.213, + "GPQA": 0.2995, + "MUSR": 0.4186, + "MMLU-PRO": 0.3834 + } + }, + { + "model_id": "OpenBuddy/openbuddy-llama3-70b-v21.2-32k", + "name": "openbuddy-llama3-70b-v21.2-32k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.701, + "BBH": 0.6507, + "MATH Level 5": 0.2032, + "GPQA": 0.3423, + "MUSR": 0.458, + "MMLU-PRO": 0.4832 + } + }, + { + "model_id": "OpenBuddy/openbuddy-llama3-8b-v21.1-8k", + "name": "openbuddy-llama3-8b-v21.1-8k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.557, + "BBH": 0.4788, + "MATH Level 5": 0.0431, + "GPQA": 0.271, + "MUSR": 0.3988, + "MMLU-PRO": 0.2955 + } + }, + { + "model_id": "OpenBuddy/openbuddy-llama3-8b-v21.2-32k", + "name": "openbuddy-llama3-8b-v21.2-32k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.6192, + "BBH": 0.4856, + "MATH Level 5": 0.0785, + "GPQA": 0.2794, + "MUSR": 0.3779, + "MMLU-PRO": 0.3299 + } + }, + { + "model_id": "OpenBuddy/openbuddy-llama3.1-70b-v22.1-131k", + "name": "openbuddy-llama3.1-70b-v22.1-131k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.7333, + "BBH": 0.6698, + "MATH Level 5": 0.395, + "GPQA": 0.375, + "MUSR": 0.463, + "MMLU-PRO": 0.5304 + } + }, + { + "model_id": "OpenBuddy/openbuddy-llama3.1-8b-v22.2-131k", + "name": "openbuddy-llama3.1-8b-v22.2-131k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.6657, + "BBH": 0.5007, + "MATH Level 5": 0.1148, + "GPQA": 0.2794, + "MUSR": 0.4081, + "MMLU-PRO": 0.331 + } + }, + { + "model_id": "OpenBuddy/openbuddy-llama3.1-8b-v22.3-131k", + "name": "openbuddy-llama3.1-8b-v22.3-131k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.5997, + "BBH": 0.5066, + "MATH Level 5": 0.1208, + "GPQA": 0.2794, + "MUSR": 0.4015, + "MMLU-PRO": 0.3277 + } + }, + { + "model_id": "OpenBuddy/openbuddy-llama3.2-1b-v23.1-131k", + "name": "openbuddy-llama3.2-1b-v23.1-131k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.359, + "BBH": 0.3267, + "MATH Level 5": 0.0249, + "GPQA": 0.2584, + "MUSR": 0.3342, + "MMLU-PRO": 0.184 + } + }, + { + "model_id": "OpenBuddy/openbuddy-llama3.2-3b-v23.2-131k", + "name": "openbuddy-llama3.2-3b-v23.2-131k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.4319, + "BBH": 0.4073, + "MATH Level 5": 0.0264, + "GPQA": 0.276, + "MUSR": 0.3263, + "MMLU-PRO": 0.2479 + } + }, + { + "model_id": "OpenBuddy/openbuddy-llama3.3-70b-v24.1-131k", + "name": "openbuddy-llama3.3-70b-v24.1-131k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.8121, + "BBH": 0.6858, + "MATH Level 5": 0.4411, + "GPQA": 0.4346, + "MUSR": 0.4869, + "MMLU-PRO": 0.5327 + } + }, + { + "model_id": "OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k", + "name": "openbuddy-mixtral-7bx8-v18.1-32k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.5493, + "BBH": 0.4656, + "MATH Level 5": 0.108, + "GPQA": 0.3045, + "MUSR": 0.3831, + "MMLU-PRO": 0.3804 + } + }, + { + "model_id": "OpenBuddy/openbuddy-nemotron-70b-v23.1-131k", + "name": "openbuddy-nemotron-70b-v23.1-131k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.7555, + "BBH": 0.6749, + "MATH Level 5": 0.321, + "GPQA": 0.3633, + "MUSR": 0.4538, + "MMLU-PRO": 0.5175 + } + }, + { + "model_id": "OpenBuddy/openbuddy-nemotron-70b-v23.2-131k", + "name": "openbuddy-nemotron-70b-v23.2-131k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.7227, + "BBH": 0.6705, + "MATH Level 5": 0.3157, + "GPQA": 0.3599, + "MUSR": 0.4696, + "MMLU-PRO": 0.5121 + } + }, + { + "model_id": "OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.1-200k", + "name": "openbuddy-qwen2.5llamaify-14b-v23.1-200k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.6309, + "BBH": 0.6013, + "MATH Level 5": 0.2538, + "GPQA": 0.3331, + "MUSR": 0.424, + "MMLU-PRO": 0.4673 + } + }, + { + "model_id": "OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.3-200k", + "name": "openbuddy-qwen2.5llamaify-14b-v23.3-200k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.6131, + "BBH": 0.6081, + "MATH Level 5": 0.2311, + "GPQA": 0.3272, + "MUSR": 0.4346, + "MMLU-PRO": 0.4795 + } + }, + { + "model_id": "OpenBuddy/openbuddy-qwen2.5llamaify-7b-v23.1-200k", + "name": "openbuddy-qwen2.5llamaify-7b-v23.1-200k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.5673, + "BBH": 0.5509, + "MATH Level 5": 0.1888, + "GPQA": 0.3146, + "MUSR": 0.4363, + "MMLU-PRO": 0.3948 + } + }, + { + "model_id": "OpenBuddy/openbuddy-qwq-32b-v24.1-200k", + "name": "openbuddy-qwq-32b-v24.1-200k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.5937, + "BBH": 0.6798, + "MATH Level 5": 0.3739, + "GPQA": 0.3809, + "MUSR": 0.4849, + "MMLU-PRO": 0.549 + } + }, + { + "model_id": "OpenBuddy/openbuddy-qwq-32b-v24.2-200k", + "name": "openbuddy-qwq-32b-v24.2-200k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.597, + "BBH": 0.6772, + "MATH Level 5": 0.3776, + "GPQA": 0.3767, + "MUSR": 0.4718, + "MMLU-PRO": 0.5446 + } + }, + { + "model_id": "OpenBuddy/openbuddy-yi1.5-34b-v21.3-32k", + "name": "openbuddy-yi1.5-34b-v21.3-32k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.542, + "BBH": 0.6163, + "MATH Level 5": 0.1782, + "GPQA": 0.349, + "MUSR": 0.4439, + "MMLU-PRO": 0.4599 + } + }, + { + "model_id": "OpenBuddy/openbuddy-zero-14b-v22.3-32k", + "name": "openbuddy-zero-14b-v22.3-32k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.3753, + "BBH": 0.486, + "MATH Level 5": 0.0937, + "GPQA": 0.307, + "MUSR": 0.4166, + "MMLU-PRO": 0.3187 + } + }, + { + "model_id": "OpenBuddy/openbuddy-zero-3b-v21.2-32k", + "name": "openbuddy-zero-3b-v21.2-32k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.3802, + "BBH": 0.3935, + "MATH Level 5": 0.0189, + "GPQA": 0.2601, + "MUSR": 0.3566, + "MMLU-PRO": 0.2034 + } + }, + { + "model_id": "OpenBuddy/openbuddy-zero-56b-v21.2-32k", + "name": "openbuddy-zero-56b-v21.2-32k", + "developer": "OpenBuddy", + "scores": { + "IFEval": 0.5057, + "BBH": 0.6128, + "MATH Level 5": 0.1624, + "GPQA": 0.318, + "MUSR": 0.4305, + "MMLU-PRO": 0.4399 + } + }, + { + "model_id": "OpenGenerativeAI/Bifrost", + "name": "Bifrost", + "developer": "OpenGenerativeAI", + "scores": { + "IFEval": 0.6348, + "BBH": 0.6849, + "MATH Level 5": 0.2545, + "GPQA": 0.3683, + "MUSR": 0.4598, + "MMLU-PRO": 0.516 + } + }, + { + "model_id": "OpenGenerativeAI/Bifrost-14B", + "name": "Bifrost-14B", + "developer": "OpenGenerativeAI", + "scores": { + "IFEval": 0.6615, + "BBH": 0.6845, + "MATH Level 5": 0.2356, + "GPQA": 0.3792, + "MUSR": 0.4624, + "MMLU-PRO": 0.5074 + } + }, + { + "model_id": "OpenLLM-France/Lucie-7B", + "name": "Lucie-7B", + "developer": "OpenLLM-France", + "scores": { + "IFEval": 0.2496, + "BBH": 0.3492, + "MATH Level 5": 0.0144, + "GPQA": 0.2727, + "MUSR": 0.3923, + "MMLU-PRO": 0.1498 + } + }, + { + "model_id": "OpenLLM-France/Lucie-7B-Instruct", + "name": "Lucie-7B-Instruct", + "developer": "OpenLLM-France", + "scores": { + "IFEval": 0.2796, + "BBH": 0.3254, + "MATH Level 5": 0.0166, + "GPQA": 0.2794, + "MUSR": 0.3662, + "MMLU-PRO": 0.1556 + } + }, + { + "model_id": "OpenLLM-France/Lucie-7B-Instruct-human-data", + "name": "Lucie-7B-Instruct-human-data", + "developer": "OpenLLM-France", + "scores": { + "IFEval": 0.2946, + "BBH": 0.3284, + "MATH Level 5": 0.0219, + "GPQA": 0.2752, + "MUSR": 0.3729, + "MMLU-PRO": 0.143 + } + }, + { + "model_id": "OpenLLM-France/Lucie-7B-Instruct-v1.1", + "name": "Lucie-7B-Instruct-v1.1", + "developer": "OpenLLM-France", + "scores": { + "IFEval": 0.3039, + "BBH": 0.3816, + "MATH Level 5": 0.0317, + "GPQA": 0.2819, + "MUSR": 0.375, + "MMLU-PRO": 0.1864 + } + }, + { + "model_id": "OpenLeecher/llama3-8b-lima", + "name": "llama3-8b-lima", + "developer": "OpenLeecher", + "scores": { + "IFEval": 0.4371, + "BBH": 0.4296, + "MATH Level 5": 0.0506, + "GPQA": 0.2383, + "MUSR": 0.3713, + "MMLU-PRO": 0.2626 + } + }, + { + "model_id": "OpenScholar/Llama-3.1_OpenScholar-8B", + "name": "Llama-3.1_OpenScholar-8B", + "developer": "OpenScholar", + "scores": { + "IFEval": 0.6064, + "BBH": 0.5208, + "MATH Level 5": 0.1654, + "GPQA": 0.2819, + "MUSR": 0.4275, + "MMLU-PRO": 0.3708 + } + }, + { + "model_id": "Orenguteng/Llama-3.1-8B-Lexi-Uncensored", + "name": "Llama-3.1-8B-Lexi-Uncensored", + "developer": "Orenguteng", + "scores": { + "IFEval": 0.7777, + "BBH": 0.5057, + "MATH Level 5": 0.1571, + "GPQA": 0.2718, + "MUSR": 0.3871, + "MMLU-PRO": 0.379 + } + }, + { + "model_id": "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2", + "name": "Llama-3.1-8B-Lexi-Uncensored-V2", + "developer": "Orenguteng", + "scores": { + "IFEval": 0.7792, + "BBH": 0.5084, + "MATH Level 5": 0.1971, + "GPQA": 0.2827, + "MUSR": 0.3843, + "MMLU-PRO": 0.3781 + } + }, + { + "model_id": "Orion-zhen/Qwen2.5-7B-Instruct-Uncensored", + "name": "Qwen2.5-7B-Instruct-Uncensored", + "developer": "Orion-zhen", + "scores": { + "IFEval": 0.7204, + "BBH": 0.5474, + "MATH Level 5": 0.4773, + "GPQA": 0.3029, + "MUSR": 0.4361, + "MMLU-PRO": 0.4427 + } + }, + { + "model_id": "Orion-zhen/phi-4-abliterated", + "name": "phi-4-abliterated", + "developer": "Orion-zhen", + "scores": { + "IFEval": 0.0576, + "BBH": 0.6698, + "MATH Level 5": 0.3021, + "GPQA": 0.4044, + "MUSR": 0.5006, + "MMLU-PRO": 0.5292 + } + }, + { + "model_id": "P0x0/Astra-v1-12B", + "name": "Astra-v1-12B", + "developer": "P0x0", + "scores": { + "IFEval": 0.2806, + "BBH": 0.5215, + "MATH Level 5": 0.1133, + "GPQA": 0.3138, + "MUSR": 0.4052, + "MMLU-PRO": 0.3461 + } + }, + { + "model_id": "PJMixers-Dev/L3.2-Instruct-Thinking-v0.1-1B", + "name": "L3.2-Instruct-Thinking-v0.1-1B", + "developer": "PJMixers-Dev", + "scores": { + "IFEval": 0.4628, + "BBH": 0.3302, + "MATH Level 5": 0.0544, + "GPQA": 0.2576, + "MUSR": 0.3262, + "MMLU-PRO": 0.1483 + } + }, + { + "model_id": "PJMixers-Dev/LLaMa-3.1-Instruct-Interleaved-Zeroed-13B", + "name": "LLaMa-3.1-Instruct-Interleaved-Zeroed-13B", + "developer": "PJMixers-Dev", + "scores": { + "IFEval": 0.7871, + "BBH": 0.5073, + "MATH Level 5": 0.2002, + "GPQA": 0.2919, + "MUSR": 0.387, + "MMLU-PRO": 0.3767 + } + }, + { + "model_id": "PJMixers-Dev/LLaMa-3.1-RomboTiesTest-8B", + "name": "LLaMa-3.1-RomboTiesTest-8B", + "developer": "PJMixers-Dev", + "scores": { + "IFEval": 0.7825, + "BBH": 0.5073, + "MATH Level 5": 0.2002, + "GPQA": 0.2919, + "MUSR": 0.387, + "MMLU-PRO": 0.3767 + } + }, + { + "model_id": "PJMixers-Dev/LLaMa-3.1-RomboTiesTest2-8B", + "name": "LLaMa-3.1-RomboTiesTest2-8B", + "developer": "PJMixers-Dev", + "scores": { + "IFEval": 0.7825, + "BBH": 0.5073, + "MATH Level 5": 0.2002, + "GPQA": 0.2919, + "MUSR": 0.387, + "MMLU-PRO": 0.3767 + } + }, + { + "model_id": "PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B", + "name": "LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B", + "developer": "PJMixers-Dev", + "scores": { + "IFEval": 0.6931, + "BBH": 0.4556, + "MATH Level 5": 0.1216, + "GPQA": 0.2743, + "MUSR": 0.37, + "MMLU-PRO": 0.3127 + } + }, + { + "model_id": "PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B", + "name": "LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B", + "developer": "PJMixers-Dev", + "scores": { + "IFEval": 0.6292, + "BBH": 0.4581, + "MATH Level 5": 0.1299, + "GPQA": 0.2727, + "MUSR": 0.3659, + "MMLU-PRO": 0.3115 + } + }, + { + "model_id": "PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B", + "name": "LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B", + "developer": "PJMixers-Dev", + "scores": { + "IFEval": 0.6504, + "BBH": 0.4511, + "MATH Level 5": 0.1261, + "GPQA": 0.2718, + "MUSR": 0.3687, + "MMLU-PRO": 0.3108 + } + }, + { + "model_id": "PJMixers-Dev/LLaMa-3.2-Instruct-JankMixBread-v0.1-3B", + "name": "LLaMa-3.2-Instruct-JankMixBread-v0.1-3B", + "developer": "PJMixers-Dev", + "scores": { + "IFEval": 0.5041, + "BBH": 0.4483, + "MATH Level 5": 0.1307, + "GPQA": 0.2827, + "MUSR": 0.3516, + "MMLU-PRO": 0.3083 + } + }, + { + "model_id": "PJMixers-Dev/Qwen2.5-RomboTiesTest-7B", + "name": "Qwen2.5-RomboTiesTest-7B", + "developer": "PJMixers-Dev", + "scores": { + "IFEval": 0.7558, + "BBH": 0.5399, + "MATH Level 5": 0.4962, + "GPQA": 0.2978, + "MUSR": 0.4034, + "MMLU-PRO": 0.4285 + } + }, + { + "model_id": "PJMixers/LLaMa-3-CursedStock-v2.0-8B", + "name": "LLaMa-3-CursedStock-v2.0-8B", + "developer": "PJMixers", + "scores": { + "IFEval": 0.6331, + "BBH": 0.5271, + "MATH Level 5": 0.0944, + "GPQA": 0.2743, + "MUSR": 0.3856, + "MMLU-PRO": 0.3556 + } + }, + { + "model_id": "Parissa3/test-model", + "name": "test-model", + "developer": "Parissa3", + "scores": { + "IFEval": 0.3883, + "BBH": 0.5194, + "MATH Level 5": 0.065, + "GPQA": 0.2945, + "MUSR": 0.4685, + "MMLU-PRO": 0.3057 + } + }, + { + "model_id": "Pinkstack/PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B", + "name": "PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B", + "developer": "Pinkstack", + "scores": { + "IFEval": 0.5085, + "BBH": 0.4711, + "MATH Level 5": 0.1692, + "GPQA": 0.297, + "MUSR": 0.4479, + "MMLU-PRO": 0.3511 + } + }, + { + "model_id": "Pinkstack/SuperThoughts-CoT-14B-16k-o1-QwQ", + "name": "SuperThoughts-CoT-14B-16k-o1-QwQ", + "developer": "Pinkstack", + "scores": { + "IFEval": 0.0515, + "BBH": 0.672, + "MATH Level 5": 0.4199, + "GPQA": 0.3926, + "MUSR": 0.4914, + "MMLU-PRO": 0.5268 + } + }, + { + "model_id": "Pinkstack/Superthoughts-lite-1.8B-experimental-o1", + "name": "Superthoughts-lite-1.8B-experimental-o1", + "developer": "Pinkstack", + "scores": { + "IFEval": 0.0375, + "BBH": 0.3435, + "MATH Level 5": 0.0317, + "GPQA": 0.2752, + "MUSR": 0.3354, + "MMLU-PRO": 0.1851 + } + }, + { + "model_id": "Pinkstack/Superthoughts-lite-v1", + "name": "Superthoughts-lite-v1", + "developer": "Pinkstack", + "scores": { + "IFEval": 0.1659, + "BBH": 0.3466, + "MATH Level 5": 0.0295, + "GPQA": 0.281, + "MUSR": 0.3672, + "MMLU-PRO": 0.1755 + } + }, + { + "model_id": "PocketDoc/Dans-Instruct-CoreCurriculum-12b", + "name": "Dans-Instruct-CoreCurriculum-12b", + "developer": "PocketDoc", + "scores": { + "IFEval": 0.2191, + "BBH": 0.3789, + "MATH Level 5": 0.0544, + "GPQA": 0.2827, + "MUSR": 0.4096, + "MMLU-PRO": 0.1219 + } + }, + { + "model_id": "PocketDoc/Dans-PersonalityEngine-V1.1.0-12b", + "name": "Dans-PersonalityEngine-V1.1.0-12b", + "developer": "PocketDoc", + "scores": { + "IFEval": 0.7075, + "BBH": 0.5361, + "MATH Level 5": 0.105, + "GPQA": 0.2869, + "MUSR": 0.4587, + "MMLU-PRO": 0.3262 + } + }, + { + "model_id": "PocketDoc/Dans-PersonalityEngine-V1.2.0-24b", + "name": "Dans-PersonalityEngine-V1.2.0-24b", + "developer": "PocketDoc", + "scores": { + "IFEval": 0.7886, + "BBH": 0.6421, + "MATH Level 5": 0.2455, + "GPQA": 0.3188, + "MUSR": 0.43, + "MMLU-PRO": 0.5026 + } + }, + { + "model_id": "PocketDoc/Dans-PersonalityEngine-v1.0.0-8b", + "name": "Dans-PersonalityEngine-v1.0.0-8b", + "developer": "PocketDoc", + "scores": { + "IFEval": 0.4982, + "BBH": 0.4733, + "MATH Level 5": 0.0816, + "GPQA": 0.2852, + "MUSR": 0.3542, + "MMLU-PRO": 0.3065 + } + }, + { + "model_id": "PocketDoc/Dans-SakuraKaze-V1.0.0-12b", + "name": "Dans-SakuraKaze-V1.0.0-12b", + "developer": "PocketDoc", + "scores": { + "IFEval": 0.652, + "BBH": 0.5405, + "MATH Level 5": 0.0929, + "GPQA": 0.2936, + "MUSR": 0.4745, + "MMLU-PRO": 0.356 + } + }, + { + "model_id": "PowerInfer/SmallThinker-3B-Preview", + "name": "SmallThinker-3B-Preview", + "developer": "PowerInfer", + "scores": { + "IFEval": 0.62, + "BBH": 0.4495, + "MATH Level 5": 0.2779, + "GPQA": 0.2609, + "MUSR": 0.3525, + "MMLU-PRO": 0.3018 + } + }, + { + "model_id": "PranavHarshan/LaMistral-V4", + "name": "LaMistral-V4", + "developer": "PranavHarshan", + "scores": { + "IFEval": 0.6239, + "BBH": 0.5184, + "MATH Level 5": 0.0687, + "GPQA": 0.328, + "MUSR": 0.3643, + "MMLU-PRO": 0.3599 + } + }, + { + "model_id": "PranavHarshan/MedNarra-X1", + "name": "MedNarra-X1", + "developer": "PranavHarshan", + "scores": { + "IFEval": 0.4338, + "BBH": 0.4637, + "MATH Level 5": 0.0438, + "GPQA": 0.3079, + "MUSR": 0.354, + "MMLU-PRO": 0.3431 + } + }, + { + "model_id": "Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Appended", + "name": "OpenChat-3.5-0106_10.7B_48Layers-Appended", + "developer": "Pretergeek", + "scores": { + "IFEval": 0.5961, + "BBH": 0.462, + "MATH Level 5": 0.0793, + "GPQA": 0.307, + "MUSR": 0.4254, + "MMLU-PRO": 0.329 + } + }, + { + "model_id": "Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Interleaved", + "name": "OpenChat-3.5-0106_10.7B_48Layers-Interleaved", + "developer": "Pretergeek", + "scores": { + "IFEval": 0.5961, + "BBH": 0.462, + "MATH Level 5": 0.0778, + "GPQA": 0.3045, + "MUSR": 0.4254, + "MMLU-PRO": 0.3299 + } + }, + { + "model_id": "Pretergeek/OpenChat-3.5-0106_32K-PoSE", + "name": "OpenChat-3.5-0106_32K-PoSE", + "developer": "Pretergeek", + "scores": { + "IFEval": 0.3969, + "BBH": 0.3471, + "MATH Level 5": 0.0264, + "GPQA": 0.276, + "MUSR": 0.4205, + "MMLU-PRO": 0.2031 + } + }, + { + "model_id": "Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Appended", + "name": "OpenChat-3.5-0106_8.11B_36Layers-Appended", + "developer": "Pretergeek", + "scores": { + "IFEval": 0.5976, + "BBH": 0.462, + "MATH Level 5": 0.0793, + "GPQA": 0.307, + "MUSR": 0.4254, + "MMLU-PRO": 0.329 + } + }, + { + "model_id": "Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Interleaved", + "name": "OpenChat-3.5-0106_8.11B_36Layers-Interleaved", + "developer": "Pretergeek", + "scores": { + "IFEval": 0.5961, + "BBH": 0.4621, + "MATH Level 5": 0.0778, + "GPQA": 0.3045, + "MUSR": 0.4241, + "MMLU-PRO": 0.3299 + } + }, + { + "model_id": "Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Appended", + "name": "OpenChat-3.5-0106_8.99B_40Layers-Appended", + "developer": "Pretergeek", + "scores": { + "IFEval": 0.5961, + "BBH": 0.462, + "MATH Level 5": 0.0793, + "GPQA": 0.307, + "MUSR": 0.4254, + "MMLU-PRO": 0.329 + } + }, + { + "model_id": "Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Interleaved", + "name": "OpenChat-3.5-0106_8.99B_40Layers-Interleaved", + "developer": "Pretergeek", + "scores": { + "IFEval": 0.5976, + "BBH": 0.4621, + "MATH Level 5": 0.0778, + "GPQA": 0.3045, + "MUSR": 0.4241, + "MMLU-PRO": 0.3299 + } + }, + { + "model_id": "Pretergeek/OpenChat-3.5-0106_9.86B_44Layers-Appended", + "name": "OpenChat-3.5-0106_9.86B_44Layers-Appended", + "developer": "Pretergeek", + "scores": { + "IFEval": 0.5961, + "BBH": 0.462, + "MATH Level 5": 0.0793, + "GPQA": 0.307, + "MUSR": 0.4254, + "MMLU-PRO": 0.329 + } + }, + { + "model_id": "Pretergeek/openchat-3.5-0106_Rebased_Mistral-7B-v0.2", + "name": "openchat-3.5-0106_Rebased_Mistral-7B-v0.2", + "developer": "Pretergeek", + "scores": { + "IFEval": 0.3706, + "BBH": 0.3627, + "MATH Level 5": 0.0453, + "GPQA": 0.2718, + "MUSR": 0.484, + "MMLU-PRO": 0.283 + } + }, + { + "model_id": "PrimeIntellect/INTELLECT-1", + "name": "INTELLECT-1", + "developer": "PrimeIntellect", + "scores": { + "IFEval": 0.1757, + "BBH": 0.274, + "MATH Level 5": 0.0, + "GPQA": 0.25, + "MUSR": 0.3753, + "MMLU-PRO": 0.112 + } + }, + { + "model_id": "PrimeIntellect/INTELLECT-1-Instruct", + "name": "INTELLECT-1-Instruct", + "developer": "PrimeIntellect", + "scores": { + "IFEval": 0.0, + "BBH": 0.287, + "MATH Level 5": 0.0227, + "GPQA": 0.2483, + "MUSR": 0.3577, + "MMLU-PRO": 0.1064 + } + }, + { + "model_id": "PuxAI/LUA_model", + "name": "LUA_model", + "developer": "PuxAI", + "scores": { + "IFEval": 0.2282, + "BBH": 0.2877, + "MATH Level 5": 0.0, + "GPQA": 0.2601, + "MUSR": 0.3484, + "MMLU-PRO": 0.1123 + } + }, + { + "model_id": "PygmalionAI/pygmalion-6b", + "name": "pygmalion-6b", + "developer": "PygmalionAI", + "scores": { + "IFEval": 0.2091, + "BBH": 0.3199, + "MATH Level 5": 0.0083, + "GPQA": 0.2492, + "MUSR": 0.3684, + "MMLU-PRO": 0.1184 + } + }, + { + "model_id": "Q-bert/MetaMath-1B", + "name": "MetaMath-1B", + "developer": "Q-bert", + "scores": { + "IFEval": 0.53, + "BBH": 0.3451, + "MATH Level 5": 0.0627, + "GPQA": 0.2517, + "MUSR": 0.3289, + "MMLU-PRO": 0.1495 + } + }, + { + "model_id": "Quazim0t0/1up-14b", + "name": "1up-14b", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6888, + "BBH": 0.6921, + "MATH Level 5": 0.4162, + "GPQA": 0.3624, + "MUSR": 0.4583, + "MMLU-PRO": 0.5406 + } + }, + { + "model_id": "Quazim0t0/Adamant-14B-sce", + "name": "Adamant-14B-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6858, + "BBH": 0.6859, + "MATH Level 5": 0.3988, + "GPQA": 0.3507, + "MUSR": 0.4558, + "MMLU-PRO": 0.5372 + } + }, + { + "model_id": "Quazim0t0/Alice-14B", + "name": "Alice-14B", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6836, + "BBH": 0.6938, + "MATH Level 5": 0.4569, + "GPQA": 0.3515, + "MUSR": 0.4479, + "MMLU-PRO": 0.5419 + } + }, + { + "model_id": "Quazim0t0/Alien-CoT-14B-sce", + "name": "Alien-CoT-14B-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.0749, + "BBH": 0.6395, + "MATH Level 5": 0.5204, + "GPQA": 0.3918, + "MUSR": 0.4785, + "MMLU-PRO": 0.517 + } + }, + { + "model_id": "Quazim0t0/Aura-8B-Linear", + "name": "Aura-8B-Linear", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.7948, + "BBH": 0.5074, + "MATH Level 5": 0.1805, + "GPQA": 0.2693, + "MUSR": 0.3687, + "MMLU-PRO": 0.3801 + } + }, + { + "model_id": "Quazim0t0/Casa-14b-sce", + "name": "Casa-14b-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6654, + "BBH": 0.6901, + "MATH Level 5": 0.4698, + "GPQA": 0.3331, + "MUSR": 0.431, + "MMLU-PRO": 0.5426 + } + }, + { + "model_id": "Quazim0t0/Charlie-8B-Linear", + "name": "Charlie-8B-Linear", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.7381, + "BBH": 0.5141, + "MATH Level 5": 0.2651, + "GPQA": 0.271, + "MUSR": 0.3485, + "MMLU-PRO": 0.3573 + } + }, + { + "model_id": "Quazim0t0/Chromatic-8b-sce", + "name": "Chromatic-8b-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.5085, + "BBH": 0.5063, + "MATH Level 5": 0.1556, + "GPQA": 0.3196, + "MUSR": 0.4051, + "MMLU-PRO": 0.3755 + } + }, + { + "model_id": "Quazim0t0/CoT_Phi", + "name": "CoT_Phi", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6159, + "BBH": 0.6751, + "MATH Level 5": 0.3308, + "GPQA": 0.3582, + "MUSR": 0.4244, + "MMLU-PRO": 0.4901 + } + }, + { + "model_id": "Quazim0t0/Dyson-14b", + "name": "Dyson-14b", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.5857, + "BBH": 0.6863, + "MATH Level 5": 0.5393, + "GPQA": 0.3138, + "MUSR": 0.4259, + "MMLU-PRO": 0.5399 + } + }, + { + "model_id": "Quazim0t0/Edu-14B-Linear", + "name": "Edu-14B-Linear", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6158, + "BBH": 0.6758, + "MATH Level 5": 0.2447, + "GPQA": 0.3171, + "MUSR": 0.4378, + "MMLU-PRO": 0.5086 + } + }, + { + "model_id": "Quazim0t0/Fugazi14b", + "name": "Fugazi14b", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6998, + "BBH": 0.6941, + "MATH Level 5": 0.4653, + "GPQA": 0.3515, + "MUSR": 0.4546, + "MMLU-PRO": 0.5417 + } + }, + { + "model_id": "Quazim0t0/GZA-14B-sce", + "name": "GZA-14B-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6274, + "BBH": 0.6687, + "MATH Level 5": 0.4721, + "GPQA": 0.302, + "MUSR": 0.4285, + "MMLU-PRO": 0.5232 + } + }, + { + "model_id": "Quazim0t0/Geedorah-14B", + "name": "Geedorah-14B", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6873, + "BBH": 0.6964, + "MATH Level 5": 0.4449, + "GPQA": 0.3473, + "MUSR": 0.4547, + "MMLU-PRO": 0.5421 + } + }, + { + "model_id": "Quazim0t0/GivingTree-8b-sce", + "name": "GivingTree-8b-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.5006, + "BBH": 0.504, + "MATH Level 5": 0.1526, + "GPQA": 0.3221, + "MUSR": 0.4051, + "MMLU-PRO": 0.3761 + } + }, + { + "model_id": "Quazim0t0/GuiltySpark-14B-ties", + "name": "GuiltySpark-14B-ties", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6854, + "BBH": 0.6914, + "MATH Level 5": 0.3837, + "GPQA": 0.3649, + "MUSR": 0.4557, + "MMLU-PRO": 0.54 + } + }, + { + "model_id": "Quazim0t0/Halo-14B-sce", + "name": "Halo-14B-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6754, + "BBH": 0.6876, + "MATH Level 5": 0.429, + "GPQA": 0.3473, + "MUSR": 0.4401, + "MMLU-PRO": 0.5376 + } + }, + { + "model_id": "Quazim0t0/Heretic1.5b", + "name": "Heretic1.5b", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.2062, + "BBH": 0.3529, + "MATH Level 5": 0.244, + "GPQA": 0.2685, + "MUSR": 0.3511, + "MMLU-PRO": 0.1728 + } + }, + { + "model_id": "Quazim0t0/Hyde-14b-sce", + "name": "Hyde-14b-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6715, + "BBH": 0.6885, + "MATH Level 5": 0.2734, + "GPQA": 0.3414, + "MUSR": 0.4141, + "MMLU-PRO": 0.53 + } + }, + { + "model_id": "Quazim0t0/Imagine-v0.5-16bit", + "name": "Imagine-v0.5-16bit", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.2759, + "BBH": 0.6769, + "MATH Level 5": 0.1397, + "GPQA": 0.3649, + "MUSR": 0.4349, + "MMLU-PRO": 0.5354 + } + }, + { + "model_id": "Quazim0t0/Imbue-14b", + "name": "Imbue-14b", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.52, + "BBH": 0.6845, + "MATH Level 5": 0.5317, + "GPQA": 0.3129, + "MUSR": 0.4167, + "MMLU-PRO": 0.5402 + } + }, + { + "model_id": "Quazim0t0/Insom", + "name": "Insom", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6818, + "BBH": 0.6881, + "MATH Level 5": 0.3852, + "GPQA": 0.3498, + "MUSR": 0.4311, + "MMLU-PRO": 0.5352 + } + }, + { + "model_id": "Quazim0t0/InspectorDeck-14B-sce", + "name": "InspectorDeck-14B-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.3241, + "BBH": 0.6668, + "MATH Level 5": 0.3165, + "GPQA": 0.297, + "MUSR": 0.3982, + "MMLU-PRO": 0.5261 + } + }, + { + "model_id": "Quazim0t0/Jekyl-8b-sce", + "name": "Jekyl-8b-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.4697, + "BBH": 0.4994, + "MATH Level 5": 0.1616, + "GPQA": 0.3381, + "MUSR": 0.4197, + "MMLU-PRO": 0.3686 + } + }, + { + "model_id": "Quazim0t0/Jigsaw-14B-Linear", + "name": "Jigsaw-14B-Linear", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.648, + "BBH": 0.6865, + "MATH Level 5": 0.2651, + "GPQA": 0.3406, + "MUSR": 0.4483, + "MMLU-PRO": 0.5234 + } + }, + { + "model_id": "Quazim0t0/Katana-8b-sce", + "name": "Katana-8b-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.5107, + "BBH": 0.5075, + "MATH Level 5": 0.1511, + "GPQA": 0.3247, + "MUSR": 0.4038, + "MMLU-PRO": 0.3771 + } + }, + { + "model_id": "Quazim0t0/Knot-CoT-14B-sce", + "name": "Knot-CoT-14B-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.4832, + "BBH": 0.6616, + "MATH Level 5": 0.3995, + "GPQA": 0.2936, + "MUSR": 0.414, + "MMLU-PRO": 0.5154 + } + }, + { + "model_id": "Quazim0t0/Lineage-14B", + "name": "Lineage-14B", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.707, + "BBH": 0.6934, + "MATH Level 5": 0.4245, + "GPQA": 0.3599, + "MUSR": 0.4597, + "MMLU-PRO": 0.5411 + } + }, + { + "model_id": "Quazim0t0/Lo-Phi-14b", + "name": "Lo-Phi-14b", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.4941, + "BBH": 0.6852, + "MATH Level 5": 0.5196, + "GPQA": 0.328, + "MUSR": 0.4232, + "MMLU-PRO": 0.5369 + } + }, + { + "model_id": "Quazim0t0/Loke-14B-sce", + "name": "Loke-14B-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6848, + "BBH": 0.6924, + "MATH Level 5": 0.3905, + "GPQA": 0.3649, + "MUSR": 0.4637, + "MMLU-PRO": 0.5401 + } + }, + { + "model_id": "Quazim0t0/MFDOOM-14B", + "name": "MFDOOM-14B", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6736, + "BBH": 0.6916, + "MATH Level 5": 0.5264, + "GPQA": 0.323, + "MUSR": 0.4377, + "MMLU-PRO": 0.5426 + } + }, + { + "model_id": "Quazim0t0/MFGRIMM-14B", + "name": "MFGRIMM-14B", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6894, + "BBH": 0.6909, + "MATH Level 5": 0.506, + "GPQA": 0.3339, + "MUSR": 0.4361, + "MMLU-PRO": 0.5416 + } + }, + { + "model_id": "Quazim0t0/Math_Phi4_Reason", + "name": "Math_Phi4_Reason", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.322, + "BBH": 0.624, + "MATH Level 5": 0.3278, + "GPQA": 0.2903, + "MUSR": 0.4034, + "MMLU-PRO": 0.503 + } + }, + { + "model_id": "Quazim0t0/Mithril-14B-sce", + "name": "Mithril-14B-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6958, + "BBH": 0.6926, + "MATH Level 5": 0.3822, + "GPQA": 0.3691, + "MUSR": 0.4611, + "MMLU-PRO": 0.5403 + } + }, + { + "model_id": "Quazim0t0/Mononoke-14B-sce", + "name": "Mononoke-14B-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.3502, + "BBH": 0.6744, + "MATH Level 5": 0.4698, + "GPQA": 0.323, + "MUSR": 0.4155, + "MMLU-PRO": 0.5298 + } + }, + { + "model_id": "Quazim0t0/Motion-8B-Linear", + "name": "Motion-8B-Linear", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.7686, + "BBH": 0.5084, + "MATH Level 5": 0.1888, + "GPQA": 0.271, + "MUSR": 0.3606, + "MMLU-PRO": 0.3785 + } + }, + { + "model_id": "Quazim0t0/Mouse-9B", + "name": "Mouse-9B", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.1325, + "BBH": 0.2979, + "MATH Level 5": 0.0053, + "GPQA": 0.2542, + "MUSR": 0.347, + "MMLU-PRO": 0.1139 + } + }, + { + "model_id": "Quazim0t0/Nova-14b-sce", + "name": "Nova-14b-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.7022, + "BBH": 0.6935, + "MATH Level 5": 0.4162, + "GPQA": 0.3633, + "MUSR": 0.4571, + "MMLU-PRO": 0.5413 + } + }, + { + "model_id": "Quazim0t0/NovaScotia-14b-stock", + "name": "NovaScotia-14b-stock", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6787, + "BBH": 0.6935, + "MATH Level 5": 0.463, + "GPQA": 0.349, + "MUSR": 0.4493, + "MMLU-PRO": 0.5409 + } + }, + { + "model_id": "Quazim0t0/ODB-14B-sce", + "name": "ODB-14B-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.2922, + "BBH": 0.6559, + "MATH Level 5": 0.2545, + "GPQA": 0.2659, + "MUSR": 0.3929, + "MMLU-PRO": 0.5207 + } + }, + { + "model_id": "Quazim0t0/ODB-14b-sce", + "name": "ODB-14b-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.7016, + "BBH": 0.6942, + "MATH Level 5": 0.4116, + "GPQA": 0.3624, + "MUSR": 0.4571, + "MMLU-PRO": 0.5411 + } + }, + { + "model_id": "Quazim0t0/Oasis-14B-ties", + "name": "Oasis-14B-ties", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6937, + "BBH": 0.6915, + "MATH Level 5": 0.3754, + "GPQA": 0.3649, + "MUSR": 0.4571, + "MMLU-PRO": 0.5405 + } + }, + { + "model_id": "Quazim0t0/Origami-14B-sce", + "name": "Origami-14B-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.3259, + "BBH": 0.662, + "MATH Level 5": 0.2915, + "GPQA": 0.2836, + "MUSR": 0.4035, + "MMLU-PRO": 0.5244 + } + }, + { + "model_id": "Quazim0t0/Phi4.Turn.R1Distill.16bit", + "name": "Phi4.Turn.R1Distill.16bit", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.3126, + "BBH": 0.6563, + "MATH Level 5": 0.2311, + "GPQA": 0.2945, + "MUSR": 0.3902, + "MMLU-PRO": 0.5257 + } + }, + { + "model_id": "Quazim0t0/Phi4.Turn.R1Distill_v1.5.1-Tensors", + "name": "Phi4.Turn.R1Distill_v1.5.1-Tensors", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.2995, + "BBH": 0.6456, + "MATH Level 5": 0.219, + "GPQA": 0.2685, + "MUSR": 0.3929, + "MMLU-PRO": 0.5117 + } + }, + { + "model_id": "Quazim0t0/Phi4Basis-14B-sce", + "name": "Phi4Basis-14B-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6502, + "BBH": 0.6909, + "MATH Level 5": 0.4789, + "GPQA": 0.3289, + "MUSR": 0.4338, + "MMLU-PRO": 0.539 + } + }, + { + "model_id": "Quazim0t0/Ponder-14B-linear", + "name": "Ponder-14B-linear", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6906, + "BBH": 0.6943, + "MATH Level 5": 0.4282, + "GPQA": 0.3582, + "MUSR": 0.4558, + "MMLU-PRO": 0.5408 + } + }, + { + "model_id": "Quazim0t0/RZA-14B-sce", + "name": "RZA-14B-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.4774, + "BBH": 0.6686, + "MATH Level 5": 0.5189, + "GPQA": 0.2903, + "MUSR": 0.4113, + "MMLU-PRO": 0.5383 + } + }, + { + "model_id": "Quazim0t0/Rosemary-14b", + "name": "Rosemary-14b", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6915, + "BBH": 0.6955, + "MATH Level 5": 0.4388, + "GPQA": 0.3565, + "MUSR": 0.4492, + "MMLU-PRO": 0.5396 + } + }, + { + "model_id": "Quazim0t0/Rune-14b", + "name": "Rune-14b", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.7016, + "BBH": 0.6937, + "MATH Level 5": 0.4585, + "GPQA": 0.3515, + "MUSR": 0.4533, + "MMLU-PRO": 0.5411 + } + }, + { + "model_id": "Quazim0t0/SZA-14B-sce", + "name": "SZA-14B-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.5659, + "BBH": 0.6889, + "MATH Level 5": 0.5242, + "GPQA": 0.3305, + "MUSR": 0.4339, + "MMLU-PRO": 0.5353 + } + }, + { + "model_id": "Quazim0t0/Sake-20b", + "name": "Sake-20b", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6693, + "BBH": 0.677, + "MATH Level 5": 0.4653, + "GPQA": 0.3188, + "MUSR": 0.4494, + "MMLU-PRO": 0.5391 + } + }, + { + "model_id": "Quazim0t0/Spok-14b-sce", + "name": "Spok-14b-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6682, + "BBH": 0.6899, + "MATH Level 5": 0.2719, + "GPQA": 0.3456, + "MUSR": 0.4141, + "MMLU-PRO": 0.5298 + } + }, + { + "model_id": "Quazim0t0/Sumatra-20b", + "name": "Sumatra-20b", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6738, + "BBH": 0.6855, + "MATH Level 5": 0.3671, + "GPQA": 0.3263, + "MUSR": 0.456, + "MMLU-PRO": 0.5415 + } + }, + { + "model_id": "Quazim0t0/SuperNova14b", + "name": "SuperNova14b", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.7076, + "BBH": 0.6937, + "MATH Level 5": 0.4396, + "GPQA": 0.3523, + "MUSR": 0.4545, + "MMLU-PRO": 0.5435 + } + }, + { + "model_id": "Quazim0t0/TB0-8B-sce", + "name": "TB0-8B-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.5107, + "BBH": 0.5075, + "MATH Level 5": 0.1511, + "GPQA": 0.3247, + "MUSR": 0.4038, + "MMLU-PRO": 0.3771 + } + }, + { + "model_id": "Quazim0t0/TBL-8B-sce", + "name": "TBL-8B-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.4581, + "BBH": 0.5008, + "MATH Level 5": 0.1533, + "GPQA": 0.3339, + "MUSR": 0.4236, + "MMLU-PRO": 0.3689 + } + }, + { + "model_id": "Quazim0t0/ThinkPhi1.1-Tensors", + "name": "ThinkPhi1.1-Tensors", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.3908, + "BBH": 0.6449, + "MATH Level 5": 0.182, + "GPQA": 0.2987, + "MUSR": 0.418, + "MMLU-PRO": 0.4908 + } + }, + { + "model_id": "Quazim0t0/Venti-20b", + "name": "Venti-20b", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6641, + "BBH": 0.6901, + "MATH Level 5": 0.3391, + "GPQA": 0.3322, + "MUSR": 0.448, + "MMLU-PRO": 0.5386 + } + }, + { + "model_id": "Quazim0t0/Venti-Blend-sce", + "name": "Venti-Blend-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6879, + "BBH": 0.6843, + "MATH Level 5": 0.4056, + "GPQA": 0.3163, + "MUSR": 0.4389, + "MMLU-PRO": 0.5414 + } + }, + { + "model_id": "Quazim0t0/Vine-14b-sce", + "name": "Vine-14b-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6733, + "BBH": 0.6891, + "MATH Level 5": 0.5008, + "GPQA": 0.3339, + "MUSR": 0.4323, + "MMLU-PRO": 0.5408 + } + }, + { + "model_id": "Quazim0t0/Wendy-14B", + "name": "Wendy-14B", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6772, + "BBH": 0.6958, + "MATH Level 5": 0.4834, + "GPQA": 0.3322, + "MUSR": 0.4428, + "MMLU-PRO": 0.5435 + } + }, + { + "model_id": "Quazim0t0/Wu-14b-sce", + "name": "Wu-14b-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6718, + "BBH": 0.6885, + "MATH Level 5": 0.2613, + "GPQA": 0.3465, + "MUSR": 0.4114, + "MMLU-PRO": 0.5293 + } + }, + { + "model_id": "Quazim0t0/bloom-14b-stock", + "name": "bloom-14b-stock", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6575, + "BBH": 0.6878, + "MATH Level 5": 0.4811, + "GPQA": 0.3314, + "MUSR": 0.431, + "MMLU-PRO": 0.5373 + } + }, + { + "model_id": "Quazim0t0/caramel-14B", + "name": "caramel-14B", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6745, + "BBH": 0.6919, + "MATH Level 5": 0.4713, + "GPQA": 0.3448, + "MUSR": 0.4454, + "MMLU-PRO": 0.5436 + } + }, + { + "model_id": "Quazim0t0/graphite-14b-sce", + "name": "graphite-14b-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.3217, + "BBH": 0.6631, + "MATH Level 5": 0.3006, + "GPQA": 0.2894, + "MUSR": 0.3981, + "MMLU-PRO": 0.528 + } + }, + { + "model_id": "Quazim0t0/mocha-14B", + "name": "mocha-14B", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.5893, + "BBH": 0.6895, + "MATH Level 5": 0.5264, + "GPQA": 0.3305, + "MUSR": 0.4272, + "MMLU-PRO": 0.5384 + } + }, + { + "model_id": "Quazim0t0/mosaic-14b-sce", + "name": "mosaic-14b-sce", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6876, + "BBH": 0.6907, + "MATH Level 5": 0.4026, + "GPQA": 0.3624, + "MUSR": 0.4558, + "MMLU-PRO": 0.5396 + } + }, + { + "model_id": "Quazim0t0/tesseract-14b-stock", + "name": "tesseract-14b-stock", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.5848, + "BBH": 0.688, + "MATH Level 5": 0.5144, + "GPQA": 0.3272, + "MUSR": 0.4232, + "MMLU-PRO": 0.5389 + } + }, + { + "model_id": "Quazim0t0/time-14b-stock", + "name": "time-14b-stock", + "developer": "Quazim0t0", + "scores": { + "IFEval": 0.6699, + "BBH": 0.6897, + "MATH Level 5": 0.5083, + "GPQA": 0.3347, + "MUSR": 0.4323, + "MMLU-PRO": 0.5419 + } + }, + { + "model_id": "Qwen/QwQ-32B", + "name": "QwQ-32B", + "developer": "Qwen", + "scores": { + "IFEval": 0.3977, + "BBH": 0.2983, + "MATH Level 5": 0.1609, + "GPQA": 0.2601, + "MUSR": 0.4206, + "MMLU-PRO": 0.1196 + } + }, + { + "model_id": "Qwen/QwQ-32B-Preview", + "name": "QwQ-32B-Preview", + "developer": "Qwen", + "scores": { + "IFEval": 0.4035, + "BBH": 0.6691, + "MATH Level 5": 0.4494, + "GPQA": 0.2819, + "MUSR": 0.411, + "MMLU-PRO": 0.5678 + } + }, + { + "model_id": "Qwen/Qwen1.5-0.5B", + "name": "Qwen1.5-0.5B", + "developer": "Qwen", + "scores": { + "IFEval": 0.1706, + "BBH": 0.3154, + "MATH Level 5": 0.0174, + "GPQA": 0.2542, + "MUSR": 0.3616, + "MMLU-PRO": 0.1307 + } + }, + { + "model_id": "Qwen/Qwen1.5-0.5B-Chat", + "name": "Qwen1.5-0.5B-Chat", + "developer": "Qwen", + "scores": { + "IFEval": 0.1807, + "BBH": 0.3167, + "MATH Level 5": 0.0068, + "GPQA": 0.2693, + "MUSR": 0.3837, + "MMLU-PRO": 0.1213 + } + }, + { + "model_id": "Qwen/Qwen1.5-1.8B", + "name": "Qwen1.5-1.8B", + "developer": "Qwen", + "scores": { + "IFEval": 0.2154, + "BBH": 0.3476, + "MATH Level 5": 0.0317, + "GPQA": 0.3054, + "MUSR": 0.3605, + "MMLU-PRO": 0.1882 + } + }, + { + "model_id": "Qwen/Qwen1.5-1.8B-Chat", + "name": "Qwen1.5-1.8B-Chat", + "developer": "Qwen", + "scores": { + "IFEval": 0.2019, + "BBH": 0.3256, + "MATH Level 5": 0.0196, + "GPQA": 0.2978, + "MUSR": 0.426, + "MMLU-PRO": 0.1804 + } + }, + { + "model_id": "Qwen/Qwen1.5-110B", + "name": "Qwen1.5-110B", + "developer": "Qwen", + "scores": { + "IFEval": 0.3422, + "BBH": 0.61, + "MATH Level 5": 0.247, + "GPQA": 0.3523, + "MUSR": 0.4408, + "MMLU-PRO": 0.5361 + } + }, + { + "model_id": "Qwen/Qwen1.5-110B-Chat", + "name": "Qwen1.5-110B-Chat", + "developer": "Qwen", + "scores": { + "IFEval": 0.5939, + "BBH": 0.6184, + "MATH Level 5": 0.2341, + "GPQA": 0.3414, + "MUSR": 0.4522, + "MMLU-PRO": 0.4825 + } + }, + { + "model_id": "Qwen/Qwen1.5-14B", + "name": "Qwen1.5-14B", + "developer": "Qwen", + "scores": { + "IFEval": 0.2905, + "BBH": 0.508, + "MATH Level 5": 0.2024, + "GPQA": 0.2945, + "MUSR": 0.4186, + "MMLU-PRO": 0.3644 + } + }, + { + "model_id": "Qwen/Qwen1.5-14B-Chat", + "name": "Qwen1.5-14B-Chat", + "developer": "Qwen", + "scores": { + "IFEval": 0.4768, + "BBH": 0.5229, + "MATH Level 5": 0.1526, + "GPQA": 0.2701, + "MUSR": 0.44, + "MMLU-PRO": 0.3618 + } + }, + { + "model_id": "Qwen/Qwen1.5-32B", + "name": "Qwen1.5-32B", + "developer": "Qwen", + "scores": { + "IFEval": 0.3297, + "BBH": 0.5715, + "MATH Level 5": 0.3029, + "GPQA": 0.3297, + "MUSR": 0.4278, + "MMLU-PRO": 0.45 + } + }, + { + "model_id": "Qwen/Qwen1.5-32B-Chat", + "name": "Qwen1.5-32B-Chat", + "developer": "Qwen", + "scores": { + "IFEval": 0.5532, + "BBH": 0.6067, + "MATH Level 5": 0.1956, + "GPQA": 0.3062, + "MUSR": 0.416, + "MMLU-PRO": 0.4457 + } + }, + { + "model_id": "Qwen/Qwen1.5-4B", + "name": "Qwen1.5-4B", + "developer": "Qwen", + "scores": { + "IFEval": 0.2445, + "BBH": 0.4054, + "MATH Level 5": 0.0529, + "GPQA": 0.2768, + "MUSR": 0.3604, + "MMLU-PRO": 0.246 + } + }, + { + "model_id": "Qwen/Qwen1.5-4B-Chat", + "name": "Qwen1.5-4B-Chat", + "developer": "Qwen", + "scores": { + "IFEval": 0.3157, + "BBH": 0.4006, + "MATH Level 5": 0.0279, + "GPQA": 0.2668, + "MUSR": 0.3978, + "MMLU-PRO": 0.2396 + } + }, + { + "model_id": "Qwen/Qwen1.5-7B", + "name": "Qwen1.5-7B", + "developer": "Qwen", + "scores": { + "IFEval": 0.2684, + "BBH": 0.456, + "MATH Level 5": 0.0929, + "GPQA": 0.2987, + "MUSR": 0.4103, + "MMLU-PRO": 0.2916 + } + }, + { + "model_id": "Qwen/Qwen1.5-7B-Chat", + "name": "Qwen1.5-7B-Chat", + "developer": "Qwen", + "scores": { + "IFEval": 0.4371, + "BBH": 0.451, + "MATH Level 5": 0.0627, + "GPQA": 0.3029, + "MUSR": 0.3779, + "MMLU-PRO": 0.2951 + } + }, + { + "model_id": "Qwen/Qwen1.5-MoE-A2.7B", + "name": "Qwen1.5-MoE-A2.7B", + "developer": "Qwen", + "scores": { + "IFEval": 0.266, + "BBH": 0.4114, + "MATH Level 5": 0.0929, + "GPQA": 0.2592, + "MUSR": 0.4013, + "MMLU-PRO": 0.2778 + } + }, + { + "model_id": "Qwen/Qwen1.5-MoE-A2.7B-Chat", + "name": "Qwen1.5-MoE-A2.7B-Chat", + "developer": "Qwen", + "scores": { + "IFEval": 0.3795, + "BBH": 0.4272, + "MATH Level 5": 0.0634, + "GPQA": 0.2743, + "MUSR": 0.3899, + "MMLU-PRO": 0.2923 + } + }, + { + "model_id": "Qwen/Qwen2-0.5B", + "name": "Qwen2-0.5B", + "developer": "Qwen", + "scores": { + "IFEval": 0.1873, + "BBH": 0.3239, + "MATH Level 5": 0.0264, + "GPQA": 0.2609, + "MUSR": 0.3752, + "MMLU-PRO": 0.172 + } + }, + { + "model_id": "Qwen/Qwen2-0.5B-Instruct", + "name": "Qwen2-0.5B-Instruct", + "developer": "Qwen", + "scores": { + "IFEval": 0.2247, + "BBH": 0.3173, + "MATH Level 5": 0.0287, + "GPQA": 0.2466, + "MUSR": 0.3353, + "MMLU-PRO": 0.1531 + } + }, + { + "model_id": "Qwen/Qwen2-1.5B", + "name": "Qwen2-1.5B", + "developer": "Qwen", + "scores": { + "IFEval": 0.2113, + "BBH": 0.3575, + "MATH Level 5": 0.0702, + "GPQA": 0.2643, + "MUSR": 0.3658, + "MMLU-PRO": 0.2552 + } + }, + { + "model_id": "Qwen/Qwen2-1.5B-Instruct", + "name": "Qwen2-1.5B-Instruct", + "developer": "Qwen", + "scores": { + "IFEval": 0.3371, + "BBH": 0.3852, + "MATH Level 5": 0.0718, + "GPQA": 0.2617, + "MUSR": 0.4293, + "MMLU-PRO": 0.2501 + } + }, + { + "model_id": "Qwen/Qwen2-57B-A14B", + "name": "Qwen2-57B-A14B", + "developer": "Qwen", + "scores": { + "IFEval": 0.3113, + "BBH": 0.5618, + "MATH Level 5": 0.1866, + "GPQA": 0.3062, + "MUSR": 0.4174, + "MMLU-PRO": 0.4916 + } + }, + { + "model_id": "Qwen/Qwen2-57B-A14B-Instruct", + "name": "Qwen2-57B-A14B-Instruct", + "developer": "Qwen", + "scores": { + "IFEval": 0.6338, + "BBH": 0.5888, + "MATH Level 5": 0.2817, + "GPQA": 0.3314, + "MUSR": 0.4361, + "MMLU-PRO": 0.4575 + } + }, + { + "model_id": "Qwen/Qwen2-72B", + "name": "Qwen2-72B", + "developer": "Qwen", + "scores": { + "IFEval": 0.3824, + "BBH": 0.6617, + "MATH Level 5": 0.3112, + "GPQA": 0.3943, + "MUSR": 0.4704, + "MMLU-PRO": 0.5731 + } + }, + { + "model_id": "Qwen/Qwen2-72B-Instruct", + "name": "Qwen2-72B-Instruct", + "developer": "Qwen", + "scores": { + "IFEval": 0.7989, + "BBH": 0.6977, + "MATH Level 5": 0.4177, + "GPQA": 0.3725, + "MUSR": 0.456, + "MMLU-PRO": 0.5403 + } + }, + { + "model_id": "Qwen/Qwen2-7B", + "name": "Qwen2-7B", + "developer": "Qwen", + "scores": { + "IFEval": 0.3149, + "BBH": 0.5315, + "MATH Level 5": 0.2039, + "GPQA": 0.3045, + "MUSR": 0.4439, + "MMLU-PRO": 0.4183 + } + }, + { + "model_id": "Qwen/Qwen2-7B-Instruct", + "name": "Qwen2-7B-Instruct", + "developer": "Qwen", + "scores": { + "IFEval": 0.5679, + "BBH": 0.5545, + "MATH Level 5": 0.2764, + "GPQA": 0.2978, + "MUSR": 0.3928, + "MMLU-PRO": 0.3847 + } + }, + { + "model_id": "Qwen/Qwen2-Math-72B-Instruct", + "name": "Qwen2-Math-72B-Instruct", + "developer": "Qwen", + "scores": { + "IFEval": 0.5694, + "BBH": 0.6343, + "MATH Level 5": 0.5536, + "GPQA": 0.3683, + "MUSR": 0.4517, + "MMLU-PRO": 0.4273 + } + }, + { + "model_id": "Qwen/Qwen2-Math-7B", + "name": "Qwen2-Math-7B", + "developer": "Qwen", + "scores": { + "IFEval": 0.2687, + "BBH": 0.387, + "MATH Level 5": 0.2477, + "GPQA": 0.2634, + "MUSR": 0.3593, + "MMLU-PRO": 0.1197 + } + }, + { + "model_id": "Qwen/Qwen2-VL-72B-Instruct", + "name": "Qwen2-VL-72B-Instruct", + "developer": "Qwen", + "scores": { + "IFEval": 0.5982, + "BBH": 0.6946, + "MATH Level 5": 0.3444, + "GPQA": 0.3876, + "MUSR": 0.4492, + "MMLU-PRO": 0.5717 + } + }, + { + "model_id": "Qwen/Qwen2-VL-7B-Instruct", + "name": "Qwen2-VL-7B-Instruct", + "developer": "Qwen", + "scores": { + "IFEval": 0.4599, + "BBH": 0.5465, + "MATH Level 5": 0.1986, + "GPQA": 0.3196, + "MUSR": 0.4375, + "MMLU-PRO": 0.4095 + } + }, + { + "model_id": "Qwen/Qwen2.5-0.5B", + "name": "Qwen2.5-0.5B", + "developer": "Qwen", + "scores": { + "IFEval": 0.1627, + "BBH": 0.3275, + "MATH Level 5": 0.0393, + "GPQA": 0.2466, + "MUSR": 0.3433, + "MMLU-PRO": 0.1906 + } + }, + { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "name": "Qwen2.5-0.5B-Instruct", + "developer": "Qwen", + "scores": { + "IFEval": 0.3153, + "BBH": 0.3322, + "MATH Level 5": 0.1035, + "GPQA": 0.2592, + "MUSR": 0.3342, + "MMLU-PRO": 0.172 + } + }, + { + "model_id": "Qwen/Qwen2.5-1.5B", + "name": "Qwen2.5-1.5B", + "developer": "Qwen", + "scores": { + "IFEval": 0.2674, + "BBH": 0.4078, + "MATH Level 5": 0.0914, + "GPQA": 0.2852, + "MUSR": 0.3576, + "MMLU-PRO": 0.2855 + } + }, + { + "model_id": "Qwen/Qwen2.5-1.5B-Instruct", + "name": "Qwen2.5-1.5B-Instruct", + "developer": "Qwen", + "scores": { + "IFEval": 0.4476, + "BBH": 0.4289, + "MATH Level 5": 0.2205, + "GPQA": 0.2559, + "MUSR": 0.3663, + "MMLU-PRO": 0.2799 + } + }, + { + "model_id": "Qwen/Qwen2.5-14B", + "name": "Qwen2.5-14B", + "developer": "Qwen", + "scores": { + "IFEval": 0.3694, + "BBH": 0.6161, + "MATH Level 5": 0.29, + "GPQA": 0.3817, + "MUSR": 0.4502, + "MMLU-PRO": 0.5249 + } + }, + { + "model_id": "Qwen/Qwen2.5-14B-Instruct", + "name": "Qwen2.5-14B-Instruct", + "developer": "Qwen", + "scores": { + "IFEval": 0.8158, + "BBH": 0.639, + "MATH Level 5": 0.5476, + "GPQA": 0.3221, + "MUSR": 0.4101, + "MMLU-PRO": 0.4904 + } + }, + { + "model_id": "Qwen/Qwen2.5-14B-Instruct-1M", + "name": "Qwen2.5-14B-Instruct-1M", + "developer": "Qwen", + "scores": { + "IFEval": 0.8414, + "BBH": 0.6198, + "MATH Level 5": 0.5302, + "GPQA": 0.3431, + "MUSR": 0.418, + "MMLU-PRO": 0.485 + } + }, + { + "model_id": "Qwen/Qwen2.5-32B", + "name": "Qwen2.5-32B", + "developer": "Qwen", + "scores": { + "IFEval": 0.4077, + "BBH": 0.6771, + "MATH Level 5": 0.3565, + "GPQA": 0.4119, + "MUSR": 0.4978, + "MMLU-PRO": 0.5805 + } + }, + { + "model_id": "Qwen/Qwen2.5-32B-Instruct", + "name": "Qwen2.5-32B-Instruct", + "developer": "Qwen", + "scores": { + "IFEval": 0.8346, + "BBH": 0.6913, + "MATH Level 5": 0.6254, + "GPQA": 0.3381, + "MUSR": 0.4261, + "MMLU-PRO": 0.5667 + } + }, + { + "model_id": "Qwen/Qwen2.5-3B", + "name": "Qwen2.5-3B", + "developer": "Qwen", + "scores": { + "IFEval": 0.269, + "BBH": 0.4612, + "MATH Level 5": 0.148, + "GPQA": 0.2978, + "MUSR": 0.4303, + "MMLU-PRO": 0.3203 + } + }, + { + "model_id": "Qwen/Qwen2.5-3B-Instruct", + "name": "Qwen2.5-3B-Instruct", + "developer": "Qwen", + "scores": { + "IFEval": 0.6475, + "BBH": 0.4693, + "MATH Level 5": 0.3678, + "GPQA": 0.2727, + "MUSR": 0.3968, + "MMLU-PRO": 0.3255 + } + }, + { + "model_id": "Qwen/Qwen2.5-72B", + "name": "Qwen2.5-72B", + "developer": "Qwen", + "scores": { + "IFEval": 0.4137, + "BBH": 0.6797, + "MATH Level 5": 0.3912, + "GPQA": 0.4052, + "MUSR": 0.4771, + "MMLU-PRO": 0.5968 + } + }, + { + "model_id": "Qwen/Qwen2.5-72B-Instruct", + "name": "Qwen2.5-72B-Instruct", + "developer": "Qwen", + "scores": { + "IFEval": 0.8638, + "BBH": 0.7273, + "MATH Level 5": 0.5982, + "GPQA": 0.375, + "MUSR": 0.4206, + "MMLU-PRO": 0.5626 + } + }, + { + "model_id": "Qwen/Qwen2.5-7B", + "name": "Qwen2.5-7B", + "developer": "Qwen", + "scores": { + "IFEval": 0.3374, + "BBH": 0.5416, + "MATH Level 5": 0.2508, + "GPQA": 0.3247, + "MUSR": 0.4424, + "MMLU-PRO": 0.4365 + } + }, + { + "model_id": "Qwen/Qwen2.5-7B-Instruct", + "name": "Qwen2.5-7B-Instruct", + "developer": "Qwen", + "scores": { + "IFEval": 0.7585, + "BBH": 0.5394, + "MATH Level 5": 0.5, + "GPQA": 0.2911, + "MUSR": 0.402, + "MMLU-PRO": 0.4287 + } + }, + { + "model_id": "Qwen/Qwen2.5-7B-Instruct-1M", + "name": "Qwen2.5-7B-Instruct-1M", + "developer": "Qwen", + "scores": { + "IFEval": 0.7448, + "BBH": 0.5404, + "MATH Level 5": 0.4335, + "GPQA": 0.2978, + "MUSR": 0.4087, + "MMLU-PRO": 0.3505 + } + }, + { + "model_id": "Qwen/Qwen2.5-Coder-14B", + "name": "Qwen2.5-Coder-14B", + "developer": "Qwen", + "scores": { + "IFEval": 0.3473, + "BBH": 0.5865, + "MATH Level 5": 0.2251, + "GPQA": 0.2928, + "MUSR": 0.3874, + "MMLU-PRO": 0.4521 + } + }, + { + "model_id": "Qwen/Qwen2.5-Coder-14B-Instruct", + "name": "Qwen2.5-Coder-14B-Instruct", + "developer": "Qwen", + "scores": { + "IFEval": 0.6908, + "BBH": 0.614, + "MATH Level 5": 0.3248, + "GPQA": 0.3045, + "MUSR": 0.3915, + "MMLU-PRO": 0.3939 + } + }, + { + "model_id": "Qwen/Qwen2.5-Coder-32B", + "name": "Qwen2.5-Coder-32B", + "developer": "Qwen", + "scores": { + "IFEval": 0.4363, + "BBH": 0.6404, + "MATH Level 5": 0.3089, + "GPQA": 0.3465, + "MUSR": 0.4528, + "MMLU-PRO": 0.5303 + } + }, + { + "model_id": "Qwen/Qwen2.5-Coder-32B-Instruct", + "name": "Qwen2.5-Coder-32B-Instruct", + "developer": "Qwen", + "scores": { + "IFEval": 0.7265, + "BBH": 0.6625, + "MATH Level 5": 0.4955, + "GPQA": 0.349, + "MUSR": 0.4386, + "MMLU-PRO": 0.4413 + } + }, + { + "model_id": "Qwen/Qwen2.5-Coder-7B", + "name": "Qwen2.5-Coder-7B", + "developer": "Qwen", + "scores": { + "IFEval": 0.3446, + "BBH": 0.4856, + "MATH Level 5": 0.1918, + "GPQA": 0.2592, + "MUSR": 0.3449, + "MMLU-PRO": 0.3679 + } + }, + { + "model_id": "Qwen/Qwen2.5-Coder-7B-Instruct", + "name": "Qwen2.5-Coder-7B-Instruct", + "developer": "Qwen", + "scores": { + "IFEval": 0.6147, + "BBH": 0.4999, + "MATH Level 5": 0.031, + "GPQA": 0.2936, + "MUSR": 0.4099, + "MMLU-PRO": 0.3354 + } + }, + { + "model_id": "Qwen/Qwen2.5-Math-1.5B-Instruct", + "name": "Qwen2.5-Math-1.5B-Instruct", + "developer": "Qwen", + "scores": { + "IFEval": 0.1856, + "BBH": 0.3752, + "MATH Level 5": 0.2628, + "GPQA": 0.2651, + "MUSR": 0.3685, + "MMLU-PRO": 0.1801 + } + }, + { + "model_id": "Qwen/Qwen2.5-Math-72B-Instruct", + "name": "Qwen2.5-Math-72B-Instruct", + "developer": "Qwen", + "scores": { + "IFEval": 0.4003, + "BBH": 0.6452, + "MATH Level 5": 0.6239, + "GPQA": 0.3314, + "MUSR": 0.4473, + "MMLU-PRO": 0.4812 + } + }, + { + "model_id": "Qwen/Qwen2.5-Math-7B", + "name": "Qwen2.5-Math-7B", + "developer": "Qwen", + "scores": { + "IFEval": 0.246, + "BBH": 0.4455, + "MATH Level 5": 0.3051, + "GPQA": 0.2936, + "MUSR": 0.3781, + "MMLU-PRO": 0.2718 + } + }, + { + "model_id": "Qwen/Qwen2.5-Math-7B-Instruct", + "name": "Qwen2.5-Math-7B-Instruct", + "developer": "Qwen", + "scores": { + "IFEval": 0.2636, + "BBH": 0.4388, + "MATH Level 5": 0.5808, + "GPQA": 0.2617, + "MUSR": 0.3647, + "MMLU-PRO": 0.282 + } + }, + { + "model_id": "RDson/WomboCombo-R1-Coder-14B-Preview", + "name": "WomboCombo-R1-Coder-14B-Preview", + "developer": "RDson", + "scores": { + "IFEval": 0.6286, + "BBH": 0.6392, + "MATH Level 5": 0.5989, + "GPQA": 0.3213, + "MUSR": 0.4844, + "MMLU-PRO": 0.5168 + } + }, + { + "model_id": "RESMPDEV/EVA-Qwen2.5-1.5B-FRFR", + "name": "EVA-Qwen2.5-1.5B-FRFR", + "developer": "RESMPDEV", + "scores": { + "IFEval": 0.3082, + "BBH": 0.3932, + "MATH Level 5": 0.1027, + "GPQA": 0.2794, + "MUSR": 0.3539, + "MMLU-PRO": 0.277 + } + }, + { + "model_id": "RESMPDEV/Qwen2-Wukong-0.5B", + "name": "Qwen2-Wukong-0.5B", + "developer": "RESMPDEV", + "scores": { + "IFEval": 0.1854, + "BBH": 0.3085, + "MATH Level 5": 0.0015, + "GPQA": 0.2366, + "MUSR": 0.3525, + "MMLU-PRO": 0.1327 + } + }, + { + "model_id": "RLHFlow/ArmoRM-Llama3-8B-v0.1", + "name": "ArmoRM-Llama3-8B-v0.1", + "developer": "RLHFlow", + "scores": { + "IFEval": 0.1897, + "BBH": 0.2876, + "MATH Level 5": 0.0, + "GPQA": 0.2492, + "MUSR": 0.3948, + "MMLU-PRO": 0.1078 + } + }, + { + "model_id": "RLHFlow/LLaMA3-iterative-DPO-final", + "name": "LLaMA3-iterative-DPO-final", + "developer": "RLHFlow", + "scores": { + "IFEval": 0.534, + "BBH": 0.5058, + "MATH Level 5": 0.0884, + "GPQA": 0.2836, + "MUSR": 0.3673, + "MMLU-PRO": 0.3257 + } + }, + { + "model_id": "RWKV/rwkv-raven-14b", + "name": "rwkv-raven-14b", + "developer": "RWKV", + "scores": { + "IFEval": 0.0768, + "BBH": 0.3307, + "MATH Level 5": 0.0045, + "GPQA": 0.229, + "MUSR": 0.3951, + "MMLU-PRO": 0.115 + } + }, + { + "model_id": "Rakuten/RakutenAI-2.0-mini-instruct", + "name": "RakutenAI-2.0-mini-instruct", + "developer": "Rakuten", + "scores": { + "IFEval": 0.6794, + "BBH": 0.2867, + "MATH Level 5": 0.0521, + "GPQA": 0.2668, + "MUSR": 0.3249, + "MMLU-PRO": 0.1118 + } + }, + { + "model_id": "Rakuten/RakutenAI-7B", + "name": "RakutenAI-7B", + "developer": "Rakuten", + "scores": { + "IFEval": 0.1556, + "BBH": 0.4315, + "MATH Level 5": 0.0196, + "GPQA": 0.2894, + "MUSR": 0.3738, + "MMLU-PRO": 0.2877 + } + }, + { + "model_id": "Rakuten/RakutenAI-7B-chat", + "name": "RakutenAI-7B-chat", + "developer": "Rakuten", + "scores": { + "IFEval": 0.2686, + "BBH": 0.4316, + "MATH Level 5": 0.0295, + "GPQA": 0.2567, + "MUSR": 0.379, + "MMLU-PRO": 0.2798 + } + }, + { + "model_id": "Replete-AI/L3-Pneuma-8B", + "name": "L3-Pneuma-8B", + "developer": "Replete-AI", + "scores": { + "IFEval": 0.2413, + "BBH": 0.4909, + "MATH Level 5": 0.0544, + "GPQA": 0.318, + "MUSR": 0.4105, + "MMLU-PRO": 0.3176 + } + }, + { + "model_id": "Replete-AI/L3.1-Pneuma-8B", + "name": "L3.1-Pneuma-8B", + "developer": "Replete-AI", + "scores": { + "IFEval": 0.7076, + "BBH": 0.505, + "MATH Level 5": 0.2198, + "GPQA": 0.3029, + "MUSR": 0.3871, + "MMLU-PRO": 0.3691 + } + }, + { + "model_id": "Replete-AI/Llama3-8B-Instruct-Replete-Adapted", + "name": "Llama3-8B-Instruct-Replete-Adapted", + "developer": "Replete-AI", + "scores": { + "IFEval": 0.6915, + "BBH": 0.487, + "MATH Level 5": 0.071, + "GPQA": 0.281, + "MUSR": 0.3634, + "MMLU-PRO": 0.3391 + } + }, + { + "model_id": "Replete-AI/Replete-Coder-Instruct-8b-Merged", + "name": "Replete-Coder-Instruct-8b-Merged", + "developer": "Replete-AI", + "scores": { + "IFEval": 0.5388, + "BBH": 0.4462, + "MATH Level 5": 0.0778, + "GPQA": 0.2693, + "MUSR": 0.366, + "MMLU-PRO": 0.1805 + } + }, + { + "model_id": "Replete-AI/Replete-Coder-Llama3-8B", + "name": "Replete-Coder-Llama3-8B", + "developer": "Replete-AI", + "scores": { + "IFEval": 0.4729, + "BBH": 0.3271, + "MATH Level 5": 0.0476, + "GPQA": 0.2609, + "MUSR": 0.3953, + "MMLU-PRO": 0.1331 + } + }, + { + "model_id": "Replete-AI/Replete-Coder-Qwen2-1.5b", + "name": "Replete-Coder-Qwen2-1.5b", + "developer": "Replete-AI", + "scores": { + "IFEval": 0.3014, + "BBH": 0.3475, + "MATH Level 5": 0.0385, + "GPQA": 0.2685, + "MUSR": 0.4073, + "MMLU-PRO": 0.2147 + } + }, + { + "model_id": "Replete-AI/Replete-LLM-Qwen2-7b", + "name": "Replete-LLM-Qwen2-7b", + "developer": "Replete-AI", + "scores": { + "IFEval": 0.0932, + "BBH": 0.2977, + "MATH Level 5": 0.0, + "GPQA": 0.2475, + "MUSR": 0.3941, + "MMLU-PRO": 0.1157 + } + }, + { + "model_id": "Replete-AI/Replete-LLM-Qwen2-7b_Beta-Preview", + "name": "Replete-LLM-Qwen2-7b_Beta-Preview", + "developer": "Replete-AI", + "scores": { + "IFEval": 0.0858, + "BBH": 0.2929, + "MATH Level 5": 0.0, + "GPQA": 0.2483, + "MUSR": 0.3981, + "MMLU-PRO": 0.1285 + } + }, + { + "model_id": "Replete-AI/Replete-LLM-V2-Llama-3.1-8b", + "name": "Replete-LLM-V2-Llama-3.1-8b", + "developer": "Replete-AI", + "scores": { + "IFEval": 0.5515, + "BBH": 0.5339, + "MATH Level 5": 0.1405, + "GPQA": 0.3138, + "MUSR": 0.4001, + "MMLU-PRO": 0.3753 + } + }, + { + "model_id": "RezVortex/JAJUKA-WEWILLNEVERFORGETYOU-3B", + "name": "JAJUKA-WEWILLNEVERFORGETYOU-3B", + "developer": "RezVortex", + "scores": { + "IFEval": 0.6858, + "BBH": 0.4619, + "MATH Level 5": 0.1548, + "GPQA": 0.2576, + "MUSR": 0.363, + "MMLU-PRO": 0.3143 + } + }, + { + "model_id": "RezVortex/Jajuka-3b", + "name": "Jajuka-3b", + "developer": "RezVortex", + "scores": { + "IFEval": 0.6925, + "BBH": 0.4594, + "MATH Level 5": 0.1594, + "GPQA": 0.2659, + "MUSR": 0.3671, + "MMLU-PRO": 0.3137 + } + }, + { + "model_id": "Ro-xe/FMixIA-7B-DARE-0", + "name": "FMixIA-7B-DARE-0", + "developer": "Ro-xe", + "scores": { + "IFEval": 0.3341, + "BBH": 0.5035, + "MATH Level 5": 0.0529, + "GPQA": 0.2894, + "MUSR": 0.4545, + "MMLU-PRO": 0.3016 + } + }, + { + "model_id": "Ro-xe/FMixIA-7B-SLERP-27", + "name": "FMixIA-7B-SLERP-27", + "developer": "Ro-xe", + "scores": { + "IFEval": 0.3765, + "BBH": 0.5151, + "MATH Level 5": 0.0634, + "GPQA": 0.2953, + "MUSR": 0.4412, + "MMLU-PRO": 0.3008 + } + }, + { + "model_id": "Ro-xe/FMixIA-7B-TIES-1", + "name": "FMixIA-7B-TIES-1", + "developer": "Ro-xe", + "scores": { + "IFEval": 0.3453, + "BBH": 0.5092, + "MATH Level 5": 0.0566, + "GPQA": 0.2886, + "MUSR": 0.4689, + "MMLU-PRO": 0.2992 + } + }, + { + "model_id": "Ro-xe/FMixIA-FrankenMerge-9.5B-PT-9", + "name": "FMixIA-FrankenMerge-9.5B-PT-9", + "developer": "Ro-xe", + "scores": { + "IFEval": 0.194, + "BBH": 0.5088, + "MATH Level 5": 0.003, + "GPQA": 0.3079, + "MUSR": 0.417, + "MMLU-PRO": 0.3657 + } + }, + { + "model_id": "Rombo-Org/Rombo-LLM-V2.5-Qwen-7b", + "name": "Rombo-LLM-V2.5-Qwen-7b", + "developer": "Rombo-Org", + "scores": { + "IFEval": 0.7482, + "BBH": 0.54, + "MATH Level 5": 0.5068, + "GPQA": 0.3012, + "MUSR": 0.398, + "MMLU-PRO": 0.4283 + } + }, + { + "model_id": "RubielLabarta/LogoS-7Bx2-MoE-13B-v0.2", + "name": "LogoS-7Bx2-MoE-13B-v0.2", + "developer": "RubielLabarta", + "scores": { + "IFEval": 0.4379, + "BBH": 0.5207, + "MATH Level 5": 0.0574, + "GPQA": 0.2777, + "MUSR": 0.4226, + "MMLU-PRO": 0.3088 + } + }, + { + "model_id": "SaisExperiments/Evil-Alpaca-3B-L3.2", + "name": "Evil-Alpaca-3B-L3.2", + "developer": "SaisExperiments", + "scores": { + "IFEval": 0.3251, + "BBH": 0.4341, + "MATH Level 5": 0.0702, + "GPQA": 0.2634, + "MUSR": 0.4198, + "MMLU-PRO": 0.2621 + } + }, + { + "model_id": "SaisExperiments/Gemma-2-2B-Opus-Instruct", + "name": "Gemma-2-2B-Opus-Instruct", + "developer": "SaisExperiments", + "scores": { + "IFEval": 0.475, + "BBH": 0.4293, + "MATH Level 5": 0.0506, + "GPQA": 0.2836, + "MUSR": 0.4057, + "MMLU-PRO": 0.265 + } + }, + { + "model_id": "SaisExperiments/Gemma-2-2B-Stheno-Filtered", + "name": "Gemma-2-2B-Stheno-Filtered", + "developer": "SaisExperiments", + "scores": { + "IFEval": 0.4197, + "BBH": 0.4149, + "MATH Level 5": 0.0461, + "GPQA": 0.2701, + "MUSR": 0.4003, + "MMLU-PRO": 0.263 + } + }, + { + "model_id": "SaisExperiments/Not-So-Small-Alpaca-24B", + "name": "Not-So-Small-Alpaca-24B", + "developer": "SaisExperiments", + "scores": { + "IFEval": 0.6244, + "BBH": 0.5339, + "MATH Level 5": 0.1828, + "GPQA": 0.3591, + "MUSR": 0.4282, + "MMLU-PRO": 0.3694 + } + }, + { + "model_id": "SaisExperiments/QwOwO-7B-V1", + "name": "QwOwO-7B-V1", + "developer": "SaisExperiments", + "scores": { + "IFEval": 0.4556, + "BBH": 0.5431, + "MATH Level 5": 0.386, + "GPQA": 0.2601, + "MUSR": 0.3835, + "MMLU-PRO": 0.4224 + } + }, + { + "model_id": "SaisExperiments/RightSheep-Llama3.2-3B", + "name": "RightSheep-Llama3.2-3B", + "developer": "SaisExperiments", + "scores": { + "IFEval": 0.4156, + "BBH": 0.4241, + "MATH Level 5": 0.0808, + "GPQA": 0.2869, + "MUSR": 0.3767, + "MMLU-PRO": 0.254 + } + }, + { + "model_id": "Sakalti/Anemoi-3B", + "name": "Anemoi-3B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.3804, + "BBH": 0.4922, + "MATH Level 5": 0.1775, + "GPQA": 0.3054, + "MUSR": 0.4371, + "MMLU-PRO": 0.3766 + } + }, + { + "model_id": "Sakalti/Euphrates-14B", + "name": "Euphrates-14B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.2647, + "BBH": 0.6138, + "MATH Level 5": 0.3051, + "GPQA": 0.3935, + "MUSR": 0.4516, + "MMLU-PRO": 0.5255 + } + }, + { + "model_id": "Sakalti/Llama3.2-3B-Uranus-1", + "name": "Llama3.2-3B-Uranus-1", + "developer": "Sakalti", + "scores": { + "IFEval": 0.5335, + "BBH": 0.4437, + "MATH Level 5": 0.1495, + "GPQA": 0.297, + "MUSR": 0.3669, + "MMLU-PRO": 0.3094 + } + }, + { + "model_id": "Sakalti/Magro-7B-v1.1", + "name": "Magro-7B-v1.1", + "developer": "Sakalti", + "scores": { + "IFEval": 0.1204, + "BBH": 0.4179, + "MATH Level 5": 0.0249, + "GPQA": 0.2961, + "MUSR": 0.4433, + "MMLU-PRO": 0.2764 + } + }, + { + "model_id": "Sakalti/Neptuno-3B", + "name": "Neptuno-3B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.4296, + "BBH": 0.4834, + "MATH Level 5": 0.2553, + "GPQA": 0.2961, + "MUSR": 0.4002, + "MMLU-PRO": 0.3773 + } + }, + { + "model_id": "Sakalti/Neptuno-Alpha", + "name": "Neptuno-Alpha", + "developer": "Sakalti", + "scores": { + "IFEval": 0.378, + "BBH": 0.4925, + "MATH Level 5": 0.1835, + "GPQA": 0.307, + "MUSR": 0.4371, + "MMLU-PRO": 0.3767 + } + }, + { + "model_id": "Sakalti/Oxyge1-33B", + "name": "Oxyge1-33B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.4548, + "BBH": 0.7033, + "MATH Level 5": 0.4962, + "GPQA": 0.3826, + "MUSR": 0.5008, + "MMLU-PRO": 0.5909 + } + }, + { + "model_id": "Sakalti/Phi3.5-Comets-3.8B", + "name": "Phi3.5-Comets-3.8B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.2094, + "BBH": 0.3335, + "MATH Level 5": 0.0008, + "GPQA": 0.2492, + "MUSR": 0.3764, + "MMLU-PRO": 0.1153 + } + }, + { + "model_id": "Sakalti/Qwen2.5-1B-Instruct", + "name": "Qwen2.5-1B-Instruct", + "developer": "Sakalti", + "scores": { + "IFEval": 0.1751, + "BBH": 0.3027, + "MATH Level 5": 0.006, + "GPQA": 0.2559, + "MUSR": 0.3369, + "MMLU-PRO": 0.1213 + } + }, + { + "model_id": "Sakalti/QwenTest-7", + "name": "QwenTest-7", + "developer": "Sakalti", + "scores": { + "IFEval": 0.1672, + "BBH": 0.3063, + "MATH Level 5": 0.0038, + "GPQA": 0.2601, + "MUSR": 0.3422, + "MMLU-PRO": 0.1212 + } + }, + { + "model_id": "Sakalti/SJT-0.5B", + "name": "SJT-0.5B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.2425, + "BBH": 0.3306, + "MATH Level 5": 0.0521, + "GPQA": 0.2718, + "MUSR": 0.3196, + "MMLU-PRO": 0.1891 + } + }, + { + "model_id": "Sakalti/SJT-1.5B-Alpha", + "name": "SJT-1.5B-Alpha", + "developer": "Sakalti", + "scores": { + "IFEval": 0.3449, + "BBH": 0.4241, + "MATH Level 5": 0.0997, + "GPQA": 0.2919, + "MUSR": 0.4226, + "MMLU-PRO": 0.2961 + } + }, + { + "model_id": "Sakalti/SJT-1.5B-Alpha-1.1", + "name": "SJT-1.5B-Alpha-1.1", + "developer": "Sakalti", + "scores": { + "IFEval": 0.3439, + "BBH": 0.4243, + "MATH Level 5": 0.0959, + "GPQA": 0.2894, + "MUSR": 0.4239, + "MMLU-PRO": 0.2966 + } + }, + { + "model_id": "Sakalti/SJT-1.7B", + "name": "SJT-1.7B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.1776, + "BBH": 0.2934, + "MATH Level 5": 0.0015, + "GPQA": 0.2416, + "MUSR": 0.3964, + "MMLU-PRO": 0.1133 + } + }, + { + "model_id": "Sakalti/SJT-14B", + "name": "SJT-14B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.5494, + "BBH": 0.6536, + "MATH Level 5": 0.3844, + "GPQA": 0.3867, + "MUSR": 0.4766, + "MMLU-PRO": 0.5381 + } + }, + { + "model_id": "Sakalti/SJT-2.4B", + "name": "SJT-2.4B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.2804, + "BBH": 0.349, + "MATH Level 5": 0.0219, + "GPQA": 0.2559, + "MUSR": 0.3699, + "MMLU-PRO": 0.1858 + } + }, + { + "model_id": "Sakalti/SJT-24B-Alpha", + "name": "SJT-24B-Alpha", + "developer": "Sakalti", + "scores": { + "IFEval": 0.3206, + "BBH": 0.6081, + "MATH Level 5": 0.253, + "GPQA": 0.3809, + "MUSR": 0.4595, + "MMLU-PRO": 0.4857 + } + }, + { + "model_id": "Sakalti/SJT-2B", + "name": "SJT-2B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.2151, + "BBH": 0.2936, + "MATH Level 5": 0.0008, + "GPQA": 0.2416, + "MUSR": 0.3564, + "MMLU-PRO": 0.1187 + } + }, + { + "model_id": "Sakalti/SJT-2B-V1.1", + "name": "SJT-2B-V1.1", + "developer": "Sakalti", + "scores": { + "IFEval": 0.3977, + "BBH": 0.3984, + "MATH Level 5": 0.0483, + "GPQA": 0.2676, + "MUSR": 0.4299, + "MMLU-PRO": 0.2124 + } + }, + { + "model_id": "Sakalti/SJT-3.7B", + "name": "SJT-3.7B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.1078, + "BBH": 0.3393, + "MATH Level 5": 0.0121, + "GPQA": 0.2559, + "MUSR": 0.3617, + "MMLU-PRO": 0.1505 + } + }, + { + "model_id": "Sakalti/SJT-4B", + "name": "SJT-4B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.4077, + "BBH": 0.4886, + "MATH Level 5": 0.1156, + "GPQA": 0.2945, + "MUSR": 0.478, + "MMLU-PRO": 0.3281 + } + }, + { + "model_id": "Sakalti/SJT-7.5B", + "name": "SJT-7.5B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.4223, + "BBH": 0.5367, + "MATH Level 5": 0.2168, + "GPQA": 0.3263, + "MUSR": 0.4399, + "MMLU-PRO": 0.3951 + } + }, + { + "model_id": "Sakalti/SJT-7B-V1.1", + "name": "SJT-7B-V1.1", + "developer": "Sakalti", + "scores": { + "IFEval": 0.4703, + "BBH": 0.5419, + "MATH Level 5": 0.2432, + "GPQA": 0.3339, + "MUSR": 0.4411, + "MMLU-PRO": 0.4412 + } + }, + { + "model_id": "Sakalti/SJT-7B-V1.1-Multilingal", + "name": "SJT-7B-V1.1-Multilingal", + "developer": "Sakalti", + "scores": { + "IFEval": 0.1949, + "BBH": 0.292, + "MATH Level 5": 0.0045, + "GPQA": 0.2601, + "MUSR": 0.3621, + "MMLU-PRO": 0.1137 + } + }, + { + "model_id": "Sakalti/SJT-8B", + "name": "SJT-8B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.6535, + "BBH": 0.5282, + "MATH Level 5": 0.2538, + "GPQA": 0.3297, + "MUSR": 0.408, + "MMLU-PRO": 0.4266 + } + }, + { + "model_id": "Sakalti/SJT-8B-V1.1", + "name": "SJT-8B-V1.1", + "developer": "Sakalti", + "scores": { + "IFEval": 0.4621, + "BBH": 0.5121, + "MATH Level 5": 0.2069, + "GPQA": 0.3364, + "MUSR": 0.4266, + "MMLU-PRO": 0.4231 + } + }, + { + "model_id": "Sakalti/SJT-900M", + "name": "SJT-900M", + "developer": "Sakalti", + "scores": { + "IFEval": 0.241, + "BBH": 0.3169, + "MATH Level 5": 0.0136, + "GPQA": 0.2534, + "MUSR": 0.3595, + "MMLU-PRO": 0.1142 + } + }, + { + "model_id": "Sakalti/SJT-Moe2x7.5B", + "name": "SJT-Moe2x7.5B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.4117, + "BBH": 0.5371, + "MATH Level 5": 0.2145, + "GPQA": 0.3263, + "MUSR": 0.4399, + "MMLU-PRO": 0.3954 + } + }, + { + "model_id": "Sakalti/SJTPass-2", + "name": "SJTPass-2", + "developer": "Sakalti", + "scores": { + "IFEval": 0.24, + "BBH": 0.3302, + "MATH Level 5": 0.0529, + "GPQA": 0.2727, + "MUSR": 0.3222, + "MMLU-PRO": 0.1902 + } + }, + { + "model_id": "Sakalti/SJTPass-4", + "name": "SJTPass-4", + "developer": "Sakalti", + "scores": { + "IFEval": 0.1913, + "BBH": 0.2964, + "MATH Level 5": 0.0023, + "GPQA": 0.2601, + "MUSR": 0.3898, + "MMLU-PRO": 0.1083 + } + }, + { + "model_id": "Sakalti/SJTPass-5", + "name": "SJTPass-5", + "developer": "Sakalti", + "scores": { + "IFEval": 0.2425, + "BBH": 0.3103, + "MATH Level 5": 0.0159, + "GPQA": 0.2668, + "MUSR": 0.3794, + "MMLU-PRO": 0.1327 + } + }, + { + "model_id": "Sakalti/Saba-Passthrough-2", + "name": "Saba-Passthrough-2", + "developer": "Sakalti", + "scores": { + "IFEval": 0.1691, + "BBH": 0.3672, + "MATH Level 5": 0.0008, + "GPQA": 0.2634, + "MUSR": 0.3844, + "MMLU-PRO": 0.2077 + } + }, + { + "model_id": "Sakalti/Saba1-1.8B", + "name": "Saba1-1.8B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.3333, + "BBH": 0.4147, + "MATH Level 5": 0.1541, + "GPQA": 0.2827, + "MUSR": 0.4239, + "MMLU-PRO": 0.2926 + } + }, + { + "model_id": "Sakalti/Saba1-7B", + "name": "Saba1-7B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.4585, + "BBH": 0.5489, + "MATH Level 5": 0.3663, + "GPQA": 0.3163, + "MUSR": 0.4793, + "MMLU-PRO": 0.4376 + } + }, + { + "model_id": "Sakalti/Saba1.5-1.5B", + "name": "Saba1.5-1.5B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.3333, + "BBH": 0.4147, + "MATH Level 5": 0.1541, + "GPQA": 0.2827, + "MUSR": 0.4239, + "MMLU-PRO": 0.2926 + } + }, + { + "model_id": "Sakalti/Saba1.5-Pro-3B", + "name": "Saba1.5-Pro-3B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.2386, + "BBH": 0.3623, + "MATH Level 5": 0.0272, + "GPQA": 0.2685, + "MUSR": 0.4405, + "MMLU-PRO": 0.1958 + } + }, + { + "model_id": "Sakalti/Saba2-14B-Preview", + "name": "Saba2-14B-Preview", + "developer": "Sakalti", + "scores": { + "IFEval": 0.4722, + "BBH": 0.6496, + "MATH Level 5": 0.3127, + "GPQA": 0.3826, + "MUSR": 0.4781, + "MMLU-PRO": 0.5384 + } + }, + { + "model_id": "Sakalti/Saba2-3B", + "name": "Saba2-3B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.2865, + "BBH": 0.2801, + "MATH Level 5": 0.006, + "GPQA": 0.2617, + "MUSR": 0.3927, + "MMLU-PRO": 0.121 + } + }, + { + "model_id": "Sakalti/Sailor-japanese", + "name": "Sailor-japanese", + "developer": "Sakalti", + "scores": { + "IFEval": 0.1605, + "BBH": 0.2913, + "MATH Level 5": 0.003, + "GPQA": 0.2534, + "MUSR": 0.3912, + "MMLU-PRO": 0.1164 + } + }, + { + "model_id": "Sakalti/Saka-1.5B", + "name": "Saka-1.5B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.2726, + "BBH": 0.3988, + "MATH Level 5": 0.0801, + "GPQA": 0.2903, + "MUSR": 0.3739, + "MMLU-PRO": 0.2415 + } + }, + { + "model_id": "Sakalti/Saka-14B", + "name": "Saka-14B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.7174, + "BBH": 0.6497, + "MATH Level 5": 0.4094, + "GPQA": 0.396, + "MUSR": 0.4886, + "MMLU-PRO": 0.5396 + } + }, + { + "model_id": "Sakalti/Saka-24B", + "name": "Saka-24B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.3819, + "BBH": 0.6072, + "MATH Level 5": 0.1805, + "GPQA": 0.3423, + "MUSR": 0.4541, + "MMLU-PRO": 0.4766 + } + }, + { + "model_id": "Sakalti/Saka-7.2B", + "name": "Saka-7.2B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.1545, + "BBH": 0.2945, + "MATH Level 5": 0.0, + "GPQA": 0.2391, + "MUSR": 0.3711, + "MMLU-PRO": 0.116 + } + }, + { + "model_id": "Sakalti/Saka-7.6B", + "name": "Saka-7.6B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.4524, + "BBH": 0.5655, + "MATH Level 5": 0.3255, + "GPQA": 0.3163, + "MUSR": 0.4489, + "MMLU-PRO": 0.454 + } + }, + { + "model_id": "Sakalti/SakaMoe-3x1.6B-Instruct", + "name": "SakaMoe-3x1.6B-Instruct", + "developer": "Sakalti", + "scores": { + "IFEval": 0.2371, + "BBH": 0.3282, + "MATH Level 5": 0.0544, + "GPQA": 0.2668, + "MUSR": 0.3342, + "MMLU-PRO": 0.1882 + } + }, + { + "model_id": "Sakalti/SakalFusion-7B-Alpha", + "name": "SakalFusion-7B-Alpha", + "developer": "Sakalti", + "scores": { + "IFEval": 0.529, + "BBH": 0.5591, + "MATH Level 5": 0.3844, + "GPQA": 0.3255, + "MUSR": 0.4581, + "MMLU-PRO": 0.4474 + } + }, + { + "model_id": "Sakalti/SakalFusion-7B-Beta", + "name": "SakalFusion-7B-Beta", + "developer": "Sakalti", + "scores": { + "IFEval": 0.1809, + "BBH": 0.2881, + "MATH Level 5": 0.0, + "GPQA": 0.2433, + "MUSR": 0.3872, + "MMLU-PRO": 0.109 + } + }, + { + "model_id": "Sakalti/Tara-3.8B-v1.1", + "name": "Tara-3.8B-v1.1", + "developer": "Sakalti", + "scores": { + "IFEval": 0.4062, + "BBH": 0.4886, + "MATH Level 5": 0.1156, + "GPQA": 0.2945, + "MUSR": 0.478, + "MMLU-PRO": 0.3281 + } + }, + { + "model_id": "Sakalti/light-1.1-3B", + "name": "light-1.1-3B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.2735, + "BBH": 0.2803, + "MATH Level 5": 0.0113, + "GPQA": 0.2617, + "MUSR": 0.3901, + "MMLU-PRO": 0.1209 + } + }, + { + "model_id": "Sakalti/light-3B", + "name": "light-3B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.5337, + "BBH": 0.4831, + "MATH Level 5": 0.2591, + "GPQA": 0.2953, + "MUSR": 0.4015, + "MMLU-PRO": 0.3775 + } + }, + { + "model_id": "Sakalti/light-3b-beta", + "name": "light-3b-beta", + "developer": "Sakalti", + "scores": { + "IFEval": 0.5485, + "BBH": 0.4815, + "MATH Level 5": 0.2772, + "GPQA": 0.2978, + "MUSR": 0.4015, + "MMLU-PRO": 0.3758 + } + }, + { + "model_id": "Sakalti/light-7b-beta", + "name": "light-7b-beta", + "developer": "Sakalti", + "scores": { + "IFEval": 0.6234, + "BBH": 0.5548, + "MATH Level 5": 0.3769, + "GPQA": 0.3213, + "MUSR": 0.4291, + "MMLU-PRO": 0.4456 + } + }, + { + "model_id": "Sakalti/llama-3-yanyuedao-8b-instruct", + "name": "llama-3-yanyuedao-8b-instruct", + "developer": "Sakalti", + "scores": { + "IFEval": 0.2186, + "BBH": 0.435, + "MATH Level 5": 0.0385, + "GPQA": 0.2903, + "MUSR": 0.4199, + "MMLU-PRO": 0.2911 + } + }, + { + "model_id": "Sakalti/magro-7B", + "name": "magro-7B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.1344, + "BBH": 0.4186, + "MATH Level 5": 0.0204, + "GPQA": 0.2953, + "MUSR": 0.446, + "MMLU-PRO": 0.2765 + } + }, + { + "model_id": "Sakalti/mergekit-01", + "name": "mergekit-01", + "developer": "Sakalti", + "scores": { + "IFEval": 0.6234, + "BBH": 0.5548, + "MATH Level 5": 0.3769, + "GPQA": 0.3213, + "MUSR": 0.4291, + "MMLU-PRO": 0.4456 + } + }, + { + "model_id": "Sakalti/mergekit-della_linear-vmeykci", + "name": "mergekit-della_linear-vmeykci", + "developer": "Sakalti", + "scores": { + "IFEval": 0.1126, + "BBH": 0.2816, + "MATH Level 5": 0.0106, + "GPQA": 0.2634, + "MUSR": 0.3897, + "MMLU-PRO": 0.1089 + } + }, + { + "model_id": "Sakalti/model-3", + "name": "model-3", + "developer": "Sakalti", + "scores": { + "IFEval": 0.6264, + "BBH": 0.5542, + "MATH Level 5": 0.3708, + "GPQA": 0.3213, + "MUSR": 0.4264, + "MMLU-PRO": 0.4455 + } + }, + { + "model_id": "Sakalti/qwen2.5-2.3B", + "name": "qwen2.5-2.3B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.1288, + "BBH": 0.2849, + "MATH Level 5": 0.0053, + "GPQA": 0.2517, + "MUSR": 0.3857, + "MMLU-PRO": 0.1173 + } + }, + { + "model_id": "Sakalti/tara-3.8B", + "name": "tara-3.8B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.4077, + "BBH": 0.4886, + "MATH Level 5": 0.1156, + "GPQA": 0.2945, + "MUSR": 0.478, + "MMLU-PRO": 0.3281 + } + }, + { + "model_id": "Sakalti/ultiima-14B", + "name": "ultiima-14B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.5701, + "BBH": 0.6491, + "MATH Level 5": 0.4698, + "GPQA": 0.3742, + "MUSR": 0.4718, + "MMLU-PRO": 0.5381 + } + }, + { + "model_id": "Sakalti/ultiima-14B-v0.2", + "name": "ultiima-14B-v0.2", + "developer": "Sakalti", + "scores": { + "IFEval": 0.707, + "BBH": 0.6472, + "MATH Level 5": 0.3995, + "GPQA": 0.3826, + "MUSR": 0.4794, + "MMLU-PRO": 0.5387 + } + }, + { + "model_id": "Sakalti/ultiima-14B-v0.3", + "name": "ultiima-14B-v0.3", + "developer": "Sakalti", + "scores": { + "IFEval": 0.704, + "BBH": 0.6398, + "MATH Level 5": 0.3965, + "GPQA": 0.3767, + "MUSR": 0.4754, + "MMLU-PRO": 0.5337 + } + }, + { + "model_id": "Sakalti/ultiima-14B-v0.4", + "name": "ultiima-14B-v0.4", + "developer": "Sakalti", + "scores": { + "IFEval": 0.3008, + "BBH": 0.642, + "MATH Level 5": 0.3535, + "GPQA": 0.396, + "MUSR": 0.4886, + "MMLU-PRO": 0.5278 + } + }, + { + "model_id": "Sakalti/ultiima-32B", + "name": "ultiima-32B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.6854, + "BBH": 0.7037, + "MATH Level 5": 0.4962, + "GPQA": 0.3809, + "MUSR": 0.4995, + "MMLU-PRO": 0.591 + } + }, + { + "model_id": "Sakalti/ultiima-72B", + "name": "ultiima-72B", + "developer": "Sakalti", + "scores": { + "IFEval": 0.714, + "BBH": 0.7218, + "MATH Level 5": 0.5355, + "GPQA": 0.4144, + "MUSR": 0.4652, + "MMLU-PRO": 0.5906 + } + }, + { + "model_id": "Sakalti/ultiima-72B-v1.5", + "name": "ultiima-72B-v1.5", + "developer": "Sakalti", + "scores": { + "IFEval": 0.655, + "BBH": 0.7392, + "MATH Level 5": 0.4396, + "GPQA": 0.4136, + "MUSR": 0.4691, + "MMLU-PRO": 0.6054 + } + }, + { + "model_id": "Salesforce/LLaMA-3-8B-SFR-Iterative-DPO-R", + "name": "LLaMA-3-8B-SFR-Iterative-DPO-R", + "developer": "Salesforce", + "scores": { + "IFEval": 0.3816, + "BBH": 0.5012, + "MATH Level 5": 0.0914, + "GPQA": 0.2878, + "MUSR": 0.3633, + "MMLU-PRO": 0.3172 + } + }, + { + "model_id": "SanjiWatsuki/Kunoichi-DPO-v2-7B", + "name": "Kunoichi-DPO-v2-7B", + "developer": "SanjiWatsuki", + "scores": { + "IFEval": 0.5431, + "BBH": 0.4416, + "MATH Level 5": 0.0763, + "GPQA": 0.2961, + "MUSR": 0.4188, + "MMLU-PRO": 0.3107 + } + }, + { + "model_id": "SanjiWatsuki/Silicon-Maid-7B", + "name": "Silicon-Maid-7B", + "developer": "SanjiWatsuki", + "scores": { + "IFEval": 0.5368, + "BBH": 0.4128, + "MATH Level 5": 0.065, + "GPQA": 0.2903, + "MUSR": 0.4188, + "MMLU-PRO": 0.3083 + } + }, + { + "model_id": "Sao10K/70B-L3.3-Cirrus-x1", + "name": "70B-L3.3-Cirrus-x1", + "developer": "Sao10K", + "scores": { + "IFEval": 0.6681, + "BBH": 0.7029, + "MATH Level 5": 0.3739, + "GPQA": 0.4497, + "MUSR": 0.4842, + "MMLU-PRO": 0.5378 + } + }, + { + "model_id": "Sao10K/Fimbulvetr-11B-v2", + "name": "Fimbulvetr-11B-v2", + "developer": "Sao10K", + "scores": { + "IFEval": 0.51, + "BBH": 0.4544, + "MATH Level 5": 0.068, + "GPQA": 0.2919, + "MUSR": 0.4354, + "MMLU-PRO": 0.3301 + } + }, + { + "model_id": "Sao10K/L3-70B-Euryale-v2.1", + "name": "L3-70B-Euryale-v2.1", + "developer": "Sao10K", + "scores": { + "IFEval": 0.7384, + "BBH": 0.6471, + "MATH Level 5": 0.2137, + "GPQA": 0.3314, + "MUSR": 0.4209, + "MMLU-PRO": 0.5104 + } + }, + { + "model_id": "Sao10K/L3-8B-Lunaris-v1", + "name": "L3-8B-Lunaris-v1", + "developer": "Sao10K", + "scores": { + "IFEval": 0.6895, + "BBH": 0.5235, + "MATH Level 5": 0.0906, + "GPQA": 0.3012, + "MUSR": 0.3727, + "MMLU-PRO": 0.3787 + } + }, + { + "model_id": "Sao10K/L3-8B-Niitama-v1", + "name": "L3-8B-Niitama-v1", + "developer": "Sao10K", + "scores": { + "IFEval": 0.6791, + "BBH": 0.5303, + "MATH Level 5": 0.0982, + "GPQA": 0.3079, + "MUSR": 0.3807, + "MMLU-PRO": 0.3701 + } + }, + { + "model_id": "Sao10K/L3-8B-Stheno-v3.2", + "name": "L3-8B-Stheno-v3.2", + "developer": "Sao10K", + "scores": { + "IFEval": 0.6873, + "BBH": 0.5228, + "MATH Level 5": 0.0929, + "GPQA": 0.3104, + "MUSR": 0.3794, + "MMLU-PRO": 0.3768 + } + }, + { + "model_id": "Sao10K/L3-8B-Stheno-v3.3-32K", + "name": "L3-8B-Stheno-v3.3-32K", + "developer": "Sao10K", + "scores": { + "IFEval": 0.4604, + "BBH": 0.3844, + "MATH Level 5": 0.0144, + "GPQA": 0.2567, + "MUSR": 0.3725, + "MMLU-PRO": 0.1896 + } + }, + { + "model_id": "Sao10K/MN-12B-Lyra-v3", + "name": "MN-12B-Lyra-v3", + "developer": "Sao10K", + "scores": { + "IFEval": 0.4486, + "BBH": 0.4804, + "MATH Level 5": 0.0937, + "GPQA": 0.2777, + "MUSR": 0.4019, + "MMLU-PRO": 0.3249 + } + }, + { + "model_id": "Saxo/Linkbricks-Horizon-AI-Avengers-V1-32B", + "name": "Linkbricks-Horizon-AI-Avengers-V1-32B", + "developer": "Saxo", + "scores": { + "IFEval": 0.7972, + "BBH": 0.7001, + "MATH Level 5": 0.6027, + "GPQA": 0.3624, + "MUSR": 0.4538, + "MMLU-PRO": 0.5793 + } + }, + { + "model_id": "Saxo/Linkbricks-Horizon-AI-Avengers-V2-32B", + "name": "Linkbricks-Horizon-AI-Avengers-V2-32B", + "developer": "Saxo", + "scores": { + "IFEval": 0.7956, + "BBH": 0.7023, + "MATH Level 5": 0.5665, + "GPQA": 0.2659, + "MUSR": 0.4166, + "MMLU-PRO": 0.572 + } + }, + { + "model_id": "Saxo/Linkbricks-Horizon-AI-Avengers-V3-32B", + "name": "Linkbricks-Horizon-AI-Avengers-V3-32B", + "developer": "Saxo", + "scores": { + "IFEval": 0.8249, + "BBH": 0.6913, + "MATH Level 5": 0.6178, + "GPQA": 0.3381, + "MUSR": 0.4275, + "MMLU-PRO": 0.5664 + } + }, + { + "model_id": "Saxo/Linkbricks-Horizon-AI-Avengers-V4-32B", + "name": "Linkbricks-Horizon-AI-Avengers-V4-32B", + "developer": "Saxo", + "scores": { + "IFEval": 0.7631, + "BBH": 0.692, + "MATH Level 5": 0.5363, + "GPQA": 0.3616, + "MUSR": 0.4643, + "MMLU-PRO": 0.5752 + } + }, + { + "model_id": "Saxo/Linkbricks-Horizon-AI-Avengers-V5-32B", + "name": "Linkbricks-Horizon-AI-Avengers-V5-32B", + "developer": "Saxo", + "scores": { + "IFEval": 0.7516, + "BBH": 0.6929, + "MATH Level 5": 0.5461, + "GPQA": 0.3557, + "MUSR": 0.4709, + "MMLU-PRO": 0.5762 + } + }, + { + "model_id": "Saxo/Linkbricks-Horizon-AI-Avengers-V6-32B", + "name": "Linkbricks-Horizon-AI-Avengers-V6-32B", + "developer": "Saxo", + "scores": { + "IFEval": 0.8209, + "BBH": 0.689, + "MATH Level 5": 0.6224, + "GPQA": 0.3347, + "MUSR": 0.4274, + "MMLU-PRO": 0.5672 + } + }, + { + "model_id": "Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V2-27B", + "name": "Linkbricks-Horizon-AI-Korean-Avengers-V2-27B", + "developer": "Saxo", + "scores": { + "IFEval": 0.8146, + "BBH": 0.6463, + "MATH Level 5": 0.2802, + "GPQA": 0.3473, + "MUSR": 0.4139, + "MMLU-PRO": 0.4599 + } + }, + { + "model_id": "Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V3-27B", + "name": "Linkbricks-Horizon-AI-Korean-Avengers-V3-27B", + "developer": "Saxo", + "scores": { + "IFEval": 0.8142, + "BBH": 0.6404, + "MATH Level 5": 0.2492, + "GPQA": 0.3591, + "MUSR": 0.4467, + "MMLU-PRO": 0.4524 + } + }, + { + "model_id": "Saxo/Linkbricks-Horizon-AI-Korean-Superb-22B", + "name": "Linkbricks-Horizon-AI-Korean-Superb-22B", + "developer": "Saxo", + "scores": { + "IFEval": 0.6767, + "BBH": 0.5626, + "MATH Level 5": 0.2372, + "GPQA": 0.3263, + "MUSR": 0.3908, + "MMLU-PRO": 0.3871 + } + }, + { + "model_id": "Saxo/Linkbricks-Horizon-AI-Korean-Superb-27B", + "name": "Linkbricks-Horizon-AI-Korean-Superb-27B", + "developer": "Saxo", + "scores": { + "IFEval": 0.7768, + "BBH": 0.6518, + "MATH Level 5": 0.2719, + "GPQA": 0.3599, + "MUSR": 0.4791, + "MMLU-PRO": 0.4647 + } + }, + { + "model_id": "Saxo/Linkbricks-Horizon-AI-Superb-27B", + "name": "Linkbricks-Horizon-AI-Superb-27B", + "developer": "Saxo", + "scores": { + "IFEval": 0.7302, + "BBH": 0.6186, + "MATH Level 5": 0.2221, + "GPQA": 0.3574, + "MUSR": 0.465, + "MMLU-PRO": 0.406 + } + }, + { + "model_id": "SeaLLMs/SeaLLM-7B-v2", + "name": "SeaLLM-7B-v2", + "developer": "SeaLLMs", + "scores": { + "IFEval": 0.3671, + "BBH": 0.4902, + "MATH Level 5": 0.0853, + "GPQA": 0.2785, + "MUSR": 0.407, + "MMLU-PRO": 0.3083 + } + }, + { + "model_id": "SeaLLMs/SeaLLM-7B-v2.5", + "name": "SeaLLM-7B-v2.5", + "developer": "SeaLLMs", + "scores": { + "IFEval": 0.4522, + "BBH": 0.498, + "MATH Level 5": 0.1088, + "GPQA": 0.276, + "MUSR": 0.4203, + "MMLU-PRO": 0.3203 + } + }, + { + "model_id": "SeaLLMs/SeaLLMs-v3-7B-Chat", + "name": "SeaLLMs-v3-7B-Chat", + "developer": "SeaLLMs", + "scores": { + "IFEval": 0.4377, + "BBH": 0.5266, + "MATH Level 5": 0.1858, + "GPQA": 0.2987, + "MUSR": 0.4174, + "MMLU-PRO": 0.3895 + } + }, + { + "model_id": "SenseLLM/ReflectionCoder-CL-34B", + "name": "ReflectionCoder-CL-34B", + "developer": "SenseLLM", + "scores": { + "IFEval": 0.4008, + "BBH": 0.3953, + "MATH Level 5": 0.0332, + "GPQA": 0.2508, + "MUSR": 0.4155, + "MMLU-PRO": 0.1424 + } + }, + { + "model_id": "SenseLLM/ReflectionCoder-DS-33B", + "name": "ReflectionCoder-DS-33B", + "developer": "SenseLLM", + "scores": { + "IFEval": 0.3787, + "BBH": 0.3449, + "MATH Level 5": 0.0302, + "GPQA": 0.2743, + "MUSR": 0.3343, + "MMLU-PRO": 0.1202 + } + }, + { + "model_id": "SentientAGI/Dobby-Mini-Leashed-Llama-3.1-8B", + "name": "Dobby-Mini-Leashed-Llama-3.1-8B", + "developer": "SentientAGI", + "scores": { + "IFEval": 0.7847, + "BBH": 0.5138, + "MATH Level 5": 0.1858, + "GPQA": 0.302, + "MUSR": 0.4254, + "MMLU-PRO": 0.3694 + } + }, + { + "model_id": "SentientAGI/Dobby-Mini-Unhinged-Llama-3.1-8B", + "name": "Dobby-Mini-Unhinged-Llama-3.1-8B", + "developer": "SentientAGI", + "scores": { + "IFEval": 0.7457, + "BBH": 0.5142, + "MATH Level 5": 0.1563, + "GPQA": 0.3062, + "MUSR": 0.4013, + "MMLU-PRO": 0.3585 + } + }, + { + "model_id": "SeppeV/SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo", + "name": "SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo", + "developer": "SeppeV", + "scores": { + "IFEval": 0.0955, + "BBH": 0.3073, + "MATH Level 5": 0.0121, + "GPQA": 0.2592, + "MUSR": 0.4032, + "MMLU-PRO": 0.1161 + } + }, + { + "model_id": "Sharathhebbar24/SSH_355M", + "name": "SSH_355M", + "developer": "Sharathhebbar24", + "scores": { + "IFEval": 0.1424, + "BBH": 0.3099, + "MATH Level 5": 0.0091, + "GPQA": 0.2584, + "MUSR": 0.4178, + "MMLU-PRO": 0.1176 + } + }, + { + "model_id": "Sharathhebbar24/chat_gpt2_dpo", + "name": "chat_gpt2_dpo", + "developer": "Sharathhebbar24", + "scores": { + "IFEval": 0.0986, + "BBH": 0.2902, + "MATH Level 5": 0.0053, + "GPQA": 0.2601, + "MUSR": 0.3818, + "MMLU-PRO": 0.1142 + } + }, + { + "model_id": "Shreyash2010/Uma-4x4B-Instruct-v0.1", + "name": "Uma-4x4B-Instruct-v0.1", + "developer": "Shreyash2010", + "scores": { + "IFEval": 0.5517, + "BBH": 0.5512, + "MATH Level 5": 0.1775, + "GPQA": 0.3347, + "MUSR": 0.4441, + "MMLU-PRO": 0.387 + } + }, + { + "model_id": "Sicarius-Prototyping/Brainy_LLAMA", + "name": "Brainy_LLAMA", + "developer": "Sicarius-Prototyping", + "scores": { + "IFEval": 0.5204, + "BBH": 0.5117, + "MATH Level 5": 0.1337, + "GPQA": 0.3138, + "MUSR": 0.4143, + "MMLU-PRO": 0.3849 + } + }, + { + "model_id": "Sicarius-Prototyping/Micropenis_1B", + "name": "Micropenis_1B", + "developer": "Sicarius-Prototyping", + "scores": { + "IFEval": 0.3461, + "BBH": 0.3372, + "MATH Level 5": 0.0461, + "GPQA": 0.2626, + "MUSR": 0.3325, + "MMLU-PRO": 0.186 + } + }, + { + "model_id": "Sicarius-Prototyping/bacon_and_food", + "name": "bacon_and_food", + "developer": "Sicarius-Prototyping", + "scores": { + "IFEval": 0.586, + "BBH": 0.4725, + "MATH Level 5": 0.0982, + "GPQA": 0.3096, + "MUSR": 0.3884, + "MMLU-PRO": 0.3263 + } + }, + { + "model_id": "SicariusSicariiStuff/2B-ad", + "name": "2B-ad", + "developer": "SicariusSicariiStuff", + "scores": { + "IFEval": 0.4379, + "BBH": 0.4092, + "MATH Level 5": 0.0506, + "GPQA": 0.281, + "MUSR": 0.4015, + "MMLU-PRO": 0.2662 + } + }, + { + "model_id": "SicariusSicariiStuff/2B_or_not_2B", + "name": "2B_or_not_2B", + "developer": "SicariusSicariiStuff", + "scores": { + "IFEval": 0.2062, + "BBH": 0.3416, + "MATH Level 5": 0.0196, + "GPQA": 0.2475, + "MUSR": 0.3791, + "MMLU-PRO": 0.1399 + } + }, + { + "model_id": "SicariusSicariiStuff/Dusk_Rainbow", + "name": "Dusk_Rainbow", + "developer": "SicariusSicariiStuff", + "scores": { + "IFEval": 0.3588, + "BBH": 0.4772, + "MATH Level 5": 0.0748, + "GPQA": 0.3087, + "MUSR": 0.4025, + "MMLU-PRO": 0.3443 + } + }, + { + "model_id": "SicariusSicariiStuff/Eximius_Persona_5B", + "name": "Eximius_Persona_5B", + "developer": "SicariusSicariiStuff", + "scores": { + "IFEval": 0.656, + "BBH": 0.4512, + "MATH Level 5": 0.102, + "GPQA": 0.2643, + "MUSR": 0.3818, + "MMLU-PRO": 0.314 + } + }, + { + "model_id": "SicariusSicariiStuff/Impish_LLAMA_3B", + "name": "Impish_LLAMA_3B", + "developer": "SicariusSicariiStuff", + "scores": { + "IFEval": 0.463, + "BBH": 0.4091, + "MATH Level 5": 0.1125, + "GPQA": 0.2878, + "MUSR": 0.3673, + "MMLU-PRO": 0.2941 + } + }, + { + "model_id": "SicariusSicariiStuff/Impish_Mind_8B", + "name": "Impish_Mind_8B", + "developer": "SicariusSicariiStuff", + "scores": { + "IFEval": 0.3179, + "BBH": 0.4674, + "MATH Level 5": 0.105, + "GPQA": 0.3045, + "MUSR": 0.407, + "MMLU-PRO": 0.3309 + } + }, + { + "model_id": "SicariusSicariiStuff/Impish_QWEN_14B-1M", + "name": "Impish_QWEN_14B-1M", + "developer": "SicariusSicariiStuff", + "scores": { + "IFEval": 0.7868, + "BBH": 0.6283, + "MATH Level 5": 0.3965, + "GPQA": 0.3507, + "MUSR": 0.4615, + "MMLU-PRO": 0.5044 + } + }, + { + "model_id": "SicariusSicariiStuff/Impish_QWEN_7B-1M", + "name": "Impish_QWEN_7B-1M", + "developer": "SicariusSicariiStuff", + "scores": { + "IFEval": 0.6382, + "BBH": 0.5372, + "MATH Level 5": 0.3089, + "GPQA": 0.2961, + "MUSR": 0.4074, + "MMLU-PRO": 0.4265 + } + }, + { + "model_id": "SicariusSicariiStuff/LLAMA-3_8B_Unaligned_BETA", + "name": "LLAMA-3_8B_Unaligned_BETA", + "developer": "SicariusSicariiStuff", + "scores": { + "IFEval": 0.3713, + "BBH": 0.4717, + "MATH Level 5": 0.0838, + "GPQA": 0.3054, + "MUSR": 0.4119, + "MMLU-PRO": 0.3465 + } + }, + { + "model_id": "SicariusSicariiStuff/Phi-Line_14B", + "name": "Phi-Line_14B", + "developer": "SicariusSicariiStuff", + "scores": { + "IFEval": 0.6496, + "BBH": 0.6154, + "MATH Level 5": 0.386, + "GPQA": 0.3532, + "MUSR": 0.4479, + "MMLU-PRO": 0.5454 + } + }, + { + "model_id": "SicariusSicariiStuff/Phi-lthy4", + "name": "Phi-lthy4", + "developer": "SicariusSicariiStuff", + "scores": { + "IFEval": 0.7679, + "BBH": 0.5879, + "MATH Level 5": 0.1367, + "GPQA": 0.2869, + "MUSR": 0.4083, + "MMLU-PRO": 0.4333 + } + }, + { + "model_id": "SicariusSicariiStuff/Qwen2.5-14B_Uncencored", + "name": "Qwen2.5-14B_Uncencored", + "developer": "SicariusSicariiStuff", + "scores": { + "IFEval": 0.3158, + "BBH": 0.6309, + "MATH Level 5": 0.318, + "GPQA": 0.3817, + "MUSR": 0.4517, + "MMLU-PRO": 0.5266 + } + }, + { + "model_id": "SicariusSicariiStuff/Qwen2.5-14B_Uncensored", + "name": "Qwen2.5-14B_Uncensored", + "developer": "SicariusSicariiStuff", + "scores": { + "IFEval": 0.3173, + "BBH": 0.6309, + "MATH Level 5": 0.318, + "GPQA": 0.3817, + "MUSR": 0.4517, + "MMLU-PRO": 0.5266 + } + }, + { + "model_id": "SicariusSicariiStuff/Qwen2.5-14B_Uncensored_Instruct", + "name": "Qwen2.5-14B_Uncensored_Instruct", + "developer": "SicariusSicariiStuff", + "scores": { + "IFEval": 0.3789, + "BBH": 0.5937, + "MATH Level 5": 0.3285, + "GPQA": 0.3297, + "MUSR": 0.3697, + "MMLU-PRO": 0.5127 + } + }, + { + "model_id": "SicariusSicariiStuff/Redemption_Wind_24B", + "name": "Redemption_Wind_24B", + "developer": "SicariusSicariiStuff", + "scores": { + "IFEval": 0.2501, + "BBH": 0.6428, + "MATH Level 5": 0.1858, + "GPQA": 0.3834, + "MUSR": 0.4262, + "MMLU-PRO": 0.5432 + } + }, + { + "model_id": "SicariusSicariiStuff/Winged_Imp_8B", + "name": "Winged_Imp_8B", + "developer": "SicariusSicariiStuff", + "scores": { + "IFEval": 0.743, + "BBH": 0.512, + "MATH Level 5": 0.1201, + "GPQA": 0.2827, + "MUSR": 0.4148, + "MMLU-PRO": 0.3639 + } + }, + { + "model_id": "SicariusSicariiStuff/Wingless_Imp_8B", + "name": "Wingless_Imp_8B", + "developer": "SicariusSicariiStuff", + "scores": { + "IFEval": 0.743, + "BBH": 0.512, + "MATH Level 5": 0.1201, + "GPQA": 0.2827, + "MUSR": 0.4148, + "MMLU-PRO": 0.3639 + } + }, + { + "model_id": "SicariusSicariiStuff/Zion_Alpha", + "name": "Zion_Alpha", + "developer": "SicariusSicariiStuff", + "scores": { + "IFEval": 0.3324, + "BBH": 0.4932, + "MATH Level 5": 0.0521, + "GPQA": 0.2903, + "MUSR": 0.4727, + "MMLU-PRO": 0.3132 + } + }, + { + "model_id": "SicariusSicariiStuff/dn_ep02", + "name": "dn_ep02", + "developer": "SicariusSicariiStuff", + "scores": { + "IFEval": 0.5064, + "BBH": 0.5266, + "MATH Level 5": 0.142, + "GPQA": 0.3154, + "MUSR": 0.4316, + "MMLU-PRO": 0.3998 + } + }, + { + "model_id": "SkyOrbis/SKY-Ko-Llama3.1-8B-lora", + "name": "SKY-Ko-Llama3.1-8B-lora", + "developer": "SkyOrbis", + "scores": { + "IFEval": 0.5058, + "BBH": 0.5088, + "MATH Level 5": 0.1548, + "GPQA": 0.3213, + "MUSR": 0.3998, + "MMLU-PRO": 0.3777 + } + }, + { + "model_id": "SkyOrbis/SKY-Ko-Llama3.1-8B-lora-epoch1", + "name": "SKY-Ko-Llama3.1-8B-lora-epoch1", + "developer": "SkyOrbis", + "scores": { + "IFEval": 0.5058, + "BBH": 0.5088, + "MATH Level 5": 0.1548, + "GPQA": 0.3213, + "MUSR": 0.3998, + "MMLU-PRO": 0.3777 + } + }, + { + "model_id": "SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch3", + "name": "SKY-Ko-Llama3.2-1B-lora-epoch3", + "developer": "SkyOrbis", + "scores": { + "IFEval": 0.3247, + "BBH": 0.3167, + "MATH Level 5": 0.0272, + "GPQA": 0.2517, + "MUSR": 0.3382, + "MMLU-PRO": 0.1279 + } + }, + { + "model_id": "SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch5", + "name": "SKY-Ko-Llama3.2-1B-lora-epoch5", + "developer": "SkyOrbis", + "scores": { + "IFEval": 0.436, + "BBH": 0.3406, + "MATH Level 5": 0.0521, + "GPQA": 0.2592, + "MUSR": 0.3471, + "MMLU-PRO": 0.1946 + } + }, + { + "model_id": "SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch3", + "name": "SKY-Ko-Llama3.2-1B-lora-v2-epoch3", + "developer": "SkyOrbis", + "scores": { + "IFEval": 0.436, + "BBH": 0.3406, + "MATH Level 5": 0.0521, + "GPQA": 0.2592, + "MUSR": 0.3471, + "MMLU-PRO": 0.1946 + } + }, + { + "model_id": "SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch5", + "name": "SKY-Ko-Llama3.2-1B-lora-v2-epoch5", + "developer": "SkyOrbis", + "scores": { + "IFEval": 0.4247, + "BBH": 0.3397, + "MATH Level 5": 0.0506, + "GPQA": 0.2542, + "MUSR": 0.3458, + "MMLU-PRO": 0.1946 + } + }, + { + "model_id": "SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch1", + "name": "SKY-Ko-Llama3.2-3B-lora-epoch1", + "developer": "SkyOrbis", + "scores": { + "IFEval": 0.5331, + "BBH": 0.44, + "MATH Level 5": 0.1458, + "GPQA": 0.2919, + "MUSR": 0.3522, + "MMLU-PRO": 0.3004 + } + }, + { + "model_id": "SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch2", + "name": "SKY-Ko-Llama3.2-3B-lora-epoch2", + "developer": "SkyOrbis", + "scores": { + "IFEval": 0.5331, + "BBH": 0.44, + "MATH Level 5": 0.1458, + "GPQA": 0.2919, + "MUSR": 0.3522, + "MMLU-PRO": 0.3004 + } + }, + { + "model_id": "SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch3", + "name": "SKY-Ko-Llama3.2-3B-lora-epoch3", + "developer": "SkyOrbis", + "scores": { + "IFEval": 0.5331, + "BBH": 0.44, + "MATH Level 5": 0.1458, + "GPQA": 0.2919, + "MUSR": 0.3522, + "MMLU-PRO": 0.3004 + } + }, + { + "model_id": "SkyOrbis/SKY-Ko-Qwen2.5-3B-Instruct", + "name": "SKY-Ko-Qwen2.5-3B-Instruct", + "developer": "SkyOrbis", + "scores": { + "IFEval": 0.3534, + "BBH": 0.4265, + "MATH Level 5": 0.0695, + "GPQA": 0.2794, + "MUSR": 0.4024, + "MMLU-PRO": 0.2812 + } + }, + { + "model_id": "SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000", + "name": "SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000", + "developer": "SkyOrbis", + "scores": { + "IFEval": 0.3819, + "BBH": 0.5078, + "MATH Level 5": 0.1866, + "GPQA": 0.3272, + "MUSR": 0.4436, + "MMLU-PRO": 0.3914 + } + }, + { + "model_id": "SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000", + "name": "SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000", + "developer": "SkyOrbis", + "scores": { + "IFEval": 0.3812, + "BBH": 0.539, + "MATH Level 5": 0.21, + "GPQA": 0.3029, + "MUSR": 0.4238, + "MMLU-PRO": 0.4238 + } + }, + { + "model_id": "Skywork/Skywork-Reward-Gemma-2-27B-v0.2", + "name": "Skywork-Reward-Gemma-2-27B-v0.2", + "developer": "Skywork", + "scores": { + "IFEval": 0.7807, + "BBH": 0.636, + "MATH Level 5": 0.2273, + "GPQA": 0.344, + "MUSR": 0.4231, + "MMLU-PRO": 0.4103 + } + }, + { + "model_id": "Skywork/Skywork-o1-Open-Llama-3.1-8B", + "name": "Skywork-o1-Open-Llama-3.1-8B", + "developer": "Skywork", + "scores": { + "IFEval": 0.3518, + "BBH": 0.4516, + "MATH Level 5": 0.5211, + "GPQA": 0.2592, + "MUSR": 0.3156, + "MMLU-PRO": 0.203 + } + }, + { + "model_id": "Solshine/Brimful-merged-replete", + "name": "Brimful-merged-replete", + "developer": "Solshine", + "scores": { + "IFEval": 0.1761, + "BBH": 0.2883, + "MATH Level 5": 0.003, + "GPQA": 0.2576, + "MUSR": 0.3421, + "MMLU-PRO": 0.1085 + } + }, + { + "model_id": "Solshine/Llama-3-1-big-thoughtful-passthrough-merge-2", + "name": "Llama-3-1-big-thoughtful-passthrough-merge-2", + "developer": "Solshine", + "scores": { + "IFEval": 0.2547, + "BBH": 0.3209, + "MATH Level 5": 0.0106, + "GPQA": 0.2592, + "MUSR": 0.3889, + "MMLU-PRO": 0.1185 + } + }, + { + "model_id": "Sorawiz/Gemma-9B-Base", + "name": "Gemma-9B-Base", + "developer": "Sorawiz", + "scores": { + "IFEval": 0.1667, + "BBH": 0.593, + "MATH Level 5": 0.0982, + "GPQA": 0.3398, + "MUSR": 0.4045, + "MMLU-PRO": 0.4235 + } + }, + { + "model_id": "Sorawiz/Gemma-Creative-9B-Base", + "name": "Gemma-Creative-9B-Base", + "developer": "Sorawiz", + "scores": { + "IFEval": 0.1515, + "BBH": 0.5459, + "MATH Level 5": 0.0778, + "GPQA": 0.3297, + "MUSR": 0.4019, + "MMLU-PRO": 0.4008 + } + }, + { + "model_id": "Sourjayon/DeepSeek-R1-8b-Sify", + "name": "DeepSeek-R1-8b-Sify", + "developer": "Sourjayon", + "scores": { + "IFEval": 0.3679, + "BBH": 0.3379, + "MATH Level 5": 0.2447, + "GPQA": 0.2525, + "MUSR": 0.3303, + "MMLU-PRO": 0.1981 + } + }, + { + "model_id": "Sourjayon/DeepSeek-R1-ForumNXT", + "name": "DeepSeek-R1-ForumNXT", + "developer": "Sourjayon", + "scores": { + "IFEval": 0.2603, + "BBH": 0.331, + "MATH Level 5": 0.2576, + "GPQA": 0.2743, + "MUSR": 0.3392, + "MMLU-PRO": 0.1648 + } + }, + { + "model_id": "SpaceYL/ECE_Poirot", + "name": "ECE_Poirot", + "developer": "SpaceYL", + "scores": { + "IFEval": 0.3107, + "BBH": 0.4262, + "MATH Level 5": 0.0914, + "GPQA": 0.2978, + "MUSR": 0.4026, + "MMLU-PRO": 0.2883 + } + }, + { + "model_id": "Spestly/Athena-1-3B", + "name": "Athena-1-3B", + "developer": "Spestly", + "scores": { + "IFEval": 0.5569, + "BBH": 0.4702, + "MATH Level 5": 0.2379, + "GPQA": 0.2936, + "MUSR": 0.4362, + "MMLU-PRO": 0.3519 + } + }, + { + "model_id": "Spestly/Atlas-Pro-1.5B-Preview", + "name": "Atlas-Pro-1.5B-Preview", + "developer": "Spestly", + "scores": { + "IFEval": 0.243, + "BBH": 0.3499, + "MATH Level 5": 0.3195, + "GPQA": 0.297, + "MUSR": 0.3354, + "MMLU-PRO": 0.1925 + } + }, + { + "model_id": "Spestly/Atlas-Pro-7B-Preview", + "name": "Atlas-Pro-7B-Preview", + "developer": "Spestly", + "scores": { + "IFEval": 0.3154, + "BBH": 0.4668, + "MATH Level 5": 0.5083, + "GPQA": 0.3372, + "MUSR": 0.3911, + "MMLU-PRO": 0.297 + } + }, + { + "model_id": "Stark2008/GutenLaserPi", + "name": "GutenLaserPi", + "developer": "Stark2008", + "scores": { + "IFEval": 0.4227, + "BBH": 0.5212, + "MATH Level 5": 0.0785, + "GPQA": 0.2869, + "MUSR": 0.462, + "MMLU-PRO": 0.3106 + } + }, + { + "model_id": "Stark2008/LayleleFlamPi", + "name": "LayleleFlamPi", + "developer": "Stark2008", + "scores": { + "IFEval": 0.4284, + "BBH": 0.5116, + "MATH Level 5": 0.0665, + "GPQA": 0.2852, + "MUSR": 0.4608, + "MMLU-PRO": 0.3093 + } + }, + { + "model_id": "Stark2008/VisFlamCat", + "name": "VisFlamCat", + "developer": "Stark2008", + "scores": { + "IFEval": 0.4366, + "BBH": 0.5217, + "MATH Level 5": 0.0763, + "GPQA": 0.2903, + "MUSR": 0.4463, + "MMLU-PRO": 0.3144 + } + }, + { + "model_id": "Steelskull/L3.3-MS-Nevoria-70b", + "name": "L3.3-MS-Nevoria-70b", + "developer": "Steelskull", + "scores": { + "IFEval": 0.6963, + "BBH": 0.6998, + "MATH Level 5": 0.3958, + "GPQA": 0.4706, + "MUSR": 0.4682, + "MMLU-PRO": 0.5535 + } + }, + { + "model_id": "Steelskull/L3.3-Nevoria-R1-70b", + "name": "L3.3-Nevoria-R1-70b", + "developer": "Steelskull", + "scores": { + "IFEval": 0.6024, + "BBH": 0.6972, + "MATH Level 5": 0.463, + "GPQA": 0.469, + "MUSR": 0.4775, + "MMLU-PRO": 0.5463 + } + }, + { + "model_id": "StelleX/Qwen2.5_Math_7B_Cot", + "name": "Qwen2.5_Math_7B_Cot", + "developer": "StelleX", + "scores": { + "IFEval": 0.2143, + "BBH": 0.4313, + "MATH Level 5": 0.3263, + "GPQA": 0.2945, + "MUSR": 0.3924, + "MMLU-PRO": 0.281 + } + }, + { + "model_id": "StelleX/Vorisatex-7B-preview", + "name": "Vorisatex-7B-preview", + "developer": "StelleX", + "scores": { + "IFEval": 0.1515, + "BBH": 0.3112, + "MATH Level 5": 0.0287, + "GPQA": 0.2517, + "MUSR": 0.4192, + "MMLU-PRO": 0.1166 + } + }, + { + "model_id": "SultanR/SmolTulu-1.7b-Instruct", + "name": "SmolTulu-1.7b-Instruct", + "developer": "SultanR", + "scores": { + "IFEval": 0.6541, + "BBH": 0.3713, + "MATH Level 5": 0.0793, + "GPQA": 0.2693, + "MUSR": 0.354, + "MMLU-PRO": 0.171 + } + }, + { + "model_id": "SultanR/SmolTulu-1.7b-Reinforced", + "name": "SmolTulu-1.7b-Reinforced", + "developer": "SultanR", + "scores": { + "IFEval": 0.6791, + "BBH": 0.3552, + "MATH Level 5": 0.0718, + "GPQA": 0.276, + "MUSR": 0.3406, + "MMLU-PRO": 0.1763 + } + }, + { + "model_id": "SultanR/SmolTulu-1.7b-it-v0", + "name": "SmolTulu-1.7b-it-v0", + "developer": "SultanR", + "scores": { + "IFEval": 0.6541, + "BBH": 0.3713, + "MATH Level 5": 0.0793, + "GPQA": 0.2693, + "MUSR": 0.354, + "MMLU-PRO": 0.171 + } + }, + { + "model_id": "Supichi/BBA-123", + "name": "BBA-123", + "developer": "Supichi", + "scores": { + "IFEval": 0.208, + "BBH": 0.292, + "MATH Level 5": 0.0, + "GPQA": 0.2601, + "MUSR": 0.3499, + "MMLU-PRO": 0.1167 + } + }, + { + "model_id": "Supichi/BBA99", + "name": "BBA99", + "developer": "Supichi", + "scores": { + "IFEval": 0.1407, + "BBH": 0.2769, + "MATH Level 5": 0.0, + "GPQA": 0.2634, + "MUSR": 0.3218, + "MMLU-PRO": 0.1112 + } + }, + { + "model_id": "Supichi/BBAIK29", + "name": "BBAIK29", + "developer": "Supichi", + "scores": { + "IFEval": 0.4588, + "BBH": 0.559, + "MATH Level 5": 0.3678, + "GPQA": 0.3121, + "MUSR": 0.4501, + "MMLU-PRO": 0.4469 + } + }, + { + "model_id": "Supichi/BBAI_135_Gemma", + "name": "BBAI_135_Gemma", + "developer": "Supichi", + "scores": { + "IFEval": 0.0656, + "BBH": 0.3568, + "MATH Level 5": 0.0, + "GPQA": 0.2676, + "MUSR": 0.3805, + "MMLU-PRO": 0.1672 + } + }, + { + "model_id": "Supichi/BBAI_250_Xia0_gZ", + "name": "BBAI_250_Xia0_gZ", + "developer": "Supichi", + "scores": { + "IFEval": 0.4685, + "BBH": 0.5568, + "MATH Level 5": 0.364, + "GPQA": 0.3213, + "MUSR": 0.4579, + "MMLU-PRO": 0.4465 + } + }, + { + "model_id": "Supichi/BBAI_275_Tsunami_gZ", + "name": "BBAI_275_Tsunami_gZ", + "developer": "Supichi", + "scores": { + "IFEval": 0.537, + "BBH": 0.5531, + "MATH Level 5": 0.3285, + "GPQA": 0.3213, + "MUSR": 0.4448, + "MMLU-PRO": 0.4492 + } + }, + { + "model_id": "Supichi/BBAI_525_Tsu_gZ_Xia0", + "name": "BBAI_525_Tsu_gZ_Xia0", + "developer": "Supichi", + "scores": { + "IFEval": 0.5339, + "BBH": 0.5562, + "MATH Level 5": 0.3429, + "GPQA": 0.3121, + "MUSR": 0.4474, + "MMLU-PRO": 0.4477 + } + }, + { + "model_id": "Supichi/BBAI_78B_Calme_3_1_Ties", + "name": "BBAI_78B_Calme_3_1_Ties", + "developer": "Supichi", + "scores": { + "IFEval": 0.1828, + "BBH": 0.2828, + "MATH Level 5": 0.0, + "GPQA": 0.229, + "MUSR": 0.31, + "MMLU-PRO": 0.1144 + } + }, + { + "model_id": "Supichi/BBAI_QWEEN_V000000_LUMEN_14B", + "name": "BBAI_QWEEN_V000000_LUMEN_14B", + "developer": "Supichi", + "scores": { + "IFEval": 0.1815, + "BBH": 0.2297, + "MATH Level 5": 0.0, + "GPQA": 0.2315, + "MUSR": 0.3445, + "MMLU-PRO": 0.116 + } + }, + { + "model_id": "Supichi/HF_TOKEN", + "name": "HF_TOKEN", + "developer": "Supichi", + "scores": { + "IFEval": 0.138, + "BBH": 0.2764, + "MATH Level 5": 0.0008, + "GPQA": 0.2634, + "MUSR": 0.3272, + "MMLU-PRO": 0.111 + } + }, + { + "model_id": "Supichi/NJS26", + "name": "NJS26", + "developer": "Supichi", + "scores": { + "IFEval": 0.0448, + "BBH": 0.478, + "MATH Level 5": 0.0325, + "GPQA": 0.318, + "MUSR": 0.3854, + "MMLU-PRO": 0.3037 + } + }, + { + "model_id": "Svak/MN-12B-Inferor-v0.0", + "name": "MN-12B-Inferor-v0.0", + "developer": "Svak", + "scores": { + "IFEval": 0.5708, + "BBH": 0.5195, + "MATH Level 5": 0.102, + "GPQA": 0.3087, + "MUSR": 0.4639, + "MMLU-PRO": 0.3559 + } + }, + { + "model_id": "Svak/MN-12B-Inferor-v0.1", + "name": "MN-12B-Inferor-v0.1", + "developer": "Svak", + "scores": { + "IFEval": 0.6347, + "BBH": 0.5147, + "MATH Level 5": 0.1261, + "GPQA": 0.3255, + "MUSR": 0.4351, + "MMLU-PRO": 0.3662 + } + }, + { + "model_id": "Syed-Hasan-8503/Phi-3-mini-4K-instruct-cpo-simpo", + "name": "Phi-3-mini-4K-instruct-cpo-simpo", + "developer": "Syed-Hasan-8503", + "scores": { + "IFEval": 0.5714, + "BBH": 0.5682, + "MATH Level 5": 0.1571, + "GPQA": 0.3305, + "MUSR": 0.3964, + "MMLU-PRO": 0.3861 + } + }, + { + "model_id": "T145/KRONOS-8B-V1-P1", + "name": "KRONOS-8B-V1-P1", + "developer": "T145", + "scores": { + "IFEval": 0.785, + "BBH": 0.5085, + "MATH Level 5": 0.1979, + "GPQA": 0.2953, + "MUSR": 0.3881, + "MMLU-PRO": 0.376 + } + }, + { + "model_id": "T145/KRONOS-8B-V1-P2", + "name": "KRONOS-8B-V1-P2", + "developer": "T145", + "scores": { + "IFEval": 0.6724, + "BBH": 0.4772, + "MATH Level 5": 0.1601, + "GPQA": 0.2919, + "MUSR": 0.3568, + "MMLU-PRO": 0.3453 + } + }, + { + "model_id": "T145/KRONOS-8B-V1-P3", + "name": "KRONOS-8B-V1-P3", + "developer": "T145", + "scores": { + "IFEval": 0.7137, + "BBH": 0.5128, + "MATH Level 5": 0.1926, + "GPQA": 0.2601, + "MUSR": 0.3616, + "MMLU-PRO": 0.3405 + } + }, + { + "model_id": "T145/KRONOS-8B-V2", + "name": "KRONOS-8B-V2", + "developer": "T145", + "scores": { + "IFEval": 0.518, + "BBH": 0.5133, + "MATH Level 5": 0.2266, + "GPQA": 0.2987, + "MUSR": 0.3829, + "MMLU-PRO": 0.3738 + } + }, + { + "model_id": "T145/KRONOS-8B-V3", + "name": "KRONOS-8B-V3", + "developer": "T145", + "scores": { + "IFEval": 0.5475, + "BBH": 0.5119, + "MATH Level 5": 0.2598, + "GPQA": 0.2886, + "MUSR": 0.3922, + "MMLU-PRO": 0.3738 + } + }, + { + "model_id": "T145/KRONOS-8B-V4", + "name": "KRONOS-8B-V4", + "developer": "T145", + "scores": { + "IFEval": 0.7889, + "BBH": 0.5092, + "MATH Level 5": 0.1949, + "GPQA": 0.2894, + "MUSR": 0.383, + "MMLU-PRO": 0.3786 + } + }, + { + "model_id": "T145/KRONOS-8B-V5", + "name": "KRONOS-8B-V5", + "developer": "T145", + "scores": { + "IFEval": 0.5405, + "BBH": 0.5089, + "MATH Level 5": 0.2689, + "GPQA": 0.2903, + "MUSR": 0.4055, + "MMLU-PRO": 0.3759 + } + }, + { + "model_id": "T145/KRONOS-8B-V6", + "name": "KRONOS-8B-V6", + "developer": "T145", + "scores": { + "IFEval": 0.7022, + "BBH": 0.5034, + "MATH Level 5": 0.2598, + "GPQA": 0.2794, + "MUSR": 0.4121, + "MMLU-PRO": 0.3501 + } + }, + { + "model_id": "T145/KRONOS-8B-V7", + "name": "KRONOS-8B-V7", + "developer": "T145", + "scores": { + "IFEval": 0.3529, + "BBH": 0.4526, + "MATH Level 5": 0.111, + "GPQA": 0.2668, + "MUSR": 0.3671, + "MMLU-PRO": 0.2697 + } + }, + { + "model_id": "T145/KRONOS-8B-V8", + "name": "KRONOS-8B-V8", + "developer": "T145", + "scores": { + "IFEval": 0.777, + "BBH": 0.5094, + "MATH Level 5": 0.2047, + "GPQA": 0.2894, + "MUSR": 0.3869, + "MMLU-PRO": 0.3782 + } + }, + { + "model_id": "T145/KRONOS-8B-V9", + "name": "KRONOS-8B-V9", + "developer": "T145", + "scores": { + "IFEval": 0.7856, + "BBH": 0.5099, + "MATH Level 5": 0.1986, + "GPQA": 0.2961, + "MUSR": 0.3868, + "MMLU-PRO": 0.3752 + } + }, + { + "model_id": "T145/Llama-3.1-8B-Instruct-Zeus", + "name": "Llama-3.1-8B-Instruct-Zeus", + "developer": "T145", + "scores": { + "IFEval": 0.7941, + "BBH": 0.5174, + "MATH Level 5": 0.1956, + "GPQA": 0.3012, + "MUSR": 0.3976, + "MMLU-PRO": 0.3893 + } + }, + { + "model_id": "T145/Llama-3.1-8B-Zeus", + "name": "Llama-3.1-8B-Zeus", + "developer": "T145", + "scores": { + "IFEval": 0.3518, + "BBH": 0.3671, + "MATH Level 5": 0.0144, + "GPQA": 0.2651, + "MUSR": 0.3316, + "MMLU-PRO": 0.1332 + } + }, + { + "model_id": "T145/Meta-Llama-3.1-8B-Instruct-TIES", + "name": "Meta-Llama-3.1-8B-Instruct-TIES", + "developer": "T145", + "scores": { + "IFEval": 0.5424, + "BBH": 0.507, + "MATH Level 5": 0.21, + "GPQA": 0.2945, + "MUSR": 0.3843, + "MMLU-PRO": 0.378 + } + }, + { + "model_id": "T145/ZEUS-8B-V10", + "name": "ZEUS-8B-V10", + "developer": "T145", + "scores": { + "IFEval": 0.7707, + "BBH": 0.527, + "MATH Level 5": 0.2115, + "GPQA": 0.3247, + "MUSR": 0.3898, + "MMLU-PRO": 0.3904 + } + }, + { + "model_id": "T145/ZEUS-8B-V11", + "name": "ZEUS-8B-V11", + "developer": "T145", + "scores": { + "IFEval": 0.81, + "BBH": 0.5162, + "MATH Level 5": 0.1964, + "GPQA": 0.3146, + "MUSR": 0.3807, + "MMLU-PRO": 0.3884 + } + }, + { + "model_id": "T145/ZEUS-8B-V12", + "name": "ZEUS-8B-V12", + "developer": "T145", + "scores": { + "IFEval": 0.7816, + "BBH": 0.5254, + "MATH Level 5": 0.2115, + "GPQA": 0.3205, + "MUSR": 0.3858, + "MMLU-PRO": 0.3912 + } + }, + { + "model_id": "T145/ZEUS-8B-V13", + "name": "ZEUS-8B-V13", + "developer": "T145", + "scores": { + "IFEval": 0.7904, + "BBH": 0.5277, + "MATH Level 5": 0.2137, + "GPQA": 0.3238, + "MUSR": 0.3845, + "MMLU-PRO": 0.3911 + } + }, + { + "model_id": "T145/ZEUS-8B-V13-abliterated", + "name": "ZEUS-8B-V13-abliterated", + "developer": "T145", + "scores": { + "IFEval": 0.7878, + "BBH": 0.5198, + "MATH Level 5": 0.179, + "GPQA": 0.3112, + "MUSR": 0.3871, + "MMLU-PRO": 0.3872 + } + }, + { + "model_id": "T145/ZEUS-8B-V14", + "name": "ZEUS-8B-V14", + "developer": "T145", + "scores": { + "IFEval": 0.7709, + "BBH": 0.5275, + "MATH Level 5": 0.213, + "GPQA": 0.3205, + "MUSR": 0.3844, + "MMLU-PRO": 0.3914 + } + }, + { + "model_id": "T145/ZEUS-8B-V15", + "name": "ZEUS-8B-V15", + "developer": "T145", + "scores": { + "IFEval": 0.7013, + "BBH": 0.5538, + "MATH Level 5": 0.2304, + "GPQA": 0.276, + "MUSR": 0.402, + "MMLU-PRO": 0.4059 + } + }, + { + "model_id": "T145/ZEUS-8B-V16", + "name": "ZEUS-8B-V16", + "developer": "T145", + "scores": { + "IFEval": 0.7925, + "BBH": 0.5266, + "MATH Level 5": 0.2205, + "GPQA": 0.307, + "MUSR": 0.3951, + "MMLU-PRO": 0.3926 + } + }, + { + "model_id": "T145/ZEUS-8B-V17", + "name": "ZEUS-8B-V17", + "developer": "T145", + "scores": { + "IFEval": 0.7941, + "BBH": 0.5251, + "MATH Level 5": 0.2243, + "GPQA": 0.3221, + "MUSR": 0.4016, + "MMLU-PRO": 0.3935 + } + }, + { + "model_id": "T145/ZEUS-8B-V17-abliterated", + "name": "ZEUS-8B-V17-abliterated", + "developer": "T145", + "scores": { + "IFEval": 0.7576, + "BBH": 0.52, + "MATH Level 5": 0.0438, + "GPQA": 0.3037, + "MUSR": 0.4269, + "MMLU-PRO": 0.3622 + } + }, + { + "model_id": "T145/ZEUS-8B-V17-abliterated-V2", + "name": "ZEUS-8B-V17-abliterated-V2", + "developer": "T145", + "scores": { + "IFEval": 0.6532, + "BBH": 0.4928, + "MATH Level 5": 0.1118, + "GPQA": 0.2735, + "MUSR": 0.3407, + "MMLU-PRO": 0.3402 + } + }, + { + "model_id": "T145/ZEUS-8B-V17-abliterated-V4", + "name": "ZEUS-8B-V17-abliterated-V4", + "developer": "T145", + "scores": { + "IFEval": 0.7228, + "BBH": 0.5169, + "MATH Level 5": 0.0937, + "GPQA": 0.2836, + "MUSR": 0.4187, + "MMLU-PRO": 0.3774 + } + }, + { + "model_id": "T145/ZEUS-8B-V18", + "name": "ZEUS-8B-V18", + "developer": "T145", + "scores": { + "IFEval": 0.7834, + "BBH": 0.527, + "MATH Level 5": 0.2183, + "GPQA": 0.3213, + "MUSR": 0.4043, + "MMLU-PRO": 0.3942 + } + }, + { + "model_id": "T145/ZEUS-8B-V19", + "name": "ZEUS-8B-V19", + "developer": "T145", + "scores": { + "IFEval": 0.7883, + "BBH": 0.5276, + "MATH Level 5": 0.2205, + "GPQA": 0.3221, + "MUSR": 0.4043, + "MMLU-PRO": 0.3934 + } + }, + { + "model_id": "T145/ZEUS-8B-V2", + "name": "ZEUS-8B-V2", + "developer": "T145", + "scores": { + "IFEval": 0.8029, + "BBH": 0.5194, + "MATH Level 5": 0.216, + "GPQA": 0.302, + "MUSR": 0.391, + "MMLU-PRO": 0.3896 + } + }, + { + "model_id": "T145/ZEUS-8B-V2-ORPO", + "name": "ZEUS-8B-V2-ORPO", + "developer": "T145", + "scores": { + "IFEval": 0.7187, + "BBH": 0.5075, + "MATH Level 5": 0.1828, + "GPQA": 0.3104, + "MUSR": 0.3935, + "MMLU-PRO": 0.3678 + } + }, + { + "model_id": "T145/ZEUS-8B-V2-abliterated", + "name": "ZEUS-8B-V2-abliterated", + "developer": "T145", + "scores": { + "IFEval": 0.7895, + "BBH": 0.5129, + "MATH Level 5": 0.2115, + "GPQA": 0.3129, + "MUSR": 0.3911, + "MMLU-PRO": 0.3825 + } + }, + { + "model_id": "T145/ZEUS-8B-V20", + "name": "ZEUS-8B-V20", + "developer": "T145", + "scores": { + "IFEval": 0.7956, + "BBH": 0.5244, + "MATH Level 5": 0.219, + "GPQA": 0.323, + "MUSR": 0.4043, + "MMLU-PRO": 0.393 + } + }, + { + "model_id": "T145/ZEUS-8B-V21", + "name": "ZEUS-8B-V21", + "developer": "T145", + "scores": { + "IFEval": 0.3785, + "BBH": 0.3398, + "MATH Level 5": 0.1594, + "GPQA": 0.2643, + "MUSR": 0.3262, + "MMLU-PRO": 0.1714 + } + }, + { + "model_id": "T145/ZEUS-8B-V22", + "name": "ZEUS-8B-V22", + "developer": "T145", + "scores": { + "IFEval": 0.7995, + "BBH": 0.5245, + "MATH Level 5": 0.2228, + "GPQA": 0.328, + "MUSR": 0.399, + "MMLU-PRO": 0.3938 + } + }, + { + "model_id": "T145/ZEUS-8B-V23", + "name": "ZEUS-8B-V23", + "developer": "T145", + "scores": { + "IFEval": 0.7621, + "BBH": 0.5195, + "MATH Level 5": 0.182, + "GPQA": 0.3096, + "MUSR": 0.3922, + "MMLU-PRO": 0.3666 + } + }, + { + "model_id": "T145/ZEUS-8B-V24", + "name": "ZEUS-8B-V24", + "developer": "T145", + "scores": { + "IFEval": 0.6, + "BBH": 0.4778, + "MATH Level 5": 0.1458, + "GPQA": 0.2617, + "MUSR": 0.3729, + "MMLU-PRO": 0.3285 + } + }, + { + "model_id": "T145/ZEUS-8B-V25", + "name": "ZEUS-8B-V25", + "developer": "T145", + "scores": { + "IFEval": 0.332, + "BBH": 0.4547, + "MATH Level 5": 0.2039, + "GPQA": 0.2643, + "MUSR": 0.3488, + "MMLU-PRO": 0.2885 + } + }, + { + "model_id": "T145/ZEUS-8B-V26", + "name": "ZEUS-8B-V26", + "developer": "T145", + "scores": { + "IFEval": 0.6708, + "BBH": 0.5232, + "MATH Level 5": 0.1246, + "GPQA": 0.2953, + "MUSR": 0.4016, + "MMLU-PRO": 0.3907 + } + }, + { + "model_id": "T145/ZEUS-8B-V27", + "name": "ZEUS-8B-V27", + "developer": "T145", + "scores": { + "IFEval": 0.6544, + "BBH": 0.523, + "MATH Level 5": 0.1344, + "GPQA": 0.3079, + "MUSR": 0.3977, + "MMLU-PRO": 0.3902 + } + }, + { + "model_id": "T145/ZEUS-8B-V28", + "name": "ZEUS-8B-V28", + "developer": "T145", + "scores": { + "IFEval": 0.6353, + "BBH": 0.5254, + "MATH Level 5": 0.1269, + "GPQA": 0.3037, + "MUSR": 0.3896, + "MMLU-PRO": 0.3902 + } + }, + { + "model_id": "T145/ZEUS-8B-V29", + "name": "ZEUS-8B-V29", + "developer": "T145", + "scores": { + "IFEval": 0.7418, + "BBH": 0.5253, + "MATH Level 5": 0.1601, + "GPQA": 0.3263, + "MUSR": 0.4003, + "MMLU-PRO": 0.392 + } + }, + { + "model_id": "T145/ZEUS-8B-V2L1", + "name": "ZEUS-8B-V2L1", + "developer": "T145", + "scores": { + "IFEval": 0.3192, + "BBH": 0.5013, + "MATH Level 5": 0.1239, + "GPQA": 0.3129, + "MUSR": 0.3882, + "MMLU-PRO": 0.3638 + } + }, + { + "model_id": "T145/ZEUS-8B-V2L2", + "name": "ZEUS-8B-V2L2", + "developer": "T145", + "scores": { + "IFEval": 0.8021, + "BBH": 0.5203, + "MATH Level 5": 0.2017, + "GPQA": 0.2995, + "MUSR": 0.3975, + "MMLU-PRO": 0.3884 + } + }, + { + "model_id": "T145/ZEUS-8B-V3", + "name": "ZEUS-8B-V3", + "developer": "T145", + "scores": { + "IFEval": 0.7887, + "BBH": 0.5265, + "MATH Level 5": 0.1677, + "GPQA": 0.3221, + "MUSR": 0.4017, + "MMLU-PRO": 0.3804 + } + }, + { + "model_id": "T145/ZEUS-8B-V30", + "name": "ZEUS-8B-V30", + "developer": "T145", + "scores": { + "IFEval": 0.7436, + "BBH": 0.5243, + "MATH Level 5": 0.1586, + "GPQA": 0.3205, + "MUSR": 0.4029, + "MMLU-PRO": 0.3944 + } + }, + { + "model_id": "T145/ZEUS-8B-V4", + "name": "ZEUS-8B-V4", + "developer": "T145", + "scores": { + "IFEval": 0.7807, + "BBH": 0.5246, + "MATH Level 5": 0.1926, + "GPQA": 0.307, + "MUSR": 0.4029, + "MMLU-PRO": 0.3788 + } + }, + { + "model_id": "T145/ZEUS-8B-V6", + "name": "ZEUS-8B-V6", + "developer": "T145", + "scores": { + "IFEval": 0.7838, + "BBH": 0.524, + "MATH Level 5": 0.2024, + "GPQA": 0.3045, + "MUSR": 0.4068, + "MMLU-PRO": 0.3759 + } + }, + { + "model_id": "T145/ZEUS-8B-V7", + "name": "ZEUS-8B-V7", + "developer": "T145", + "scores": { + "IFEval": 0.7786, + "BBH": 0.507, + "MATH Level 5": 0.148, + "GPQA": 0.297, + "MUSR": 0.4162, + "MMLU-PRO": 0.3812 + } + }, + { + "model_id": "T145/ZEUS-8B-V8", + "name": "ZEUS-8B-V8", + "developer": "T145", + "scores": { + "IFEval": 0.7914, + "BBH": 0.5065, + "MATH Level 5": 0.1329, + "GPQA": 0.2878, + "MUSR": 0.4214, + "MMLU-PRO": 0.3761 + } + }, + { + "model_id": "T145/ZEUS-8B-V9", + "name": "ZEUS-8B-V9", + "developer": "T145", + "scores": { + "IFEval": 0.5551, + "BBH": 0.5207, + "MATH Level 5": 0.2137, + "GPQA": 0.2911, + "MUSR": 0.3949, + "MMLU-PRO": 0.3901 + } + }, + { + "model_id": "T145/qwen-2.5-3B-merge-test", + "name": "qwen-2.5-3B-merge-test", + "developer": "T145", + "scores": { + "IFEval": 0.5751, + "BBH": 0.4842, + "MATH Level 5": 0.3202, + "GPQA": 0.2852, + "MUSR": 0.4007, + "MMLU-PRO": 0.329 + } + }, + { + "model_id": "THUDM/glm-4-9b", + "name": "glm-4-9b", + "developer": "THUDM", + "scores": { + "IFEval": 0.1426, + "BBH": 0.5528, + "MATH Level 5": 0.0, + "GPQA": 0.3163, + "MUSR": 0.4386, + "MMLU-PRO": 0.4145 + } + }, + { + "model_id": "THUDM/glm-4-9b-chat", + "name": "glm-4-9b-chat", + "developer": "THUDM", + "scores": { + "IFEval": 0.0, + "BBH": 0.4736, + "MATH Level 5": 0.0, + "GPQA": 0.3138, + "MUSR": 0.3994, + "MMLU-PRO": 0.3167 + } + }, + { + "model_id": "THUDM/glm-4-9b-chat-1m", + "name": "glm-4-9b-chat-1m", + "developer": "THUDM", + "scores": { + "IFEval": 0.0, + "BBH": 0.418, + "MATH Level 5": 0.0, + "GPQA": 0.3037, + "MUSR": 0.3795, + "MMLU-PRO": 0.3163 + } + }, + { + "model_id": "THUDM/glm-4-9b-chat-1m-hf", + "name": "glm-4-9b-chat-1m-hf", + "developer": "THUDM", + "scores": { + "IFEval": 0.5341, + "BBH": 0.3901, + "MATH Level 5": 0.0483, + "GPQA": 0.2919, + "MUSR": 0.3689, + "MMLU-PRO": 0.1814 + } + }, + { + "model_id": "THUDM/glm-4-9b-chat-hf", + "name": "glm-4-9b-chat-hf", + "developer": "THUDM", + "scores": { + "IFEval": 0.6513, + "BBH": 0.4432, + "MATH Level 5": 0.0846, + "GPQA": 0.3029, + "MUSR": 0.3593, + "MMLU-PRO": 0.2774 + } + }, + { + "model_id": "TIGER-Lab/AceCodeRM-7B", + "name": "AceCodeRM-7B", + "developer": "TIGER-Lab", + "scores": { + "IFEval": 0.5855, + "BBH": 0.4773, + "MATH Level 5": 0.3467, + "GPQA": 0.3045, + "MUSR": 0.4192, + "MMLU-PRO": 0.3361 + } + }, + { + "model_id": "TIGER-Lab/AceCoder-Qwen2.5-7B-Ins-Rule", + "name": "AceCoder-Qwen2.5-7B-Ins-Rule", + "developer": "TIGER-Lab", + "scores": { + "IFEval": 0.7424, + "BBH": 0.5404, + "MATH Level 5": 0.4992, + "GPQA": 0.3012, + "MUSR": 0.398, + "MMLU-PRO": 0.4322 + } + }, + { + "model_id": "TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Base-Rule", + "name": "AceCoder-Qwen2.5-Coder-7B-Base-Rule", + "developer": "TIGER-Lab", + "scores": { + "IFEval": 0.4408, + "BBH": 0.4902, + "MATH Level 5": 0.2017, + "GPQA": 0.2718, + "MUSR": 0.3449, + "MMLU-PRO": 0.3745 + } + }, + { + "model_id": "TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Ins-Rule", + "name": "AceCoder-Qwen2.5-Coder-7B-Ins-Rule", + "developer": "TIGER-Lab", + "scores": { + "IFEval": 0.6222, + "BBH": 0.5089, + "MATH Level 5": 0.3603, + "GPQA": 0.2777, + "MUSR": 0.4046, + "MMLU-PRO": 0.3428 + } + }, + { + "model_id": "TIGER-Lab/MAmmoTH2-7B-Plus", + "name": "MAmmoTH2-7B-Plus", + "developer": "TIGER-Lab", + "scores": { + "IFEval": 0.5575, + "BBH": 0.4235, + "MATH Level 5": 0.1858, + "GPQA": 0.2802, + "MUSR": 0.4124, + "MMLU-PRO": 0.3017 + } + }, + { + "model_id": "TIGER-Lab/Qwen2.5-Math-7B-CFT", + "name": "Qwen2.5-Math-7B-CFT", + "developer": "TIGER-Lab", + "scores": { + "IFEval": 0.2777, + "BBH": 0.4637, + "MATH Level 5": 0.5574, + "GPQA": 0.2861, + "MUSR": 0.3887, + "MMLU-PRO": 0.2945 + } + }, + { + "model_id": "TTTXXX01/Mistral-7B-Base-SimPO2-5e-7", + "name": "Mistral-7B-Base-SimPO2-5e-7", + "developer": "TTTXXX01", + "scores": { + "IFEval": 0.4392, + "BBH": 0.432, + "MATH Level 5": 0.0264, + "GPQA": 0.2978, + "MUSR": 0.3604, + "MMLU-PRO": 0.2766 + } + }, + { + "model_id": "Tarek07/Progenitor-V1.1-LLaMa-70B", + "name": "Progenitor-V1.1-LLaMa-70B", + "developer": "Tarek07", + "scores": { + "IFEval": 0.6906, + "BBH": 0.6971, + "MATH Level 5": 0.3573, + "GPQA": 0.4581, + "MUSR": 0.4736, + "MMLU-PRO": 0.5465 + } + }, + { + "model_id": "Tarek07/Thalassic-Alpha-LLaMa-70B", + "name": "Thalassic-Alpha-LLaMa-70B", + "developer": "Tarek07", + "scores": { + "IFEval": 0.7003, + "BBH": 0.694, + "MATH Level 5": 0.315, + "GPQA": 0.4438, + "MUSR": 0.4802, + "MMLU-PRO": 0.5435 + } + }, + { + "model_id": "TeeZee/DoubleBagel-57B-v1.0", + "name": "DoubleBagel-57B-v1.0", + "developer": "TeeZee", + "scores": { + "IFEval": 0.2336, + "BBH": 0.3251, + "MATH Level 5": 0.0098, + "GPQA": 0.276, + "MUSR": 0.4315, + "MMLU-PRO": 0.1478 + } + }, + { + "model_id": "Telugu-LLM-Labs/Indic-gemma-2b-finetuned-sft-Navarasa-2.0", + "name": "Indic-gemma-2b-finetuned-sft-Navarasa-2.0", + "developer": "Telugu-LLM-Labs", + "scores": { + "IFEval": 0.2103, + "BBH": 0.3241, + "MATH Level 5": 0.0272, + "GPQA": 0.2433, + "MUSR": 0.3899, + "MMLU-PRO": 0.1279 + } + }, + { + "model_id": "Telugu-LLM-Labs/Indic-gemma-7b-finetuned-sft-Navarasa-2.0", + "name": "Indic-gemma-7b-finetuned-sft-Navarasa-2.0", + "developer": "Telugu-LLM-Labs", + "scores": { + "IFEval": 0.3237, + "BBH": 0.4023, + "MATH Level 5": 0.0257, + "GPQA": 0.2701, + "MUSR": 0.4083, + "MMLU-PRO": 0.235 + } + }, + { + "model_id": "TencentARC/LLaMA-Pro-8B", + "name": "LLaMA-Pro-8B", + "developer": "TencentARC", + "scores": { + "IFEval": 0.2277, + "BBH": 0.3484, + "MATH Level 5": 0.0189, + "GPQA": 0.2601, + "MUSR": 0.4018, + "MMLU-PRO": 0.1811 + } + }, + { + "model_id": "TencentARC/LLaMA-Pro-8B-Instruct", + "name": "LLaMA-Pro-8B-Instruct", + "developer": "TencentARC", + "scores": { + "IFEval": 0.4486, + "BBH": 0.4224, + "MATH Level 5": 0.0249, + "GPQA": 0.2743, + "MUSR": 0.419, + "MMLU-PRO": 0.1946 + } + }, + { + "model_id": "TencentARC/MetaMath-Mistral-Pro", + "name": "MetaMath-Mistral-Pro", + "developer": "TencentARC", + "scores": { + "IFEval": 0.2119, + "BBH": 0.4413, + "MATH Level 5": 0.0763, + "GPQA": 0.2693, + "MUSR": 0.3524, + "MMLU-PRO": 0.2472 + } + }, + { + "model_id": "TencentARC/Mistral_Pro_8B_v0.1", + "name": "Mistral_Pro_8B_v0.1", + "developer": "TencentARC", + "scores": { + "IFEval": 0.2115, + "BBH": 0.4526, + "MATH Level 5": 0.0566, + "GPQA": 0.2802, + "MUSR": 0.4242, + "MMLU-PRO": 0.2765 + } + }, + { + "model_id": "TheDrummer/Cydonia-22B-v1.2", + "name": "Cydonia-22B-v1.2", + "developer": "TheDrummer", + "scores": { + "IFEval": 0.5635, + "BBH": 0.5809, + "MATH Level 5": 0.2032, + "GPQA": 0.3305, + "MUSR": 0.4022, + "MMLU-PRO": 0.4141 + } + }, + { + "model_id": "TheDrummer/Gemmasutra-9B-v1", + "name": "Gemmasutra-9B-v1", + "developer": "TheDrummer", + "scores": { + "IFEval": 0.2416, + "BBH": 0.5887, + "MATH Level 5": 0.0831, + "GPQA": 0.3104, + "MUSR": 0.4846, + "MMLU-PRO": 0.4045 + } + }, + { + "model_id": "TheDrummer/Gemmasutra-Mini-2B-v1", + "name": "Gemmasutra-Mini-2B-v1", + "developer": "TheDrummer", + "scores": { + "IFEval": 0.2549, + "BBH": 0.3575, + "MATH Level 5": 0.0378, + "GPQA": 0.271, + "MUSR": 0.349, + "MMLU-PRO": 0.2055 + } + }, + { + "model_id": "TheDrummer/Llama-3SOME-8B-v2", + "name": "Llama-3SOME-8B-v2", + "developer": "TheDrummer", + "scores": { + "IFEval": 0.4508, + "BBH": 0.5203, + "MATH Level 5": 0.0937, + "GPQA": 0.302, + "MUSR": 0.3833, + "MMLU-PRO": 0.3753 + } + }, + { + "model_id": "TheDrummer/Ministrations-8B-v1", + "name": "Ministrations-8B-v1", + "developer": "TheDrummer", + "scores": { + "IFEval": 0.2822, + "BBH": 0.4877, + "MATH Level 5": 0.1843, + "GPQA": 0.3247, + "MUSR": 0.4449, + "MMLU-PRO": 0.3644 + } + }, + { + "model_id": "TheDrummer/Rocinante-12B-v1", + "name": "Rocinante-12B-v1", + "developer": "TheDrummer", + "scores": { + "IFEval": 0.6076, + "BBH": 0.5065, + "MATH Level 5": 0.1269, + "GPQA": 0.2911, + "MUSR": 0.4017, + "MMLU-PRO": 0.3477 + } + }, + { + "model_id": "TheDrummer/Tiger-Gemma-9B-v1", + "name": "Tiger-Gemma-9B-v1", + "developer": "TheDrummer", + "scores": { + "IFEval": 0.7282, + "BBH": 0.5704, + "MATH Level 5": 0.1835, + "GPQA": 0.3389, + "MUSR": 0.4162, + "MMLU-PRO": 0.4118 + } + }, + { + "model_id": "TheDrummer/Tiger-Gemma-9B-v2", + "name": "Tiger-Gemma-9B-v2", + "developer": "TheDrummer", + "scores": { + "IFEval": 0.6986, + "BBH": 0.5617, + "MATH Level 5": 0.182, + "GPQA": 0.3398, + "MUSR": 0.4084, + "MMLU-PRO": 0.4112 + } + }, + { + "model_id": "TheDrummer/Tiger-Gemma-9B-v3", + "name": "Tiger-Gemma-9B-v3", + "developer": "TheDrummer", + "scores": { + "IFEval": 0.6821, + "BBH": 0.5812, + "MATH Level 5": 0.1624, + "GPQA": 0.3389, + "MUSR": 0.4004, + "MMLU-PRO": 0.4059 + } + }, + { + "model_id": "TheDrunkenSnail/Daughter-of-Rhodia-12B", + "name": "Daughter-of-Rhodia-12B", + "developer": "TheDrunkenSnail", + "scores": { + "IFEval": 0.6904, + "BBH": 0.5179, + "MATH Level 5": 0.1224, + "GPQA": 0.3171, + "MUSR": 0.4348, + "MMLU-PRO": 0.3641 + } + }, + { + "model_id": "TheDrunkenSnail/Mother-of-Rhodia-12B", + "name": "Mother-of-Rhodia-12B", + "developer": "TheDrunkenSnail", + "scores": { + "IFEval": 0.6505, + "BBH": 0.4948, + "MATH Level 5": 0.1224, + "GPQA": 0.2987, + "MUSR": 0.4124, + "MMLU-PRO": 0.3551 + } + }, + { + "model_id": "TheDrunkenSnail/Son-of-Rhodia", + "name": "Son-of-Rhodia", + "developer": "TheDrunkenSnail", + "scores": { + "IFEval": 0.7046, + "BBH": 0.5097, + "MATH Level 5": 0.1314, + "GPQA": 0.3129, + "MUSR": 0.4203, + "MMLU-PRO": 0.3608 + } + }, + { + "model_id": "TheHierophant/Underground-Cognitive-V0.3-test", + "name": "Underground-Cognitive-V0.3-test", + "developer": "TheHierophant", + "scores": { + "IFEval": 0.4808, + "BBH": 0.529, + "MATH Level 5": 0.0589, + "GPQA": 0.2987, + "MUSR": 0.4351, + "MMLU-PRO": 0.3318 + } + }, + { + "model_id": "TheTsar1209/nemo-carpmuscle-v0.1", + "name": "nemo-carpmuscle-v0.1", + "developer": "TheTsar1209", + "scores": { + "IFEval": 0.2276, + "BBH": 0.5084, + "MATH Level 5": 0.0476, + "GPQA": 0.297, + "MUSR": 0.4135, + "MMLU-PRO": 0.3406 + } + }, + { + "model_id": "TheTsar1209/qwen-carpmuscle-r-v0.3", + "name": "qwen-carpmuscle-r-v0.3", + "developer": "TheTsar1209", + "scores": { + "IFEval": 0.4455, + "BBH": 0.6227, + "MATH Level 5": 0.3006, + "GPQA": 0.3507, + "MUSR": 0.4278, + "MMLU-PRO": 0.5103 + } + }, + { + "model_id": "TheTsar1209/qwen-carpmuscle-v0.1", + "name": "qwen-carpmuscle-v0.1", + "developer": "TheTsar1209", + "scores": { + "IFEval": 0.5622, + "BBH": 0.6434, + "MATH Level 5": 0.2628, + "GPQA": 0.344, + "MUSR": 0.4161, + "MMLU-PRO": 0.52 + } + }, + { + "model_id": "TheTsar1209/qwen-carpmuscle-v0.2", + "name": "qwen-carpmuscle-v0.2", + "developer": "TheTsar1209", + "scores": { + "IFEval": 0.5257, + "BBH": 0.6387, + "MATH Level 5": 0.2832, + "GPQA": 0.3557, + "MUSR": 0.4346, + "MMLU-PRO": 0.5147 + } + }, + { + "model_id": "TheTsar1209/qwen-carpmuscle-v0.3", + "name": "qwen-carpmuscle-v0.3", + "developer": "TheTsar1209", + "scores": { + "IFEval": 0.4476, + "BBH": 0.6152, + "MATH Level 5": 0.3134, + "GPQA": 0.3565, + "MUSR": 0.4132, + "MMLU-PRO": 0.5062 + } + }, + { + "model_id": "TheTsar1209/qwen-carpmuscle-v0.4", + "name": "qwen-carpmuscle-v0.4", + "developer": "TheTsar1209", + "scores": { + "IFEval": 0.7202, + "BBH": 0.6454, + "MATH Level 5": 0.2772, + "GPQA": 0.3523, + "MUSR": 0.4516, + "MMLU-PRO": 0.5144 + } + }, + { + "model_id": "TheTsar1209/qwen-carpmuscle-v0.4.1", + "name": "qwen-carpmuscle-v0.4.1", + "developer": "TheTsar1209", + "scores": { + "IFEval": 0.736, + "BBH": 0.6507, + "MATH Level 5": 0.2779, + "GPQA": 0.3456, + "MUSR": 0.4489, + "MMLU-PRO": 0.5191 + } + }, + { + "model_id": "Tijmen2/cosmosage-v3", + "name": "cosmosage-v3", + "developer": "Tijmen2", + "scores": { + "IFEval": 0.4482, + "BBH": 0.4551, + "MATH Level 5": 0.0506, + "GPQA": 0.2827, + "MUSR": 0.4199, + "MMLU-PRO": 0.2486 + } + }, + { + "model_id": "TinyLlama/TinyLlama-1.1B-Chat-v0.1", + "name": "TinyLlama-1.1B-Chat-v0.1", + "developer": "TinyLlama", + "scores": { + "IFEval": 0.1479, + "BBH": 0.3084, + "MATH Level 5": 0.006, + "GPQA": 0.229, + "MUSR": 0.3592, + "MMLU-PRO": 0.1098 + } + }, + { + "model_id": "TinyLlama/TinyLlama-1.1B-Chat-v0.5", + "name": "TinyLlama-1.1B-Chat-v0.5", + "developer": "TinyLlama", + "scores": { + "IFEval": 0.1634, + "BBH": 0.3105, + "MATH Level 5": 0.0038, + "GPQA": 0.2483, + "MUSR": 0.3661, + "MMLU-PRO": 0.1096 + } + }, + { + "model_id": "TinyLlama/TinyLlama-1.1B-Chat-v0.6", + "name": "TinyLlama-1.1B-Chat-v0.6", + "developer": "TinyLlama", + "scores": { + "IFEval": 0.1574, + "BBH": 0.3067, + "MATH Level 5": 0.0159, + "GPQA": 0.2584, + "MUSR": 0.3422, + "MMLU-PRO": 0.1149 + } + }, + { + "model_id": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "name": "TinyLlama-1.1B-Chat-v1.0", + "developer": "TinyLlama", + "scores": { + "IFEval": 0.0596, + "BBH": 0.3104, + "MATH Level 5": 0.0151, + "GPQA": 0.25, + "MUSR": 0.3515, + "MMLU-PRO": 0.1101 + } + }, + { + "model_id": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "name": "TinyLlama-1.1B-intermediate-step-1431k-3T", + "developer": "TinyLlama", + "scores": { + "IFEval": 0.2277, + "BBH": 0.3071, + "MATH Level 5": 0.0121, + "GPQA": 0.2525, + "MUSR": 0.338, + "MMLU-PRO": 0.112 + } + }, + { + "model_id": "TinyLlama/TinyLlama_v1.1", + "name": "TinyLlama_v1.1", + "developer": "TinyLlama", + "scores": { + "IFEval": 0.2001, + "BBH": 0.3024, + "MATH Level 5": 0.0121, + "GPQA": 0.2458, + "MUSR": 0.37, + "MMLU-PRO": 0.1049 + } + }, + { + "model_id": "ToastyPigeon/Sto-vo-kor-12B", + "name": "Sto-vo-kor-12B", + "developer": "ToastyPigeon", + "scores": { + "IFEval": 0.5501, + "BBH": 0.5065, + "MATH Level 5": 0.1088, + "GPQA": 0.3054, + "MUSR": 0.3938, + "MMLU-PRO": 0.3398 + } + }, + { + "model_id": "Trappu/Magnum-Picaro-0.7-v2-12b", + "name": "Magnum-Picaro-0.7-v2-12b", + "developer": "Trappu", + "scores": { + "IFEval": 0.3003, + "BBH": 0.5507, + "MATH Level 5": 0.0665, + "GPQA": 0.323, + "MUSR": 0.4727, + "MMLU-PRO": 0.358 + } + }, + { + "model_id": "Trappu/Nemo-Picaro-12B", + "name": "Nemo-Picaro-12B", + "developer": "Trappu", + "scores": { + "IFEval": 0.2577, + "BBH": 0.549, + "MATH Level 5": 0.0846, + "GPQA": 0.3272, + "MUSR": 0.4726, + "MMLU-PRO": 0.3605 + } + }, + { + "model_id": "Tremontaine/L3-12B-Lunaris-v1", + "name": "L3-12B-Lunaris-v1", + "developer": "Tremontaine", + "scores": { + "IFEval": 0.6909, + "BBH": 0.523, + "MATH Level 5": 0.0876, + "GPQA": 0.3096, + "MUSR": 0.3674, + "MMLU-PRO": 0.3775 + } + }, + { + "model_id": "Triangle104/Annunaki-12b", + "name": "Annunaki-12b", + "developer": "Triangle104", + "scores": { + "IFEval": 0.3872, + "BBH": 0.5499, + "MATH Level 5": 0.1216, + "GPQA": 0.3213, + "MUSR": 0.4409, + "MMLU-PRO": 0.3721 + } + }, + { + "model_id": "Triangle104/BigTalker-Lite-8B", + "name": "BigTalker-Lite-8B", + "developer": "Triangle104", + "scores": { + "IFEval": 0.3689, + "BBH": 0.5308, + "MATH Level 5": 0.102, + "GPQA": 0.3104, + "MUSR": 0.4208, + "MMLU-PRO": 0.3431 + } + }, + { + "model_id": "Triangle104/Chatty-Harry_V2.0", + "name": "Chatty-Harry_V2.0", + "developer": "Triangle104", + "scores": { + "IFEval": 0.3326, + "BBH": 0.5319, + "MATH Level 5": 0.139, + "GPQA": 0.323, + "MUSR": 0.4078, + "MMLU-PRO": 0.3683 + } + }, + { + "model_id": "Triangle104/Chatty-Harry_V3.0", + "name": "Chatty-Harry_V3.0", + "developer": "Triangle104", + "scores": { + "IFEval": 0.3675, + "BBH": 0.5526, + "MATH Level 5": 0.1125, + "GPQA": 0.323, + "MUSR": 0.4408, + "MMLU-PRO": 0.3702 + } + }, + { + "model_id": "Triangle104/Chronos-Prism_V1.0", + "name": "Chronos-Prism_V1.0", + "developer": "Triangle104", + "scores": { + "IFEval": 0.3259, + "BBH": 0.5554, + "MATH Level 5": 0.1201, + "GPQA": 0.3096, + "MUSR": 0.4263, + "MMLU-PRO": 0.3673 + } + }, + { + "model_id": "Triangle104/DS-Distilled-Hermes-Llama-3.1", + "name": "DS-Distilled-Hermes-Llama-3.1", + "developer": "Triangle104", + "scores": { + "IFEval": 0.3229, + "BBH": 0.5117, + "MATH Level 5": 0.2931, + "GPQA": 0.3188, + "MUSR": 0.4039, + "MMLU-PRO": 0.311 + } + }, + { + "model_id": "Triangle104/DS-Distilled-Hermes-Llama-3.1_TIES", + "name": "DS-Distilled-Hermes-Llama-3.1_TIES", + "developer": "Triangle104", + "scores": { + "IFEval": 0.1364, + "BBH": 0.2928, + "MATH Level 5": 0.0091, + "GPQA": 0.245, + "MUSR": 0.3621, + "MMLU-PRO": 0.1104 + } + }, + { + "model_id": "Triangle104/DS-R1-Distill-Q2.5-10B-Harmony", + "name": "DS-R1-Distill-Q2.5-10B-Harmony", + "developer": "Triangle104", + "scores": { + "IFEval": 0.1751, + "BBH": 0.2643, + "MATH Level 5": 0.0, + "GPQA": 0.2106, + "MUSR": 0.3128, + "MMLU-PRO": 0.1173 + } + }, + { + "model_id": "Triangle104/DS-R1-Distill-Q2.5-14B-Harmony_V0.1", + "name": "DS-R1-Distill-Q2.5-14B-Harmony_V0.1", + "developer": "Triangle104", + "scores": { + "IFEval": 0.4515, + "BBH": 0.5783, + "MATH Level 5": 0.5551, + "GPQA": 0.3935, + "MUSR": 0.5567, + "MMLU-PRO": 0.4601 + } + }, + { + "model_id": "Triangle104/DS-R1-Distill-Q2.5-7B-RP", + "name": "DS-R1-Distill-Q2.5-7B-RP", + "developer": "Triangle104", + "scores": { + "IFEval": 0.3445, + "BBH": 0.4383, + "MATH Level 5": 0.4683, + "GPQA": 0.3138, + "MUSR": 0.403, + "MMLU-PRO": 0.2891 + } + }, + { + "model_id": "Triangle104/DS-R1-Llama-8B-Harmony", + "name": "DS-R1-Llama-8B-Harmony", + "developer": "Triangle104", + "scores": { + "IFEval": 0.3566, + "BBH": 0.4154, + "MATH Level 5": 0.4282, + "GPQA": 0.2919, + "MUSR": 0.3762, + "MMLU-PRO": 0.2744 + } + }, + { + "model_id": "Triangle104/DSR1-Distill-Llama-Lit-8B", + "name": "DSR1-Distill-Llama-Lit-8B", + "developer": "Triangle104", + "scores": { + "IFEval": 0.1885, + "BBH": 0.4284, + "MATH Level 5": 0.352, + "GPQA": 0.3029, + "MUSR": 0.3535, + "MMLU-PRO": 0.2798 + } + }, + { + "model_id": "Triangle104/DSR1-Distill-Qwen-7B-RP", + "name": "DSR1-Distill-Qwen-7B-RP", + "developer": "Triangle104", + "scores": { + "IFEval": 0.3609, + "BBH": 0.4326, + "MATH Level 5": 0.4804, + "GPQA": 0.3196, + "MUSR": 0.4045, + "MMLU-PRO": 0.3028 + } + }, + { + "model_id": "Triangle104/Dark-Chivalry_V1.0", + "name": "Dark-Chivalry_V1.0", + "developer": "Triangle104", + "scores": { + "IFEval": 0.4326, + "BBH": 0.4974, + "MATH Level 5": 0.1314, + "GPQA": 0.2936, + "MUSR": 0.4182, + "MMLU-PRO": 0.3444 + } + }, + { + "model_id": "Triangle104/Distilled-DarkPlanet-Allades-8B", + "name": "Distilled-DarkPlanet-Allades-8B", + "developer": "Triangle104", + "scores": { + "IFEval": 0.346, + "BBH": 0.4634, + "MATH Level 5": 0.4003, + "GPQA": 0.3054, + "MUSR": 0.3538, + "MMLU-PRO": 0.2901 + } + }, + { + "model_id": "Triangle104/Distilled-DarkPlanet-Allades-8B_TIES", + "name": "Distilled-DarkPlanet-Allades-8B_TIES", + "developer": "Triangle104", + "scores": { + "IFEval": 0.3892, + "BBH": 0.5042, + "MATH Level 5": 0.0906, + "GPQA": 0.3146, + "MUSR": 0.3868, + "MMLU-PRO": 0.3401 + } + }, + { + "model_id": "Triangle104/Distilled-Whiskey-8b", + "name": "Distilled-Whiskey-8b", + "developer": "Triangle104", + "scores": { + "IFEval": 0.3448, + "BBH": 0.5028, + "MATH Level 5": 0.2545, + "GPQA": 0.3314, + "MUSR": 0.4172, + "MMLU-PRO": 0.3367 + } + }, + { + "model_id": "Triangle104/Dolphin3-Llama3.2-Smart", + "name": "Dolphin3-Llama3.2-Smart", + "developer": "Triangle104", + "scores": { + "IFEval": 0.4137, + "BBH": 0.3975, + "MATH Level 5": 0.0438, + "GPQA": 0.2693, + "MUSR": 0.3922, + "MMLU-PRO": 0.2195 + } + }, + { + "model_id": "Triangle104/Gemmadevi-Stock-10B", + "name": "Gemmadevi-Stock-10B", + "developer": "Triangle104", + "scores": { + "IFEval": 0.1582, + "BBH": 0.6066, + "MATH Level 5": 0.0967, + "GPQA": 0.3532, + "MUSR": 0.4621, + "MMLU-PRO": 0.4262 + } + }, + { + "model_id": "Triangle104/Hermes-Llama-3.2-CoT", + "name": "Hermes-Llama-3.2-CoT", + "developer": "Triangle104", + "scores": { + "IFEval": 0.4178, + "BBH": 0.4616, + "MATH Level 5": 0.0952, + "GPQA": 0.2794, + "MUSR": 0.3698, + "MMLU-PRO": 0.2947 + } + }, + { + "model_id": "Triangle104/Hermes-Llama-3.2-CoT-Summary", + "name": "Hermes-Llama-3.2-CoT-Summary", + "developer": "Triangle104", + "scores": { + "IFEval": 0.483, + "BBH": 0.42, + "MATH Level 5": 0.0831, + "GPQA": 0.2559, + "MUSR": 0.3575, + "MMLU-PRO": 0.2901 + } + }, + { + "model_id": "Triangle104/Hermes3-L3.1-DirtyHarry-8B", + "name": "Hermes3-L3.1-DirtyHarry-8B", + "developer": "Triangle104", + "scores": { + "IFEval": 0.3242, + "BBH": 0.5066, + "MATH Level 5": 0.0718, + "GPQA": 0.302, + "MUSR": 0.4069, + "MMLU-PRO": 0.3339 + } + }, + { + "model_id": "Triangle104/Herodotos-14B", + "name": "Herodotos-14B", + "developer": "Triangle104", + "scores": { + "IFEval": 0.4667, + "BBH": 0.6435, + "MATH Level 5": 0.5045, + "GPQA": 0.3733, + "MUSR": 0.4795, + "MMLU-PRO": 0.529 + } + }, + { + "model_id": "Triangle104/Herodotos-14B_V0.1", + "name": "Herodotos-14B_V0.1", + "developer": "Triangle104", + "scores": { + "IFEval": 0.1879, + "BBH": 0.3017, + "MATH Level 5": 0.0, + "GPQA": 0.224, + "MUSR": 0.3684, + "MMLU-PRO": 0.1164 + } + }, + { + "model_id": "Triangle104/L3.1-8B-Dusky-Ink", + "name": "L3.1-8B-Dusky-Ink", + "developer": "Triangle104", + "scores": { + "IFEval": 0.453, + "BBH": 0.5098, + "MATH Level 5": 0.1231, + "GPQA": 0.2894, + "MUSR": 0.4224, + "MMLU-PRO": 0.3683 + } + }, + { + "model_id": "Triangle104/L3.1-8B-Dusky-Ink_v0.r1", + "name": "L3.1-8B-Dusky-Ink_v0.r1", + "developer": "Triangle104", + "scores": { + "IFEval": 0.1985, + "BBH": 0.4337, + "MATH Level 5": 0.0431, + "GPQA": 0.3037, + "MUSR": 0.3988, + "MMLU-PRO": 0.3206 + } + }, + { + "model_id": "Triangle104/LThreePointOne-8B-HermesBlackroot", + "name": "LThreePointOne-8B-HermesBlackroot", + "developer": "Triangle104", + "scores": { + "IFEval": 0.1792, + "BBH": 0.4998, + "MATH Level 5": 0.0196, + "GPQA": 0.307, + "MUSR": 0.3586, + "MMLU-PRO": 0.3285 + } + }, + { + "model_id": "Triangle104/LThreePointOne-8B-HermesInk", + "name": "LThreePointOne-8B-HermesInk", + "developer": "Triangle104", + "scores": { + "IFEval": 0.4031, + "BBH": 0.5223, + "MATH Level 5": 0.1722, + "GPQA": 0.323, + "MUSR": 0.4129, + "MMLU-PRO": 0.3467 + } + }, + { + "model_id": "Triangle104/Llama3.1-Allades-Lit-8b", + "name": "Llama3.1-Allades-Lit-8b", + "developer": "Triangle104", + "scores": { + "IFEval": 0.2461, + "BBH": 0.4183, + "MATH Level 5": 0.0023, + "GPQA": 0.2844, + "MUSR": 0.3708, + "MMLU-PRO": 0.2724 + } + }, + { + "model_id": "Triangle104/Llama3.1-cc-Lit-8b", + "name": "Llama3.1-cc-Lit-8b", + "developer": "Triangle104", + "scores": { + "IFEval": 0.2993, + "BBH": 0.3848, + "MATH Level 5": 0.003, + "GPQA": 0.2777, + "MUSR": 0.3854, + "MMLU-PRO": 0.3004 + } + }, + { + "model_id": "Triangle104/Minerva-1.5b", + "name": "Minerva-1.5b", + "developer": "Triangle104", + "scores": { + "IFEval": 0.2694, + "BBH": 0.4026, + "MATH Level 5": 0.1027, + "GPQA": 0.3104, + "MUSR": 0.3655, + "MMLU-PRO": 0.2698 + } + }, + { + "model_id": "Triangle104/Minerva-1.5b_V0.2", + "name": "Minerva-1.5b_V0.2", + "developer": "Triangle104", + "scores": { + "IFEval": 0.3083, + "BBH": 0.3989, + "MATH Level 5": 0.114, + "GPQA": 0.2852, + "MUSR": 0.396, + "MMLU-PRO": 0.2911 + } + }, + { + "model_id": "Triangle104/Minerva-10b", + "name": "Minerva-10b", + "developer": "Triangle104", + "scores": { + "IFEval": 0.1879, + "BBH": 0.4462, + "MATH Level 5": 0.0, + "GPQA": 0.281, + "MUSR": 0.3627, + "MMLU-PRO": 0.2318 + } + }, + { + "model_id": "Triangle104/Minerva-14b", + "name": "Minerva-14b", + "developer": "Triangle104", + "scores": { + "IFEval": 0.3468, + "BBH": 0.6301, + "MATH Level 5": 0.3051, + "GPQA": 0.3742, + "MUSR": 0.4766, + "MMLU-PRO": 0.5194 + } + }, + { + "model_id": "Triangle104/Minerva-14b-V0.1", + "name": "Minerva-14b-V0.1", + "developer": "Triangle104", + "scores": { + "IFEval": 0.0861, + "BBH": 0.609, + "MATH Level 5": 0.3051, + "GPQA": 0.3658, + "MUSR": 0.47, + "MMLU-PRO": 0.5118 + } + }, + { + "model_id": "Triangle104/Minerva-7b", + "name": "Minerva-7b", + "developer": "Triangle104", + "scores": { + "IFEval": 0.3724, + "BBH": 0.5498, + "MATH Level 5": 0.284, + "GPQA": 0.323, + "MUSR": 0.4143, + "MMLU-PRO": 0.4444 + } + }, + { + "model_id": "Triangle104/Minerva-8b", + "name": "Minerva-8b", + "developer": "Triangle104", + "scores": { + "IFEval": 0.1721, + "BBH": 0.4669, + "MATH Level 5": 0.0045, + "GPQA": 0.3121, + "MUSR": 0.4273, + "MMLU-PRO": 0.3089 + } + }, + { + "model_id": "Triangle104/Mistral-Redemption-Arc", + "name": "Mistral-Redemption-Arc", + "developer": "Triangle104", + "scores": { + "IFEval": 0.4029, + "BBH": 0.6255, + "MATH Level 5": 0.4101, + "GPQA": 0.3473, + "MUSR": 0.4595, + "MMLU-PRO": 0.451 + } + }, + { + "model_id": "Triangle104/Mistral-Small-24b-Harmony", + "name": "Mistral-Small-24b-Harmony", + "developer": "Triangle104", + "scores": { + "IFEval": 0.1687, + "BBH": 0.6434, + "MATH Level 5": 0.1911, + "GPQA": 0.3842, + "MUSR": 0.4276, + "MMLU-PRO": 0.5431 + } + }, + { + "model_id": "Triangle104/Pans_Gutenbergum_V0.1", + "name": "Pans_Gutenbergum_V0.1", + "developer": "Triangle104", + "scores": { + "IFEval": 0.3097, + "BBH": 0.5541, + "MATH Level 5": 0.1057, + "GPQA": 0.323, + "MUSR": 0.4528, + "MMLU-PRO": 0.3697 + } + }, + { + "model_id": "Triangle104/Pans_Gutenbergum_V0.2", + "name": "Pans_Gutenbergum_V0.2", + "developer": "Triangle104", + "scores": { + "IFEval": 0.3215, + "BBH": 0.5526, + "MATH Level 5": 0.0687, + "GPQA": 0.3121, + "MUSR": 0.4673, + "MMLU-PRO": 0.3585 + } + }, + { + "model_id": "Triangle104/Pantheon_ChatWaifu_V0.2", + "name": "Pantheon_ChatWaifu_V0.2", + "developer": "Triangle104", + "scores": { + "IFEval": 0.2683, + "BBH": 0.5532, + "MATH Level 5": 0.0566, + "GPQA": 0.318, + "MUSR": 0.4755, + "MMLU-PRO": 0.3442 + } + }, + { + "model_id": "Triangle104/Phi-4-AbliteratedRP", + "name": "Phi-4-AbliteratedRP", + "developer": "Triangle104", + "scores": { + "IFEval": 0.4923, + "BBH": 0.6709, + "MATH Level 5": 0.3074, + "GPQA": 0.3951, + "MUSR": 0.5098, + "MMLU-PRO": 0.5308 + } + }, + { + "model_id": "Triangle104/Phi4-RP-o1", + "name": "Phi4-RP-o1", + "developer": "Triangle104", + "scores": { + "IFEval": 0.022, + "BBH": 0.6653, + "MATH Level 5": 0.3776, + "GPQA": 0.3733, + "MUSR": 0.4756, + "MMLU-PRO": 0.5111 + } + }, + { + "model_id": "Triangle104/Phi4-RP-o1-Ablit", + "name": "Phi4-RP-o1-Ablit", + "developer": "Triangle104", + "scores": { + "IFEval": 0.0239, + "BBH": 0.663, + "MATH Level 5": 0.3882, + "GPQA": 0.3633, + "MUSR": 0.4754, + "MMLU-PRO": 0.5105 + } + }, + { + "model_id": "Triangle104/Porpoise-R1-Llama3.2-3b", + "name": "Porpoise-R1-Llama3.2-3b", + "developer": "Triangle104", + "scores": { + "IFEval": 0.4352, + "BBH": 0.3824, + "MATH Level 5": 0.0423, + "GPQA": 0.2668, + "MUSR": 0.3576, + "MMLU-PRO": 0.2117 + } + }, + { + "model_id": "Triangle104/Q2.5-14B-Instruct-1M-Harmony", + "name": "Q2.5-14B-Instruct-1M-Harmony", + "developer": "Triangle104", + "scores": { + "IFEval": 0.5986, + "BBH": 0.6339, + "MATH Level 5": 0.3769, + "GPQA": 0.375, + "MUSR": 0.4795, + "MMLU-PRO": 0.5075 + } + }, + { + "model_id": "Triangle104/Q2.5-AthensCOT", + "name": "Q2.5-AthensCOT", + "developer": "Triangle104", + "scores": { + "IFEval": 0.4573, + "BBH": 0.5542, + "MATH Level 5": 0.2915, + "GPQA": 0.3003, + "MUSR": 0.4578, + "MMLU-PRO": 0.4379 + } + }, + { + "model_id": "Triangle104/Q2.5-CodeR1-3B", + "name": "Q2.5-CodeR1-3B", + "developer": "Triangle104", + "scores": { + "IFEval": 0.3588, + "BBH": 0.4661, + "MATH Level 5": 0.1639, + "GPQA": 0.3037, + "MUSR": 0.4315, + "MMLU-PRO": 0.2979 + } + }, + { + "model_id": "Triangle104/Q2.5-EVACOT-7b", + "name": "Q2.5-EVACOT-7b", + "developer": "Triangle104", + "scores": { + "IFEval": 0.5784, + "BBH": 0.5506, + "MATH Level 5": 0.2825, + "GPQA": 0.318, + "MUSR": 0.4499, + "MMLU-PRO": 0.4331 + } + }, + { + "model_id": "Triangle104/Q2.5-EvaHumane-RP", + "name": "Q2.5-EvaHumane-RP", + "developer": "Triangle104", + "scores": { + "IFEval": 0.3676, + "BBH": 0.5328, + "MATH Level 5": 0.2923, + "GPQA": 0.3188, + "MUSR": 0.4276, + "MMLU-PRO": 0.4412 + } + }, + { + "model_id": "Triangle104/Q2.5-Humane-RP", + "name": "Q2.5-Humane-RP", + "developer": "Triangle104", + "scores": { + "IFEval": 0.4412, + "BBH": 0.5649, + "MATH Level 5": 0.3391, + "GPQA": 0.3188, + "MUSR": 0.4528, + "MMLU-PRO": 0.4492 + } + }, + { + "model_id": "Triangle104/Q2.5-Instruct-1M_Harmony", + "name": "Q2.5-Instruct-1M_Harmony", + "developer": "Triangle104", + "scores": { + "IFEval": 0.6038, + "BBH": 0.5373, + "MATH Level 5": 0.3323, + "GPQA": 0.323, + "MUSR": 0.4688, + "MMLU-PRO": 0.4366 + } + }, + { + "model_id": "Triangle104/Q2.5-R1-3B", + "name": "Q2.5-R1-3B", + "developer": "Triangle104", + "scores": { + "IFEval": 0.4214, + "BBH": 0.4812, + "MATH Level 5": 0.2674, + "GPQA": 0.3096, + "MUSR": 0.432, + "MMLU-PRO": 0.3813 + } + }, + { + "model_id": "Triangle104/Q2.5-R1-7B", + "name": "Q2.5-R1-7B", + "developer": "Triangle104", + "scores": { + "IFEval": 0.1346, + "BBH": 0.3007, + "MATH Level 5": 0.0166, + "GPQA": 0.2525, + "MUSR": 0.3607, + "MMLU-PRO": 0.118 + } + }, + { + "model_id": "Triangle104/Robo-Gutenberg_V1.0", + "name": "Robo-Gutenberg_V1.0", + "developer": "Triangle104", + "scores": { + "IFEval": 0.6008, + "BBH": 0.6537, + "MATH Level 5": 0.4562, + "GPQA": 0.3859, + "MUSR": 0.4744, + "MMLU-PRO": 0.5391 + } + }, + { + "model_id": "Triangle104/Rocinante-Prism_V2.0", + "name": "Rocinante-Prism_V2.0", + "developer": "Triangle104", + "scores": { + "IFEval": 0.2616, + "BBH": 0.5361, + "MATH Level 5": 0.111, + "GPQA": 0.3205, + "MUSR": 0.445, + "MMLU-PRO": 0.364 + } + }, + { + "model_id": "Triangle104/Rocinante-Prism_V2.1", + "name": "Rocinante-Prism_V2.1", + "developer": "Triangle104", + "scores": { + "IFEval": 0.2558, + "BBH": 0.5333, + "MATH Level 5": 0.1125, + "GPQA": 0.3196, + "MUSR": 0.449, + "MMLU-PRO": 0.3651 + } + }, + { + "model_id": "Triangle104/RomboHermes3-R1-Llama3.2-3b", + "name": "RomboHermes3-R1-Llama3.2-3b", + "developer": "Triangle104", + "scores": { + "IFEval": 0.3007, + "BBH": 0.4264, + "MATH Level 5": 0.0816, + "GPQA": 0.2836, + "MUSR": 0.3657, + "MMLU-PRO": 0.2957 + } + }, + { + "model_id": "Triangle104/Rombos-Novasky-7B_V1c", + "name": "Rombos-Novasky-7B_V1c", + "developer": "Triangle104", + "scores": { + "IFEval": 0.408, + "BBH": 0.4349, + "MATH Level 5": 0.0853, + "GPQA": 0.2961, + "MUSR": 0.4465, + "MMLU-PRO": 0.2738 + } + }, + { + "model_id": "Triangle104/Set-70b", + "name": "Set-70b", + "developer": "Triangle104", + "scores": { + "IFEval": 0.7643, + "BBH": 0.7014, + "MATH Level 5": 0.364, + "GPQA": 0.4463, + "MUSR": 0.4696, + "MMLU-PRO": 0.5442 + } + }, + { + "model_id": "Tsunami-th/Tsunami-0.5-7B-Instruct", + "name": "Tsunami-0.5-7B-Instruct", + "developer": "Tsunami-th", + "scores": { + "IFEval": 0.74, + "BBH": 0.5524, + "MATH Level 5": 0.5045, + "GPQA": 0.3087, + "MUSR": 0.4257, + "MMLU-PRO": 0.4413 + } + }, + { + "model_id": "Tsunami-th/Tsunami-0.5x-7B-Instruct", + "name": "Tsunami-0.5x-7B-Instruct", + "developer": "Tsunami-th", + "scores": { + "IFEval": 0.7099, + "BBH": 0.5593, + "MATH Level 5": 0.4207, + "GPQA": 0.3146, + "MUSR": 0.4667, + "MMLU-PRO": 0.4458 + } + }, + { + "model_id": "Tsunami-th/Tsunami-1.0-14B-Instruct", + "name": "Tsunami-1.0-14B-Instruct", + "developer": "Tsunami-th", + "scores": { + "IFEval": 0.7829, + "BBH": 0.6439, + "MATH Level 5": 0.4585, + "GPQA": 0.3565, + "MUSR": 0.4459, + "MMLU-PRO": 0.5249 + } + }, + { + "model_id": "Tsunami-th/Tsunami-1.0-7B-Instruct", + "name": "Tsunami-1.0-7B-Instruct", + "developer": "Tsunami-th", + "scores": { + "IFEval": 0.7309, + "BBH": 0.5491, + "MATH Level 5": 0.4335, + "GPQA": 0.3129, + "MUSR": 0.4493, + "MMLU-PRO": 0.4424 + } + }, + { + "model_id": "UCLA-AGI/Gemma-2-9B-It-SPPO-Iter1", + "name": "Gemma-2-9B-It-SPPO-Iter1", + "developer": "UCLA-AGI", + "scores": { + "IFEval": 0.3082, + "BBH": 0.5969, + "MATH Level 5": 0.0899, + "GPQA": 0.3364, + "MUSR": 0.4099, + "MMLU-PRO": 0.3907 + } + }, + { + "model_id": "UCLA-AGI/Gemma-2-9B-It-SPPO-Iter2", + "name": "Gemma-2-9B-It-SPPO-Iter2", + "developer": "UCLA-AGI", + "scores": { + "IFEval": 0.31, + "BBH": 0.599, + "MATH Level 5": 0.0808, + "GPQA": 0.3347, + "MUSR": 0.4139, + "MMLU-PRO": 0.387 + } + }, + { + "model_id": "UCLA-AGI/Gemma-2-9B-It-SPPO-Iter3", + "name": "Gemma-2-9B-It-SPPO-Iter3", + "developer": "UCLA-AGI", + "scores": { + "IFEval": 0.3167, + "BBH": 0.6007, + "MATH Level 5": 0.071, + "GPQA": 0.3389, + "MUSR": 0.4166, + "MMLU-PRO": 0.3826 + } + }, + { + "model_id": "UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter1", + "name": "Llama-3-Instruct-8B-SPPO-Iter1", + "developer": "UCLA-AGI", + "scores": { + "IFEval": 0.7299, + "BBH": 0.5058, + "MATH Level 5": 0.1148, + "GPQA": 0.2676, + "MUSR": 0.3568, + "MMLU-PRO": 0.3711 + } + }, + { + "model_id": "UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter2", + "name": "Llama-3-Instruct-8B-SPPO-Iter2", + "developer": "UCLA-AGI", + "scores": { + "IFEval": 0.6989, + "BBH": 0.5089, + "MATH Level 5": 0.1035, + "GPQA": 0.2668, + "MUSR": 0.3594, + "MMLU-PRO": 0.3692 + } + }, + { + "model_id": "UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3", + "name": "Llama-3-Instruct-8B-SPPO-Iter3", + "developer": "UCLA-AGI", + "scores": { + "IFEval": 0.6703, + "BBH": 0.5076, + "MATH Level 5": 0.0718, + "GPQA": 0.2651, + "MUSR": 0.3647, + "MMLU-PRO": 0.3658 + } + }, + { + "model_id": "UCLA-AGI/Mistral7B-PairRM-SPPO", + "name": "Mistral7B-PairRM-SPPO", + "developer": "UCLA-AGI", + "scores": { + "IFEval": 0.4355, + "BBH": 0.4439, + "MATH Level 5": 0.031, + "GPQA": 0.281, + "MUSR": 0.3965, + "MMLU-PRO": 0.2621 + } + }, + { + "model_id": "UCLA-AGI/Mistral7B-PairRM-SPPO-Iter1", + "name": "Mistral7B-PairRM-SPPO-Iter1", + "developer": "UCLA-AGI", + "scores": { + "IFEval": 0.5047, + "BBH": 0.4468, + "MATH Level 5": 0.0249, + "GPQA": 0.2836, + "MUSR": 0.3992, + "MMLU-PRO": 0.2695 + } + }, + { + "model_id": "UCLA-AGI/Mistral7B-PairRM-SPPO-Iter2", + "name": "Mistral7B-PairRM-SPPO-Iter2", + "developer": "UCLA-AGI", + "scores": { + "IFEval": 0.4446, + "BBH": 0.4466, + "MATH Level 5": 0.0219, + "GPQA": 0.2886, + "MUSR": 0.4085, + "MMLU-PRO": 0.2677 + } + }, + { + "model_id": "UCLA-AGI/Mistral7B-PairRM-SPPO-Iter3", + "name": "Mistral7B-PairRM-SPPO-Iter3", + "developer": "UCLA-AGI", + "scores": { + "IFEval": 0.4351, + "BBH": 0.4397, + "MATH Level 5": 0.0234, + "GPQA": 0.2752, + "MUSR": 0.4071, + "MMLU-PRO": 0.2658 + } + }, + { + "model_id": "UKzExecution/LlamaExecutor-8B-3.0.5", + "name": "LlamaExecutor-8B-3.0.5", + "developer": "UKzExecution", + "scores": { + "IFEval": 0.7403, + "BBH": 0.5006, + "MATH Level 5": 0.102, + "GPQA": 0.2559, + "MUSR": 0.3754, + "MMLU-PRO": 0.3625 + } + }, + { + "model_id": "Unbabel/TowerInstruct-Mistral-7B-v0.2", + "name": "TowerInstruct-Mistral-7B-v0.2", + "developer": "Unbabel", + "scores": { + "IFEval": 0.2843, + "BBH": 0.3882, + "MATH Level 5": 0.0204, + "GPQA": 0.2475, + "MUSR": 0.4522, + "MMLU-PRO": 0.1968 + } + }, + { + "model_id": "Undi95/MG-FinalMix-72B", + "name": "MG-FinalMix-72B", + "developer": "Undi95", + "scores": { + "IFEval": 0.8014, + "BBH": 0.6973, + "MATH Level 5": 0.3973, + "GPQA": 0.3851, + "MUSR": 0.4823, + "MMLU-PRO": 0.5427 + } + }, + { + "model_id": "Undi95/Phi4-abliterated", + "name": "Phi4-abliterated", + "developer": "Undi95", + "scores": { + "IFEval": 0.6618, + "BBH": 0.6809, + "MATH Level 5": 0.3701, + "GPQA": 0.3305, + "MUSR": 0.4034, + "MMLU-PRO": 0.5281 + } + }, + { + "model_id": "V3N0M/Jenna-Tiny-2.0", + "name": "Jenna-Tiny-2.0", + "developer": "V3N0M", + "scores": { + "IFEval": 0.2309, + "BBH": 0.3148, + "MATH Level 5": 0.0121, + "GPQA": 0.25, + "MUSR": 0.3367, + "MMLU-PRO": 0.1147 + } + }, + { + "model_id": "VAGOsolutions/Llama-3-SauerkrautLM-70b-Instruct", + "name": "Llama-3-SauerkrautLM-70b-Instruct", + "developer": "VAGOsolutions", + "scores": { + "IFEval": 0.8045, + "BBH": 0.6663, + "MATH Level 5": 0.2281, + "GPQA": 0.328, + "MUSR": 0.4339, + "MMLU-PRO": 0.5392 + } + }, + { + "model_id": "VAGOsolutions/Llama-3-SauerkrautLM-8b-Instruct", + "name": "Llama-3-SauerkrautLM-8b-Instruct", + "developer": "VAGOsolutions", + "scores": { + "IFEval": 0.7445, + "BBH": 0.4943, + "MATH Level 5": 0.0665, + "GPQA": 0.3087, + "MUSR": 0.4241, + "MMLU-PRO": 0.3857 + } + }, + { + "model_id": "VAGOsolutions/Llama-3.1-SauerkrautLM-70b-Instruct", + "name": "Llama-3.1-SauerkrautLM-70b-Instruct", + "developer": "VAGOsolutions", + "scores": { + "IFEval": 0.8656, + "BBH": 0.7006, + "MATH Level 5": 0.3693, + "GPQA": 0.3414, + "MUSR": 0.4711, + "MMLU-PRO": 0.5335 + } + }, + { + "model_id": "VAGOsolutions/Llama-3.1-SauerkrautLM-8b-Instruct", + "name": "Llama-3.1-SauerkrautLM-8b-Instruct", + "developer": "VAGOsolutions", + "scores": { + "IFEval": 0.8017, + "BBH": 0.5115, + "MATH Level 5": 0.1941, + "GPQA": 0.2903, + "MUSR": 0.4148, + "MMLU-PRO": 0.389 + } + }, + { + "model_id": "VAGOsolutions/SauerkrautLM-1.5b", + "name": "SauerkrautLM-1.5b", + "developer": "VAGOsolutions", + "scores": { + "IFEval": 0.2404, + "BBH": 0.3704, + "MATH Level 5": 0.0363, + "GPQA": 0.271, + "MUSR": 0.3739, + "MMLU-PRO": 0.2151 + } + }, + { + "model_id": "VAGOsolutions/SauerkrautLM-7b-HerO", + "name": "SauerkrautLM-7b-HerO", + "developer": "VAGOsolutions", + "scores": { + "IFEval": 0.5346, + "BBH": 0.4904, + "MATH Level 5": 0.0393, + "GPQA": 0.2727, + "MUSR": 0.3924, + "MMLU-PRO": 0.3046 + } + }, + { + "model_id": "VAGOsolutions/SauerkrautLM-7b-LaserChat", + "name": "SauerkrautLM-7b-LaserChat", + "developer": "VAGOsolutions", + "scores": { + "IFEval": 0.5988, + "BBH": 0.4543, + "MATH Level 5": 0.0778, + "GPQA": 0.3003, + "MUSR": 0.4148, + "MMLU-PRO": 0.3305 + } + }, + { + "model_id": "VAGOsolutions/SauerkrautLM-Gemma-2b", + "name": "SauerkrautLM-Gemma-2b", + "developer": "VAGOsolutions", + "scores": { + "IFEval": 0.2475, + "BBH": 0.3416, + "MATH Level 5": 0.0279, + "GPQA": 0.2567, + "MUSR": 0.3676, + "MMLU-PRO": 0.1469 + } + }, + { + "model_id": "VAGOsolutions/SauerkrautLM-Gemma-7b", + "name": "SauerkrautLM-Gemma-7b", + "developer": "VAGOsolutions", + "scores": { + "IFEval": 0.3407, + "BBH": 0.4188, + "MATH Level 5": 0.0672, + "GPQA": 0.2861, + "MUSR": 0.3594, + "MMLU-PRO": 0.2961 + } + }, + { + "model_id": "VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct", + "name": "SauerkrautLM-Mixtral-8x7B-Instruct", + "developer": "VAGOsolutions", + "scores": { + "IFEval": 0.5602, + "BBH": 0.5277, + "MATH Level 5": 0.0982, + "GPQA": 0.2978, + "MUSR": 0.4204, + "MMLU-PRO": 0.365 + } + }, + { + "model_id": "VAGOsolutions/SauerkrautLM-Nemo-12b-Instruct", + "name": "SauerkrautLM-Nemo-12b-Instruct", + "developer": "VAGOsolutions", + "scores": { + "IFEval": 0.6113, + "BBH": 0.5214, + "MATH Level 5": 0.1224, + "GPQA": 0.3096, + "MUSR": 0.4469, + "MMLU-PRO": 0.3385 + } + }, + { + "model_id": "VAGOsolutions/SauerkrautLM-Phi-3-medium", + "name": "SauerkrautLM-Phi-3-medium", + "developer": "VAGOsolutions", + "scores": { + "IFEval": 0.4409, + "BBH": 0.6433, + "MATH Level 5": 0.1601, + "GPQA": 0.3347, + "MUSR": 0.4845, + "MMLU-PRO": 0.4665 + } + }, + { + "model_id": "VAGOsolutions/SauerkrautLM-SOLAR-Instruct", + "name": "SauerkrautLM-SOLAR-Instruct", + "developer": "VAGOsolutions", + "scores": { + "IFEval": 0.4917, + "BBH": 0.5169, + "MATH Level 5": 0.0634, + "GPQA": 0.3054, + "MUSR": 0.3965, + "MMLU-PRO": 0.3183 + } + }, + { + "model_id": "VAGOsolutions/SauerkrautLM-gemma-2-2b-it", + "name": "SauerkrautLM-gemma-2-2b-it", + "developer": "VAGOsolutions", + "scores": { + "IFEval": 0.1321, + "BBH": 0.4241, + "MATH Level 5": 0.0219, + "GPQA": 0.2727, + "MUSR": 0.3995, + "MMLU-PRO": 0.2693 + } + }, + { + "model_id": "VAGOsolutions/SauerkrautLM-gemma-2-9b-it", + "name": "SauerkrautLM-gemma-2-9b-it", + "developer": "VAGOsolutions", + "scores": { + "IFEval": 0.3024, + "BBH": 0.6073, + "MATH Level 5": 0.0838, + "GPQA": 0.3272, + "MUSR": 0.4318, + "MMLU-PRO": 0.4091 + } + }, + { + "model_id": "VAGOsolutions/SauerkrautLM-v2-14b-DPO", + "name": "SauerkrautLM-v2-14b-DPO", + "developer": "VAGOsolutions", + "scores": { + "IFEval": 0.7412, + "BBH": 0.656, + "MATH Level 5": 0.3165, + "GPQA": 0.3196, + "MUSR": 0.4375, + "MMLU-PRO": 0.5117 + } + }, + { + "model_id": "VAGOsolutions/SauerkrautLM-v2-14b-SFT", + "name": "SauerkrautLM-v2-14b-SFT", + "developer": "VAGOsolutions", + "scores": { + "IFEval": 0.6949, + "BBH": 0.621, + "MATH Level 5": 0.3285, + "GPQA": 0.3356, + "MUSR": 0.4179, + "MMLU-PRO": 0.5205 + } + }, + { + "model_id": "VIRNECT/llama-3-Korean-8B", + "name": "llama-3-Korean-8B", + "developer": "VIRNECT", + "scores": { + "IFEval": 0.5021, + "BBH": 0.4918, + "MATH Level 5": 0.108, + "GPQA": 0.271, + "MUSR": 0.3648, + "MMLU-PRO": 0.3536 + } + }, + { + "model_id": "VIRNECT/llama-3-Korean-8B-r-v-0.1", + "name": "llama-3-Korean-8B-r-v-0.1", + "developer": "VIRNECT", + "scores": { + "IFEval": 0.4916, + "BBH": 0.4806, + "MATH Level 5": 0.0861, + "GPQA": 0.2424, + "MUSR": 0.3675, + "MMLU-PRO": 0.326 + } + }, + { + "model_id": "ValiantLabs/Llama3-70B-Fireplace", + "name": "Llama3-70B-Fireplace", + "developer": "ValiantLabs", + "scores": { + "IFEval": 0.7774, + "BBH": 0.6489, + "MATH Level 5": 0.2145, + "GPQA": 0.3549, + "MUSR": 0.4449, + "MMLU-PRO": 0.4893 + } + }, + { + "model_id": "ValiantLabs/Llama3-70B-ShiningValiant2", + "name": "Llama3-70B-ShiningValiant2", + "developer": "ValiantLabs", + "scores": { + "IFEval": 0.6122, + "BBH": 0.6338, + "MATH Level 5": 0.2077, + "GPQA": 0.3305, + "MUSR": 0.4326, + "MMLU-PRO": 0.4898 + } + }, + { + "model_id": "ValiantLabs/Llama3.1-70B-ShiningValiant2", + "name": "Llama3.1-70B-ShiningValiant2", + "developer": "ValiantLabs", + "scores": { + "IFEval": 0.5355, + "BBH": 0.6738, + "MATH Level 5": 0.2915, + "GPQA": 0.3926, + "MUSR": 0.4681, + "MMLU-PRO": 0.5173 + } + }, + { + "model_id": "ValiantLabs/Llama3.1-8B-Cobalt", + "name": "Llama3.1-8B-Cobalt", + "developer": "ValiantLabs", + "scores": { + "IFEval": 0.7168, + "BBH": 0.4911, + "MATH Level 5": 0.1533, + "GPQA": 0.2861, + "MUSR": 0.3512, + "MMLU-PRO": 0.3663 + } + }, + { + "model_id": "ValiantLabs/Llama3.1-8B-Enigma", + "name": "Llama3.1-8B-Enigma", + "developer": "ValiantLabs", + "scores": { + "IFEval": 0.2681, + "BBH": 0.4478, + "MATH Level 5": 0.0891, + "GPQA": 0.2878, + "MUSR": 0.4196, + "MMLU-PRO": 0.3409 + } + }, + { + "model_id": "ValiantLabs/Llama3.1-8B-Esper2", + "name": "Llama3.1-8B-Esper2", + "developer": "ValiantLabs", + "scores": { + "IFEval": 0.2567, + "BBH": 0.447, + "MATH Level 5": 0.0589, + "GPQA": 0.2727, + "MUSR": 0.3561, + "MMLU-PRO": 0.2904 + } + }, + { + "model_id": "ValiantLabs/Llama3.1-8B-Fireplace2", + "name": "Llama3.1-8B-Fireplace2", + "developer": "ValiantLabs", + "scores": { + "IFEval": 0.5483, + "BBH": 0.461, + "MATH Level 5": 0.0582, + "GPQA": 0.2886, + "MUSR": 0.3433, + "MMLU-PRO": 0.2407 + } + }, + { + "model_id": "ValiantLabs/Llama3.1-8B-ShiningValiant2", + "name": "Llama3.1-8B-ShiningValiant2", + "developer": "ValiantLabs", + "scores": { + "IFEval": 0.2678, + "BBH": 0.4429, + "MATH Level 5": 0.0521, + "GPQA": 0.302, + "MUSR": 0.3959, + "MMLU-PRO": 0.2927 + } + }, + { + "model_id": "ValiantLabs/Llama3.2-3B-Enigma", + "name": "Llama3.2-3B-Enigma", + "developer": "ValiantLabs", + "scores": { + "IFEval": 0.2786, + "BBH": 0.3723, + "MATH Level 5": 0.0438, + "GPQA": 0.2617, + "MUSR": 0.3921, + "MMLU-PRO": 0.2428 + } + }, + { + "model_id": "ValiantLabs/Llama3.2-3B-Esper2", + "name": "Llama3.2-3B-Esper2", + "developer": "ValiantLabs", + "scores": { + "IFEval": 0.275, + "BBH": 0.3808, + "MATH Level 5": 0.0363, + "GPQA": 0.2701, + "MUSR": 0.355, + "MMLU-PRO": 0.2257 + } + }, + { + "model_id": "ValiantLabs/Llama3.2-3B-ShiningValiant2", + "name": "Llama3.2-3B-ShiningValiant2", + "developer": "ValiantLabs", + "scores": { + "IFEval": 0.2625, + "BBH": 0.4226, + "MATH Level 5": 0.0823, + "GPQA": 0.2802, + "MUSR": 0.3866, + "MMLU-PRO": 0.2829 + } + }, + { + "model_id": "Vikhrmodels/Vikhr-Llama3.1-8B-Instruct-R-21-09-24", + "name": "Vikhr-Llama3.1-8B-Instruct-R-21-09-24", + "developer": "Vikhrmodels", + "scores": { + "IFEval": 0.6431, + "BBH": 0.5272, + "MATH Level 5": 0.2175, + "GPQA": 0.245, + "MUSR": 0.3754, + "MMLU-PRO": 0.3547 + } + }, + { + "model_id": "Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24", + "name": "Vikhr-Nemo-12B-Instruct-R-21-09-24", + "developer": "Vikhrmodels", + "scores": { + "IFEval": 0.5999, + "BBH": 0.5212, + "MATH Level 5": 0.1715, + "GPQA": 0.2911, + "MUSR": 0.4073, + "MMLU-PRO": 0.3398 + } + }, + { + "model_id": "Weyaxi/Bagel-Hermes-2x34B", + "name": "Bagel-Hermes-2x34B", + "developer": "Weyaxi", + "scores": { + "IFEval": 0.5432, + "BBH": 0.4917, + "MATH Level 5": 0.0604, + "GPQA": 0.328, + "MUSR": 0.4517, + "MMLU-PRO": 0.4589 + } + }, + { + "model_id": "Weyaxi/Bagel-Hermes-34B-Slerp", + "name": "Bagel-Hermes-34B-Slerp", + "developer": "Weyaxi", + "scores": { + "IFEval": 0.4603, + "BBH": 0.5922, + "MATH Level 5": 0.0604, + "GPQA": 0.3347, + "MUSR": 0.4622, + "MMLU-PRO": 0.4703 + } + }, + { + "model_id": "Weyaxi/Einstein-v4-7B", + "name": "Einstein-v4-7B", + "developer": "Weyaxi", + "scores": { + "IFEval": 0.4708, + "BBH": 0.3849, + "MATH Level 5": 0.0189, + "GPQA": 0.2819, + "MUSR": 0.4682, + "MMLU-PRO": 0.2259 + } + }, + { + "model_id": "Weyaxi/Einstein-v6.1-Llama3-8B", + "name": "Einstein-v6.1-Llama3-8B", + "developer": "Weyaxi", + "scores": { + "IFEval": 0.4568, + "BBH": 0.5008, + "MATH Level 5": 0.068, + "GPQA": 0.2819, + "MUSR": 0.4213, + "MMLU-PRO": 0.3131 + } + }, + { + "model_id": "Weyaxi/Einstein-v6.1-developed-by-Weyaxi-Llama3-8B", + "name": "Einstein-v6.1-developed-by-Weyaxi-Llama3-8B", + "developer": "Weyaxi", + "scores": { + "IFEval": 0.3927, + "BBH": 0.5044, + "MATH Level 5": 0.0718, + "GPQA": 0.2735, + "MUSR": 0.4332, + "MMLU-PRO": 0.3093 + } + }, + { + "model_id": "Weyaxi/Einstein-v7-Qwen2-7B", + "name": "Einstein-v7-Qwen2-7B", + "developer": "Weyaxi", + "scores": { + "IFEval": 0.41, + "BBH": 0.5161, + "MATH Level 5": 0.1994, + "GPQA": 0.2995, + "MUSR": 0.44, + "MMLU-PRO": 0.4096 + } + }, + { + "model_id": "Weyaxi/Einstein-v8-Llama3.2-1B", + "name": "Einstein-v8-Llama3.2-1B", + "developer": "Weyaxi", + "scores": { + "IFEval": 0.1862, + "BBH": 0.3018, + "MATH Level 5": 0.0008, + "GPQA": 0.2584, + "MUSR": 0.3618, + "MMLU-PRO": 0.1161 + } + }, + { + "model_id": "Weyaxi/SauerkrautLM-UNA-SOLAR-Instruct", + "name": "SauerkrautLM-UNA-SOLAR-Instruct", + "developer": "Weyaxi", + "scores": { + "IFEval": 0.4573, + "BBH": 0.5166, + "MATH Level 5": 0.0461, + "GPQA": 0.3112, + "MUSR": 0.3979, + "MMLU-PRO": 0.3153 + } + }, + { + "model_id": "WizardLMTeam/WizardLM-13B-V1.0", + "name": "WizardLM-13B-V1.0", + "developer": "WizardLMTeam", + "scores": { + "IFEval": 0.185, + "BBH": 0.2913, + "MATH Level 5": 0.0, + "GPQA": 0.2592, + "MUSR": 0.3497, + "MMLU-PRO": 0.1166 + } + }, + { + "model_id": "WizardLMTeam/WizardLM-13B-V1.2", + "name": "WizardLM-13B-V1.2", + "developer": "WizardLMTeam", + "scores": { + "IFEval": 0.3392, + "BBH": 0.4462, + "MATH Level 5": 0.0189, + "GPQA": 0.2609, + "MUSR": 0.4378, + "MMLU-PRO": 0.2519 + } + }, + { + "model_id": "WizardLMTeam/WizardLM-70B-V1.0", + "name": "WizardLM-70B-V1.0", + "developer": "WizardLMTeam", + "scores": { + "IFEval": 0.4951, + "BBH": 0.559, + "MATH Level 5": 0.0393, + "GPQA": 0.2659, + "MUSR": 0.4391, + "MMLU-PRO": 0.3447 + } + }, + { + "model_id": "Wladastic/Mini-Think-Base-1B", + "name": "Mini-Think-Base-1B", + "developer": "Wladastic", + "scores": { + "IFEval": 0.5588, + "BBH": 0.3574, + "MATH Level 5": 0.0733, + "GPQA": 0.2634, + "MUSR": 0.3275, + "MMLU-PRO": 0.1772 + } + }, + { + "model_id": "Xclbr7/Arcanum-12b", + "name": "Arcanum-12b", + "developer": "Xclbr7", + "scores": { + "IFEval": 0.2907, + "BBH": 0.5265, + "MATH Level 5": 0.1193, + "GPQA": 0.3205, + "MUSR": 0.417, + "MMLU-PRO": 0.3586 + } + }, + { + "model_id": "Xclbr7/Hyena-12b", + "name": "Hyena-12b", + "developer": "Xclbr7", + "scores": { + "IFEval": 0.3404, + "BBH": 0.5457, + "MATH Level 5": 0.1133, + "GPQA": 0.2978, + "MUSR": 0.3984, + "MMLU-PRO": 0.3439 + } + }, + { + "model_id": "Xclbr7/caliburn-12b", + "name": "caliburn-12b", + "developer": "Xclbr7", + "scores": { + "IFEval": 0.3576, + "BBH": 0.5519, + "MATH Level 5": 0.1125, + "GPQA": 0.3364, + "MUSR": 0.4292, + "MMLU-PRO": 0.3675 + } + }, + { + "model_id": "Xclbr7/caliburn-v2-12b", + "name": "caliburn-v2-12b", + "developer": "Xclbr7", + "scores": { + "IFEval": 0.2967, + "BBH": 0.5141, + "MATH Level 5": 0.105, + "GPQA": 0.3263, + "MUSR": 0.437, + "MMLU-PRO": 0.3784 + } + }, + { + "model_id": "Xiaojian9992024/Llama3.2-1B-THREADRIPPER", + "name": "Llama3.2-1B-THREADRIPPER", + "developer": "Xiaojian9992024", + "scores": { + "IFEval": 0.5576, + "BBH": 0.3544, + "MATH Level 5": 0.074, + "GPQA": 0.2609, + "MUSR": 0.313, + "MMLU-PRO": 0.1763 + } + }, + { + "model_id": "Xiaojian9992024/Llama3.2-1B-THREADRIPPER-v0.2", + "name": "Llama3.2-1B-THREADRIPPER-v0.2", + "developer": "Xiaojian9992024", + "scores": { + "IFEval": 0.5318, + "BBH": 0.3528, + "MATH Level 5": 0.0657, + "GPQA": 0.2659, + "MUSR": 0.3316, + "MMLU-PRO": 0.1745 + } + }, + { + "model_id": "Xiaojian9992024/Phi-4-Megatron-Empathetic", + "name": "Phi-4-Megatron-Empathetic", + "developer": "Xiaojian9992024", + "scores": { + "IFEval": 0.0173, + "BBH": 0.6673, + "MATH Level 5": 0.2696, + "GPQA": 0.3859, + "MUSR": 0.5071, + "MMLU-PRO": 0.5082 + } + }, + { + "model_id": "Xiaojian9992024/Phi-4-mini-UNOFFICAL", + "name": "Phi-4-mini-UNOFFICAL", + "developer": "Xiaojian9992024", + "scores": { + "IFEval": 0.1273, + "BBH": 0.2944, + "MATH Level 5": 0.0, + "GPQA": 0.2408, + "MUSR": 0.3368, + "MMLU-PRO": 0.1144 + } + }, + { + "model_id": "Xiaojian9992024/Qwen2.5-7B-MS-Destroyer", + "name": "Qwen2.5-7B-MS-Destroyer", + "developer": "Xiaojian9992024", + "scores": { + "IFEval": 0.7296, + "BBH": 0.547, + "MATH Level 5": 0.4592, + "GPQA": 0.3045, + "MUSR": 0.427, + "MMLU-PRO": 0.4412 + } + }, + { + "model_id": "Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview", + "name": "Qwen2.5-Dyanka-7B-Preview", + "developer": "Xiaojian9992024", + "scores": { + "IFEval": 0.764, + "BBH": 0.5543, + "MATH Level 5": 0.4879, + "GPQA": 0.3171, + "MUSR": 0.4481, + "MMLU-PRO": 0.4376 + } + }, + { + "model_id": "Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview-v0.2", + "name": "Qwen2.5-Dyanka-7B-Preview-v0.2", + "developer": "Xiaojian9992024", + "scores": { + "IFEval": 0.6702, + "BBH": 0.5374, + "MATH Level 5": 0.4721, + "GPQA": 0.2936, + "MUSR": 0.4467, + "MMLU-PRO": 0.4371 + } + }, + { + "model_id": "Xiaojian9992024/Qwen2.5-THREADRIPPER-Medium-Censored", + "name": "Qwen2.5-THREADRIPPER-Medium-Censored", + "developer": "Xiaojian9992024", + "scores": { + "IFEval": 0.8112, + "BBH": 0.6431, + "MATH Level 5": 0.534, + "GPQA": 0.3347, + "MUSR": 0.414, + "MMLU-PRO": 0.4929 + } + }, + { + "model_id": "Xiaojian9992024/Qwen2.5-THREADRIPPER-Small", + "name": "Qwen2.5-THREADRIPPER-Small", + "developer": "Xiaojian9992024", + "scores": { + "IFEval": 0.7689, + "BBH": 0.549, + "MATH Level 5": 0.4736, + "GPQA": 0.3104, + "MUSR": 0.4349, + "MMLU-PRO": 0.4357 + } + }, + { + "model_id": "Xiaojian9992024/Qwen2.5-THREADRIPPER-Small-AnniversaryEdition", + "name": "Qwen2.5-THREADRIPPER-Small-AnniversaryEdition", + "developer": "Xiaojian9992024", + "scores": { + "IFEval": 0.7404, + "BBH": 0.5465, + "MATH Level 5": 0.5076, + "GPQA": 0.2685, + "MUSR": 0.3807, + "MMLU-PRO": 0.4393 + } + }, + { + "model_id": "Xiaojian9992024/Qwen2.5-Ultra-1.5B-25.02-Exp", + "name": "Qwen2.5-Ultra-1.5B-25.02-Exp", + "developer": "Xiaojian9992024", + "scores": { + "IFEval": 0.4073, + "BBH": 0.4066, + "MATH Level 5": 0.0831, + "GPQA": 0.2584, + "MUSR": 0.3383, + "MMLU-PRO": 0.2641 + } + }, + { + "model_id": "Xiaojian9992024/Reflection-L3.2-JametMiniMix-3B", + "name": "Reflection-L3.2-JametMiniMix-3B", + "developer": "Xiaojian9992024", + "scores": { + "IFEval": 0.4619, + "BBH": 0.439, + "MATH Level 5": 0.1193, + "GPQA": 0.2945, + "MUSR": 0.3667, + "MMLU-PRO": 0.2988 + } + }, + { + "model_id": "Xkev/Llama-3.2V-11B-cot", + "name": "Llama-3.2V-11B-cot", + "developer": "Xkev", + "scores": { + "IFEval": 0.4158, + "BBH": 0.4959, + "MATH Level 5": 0.1556, + "GPQA": 0.2953, + "MUSR": 0.4159, + "MMLU-PRO": 0.3587 + } + }, + { + "model_id": "YOYO-AI/Qwen2.5-14B-1M-YOYO-V3", + "name": "Qwen2.5-14B-1M-YOYO-V3", + "developer": "YOYO-AI", + "scores": { + "IFEval": 0.8398, + "BBH": 0.6448, + "MATH Level 5": 0.5355, + "GPQA": 0.3289, + "MUSR": 0.4141, + "MMLU-PRO": 0.5207 + } + }, + { + "model_id": "YOYO-AI/Qwen2.5-14B-YOYO-0505", + "name": "Qwen2.5-14B-YOYO-0505", + "developer": "YOYO-AI", + "scores": { + "IFEval": 0.5883, + "BBH": 0.6539, + "MATH Level 5": 0.4434, + "GPQA": 0.3733, + "MUSR": 0.4757, + "MMLU-PRO": 0.5371 + } + }, + { + "model_id": "YOYO-AI/Qwen2.5-14B-YOYO-0510-v2", + "name": "Qwen2.5-14B-YOYO-0510-v2", + "developer": "YOYO-AI", + "scores": { + "IFEval": 0.5947, + "BBH": 0.6553, + "MATH Level 5": 0.4441, + "GPQA": 0.3817, + "MUSR": 0.4744, + "MMLU-PRO": 0.5381 + } + }, + { + "model_id": "YOYO-AI/Qwen2.5-14B-YOYO-0805", + "name": "Qwen2.5-14B-YOYO-0805", + "developer": "YOYO-AI", + "scores": { + "IFEval": 0.5883, + "BBH": 0.6539, + "MATH Level 5": 0.4434, + "GPQA": 0.3733, + "MUSR": 0.4757, + "MMLU-PRO": 0.5371 + } + }, + { + "model_id": "YOYO-AI/Qwen2.5-14B-YOYO-1005", + "name": "Qwen2.5-14B-YOYO-1005", + "developer": "YOYO-AI", + "scores": { + "IFEval": 0.5972, + "BBH": 0.6542, + "MATH Level 5": 0.4524, + "GPQA": 0.3809, + "MUSR": 0.473, + "MMLU-PRO": 0.5382 + } + }, + { + "model_id": "YOYO-AI/Qwen2.5-14B-YOYO-1005-v2", + "name": "Qwen2.5-14B-YOYO-1005-v2", + "developer": "YOYO-AI", + "scores": { + "IFEval": 0.5953, + "BBH": 0.6551, + "MATH Level 5": 0.4434, + "GPQA": 0.3842, + "MUSR": 0.4731, + "MMLU-PRO": 0.5372 + } + }, + { + "model_id": "YOYO-AI/Qwen2.5-14B-YOYO-1010", + "name": "Qwen2.5-14B-YOYO-1010", + "developer": "YOYO-AI", + "scores": { + "IFEval": 0.7905, + "BBH": 0.6406, + "MATH Level 5": 0.0, + "GPQA": 0.3163, + "MUSR": 0.4181, + "MMLU-PRO": 0.4944 + } + }, + { + "model_id": "YOYO-AI/Qwen2.5-14B-YOYO-1010-v2", + "name": "Qwen2.5-14B-YOYO-1010-v2", + "developer": "YOYO-AI", + "scores": { + "IFEval": 0.5947, + "BBH": 0.6553, + "MATH Level 5": 0.4441, + "GPQA": 0.3817, + "MUSR": 0.4744, + "MMLU-PRO": 0.5381 + } + }, + { + "model_id": "YOYO-AI/Qwen2.5-14B-YOYO-SCE", + "name": "Qwen2.5-14B-YOYO-SCE", + "developer": "YOYO-AI", + "scores": { + "IFEval": 0.5844, + "BBH": 0.6489, + "MATH Level 5": 0.4615, + "GPQA": 0.3742, + "MUSR": 0.4704, + "MMLU-PRO": 0.5381 + } + }, + { + "model_id": "YOYO-AI/Qwen2.5-14B-YOYO-V4", + "name": "Qwen2.5-14B-YOYO-V4", + "developer": "YOYO-AI", + "scores": { + "IFEval": 0.8398, + "BBH": 0.649, + "MATH Level 5": 0.5347, + "GPQA": 0.3221, + "MUSR": 0.4115, + "MMLU-PRO": 0.517 + } + }, + { + "model_id": "YOYO-AI/Qwen2.5-14B-YOYO-V4-p1", + "name": "Qwen2.5-14B-YOYO-V4-p1", + "developer": "YOYO-AI", + "scores": { + "IFEval": 0.8203, + "BBH": 0.6516, + "MATH Level 5": 0.5332, + "GPQA": 0.3456, + "MUSR": 0.4194, + "MMLU-PRO": 0.502 + } + }, + { + "model_id": "YOYO-AI/Qwen2.5-14B-YOYO-V4-p2", + "name": "Qwen2.5-14B-YOYO-V4-p2", + "developer": "YOYO-AI", + "scores": { + "IFEval": 0.8048, + "BBH": 0.6339, + "MATH Level 5": 0.5166, + "GPQA": 0.3272, + "MUSR": 0.4435, + "MMLU-PRO": 0.4968 + } + }, + { + "model_id": "YOYO-AI/Qwen2.5-14B-YOYO-latest", + "name": "Qwen2.5-14B-YOYO-latest", + "developer": "YOYO-AI", + "scores": { + "IFEval": 0.5911, + "BBH": 0.6656, + "MATH Level 5": 0.4418, + "GPQA": 0.3826, + "MUSR": 0.4691, + "MMLU-PRO": 0.5371 + } + }, + { + "model_id": "YOYO-AI/Qwen2.5-14B-YOYO-latest-V2", + "name": "Qwen2.5-14B-YOYO-latest-V2", + "developer": "YOYO-AI", + "scores": { + "IFEval": 0.7771, + "BBH": 0.6299, + "MATH Level 5": 0.5159, + "GPQA": 0.354, + "MUSR": 0.4299, + "MMLU-PRO": 0.5224 + } + }, + { + "model_id": "YOYO-AI/Qwen2.5-14B-it-restore", + "name": "Qwen2.5-14B-it-restore", + "developer": "YOYO-AI", + "scores": { + "IFEval": 0.8209, + "BBH": 0.6388, + "MATH Level 5": 0.537, + "GPQA": 0.3372, + "MUSR": 0.4087, + "MMLU-PRO": 0.49 + } + }, + { + "model_id": "YOYO-AI/Qwen2.5-7B-it-restore", + "name": "Qwen2.5-7B-it-restore", + "developer": "YOYO-AI", + "scores": { + "IFEval": 0.7531, + "BBH": 0.5407, + "MATH Level 5": 0.5, + "GPQA": 0.3012, + "MUSR": 0.4007, + "MMLU-PRO": 0.4288 + } + }, + { + "model_id": "YOYO-AI/Qwen2.5-Coder-14B-YOYO-1010", + "name": "Qwen2.5-Coder-14B-YOYO-1010", + "developer": "YOYO-AI", + "scores": { + "IFEval": 0.5336, + "BBH": 0.6187, + "MATH Level 5": 0.3218, + "GPQA": 0.3523, + "MUSR": 0.4422, + "MMLU-PRO": 0.4075 + } + }, + { + "model_id": "YOYO-AI/ZYH-LLM-Qwen2.5-14B", + "name": "ZYH-LLM-Qwen2.5-14B", + "developer": "YOYO-AI", + "scores": { + "IFEval": 0.5941, + "BBH": 0.6644, + "MATH Level 5": 0.4116, + "GPQA": 0.3859, + "MUSR": 0.4757, + "MMLU-PRO": 0.5351 + } + }, + { + "model_id": "YOYO-AI/ZYH-LLM-Qwen2.5-14B-V2", + "name": "ZYH-LLM-Qwen2.5-14B-V2", + "developer": "YOYO-AI", + "scores": { + "IFEval": 0.5071, + "BBH": 0.6452, + "MATH Level 5": 0.3542, + "GPQA": 0.3792, + "MUSR": 0.4689, + "MMLU-PRO": 0.5372 + } + }, + { + "model_id": "YOYO-AI/ZYH-LLM-Qwen2.5-14B-V3", + "name": "ZYH-LLM-Qwen2.5-14B-V3", + "developer": "YOYO-AI", + "scores": { + "IFEval": 0.8578, + "BBH": 0.6359, + "MATH Level 5": 0.5272, + "GPQA": 0.3322, + "MUSR": 0.4022, + "MMLU-PRO": 0.4881 + } + }, + { + "model_id": "YOYO-AI/ZYH-LLM-Qwen2.5-14B-V4", + "name": "ZYH-LLM-Qwen2.5-14B-V4", + "developer": "YOYO-AI", + "scores": { + "IFEval": 0.8365, + "BBH": 0.6515, + "MATH Level 5": 0.5393, + "GPQA": 0.3146, + "MUSR": 0.4434, + "MMLU-PRO": 0.5204 + } + }, + { + "model_id": "Yash21/TinyYi-7B-Test", + "name": "TinyYi-7B-Test", + "developer": "Yash21", + "scores": { + "IFEval": 0.1856, + "BBH": 0.291, + "MATH Level 5": 0.0, + "GPQA": 0.2643, + "MUSR": 0.3364, + "MMLU-PRO": 0.1091 + } + }, + { + "model_id": "Youlln/1PARAMMYL-8B-ModelStock", + "name": "1PARAMMYL-8B-ModelStock", + "developer": "Youlln", + "scores": { + "IFEval": 0.5371, + "BBH": 0.5216, + "MATH Level 5": 0.1488, + "GPQA": 0.3238, + "MUSR": 0.4409, + "MMLU-PRO": 0.4 + } + }, + { + "model_id": "Youlln/2PRYMMAL-Yi1.5-6B-SLERP", + "name": "2PRYMMAL-Yi1.5-6B-SLERP", + "developer": "Youlln", + "scores": { + "IFEval": 0.2826, + "BBH": 0.4665, + "MATH Level 5": 0.1133, + "GPQA": 0.307, + "MUSR": 0.4756, + "MMLU-PRO": 0.317 + } + }, + { + "model_id": "Youlln/3PRYMMAL-PHI3-3B-SLERP", + "name": "3PRYMMAL-PHI3-3B-SLERP", + "developer": "Youlln", + "scores": { + "IFEval": 0.3656, + "BBH": 0.5422, + "MATH Level 5": 0.1715, + "GPQA": 0.3263, + "MUSR": 0.4648, + "MMLU-PRO": 0.4002 + } + }, + { + "model_id": "Youlln/4PRYMMAL-GEMMA2-9B-SLERP", + "name": "4PRYMMAL-GEMMA2-9B-SLERP", + "developer": "Youlln", + "scores": { + "IFEval": 0.2714, + "BBH": 0.5923, + "MATH Level 5": 0.0906, + "GPQA": 0.3305, + "MUSR": 0.4672, + "MMLU-PRO": 0.421 + } + }, + { + "model_id": "Youlln/ECE-MIRAGE-1-12B", + "name": "ECE-MIRAGE-1-12B", + "developer": "Youlln", + "scores": { + "IFEval": 0.207, + "BBH": 0.3011, + "MATH Level 5": 0.0, + "GPQA": 0.2634, + "MUSR": 0.3219, + "MMLU-PRO": 0.111 + } + }, + { + "model_id": "Youlln/ECE-MIRAGE-1-15B", + "name": "ECE-MIRAGE-1-15B", + "developer": "Youlln", + "scores": { + "IFEval": 0.207, + "BBH": 0.3011, + "MATH Level 5": 0.0, + "GPQA": 0.2634, + "MUSR": 0.3219, + "MMLU-PRO": 0.111 + } + }, + { + "model_id": "Youlln/ECE-PRYMMAL-0.5B-FT-V3", + "name": "ECE-PRYMMAL-0.5B-FT-V3", + "developer": "Youlln", + "scores": { + "IFEval": 0.1642, + "BBH": 0.3093, + "MATH Level 5": 0.003, + "GPQA": 0.2576, + "MUSR": 0.3644, + "MMLU-PRO": 0.1161 + } + }, + { + "model_id": "Youlln/ECE-PRYMMAL-0.5B-FT-V3-MUSR", + "name": "ECE-PRYMMAL-0.5B-FT-V3-MUSR", + "developer": "Youlln", + "scores": { + "IFEval": 0.1533, + "BBH": 0.3041, + "MATH Level 5": 0.0242, + "GPQA": 0.2492, + "MUSR": 0.366, + "MMLU-PRO": 0.1645 + } + }, + { + "model_id": "Youlln/ECE-PRYMMAL-0.5B-FT-V4-MUSR", + "name": "ECE-PRYMMAL-0.5B-FT-V4-MUSR", + "developer": "Youlln", + "scores": { + "IFEval": 0.1138, + "BBH": 0.3038, + "MATH Level 5": 0.0121, + "GPQA": 0.2701, + "MUSR": 0.3529, + "MMLU-PRO": 0.1321 + } + }, + { + "model_id": "Youlln/ECE-PRYMMAL-0.5B-SLERP-V2", + "name": "ECE-PRYMMAL-0.5B-SLERP-V2", + "developer": "Youlln", + "scores": { + "IFEval": 0.1612, + "BBH": 0.2935, + "MATH Level 5": 0.0008, + "GPQA": 0.2743, + "MUSR": 0.3831, + "MMLU-PRO": 0.1095 + } + }, + { + "model_id": "Youlln/ECE-PRYMMAL-0.5B-SLERP-V3", + "name": "ECE-PRYMMAL-0.5B-SLERP-V3", + "developer": "Youlln", + "scores": { + "IFEval": 0.167, + "BBH": 0.2938, + "MATH Level 5": 0.0, + "GPQA": 0.2517, + "MUSR": 0.3541, + "MMLU-PRO": 0.1087 + } + }, + { + "model_id": "Youlln/ECE-PRYMMAL-YL-1B-SLERP-V1", + "name": "ECE-PRYMMAL-YL-1B-SLERP-V1", + "developer": "Youlln", + "scores": { + "IFEval": 0.3251, + "BBH": 0.4209, + "MATH Level 5": 0.1073, + "GPQA": 0.2911, + "MUSR": 0.4266, + "MMLU-PRO": 0.2936 + } + }, + { + "model_id": "Youlln/ECE-PRYMMAL-YL-1B-SLERP-V2", + "name": "ECE-PRYMMAL-YL-1B-SLERP-V2", + "developer": "Youlln", + "scores": { + "IFEval": 0.3251, + "BBH": 0.4209, + "MATH Level 5": 0.1073, + "GPQA": 0.2911, + "MUSR": 0.4266, + "MMLU-PRO": 0.2936 + } + }, + { + "model_id": "Youlln/ECE-PRYMMAL-YL-7B-SLERP-V4", + "name": "ECE-PRYMMAL-YL-7B-SLERP-V4", + "developer": "Youlln", + "scores": { + "IFEval": 0.251, + "BBH": 0.377, + "MATH Level 5": 0.0536, + "GPQA": 0.2651, + "MUSR": 0.3745, + "MMLU-PRO": 0.2132 + } + }, + { + "model_id": "Youlln/ECE-PRYMMAL0.5-FT", + "name": "ECE-PRYMMAL0.5-FT", + "developer": "Youlln", + "scores": { + "IFEval": 0.1851, + "BBH": 0.3132, + "MATH Level 5": 0.0234, + "GPQA": 0.2559, + "MUSR": 0.3301, + "MMLU-PRO": 0.1477 + } + }, + { + "model_id": "Youlln/ECE-PRYMMAL0.5B-Youri", + "name": "ECE-PRYMMAL0.5B-Youri", + "developer": "Youlln", + "scores": { + "IFEval": 0.1446, + "BBH": 0.2817, + "MATH Level 5": 0.0, + "GPQA": 0.2433, + "MUSR": 0.3697, + "MMLU-PRO": 0.1095 + } + }, + { + "model_id": "Youlln/ECE-PRYMMAL1B-FT-V1", + "name": "ECE-PRYMMAL1B-FT-V1", + "developer": "Youlln", + "scores": { + "IFEval": 0.2144, + "BBH": 0.4033, + "MATH Level 5": 0.0642, + "GPQA": 0.2785, + "MUSR": 0.3417, + "MMLU-PRO": 0.2743 + } + }, + { + "model_id": "Youlln/ECE-Qwen0.5B-FT-V2", + "name": "ECE-Qwen0.5B-FT-V2", + "developer": "Youlln", + "scores": { + "IFEval": 0.2526, + "BBH": 0.329, + "MATH Level 5": 0.0204, + "GPQA": 0.2668, + "MUSR": 0.3063, + "MMLU-PRO": 0.1666 + } + }, + { + "model_id": "Youlln/ECE.EIFFEIL.ia-0.5B-SLERP", + "name": "ECE.EIFFEIL.ia-0.5B-SLERP", + "developer": "Youlln", + "scores": { + "IFEval": 0.2561, + "BBH": 0.3306, + "MATH Level 5": 0.0597, + "GPQA": 0.2651, + "MUSR": 0.3102, + "MMLU-PRO": 0.1903 + } + }, + { + "model_id": "YoungPanda/qwenqwen", + "name": "qwenqwen", + "developer": "YoungPanda", + "scores": { + "IFEval": 0.1264, + "BBH": 0.3379, + "MATH Level 5": 0.0355, + "GPQA": 0.25, + "MUSR": 0.3434, + "MMLU-PRO": 0.1168 + } + }, + { + "model_id": "Yuma42/KangalKhan-RawRuby-7B", + "name": "KangalKhan-RawRuby-7B", + "developer": "Yuma42", + "scores": { + "IFEval": 0.5477, + "BBH": 0.4755, + "MATH Level 5": 0.0665, + "GPQA": 0.2878, + "MUSR": 0.395, + "MMLU-PRO": 0.3023 + } + }, + { + "model_id": "Yuma42/Llama3.1-IgneousIguana-8B", + "name": "Llama3.1-IgneousIguana-8B", + "developer": "Yuma42", + "scores": { + "IFEval": 0.8133, + "BBH": 0.5191, + "MATH Level 5": 0.2198, + "GPQA": 0.3104, + "MUSR": 0.4203, + "MMLU-PRO": 0.3974 + } + }, + { + "model_id": "Yuma42/Llama3.1-SuperHawk-8B", + "name": "Llama3.1-SuperHawk-8B", + "developer": "Yuma42", + "scores": { + "IFEval": 0.7986, + "BBH": 0.52, + "MATH Level 5": 0.2349, + "GPQA": 0.3129, + "MUSR": 0.4084, + "MMLU-PRO": 0.3945 + } + }, + { + "model_id": "Z1-Coder/Z1-Coder-7B", + "name": "Z1-Coder-7B", + "developer": "Z1-Coder", + "scores": { + "IFEval": 0.3215, + "BBH": 0.4842, + "MATH Level 5": 0.3248, + "GPQA": 0.2727, + "MUSR": 0.3622, + "MMLU-PRO": 0.3759 + } + }, + { + "model_id": "ZHLiu627/zephyr-7b-gemma-dpo-avg", + "name": "zephyr-7b-gemma-dpo-avg", + "developer": "ZHLiu627", + "scores": { + "IFEval": 0.309, + "BBH": 0.4149, + "MATH Level 5": 0.0453, + "GPQA": 0.2785, + "MUSR": 0.4107, + "MMLU-PRO": 0.2851 + } + }, + { + "model_id": "ZHLiu627/zephyr-7b-gemma-rpo-avg", + "name": "zephyr-7b-gemma-rpo-avg", + "developer": "ZHLiu627", + "scores": { + "IFEval": 0.3006, + "BBH": 0.4183, + "MATH Level 5": 0.0498, + "GPQA": 0.2768, + "MUSR": 0.4081, + "MMLU-PRO": 0.2831 + } + }, + { + "model_id": "ZeroXClem/L3-Aspire-Heart-Matrix-8B", + "name": "L3-Aspire-Heart-Matrix-8B", + "developer": "ZeroXClem", + "scores": { + "IFEval": 0.4834, + "BBH": 0.5384, + "MATH Level 5": 0.1828, + "GPQA": 0.3247, + "MUSR": 0.4187, + "MMLU-PRO": 0.3785 + } + }, + { + "model_id": "ZeroXClem/Llama-3.1-8B-AthenaSky-MegaMix", + "name": "Llama-3.1-8B-AthenaSky-MegaMix", + "developer": "ZeroXClem", + "scores": { + "IFEval": 0.6301, + "BBH": 0.5163, + "MATH Level 5": 0.2795, + "GPQA": 0.2777, + "MUSR": 0.3538, + "MMLU-PRO": 0.3504 + } + }, + { + "model_id": "ZeroXClem/Llama-3.1-8B-RainbowLight-EtherealMix", + "name": "Llama-3.1-8B-RainbowLight-EtherealMix", + "developer": "ZeroXClem", + "scores": { + "IFEval": 0.4973, + "BBH": 0.5155, + "MATH Level 5": 0.1216, + "GPQA": 0.2869, + "MUSR": 0.3947, + "MMLU-PRO": 0.363 + } + }, + { + "model_id": "ZeroXClem/Llama-3.1-8B-SpecialTitanFusion", + "name": "Llama-3.1-8B-SpecialTitanFusion", + "developer": "ZeroXClem", + "scores": { + "IFEval": 0.7402, + "BBH": 0.5439, + "MATH Level 5": 0.2334, + "GPQA": 0.2995, + "MUSR": 0.3874, + "MMLU-PRO": 0.3621 + } + }, + { + "model_id": "ZeroXClem/Llama-3.1-8B-SuperNova-EtherealHermes", + "name": "Llama-3.1-8B-SuperNova-EtherealHermes", + "developer": "ZeroXClem", + "scores": { + "IFEval": 0.7339, + "BBH": 0.5244, + "MATH Level 5": 0.1745, + "GPQA": 0.2928, + "MUSR": 0.4066, + "MMLU-PRO": 0.3745 + } + }, + { + "model_id": "ZeroXClem/Llama-3.1-8B-SuperTulu-LexiNova", + "name": "Llama-3.1-8B-SuperTulu-LexiNova", + "developer": "ZeroXClem", + "scores": { + "IFEval": 0.4165, + "BBH": 0.5079, + "MATH Level 5": 0.253, + "GPQA": 0.2861, + "MUSR": 0.3971, + "MMLU-PRO": 0.3368 + } + }, + { + "model_id": "ZeroXClem/Qwen-2.5-Aether-SlerpFusion-7B", + "name": "Qwen-2.5-Aether-SlerpFusion-7B", + "developer": "ZeroXClem", + "scores": { + "IFEval": 0.6262, + "BBH": 0.5462, + "MATH Level 5": 0.2734, + "GPQA": 0.2987, + "MUSR": 0.4178, + "MMLU-PRO": 0.4327 + } + }, + { + "model_id": "ZeroXClem/Qwen2.5-7B-CelestialHarmony-1M", + "name": "Qwen2.5-7B-CelestialHarmony-1M", + "developer": "ZeroXClem", + "scores": { + "IFEval": 0.5944, + "BBH": 0.5431, + "MATH Level 5": 0.3474, + "GPQA": 0.3188, + "MUSR": 0.4595, + "MMLU-PRO": 0.4387 + } + }, + { + "model_id": "ZeroXClem/Qwen2.5-7B-HomerAnvita-NerdMix", + "name": "Qwen2.5-7B-HomerAnvita-NerdMix", + "developer": "ZeroXClem", + "scores": { + "IFEval": 0.7708, + "BBH": 0.5541, + "MATH Level 5": 0.3837, + "GPQA": 0.3196, + "MUSR": 0.4391, + "MMLU-PRO": 0.4432 + } + }, + { + "model_id": "ZeroXClem/Qwen2.5-7B-HomerCreative-Mix", + "name": "Qwen2.5-7B-HomerCreative-Mix", + "developer": "ZeroXClem", + "scores": { + "IFEval": 0.7835, + "BBH": 0.5548, + "MATH Level 5": 0.3565, + "GPQA": 0.2995, + "MUSR": 0.435, + "MMLU-PRO": 0.4447 + } + }, + { + "model_id": "ZeroXClem/Qwen2.5-7B-Qandora-CySec", + "name": "Qwen2.5-7B-Qandora-CySec", + "developer": "ZeroXClem", + "scores": { + "IFEval": 0.6773, + "BBH": 0.549, + "MATH Level 5": 0.2931, + "GPQA": 0.3003, + "MUSR": 0.4286, + "MMLU-PRO": 0.4485 + } + }, + { + "model_id": "ZeusLabs/L3-Aethora-15B-V2", + "name": "L3-Aethora-15B-V2", + "developer": "ZeusLabs", + "scores": { + "IFEval": 0.7208, + "BBH": 0.5011, + "MATH Level 5": 0.0808, + "GPQA": 0.2878, + "MUSR": 0.3871, + "MMLU-PRO": 0.35 + } + }, + { + "model_id": "ZhangShenao/SELM-Llama-3-8B-Instruct-iter-3", + "name": "SELM-Llama-3-8B-Instruct-iter-3", + "developer": "ZhangShenao", + "scores": { + "IFEval": 0.6903, + "BBH": 0.5046, + "MATH Level 5": 0.0861, + "GPQA": 0.2584, + "MUSR": 0.3845, + "MMLU-PRO": 0.3783 + } + }, + { + "model_id": "aaditya/Llama3-OpenBioLLM-70B", + "name": "Llama3-OpenBioLLM-70B", + "developer": "aaditya", + "scores": { + "IFEval": 0.7597, + "BBH": 0.6399, + "MATH Level 5": 0.1971, + "GPQA": 0.323, + "MUSR": 0.4417, + "MMLU-PRO": 0.4867 + } + }, + { + "model_id": "abacusai/Dracarys-72B-Instruct", + "name": "Dracarys-72B-Instruct", + "developer": "abacusai", + "scores": { + "IFEval": 0.7856, + "BBH": 0.6944, + "MATH Level 5": 0.3965, + "GPQA": 0.3909, + "MUSR": 0.4558, + "MMLU-PRO": 0.5456 + } + }, + { + "model_id": "abacusai/Liberated-Qwen1.5-14B", + "name": "Liberated-Qwen1.5-14B", + "developer": "abacusai", + "scores": { + "IFEval": 0.3631, + "BBH": 0.4948, + "MATH Level 5": 0.1601, + "GPQA": 0.2836, + "MUSR": 0.4175, + "MMLU-PRO": 0.3512 + } + }, + { + "model_id": "abacusai/Llama-3-Smaug-8B", + "name": "Llama-3-Smaug-8B", + "developer": "abacusai", + "scores": { + "IFEval": 0.4867, + "BBH": 0.4931, + "MATH Level 5": 0.0853, + "GPQA": 0.2483, + "MUSR": 0.3622, + "MMLU-PRO": 0.3185 + } + }, + { + "model_id": "abacusai/Smaug-34B-v0.1", + "name": "Smaug-34B-v0.1", + "developer": "abacusai", + "scores": { + "IFEval": 0.5016, + "BBH": 0.5358, + "MATH Level 5": 0.0718, + "GPQA": 0.3297, + "MUSR": 0.3979, + "MMLU-PRO": 0.4543 + } + }, + { + "model_id": "abacusai/Smaug-72B-v0.1", + "name": "Smaug-72B-v0.1", + "developer": "abacusai", + "scores": { + "IFEval": 0.5167, + "BBH": 0.5996, + "MATH Level 5": 0.1911, + "GPQA": 0.3238, + "MUSR": 0.4473, + "MMLU-PRO": 0.4624 + } + }, + { + "model_id": "abacusai/Smaug-Llama-3-70B-Instruct-32K", + "name": "Smaug-Llama-3-70B-Instruct-32K", + "developer": "abacusai", + "scores": { + "IFEval": 0.7761, + "BBH": 0.6493, + "MATH Level 5": 0.2749, + "GPQA": 0.2961, + "MUSR": 0.4208, + "MMLU-PRO": 0.4765 + } + }, + { + "model_id": "abacusai/Smaug-Mixtral-v0.1", + "name": "Smaug-Mixtral-v0.1", + "developer": "abacusai", + "scores": { + "IFEval": 0.5554, + "BBH": 0.5162, + "MATH Level 5": 0.0952, + "GPQA": 0.3012, + "MUSR": 0.4298, + "MMLU-PRO": 0.3352 + } + }, + { + "model_id": "abacusai/Smaug-Qwen2-72B-Instruct", + "name": "Smaug-Qwen2-72B-Instruct", + "developer": "abacusai", + "scores": { + "IFEval": 0.7825, + "BBH": 0.691, + "MATH Level 5": 0.4131, + "GPQA": 0.3616, + "MUSR": 0.4401, + "MMLU-PRO": 0.519 + } + }, + { + "model_id": "abacusai/bigstral-12b-32k", + "name": "bigstral-12b-32k", + "developer": "abacusai", + "scores": { + "IFEval": 0.4194, + "BBH": 0.47, + "MATH Level 5": 0.0151, + "GPQA": 0.2928, + "MUSR": 0.456, + "MMLU-PRO": 0.2641 + } + }, + { + "model_id": "abacusai/bigyi-15b", + "name": "bigyi-15b", + "developer": "abacusai", + "scores": { + "IFEval": 0.2094, + "BBH": 0.4345, + "MATH Level 5": 0.0295, + "GPQA": 0.3096, + "MUSR": 0.3538, + "MMLU-PRO": 0.3003 + } + }, + { + "model_id": "abhishek/autotrain-0tmgq-5tpbg", + "name": "autotrain-0tmgq-5tpbg", + "developer": "abhishek", + "scores": { + "IFEval": 0.1957, + "BBH": 0.3135, + "MATH Level 5": 0.0, + "GPQA": 0.2517, + "MUSR": 0.365, + "MMLU-PRO": 0.1151 + } + }, + { + "model_id": "abhishek/autotrain-llama3-70b-orpo-v1", + "name": "autotrain-llama3-70b-orpo-v1", + "developer": "abhishek", + "scores": { + "IFEval": 0.4233, + "BBH": 0.5998, + "MATH Level 5": 0.0106, + "GPQA": 0.2441, + "MUSR": 0.3579, + "MMLU-PRO": 0.1122 + } + }, + { + "model_id": "abhishek/autotrain-llama3-70b-orpo-v2", + "name": "autotrain-llama3-70b-orpo-v2", + "developer": "abhishek", + "scores": { + "IFEval": 0.5406, + "BBH": 0.5899, + "MATH Level 5": 0.2107, + "GPQA": 0.2936, + "MUSR": 0.4113, + "MMLU-PRO": 0.4818 + } + }, + { + "model_id": "abhishek/autotrain-llama3-orpo-v2", + "name": "autotrain-llama3-orpo-v2", + "developer": "abhishek", + "scores": { + "IFEval": 0.4372, + "BBH": 0.3159, + "MATH Level 5": 0.0468, + "GPQA": 0.2668, + "MUSR": 0.3792, + "MMLU-PRO": 0.2218 + } + }, + { + "model_id": "abhishek/autotrain-vr4a1-e5mms", + "name": "autotrain-vr4a1-e5mms", + "developer": "abhishek", + "scores": { + "IFEval": 0.2142, + "BBH": 0.5001, + "MATH Level 5": 0.1412, + "GPQA": 0.3196, + "MUSR": 0.3891, + "MMLU-PRO": 0.3667 + } + }, + { + "model_id": "abideen/MedPhi-4-14B-v1", + "name": "MedPhi-4-14B-v1", + "developer": "abideen", + "scores": { + "IFEval": 0.6277, + "BBH": 0.6897, + "MATH Level 5": 0.2931, + "GPQA": 0.344, + "MUSR": 0.4155, + "MMLU-PRO": 0.5338 + } + }, + { + "model_id": "adamo1139/Yi-34B-200K-AEZAKMI-v2", + "name": "Yi-34B-200K-AEZAKMI-v2", + "developer": "adamo1139", + "scores": { + "IFEval": 0.4555, + "BBH": 0.5384, + "MATH Level 5": 0.0566, + "GPQA": 0.3322, + "MUSR": 0.3886, + "MMLU-PRO": 0.4513 + } + }, + { + "model_id": "adriszmar/QAIMath-Qwen2.5-7B-TIES", + "name": "QAIMath-Qwen2.5-7B-TIES", + "developer": "adriszmar", + "scores": { + "IFEval": 0.1685, + "BBH": 0.3124, + "MATH Level 5": 0.0015, + "GPQA": 0.2492, + "MUSR": 0.3963, + "MMLU-PRO": 0.1066 + } + }, + { + "model_id": "aevalone/distill_qw_test", + "name": "distill_qw_test", + "developer": "aevalone", + "scores": { + "IFEval": 0.7409, + "BBH": 0.5246, + "MATH Level 5": 0.4781, + "GPQA": 0.3003, + "MUSR": 0.386, + "MMLU-PRO": 0.4092 + } + }, + { + "model_id": "agentlans/Gemma2-9B-AdvancedFuse", + "name": "Gemma2-9B-AdvancedFuse", + "developer": "agentlans", + "scores": { + "IFEval": 0.1543, + "BBH": 0.5859, + "MATH Level 5": 0.1005, + "GPQA": 0.3347, + "MUSR": 0.4231, + "MMLU-PRO": 0.4 + } + }, + { + "model_id": "agentlans/Llama-3.2-1B-Instruct-CrashCourse12K", + "name": "Llama-3.2-1B-Instruct-CrashCourse12K", + "developer": "agentlans", + "scores": { + "IFEval": 0.5395, + "BBH": 0.3548, + "MATH Level 5": 0.071, + "GPQA": 0.2408, + "MUSR": 0.321, + "MMLU-PRO": 0.1809 + } + }, + { + "model_id": "agentlans/Llama3.1-8B-drill", + "name": "Llama3.1-8B-drill", + "developer": "agentlans", + "scores": { + "IFEval": 0.7652, + "BBH": 0.5016, + "MATH Level 5": 0.1715, + "GPQA": 0.2676, + "MUSR": 0.3672, + "MMLU-PRO": 0.3776 + } + }, + { + "model_id": "agentlans/Llama3.1-Daredevilish", + "name": "Llama3.1-Daredevilish", + "developer": "agentlans", + "scores": { + "IFEval": 0.6292, + "BBH": 0.5013, + "MATH Level 5": 0.1292, + "GPQA": 0.3012, + "MUSR": 0.4091, + "MMLU-PRO": 0.3697 + } + }, + { + "model_id": "agentlans/Llama3.1-Daredevilish-Instruct", + "name": "Llama3.1-Daredevilish-Instruct", + "developer": "agentlans", + "scores": { + "IFEval": 0.7926, + "BBH": 0.5235, + "MATH Level 5": 0.1722, + "GPQA": 0.307, + "MUSR": 0.3911, + "MMLU-PRO": 0.3877 + } + }, + { + "model_id": "agentlans/Llama3.1-LexiHermes-SuperStorm", + "name": "Llama3.1-LexiHermes-SuperStorm", + "developer": "agentlans", + "scores": { + "IFEval": 0.7835, + "BBH": 0.5266, + "MATH Level 5": 0.1616, + "GPQA": 0.323, + "MUSR": 0.3963, + "MMLU-PRO": 0.3844 + } + }, + { + "model_id": "agentlans/Llama3.1-SuperDeepFuse", + "name": "Llama3.1-SuperDeepFuse", + "developer": "agentlans", + "scores": { + "IFEval": 0.7762, + "BBH": 0.5049, + "MATH Level 5": 0.1828, + "GPQA": 0.2743, + "MUSR": 0.3699, + "MMLU-PRO": 0.3775 + } + }, + { + "model_id": "agentlans/Llama3.1-SuperDeepFuse-CrashCourse12K", + "name": "Llama3.1-SuperDeepFuse-CrashCourse12K", + "developer": "agentlans", + "scores": { + "IFEval": 0.7187, + "BBH": 0.5216, + "MATH Level 5": 0.1805, + "GPQA": 0.3129, + "MUSR": 0.4026, + "MMLU-PRO": 0.3631 + } + }, + { + "model_id": "agentlans/Qwen2.5-0.5B-Instruct-CrashCourse-dropout", + "name": "Qwen2.5-0.5B-Instruct-CrashCourse-dropout", + "developer": "agentlans", + "scores": { + "IFEval": 0.2949, + "BBH": 0.3312, + "MATH Level 5": 0.0423, + "GPQA": 0.2634, + "MUSR": 0.3342, + "MMLU-PRO": 0.1608 + } + }, + { + "model_id": "ahmeda335/13_outOf_32_pruned_layers_llama3.1-8b", + "name": "13_outOf_32_pruned_layers_llama3.1-8b", + "developer": "ahmeda335", + "scores": { + "IFEval": 0.1748, + "BBH": 0.2883, + "MATH Level 5": 0.0, + "GPQA": 0.2592, + "MUSR": 0.3803, + "MMLU-PRO": 0.1129 + } + }, + { + "model_id": "ai21labs/Jamba-v0.1", + "name": "Jamba-v0.1", + "developer": "ai21labs", + "scores": { + "IFEval": 0.2026, + "BBH": 0.3602, + "MATH Level 5": 0.0159, + "GPQA": 0.2685, + "MUSR": 0.359, + "MMLU-PRO": 0.2492 + } + }, + { + "model_id": "ai4bharat/Airavata", + "name": "Airavata", + "developer": "ai4bharat", + "scores": { + "IFEval": 0.0559, + "BBH": 0.3628, + "MATH Level 5": 0.0181, + "GPQA": 0.2743, + "MUSR": 0.3763, + "MMLU-PRO": 0.1635 + } + }, + { + "model_id": "aixonlab/Aether-12b", + "name": "Aether-12b", + "developer": "aixonlab", + "scores": { + "IFEval": 0.2347, + "BBH": 0.5179, + "MATH Level 5": 0.1065, + "GPQA": 0.3163, + "MUSR": 0.3829, + "MMLU-PRO": 0.341 + } + }, + { + "model_id": "aixonlab/Grey-12b", + "name": "Grey-12b", + "developer": "aixonlab", + "scores": { + "IFEval": 0.3968, + "BBH": 0.5699, + "MATH Level 5": 0.0982, + "GPQA": 0.3003, + "MUSR": 0.4516, + "MMLU-PRO": 0.3779 + } + }, + { + "model_id": "aixonlab/Zara-14b-v1.2", + "name": "Zara-14b-v1.2", + "developer": "aixonlab", + "scores": { + "IFEval": 0.6197, + "BBH": 0.6405, + "MATH Level 5": 0.3535, + "GPQA": 0.3817, + "MUSR": 0.4675, + "MMLU-PRO": 0.5263 + } + }, + { + "model_id": "akhadangi/Llama3.2.1B.0.01-First", + "name": "Llama3.2.1B.0.01-First", + "developer": "akhadangi", + "scores": { + "IFEval": 0.0814, + "BBH": 0.3189, + "MATH Level 5": 0.0181, + "GPQA": 0.2483, + "MUSR": 0.3194, + "MMLU-PRO": 0.1197 + } + }, + { + "model_id": "akhadangi/Llama3.2.1B.0.01-Last", + "name": "Llama3.2.1B.0.01-Last", + "developer": "akhadangi", + "scores": { + "IFEval": 0.0917, + "BBH": 0.3159, + "MATH Level 5": 0.0136, + "GPQA": 0.2433, + "MUSR": 0.3206, + "MMLU-PRO": 0.1227 + } + }, + { + "model_id": "akhadangi/Llama3.2.1B.0.1-First", + "name": "Llama3.2.1B.0.1-First", + "developer": "akhadangi", + "scores": { + "IFEval": 0.1001, + "BBH": 0.312, + "MATH Level 5": 0.0211, + "GPQA": 0.245, + "MUSR": 0.3301, + "MMLU-PRO": 0.1169 + } + }, + { + "model_id": "akhadangi/Llama3.2.1B.0.1-Last", + "name": "Llama3.2.1B.0.1-Last", + "developer": "akhadangi", + "scores": { + "IFEval": 0.095, + "BBH": 0.3164, + "MATH Level 5": 0.0211, + "GPQA": 0.2383, + "MUSR": 0.3341, + "MMLU-PRO": 0.1178 + } + }, + { + "model_id": "akhadangi/Llama3.2.1B.BaseFiT", + "name": "Llama3.2.1B.BaseFiT", + "developer": "akhadangi", + "scores": { + "IFEval": 0.0883, + "BBH": 0.3175, + "MATH Level 5": 0.0242, + "GPQA": 0.2534, + "MUSR": 0.3221, + "MMLU-PRO": 0.1172 + } + }, + { + "model_id": "akjindal53244/Llama-3.1-Storm-8B", + "name": "Llama-3.1-Storm-8B", + "developer": "akjindal53244", + "scores": { + "IFEval": 0.8033, + "BBH": 0.5196, + "MATH Level 5": 0.1624, + "GPQA": 0.3096, + "MUSR": 0.4028, + "MMLU-PRO": 0.3812 + } + }, + { + "model_id": "alcholjung/llama3_medical_tuned", + "name": "llama3_medical_tuned", + "developer": "alcholjung", + "scores": { + "IFEval": 0.0106, + "BBH": 0.4513, + "MATH Level 5": 0.0468, + "GPQA": 0.2861, + "MUSR": 0.466, + "MMLU-PRO": 0.2946 + } + }, + { + "model_id": "allenai/Llama-3.1-Tulu-3-70B", + "name": "Llama-3.1-Tulu-3-70B", + "developer": "allenai", + "scores": { + "IFEval": 0.8379, + "BBH": 0.6157, + "MATH Level 5": 0.3829, + "GPQA": 0.3733, + "MUSR": 0.4988, + "MMLU-PRO": 0.4656 + } + }, + { + "model_id": "allenai/Llama-3.1-Tulu-3-70B-DPO", + "name": "Llama-3.1-Tulu-3-70B-DPO", + "developer": "allenai", + "scores": { + "IFEval": 0.8282, + "BBH": 0.6146, + "MATH Level 5": 0.4494, + "GPQA": 0.3758, + "MUSR": 0.4923, + "MMLU-PRO": 0.4633 + } + }, + { + "model_id": "allenai/Llama-3.1-Tulu-3-70B-SFT", + "name": "Llama-3.1-Tulu-3-70B-SFT", + "developer": "allenai", + "scores": { + "IFEval": 0.8051, + "BBH": 0.5951, + "MATH Level 5": 0.3316, + "GPQA": 0.3448, + "MUSR": 0.5026, + "MMLU-PRO": 0.4624 + } + }, + { + "model_id": "allenai/Llama-3.1-Tulu-3-8B", + "name": "Llama-3.1-Tulu-3-8B", + "developer": "allenai", + "scores": { + "IFEval": 0.8267, + "BBH": 0.405, + "MATH Level 5": 0.1964, + "GPQA": 0.2987, + "MUSR": 0.4175, + "MMLU-PRO": 0.2827 + } + }, + { + "model_id": "allenai/Llama-3.1-Tulu-3-8B-DPO", + "name": "Llama-3.1-Tulu-3-8B-DPO", + "developer": "allenai", + "scores": { + "IFEval": 0.8029, + "BBH": 0.4079, + "MATH Level 5": 0.2364, + "GPQA": 0.2936, + "MUSR": 0.4161, + "MMLU-PRO": 0.2898 + } + }, + { + "model_id": "allenai/Llama-3.1-Tulu-3-8B-RM", + "name": "Llama-3.1-Tulu-3-8B-RM", + "developer": "allenai", + "scores": { + "IFEval": 0.167, + "BBH": 0.295, + "MATH Level 5": 0.0, + "GPQA": 0.2567, + "MUSR": 0.3764, + "MMLU-PRO": 0.1082 + } + }, + { + "model_id": "allenai/Llama-3.1-Tulu-3-8B-SFT", + "name": "Llama-3.1-Tulu-3-8B-SFT", + "developer": "allenai", + "scores": { + "IFEval": 0.7403, + "BBH": 0.3872, + "MATH Level 5": 0.1178, + "GPQA": 0.2777, + "MUSR": 0.4268, + "MMLU-PRO": 0.2812 + } + }, + { + "model_id": "allenai/OLMo-1.7-7B-hf", + "name": "OLMo-1.7-7B-hf", + "developer": "allenai", + "scores": { + "IFEval": 0.1569, + "BBH": 0.3014, + "MATH Level 5": 0.0023, + "GPQA": 0.255, + "MUSR": 0.3475, + "MMLU-PRO": 0.1124 + } + }, + { + "model_id": "allenai/OLMo-1B-hf", + "name": "OLMo-1B-hf", + "developer": "allenai", + "scores": { + "IFEval": 0.2182, + "BBH": 0.3052, + "MATH Level 5": 0.0174, + "GPQA": 0.2617, + "MUSR": 0.4098, + "MMLU-PRO": 0.1174 + } + }, + { + "model_id": "allenai/OLMo-2-1124-7B-Instruct", + "name": "OLMo-2-1124-7B-Instruct", + "developer": "allenai", + "scores": { + "IFEval": 0.7244, + "BBH": 0.4022, + "MATH Level 5": 0.1488, + "GPQA": 0.2785, + "MUSR": 0.3508, + "MMLU-PRO": 0.2672 + } + }, + { + "model_id": "allenai/OLMo-7B-Instruct-hf", + "name": "OLMo-7B-Instruct-hf", + "developer": "allenai", + "scores": { + "IFEval": 0.3473, + "BBH": 0.3706, + "MATH Level 5": 0.0136, + "GPQA": 0.271, + "MUSR": 0.3765, + "MMLU-PRO": 0.1785 + } + }, + { + "model_id": "allenai/OLMo-7B-hf", + "name": "OLMo-7B-hf", + "developer": "allenai", + "scores": { + "IFEval": 0.2719, + "BBH": 0.3279, + "MATH Level 5": 0.0121, + "GPQA": 0.2727, + "MUSR": 0.3487, + "MMLU-PRO": 0.1173 + } + }, + { + "model_id": "allenai/OLMoE-1B-7B-0125-Instruct", + "name": "OLMoE-1B-7B-0125-Instruct", + "developer": "allenai", + "scores": { + "IFEval": 0.6757, + "BBH": 0.3825, + "MATH Level 5": 0.0899, + "GPQA": 0.2601, + "MUSR": 0.3636, + "MMLU-PRO": 0.1915 + } + }, + { + "model_id": "allenai/OLMoE-1B-7B-0924", + "name": "OLMoE-1B-7B-0924", + "developer": "allenai", + "scores": { + "IFEval": 0.2185, + "BBH": 0.3393, + "MATH Level 5": 0.0166, + "GPQA": 0.2475, + "MUSR": 0.3488, + "MMLU-PRO": 0.174 + } + }, + { + "model_id": "allenai/OLMoE-1B-7B-0924-Instruct", + "name": "OLMoE-1B-7B-0924-Instruct", + "developer": "allenai", + "scores": { + "IFEval": 0.4667, + "BBH": 0.3902, + "MATH Level 5": 0.0279, + "GPQA": 0.2676, + "MUSR": 0.3848, + "MMLU-PRO": 0.1876 + } + }, + { + "model_id": "allknowingroger/Chocolatine-24B", + "name": "Chocolatine-24B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.1958, + "BBH": 0.6191, + "MATH Level 5": 0.0008, + "GPQA": 0.3255, + "MUSR": 0.4323, + "MMLU-PRO": 0.4566 + } + }, + { + "model_id": "allknowingroger/Gemma2Slerp1-2.6B", + "name": "Gemma2Slerp1-2.6B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.5354, + "BBH": 0.4343, + "MATH Level 5": 0.1065, + "GPQA": 0.2836, + "MUSR": 0.4562, + "MMLU-PRO": 0.2689 + } + }, + { + "model_id": "allknowingroger/Gemma2Slerp1-27B", + "name": "Gemma2Slerp1-27B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.7186, + "BBH": 0.6399, + "MATH Level 5": 0.2583, + "GPQA": 0.3641, + "MUSR": 0.4767, + "MMLU-PRO": 0.4456 + } + }, + { + "model_id": "allknowingroger/Gemma2Slerp2-2.6B", + "name": "Gemma2Slerp2-2.6B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.5747, + "BBH": 0.4308, + "MATH Level 5": 0.0906, + "GPQA": 0.3054, + "MUSR": 0.4468, + "MMLU-PRO": 0.2696 + } + }, + { + "model_id": "allknowingroger/Gemma2Slerp2-27B", + "name": "Gemma2Slerp2-27B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.7546, + "BBH": 0.6557, + "MATH Level 5": 0.2787, + "GPQA": 0.37, + "MUSR": 0.4621, + "MMLU-PRO": 0.4623 + } + }, + { + "model_id": "allknowingroger/Gemma2Slerp3-27B", + "name": "Gemma2Slerp3-27B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.7426, + "BBH": 0.65, + "MATH Level 5": 0.2742, + "GPQA": 0.3549, + "MUSR": 0.474, + "MMLU-PRO": 0.4641 + } + }, + { + "model_id": "allknowingroger/Gemma2Slerp4-27B", + "name": "Gemma2Slerp4-27B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.7497, + "BBH": 0.653, + "MATH Level 5": 0.2719, + "GPQA": 0.3666, + "MUSR": 0.4502, + "MMLU-PRO": 0.4649 + } + }, + { + "model_id": "allknowingroger/GemmaSlerp-9B", + "name": "GemmaSlerp-9B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.7043, + "BBH": 0.5921, + "MATH Level 5": 0.216, + "GPQA": 0.344, + "MUSR": 0.4673, + "MMLU-PRO": 0.4161 + } + }, + { + "model_id": "allknowingroger/GemmaSlerp2-9B", + "name": "GemmaSlerp2-9B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.7281, + "BBH": 0.5983, + "MATH Level 5": 0.2107, + "GPQA": 0.3523, + "MUSR": 0.4767, + "MMLU-PRO": 0.4239 + } + }, + { + "model_id": "allknowingroger/GemmaSlerp4-10B", + "name": "GemmaSlerp4-10B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.7326, + "BBH": 0.6028, + "MATH Level 5": 0.2243, + "GPQA": 0.3532, + "MUSR": 0.454, + "MMLU-PRO": 0.425 + } + }, + { + "model_id": "allknowingroger/GemmaSlerp5-10B", + "name": "GemmaSlerp5-10B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.7353, + "BBH": 0.6054, + "MATH Level 5": 0.2183, + "GPQA": 0.3523, + "MUSR": 0.4608, + "MMLU-PRO": 0.4328 + } + }, + { + "model_id": "allknowingroger/GemmaStock1-27B", + "name": "GemmaStock1-27B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.7509, + "BBH": 0.6566, + "MATH Level 5": 0.2636, + "GPQA": 0.3641, + "MUSR": 0.4527, + "MMLU-PRO": 0.473 + } + }, + { + "model_id": "allknowingroger/HomerSlerp1-7B", + "name": "HomerSlerp1-7B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4621, + "BBH": 0.5518, + "MATH Level 5": 0.2719, + "GPQA": 0.318, + "MUSR": 0.4359, + "MMLU-PRO": 0.4504 + } + }, + { + "model_id": "allknowingroger/HomerSlerp2-7B", + "name": "HomerSlerp2-7B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4487, + "BBH": 0.5649, + "MATH Level 5": 0.2968, + "GPQA": 0.3196, + "MUSR": 0.4356, + "MMLU-PRO": 0.4515 + } + }, + { + "model_id": "allknowingroger/HomerSlerp3-7B", + "name": "HomerSlerp3-7B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4363, + "BBH": 0.5598, + "MATH Level 5": 0.3021, + "GPQA": 0.3171, + "MUSR": 0.4462, + "MMLU-PRO": 0.4535 + } + }, + { + "model_id": "allknowingroger/HomerSlerp4-7B", + "name": "HomerSlerp4-7B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4374, + "BBH": 0.5571, + "MATH Level 5": 0.327, + "GPQA": 0.3196, + "MUSR": 0.4408, + "MMLU-PRO": 0.4472 + } + }, + { + "model_id": "allknowingroger/LimyQstar-7B-slerp", + "name": "LimyQstar-7B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.3491, + "BBH": 0.5024, + "MATH Level 5": 0.0687, + "GPQA": 0.2987, + "MUSR": 0.4146, + "MMLU-PRO": 0.3103 + } + }, + { + "model_id": "allknowingroger/Llama3.1-60B", + "name": "Llama3.1-60B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.1815, + "BBH": 0.3242, + "MATH Level 5": 0.0, + "GPQA": 0.2945, + "MUSR": 0.3596, + "MMLU-PRO": 0.331 + } + }, + { + "model_id": "allknowingroger/Marco-01-slerp1-7B", + "name": "Marco-01-slerp1-7B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4681, + "BBH": 0.5541, + "MATH Level 5": 0.3157, + "GPQA": 0.3171, + "MUSR": 0.4452, + "MMLU-PRO": 0.4483 + } + }, + { + "model_id": "allknowingroger/Meme-7B-slerp", + "name": "Meme-7B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.5164, + "BBH": 0.4661, + "MATH Level 5": 0.0438, + "GPQA": 0.2861, + "MUSR": 0.4223, + "MMLU-PRO": 0.281 + } + }, + { + "model_id": "allknowingroger/Ministral-8B-slerp", + "name": "Ministral-8B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.1961, + "BBH": 0.4686, + "MATH Level 5": 0.0038, + "GPQA": 0.3121, + "MUSR": 0.4285, + "MMLU-PRO": 0.3119 + } + }, + { + "model_id": "allknowingroger/MistralPhi3-11B", + "name": "MistralPhi3-11B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.1943, + "BBH": 0.6234, + "MATH Level 5": 0.0, + "GPQA": 0.3322, + "MUSR": 0.4267, + "MMLU-PRO": 0.4688 + } + }, + { + "model_id": "allknowingroger/Mistralmash1-7B-s", + "name": "Mistralmash1-7B-s", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.3961, + "BBH": 0.5277, + "MATH Level 5": 0.0921, + "GPQA": 0.2945, + "MUSR": 0.4267, + "MMLU-PRO": 0.3293 + } + }, + { + "model_id": "allknowingroger/Mistralmash2-7B-s", + "name": "Mistralmash2-7B-s", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4102, + "BBH": 0.5305, + "MATH Level 5": 0.0793, + "GPQA": 0.2978, + "MUSR": 0.4372, + "MMLU-PRO": 0.3345 + } + }, + { + "model_id": "allknowingroger/MixTAO-19B-pass", + "name": "MixTAO-19B-pass", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.3814, + "BBH": 0.5128, + "MATH Level 5": 0.0612, + "GPQA": 0.2844, + "MUSR": 0.4783, + "MMLU-PRO": 0.3105 + } + }, + { + "model_id": "allknowingroger/MixTaoTruthful-13B-slerp", + "name": "MixTaoTruthful-13B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4139, + "BBH": 0.5207, + "MATH Level 5": 0.0665, + "GPQA": 0.2844, + "MUSR": 0.4292, + "MMLU-PRO": 0.31 + } + }, + { + "model_id": "allknowingroger/MultiCalm-7B-slerp", + "name": "MultiCalm-7B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.3927, + "BBH": 0.5122, + "MATH Level 5": 0.0619, + "GPQA": 0.2827, + "MUSR": 0.4319, + "MMLU-PRO": 0.3033 + } + }, + { + "model_id": "allknowingroger/MultiMash-12B-slerp", + "name": "MultiMash-12B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.3974, + "BBH": 0.5142, + "MATH Level 5": 0.0808, + "GPQA": 0.2768, + "MUSR": 0.4438, + "MMLU-PRO": 0.3068 + } + }, + { + "model_id": "allknowingroger/MultiMash10-13B-slerp", + "name": "MultiMash10-13B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4163, + "BBH": 0.5186, + "MATH Level 5": 0.0718, + "GPQA": 0.2861, + "MUSR": 0.4318, + "MMLU-PRO": 0.3117 + } + }, + { + "model_id": "allknowingroger/MultiMash11-13B-slerp", + "name": "MultiMash11-13B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4251, + "BBH": 0.5194, + "MATH Level 5": 0.0702, + "GPQA": 0.2827, + "MUSR": 0.4373, + "MMLU-PRO": 0.3085 + } + }, + { + "model_id": "allknowingroger/MultiMash2-12B-slerp", + "name": "MultiMash2-12B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4261, + "BBH": 0.5134, + "MATH Level 5": 0.0642, + "GPQA": 0.2794, + "MUSR": 0.4228, + "MMLU-PRO": 0.3043 + } + }, + { + "model_id": "allknowingroger/MultiMash5-12B-slerp", + "name": "MultiMash5-12B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4142, + "BBH": 0.5145, + "MATH Level 5": 0.0634, + "GPQA": 0.2777, + "MUSR": 0.4203, + "MMLU-PRO": 0.3028 + } + }, + { + "model_id": "allknowingroger/MultiMash6-12B-slerp", + "name": "MultiMash6-12B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.43, + "BBH": 0.5196, + "MATH Level 5": 0.0725, + "GPQA": 0.2743, + "MUSR": 0.4306, + "MMLU-PRO": 0.3091 + } + }, + { + "model_id": "allknowingroger/MultiMash7-12B-slerp", + "name": "MultiMash7-12B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4213, + "BBH": 0.5111, + "MATH Level 5": 0.0695, + "GPQA": 0.2785, + "MUSR": 0.4279, + "MMLU-PRO": 0.3029 + } + }, + { + "model_id": "allknowingroger/MultiMash8-13B-slerp", + "name": "MultiMash8-13B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4321, + "BBH": 0.5178, + "MATH Level 5": 0.077, + "GPQA": 0.2886, + "MUSR": 0.4424, + "MMLU-PRO": 0.3126 + } + }, + { + "model_id": "allknowingroger/MultiMash9-13B-slerp", + "name": "MultiMash9-13B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4188, + "BBH": 0.5194, + "MATH Level 5": 0.0785, + "GPQA": 0.2802, + "MUSR": 0.4398, + "MMLU-PRO": 0.31 + } + }, + { + "model_id": "allknowingroger/MultiMerge-7B-slerp", + "name": "MultiMerge-7B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.3948, + "BBH": 0.514, + "MATH Level 5": 0.0665, + "GPQA": 0.2827, + "MUSR": 0.428, + "MMLU-PRO": 0.3037 + } + }, + { + "model_id": "allknowingroger/Multimash3-12B-slerp", + "name": "Multimash3-12B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4437, + "BBH": 0.5177, + "MATH Level 5": 0.0627, + "GPQA": 0.2802, + "MUSR": 0.4344, + "MMLU-PRO": 0.3068 + } + }, + { + "model_id": "allknowingroger/Multimerge-19B-pass", + "name": "Multimerge-19B-pass", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.1773, + "BBH": 0.2892, + "MATH Level 5": 0.0, + "GPQA": 0.2592, + "MUSR": 0.343, + "MMLU-PRO": 0.1169 + } + }, + { + "model_id": "allknowingroger/MultiverseEx26-7B-slerp", + "name": "MultiverseEx26-7B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.3939, + "BBH": 0.5134, + "MATH Level 5": 0.0755, + "GPQA": 0.2827, + "MUSR": 0.4293, + "MMLU-PRO": 0.3035 + } + }, + { + "model_id": "allknowingroger/NeuralWestSeverus-7B-slerp", + "name": "NeuralWestSeverus-7B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4136, + "BBH": 0.5244, + "MATH Level 5": 0.0733, + "GPQA": 0.271, + "MUSR": 0.4529, + "MMLU-PRO": 0.3137 + } + }, + { + "model_id": "allknowingroger/Neuralcoven-7B-slerp", + "name": "Neuralcoven-7B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.3859, + "BBH": 0.5303, + "MATH Level 5": 0.0785, + "GPQA": 0.2852, + "MUSR": 0.429, + "MMLU-PRO": 0.3294 + } + }, + { + "model_id": "allknowingroger/Neuralmultiverse-7B-slerp", + "name": "Neuralmultiverse-7B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.3769, + "BBH": 0.5166, + "MATH Level 5": 0.065, + "GPQA": 0.2844, + "MUSR": 0.428, + "MMLU-PRO": 0.3042 + } + }, + { + "model_id": "allknowingroger/Ph3della5-14B", + "name": "Ph3della5-14B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4799, + "BBH": 0.6332, + "MATH Level 5": 0.1767, + "GPQA": 0.3423, + "MUSR": 0.4386, + "MMLU-PRO": 0.4787 + } + }, + { + "model_id": "allknowingroger/Ph3merge-14B", + "name": "Ph3merge-14B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.2701, + "BBH": 0.6381, + "MATH Level 5": 0.0106, + "GPQA": 0.3381, + "MUSR": 0.4334, + "MMLU-PRO": 0.4611 + } + }, + { + "model_id": "allknowingroger/Ph3merge2-14B", + "name": "Ph3merge2-14B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.1706, + "BBH": 0.3607, + "MATH Level 5": 0.0, + "GPQA": 0.2911, + "MUSR": 0.3911, + "MMLU-PRO": 0.1723 + } + }, + { + "model_id": "allknowingroger/Ph3merge3-14B", + "name": "Ph3merge3-14B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.1645, + "BBH": 0.3597, + "MATH Level 5": 0.0, + "GPQA": 0.2852, + "MUSR": 0.4082, + "MMLU-PRO": 0.1647 + } + }, + { + "model_id": "allknowingroger/Ph3task1-14B", + "name": "Ph3task1-14B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4695, + "BBH": 0.6318, + "MATH Level 5": 0.1669, + "GPQA": 0.3507, + "MUSR": 0.4508, + "MMLU-PRO": 0.4734 + } + }, + { + "model_id": "allknowingroger/Ph3task2-14B", + "name": "Ph3task2-14B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4713, + "BBH": 0.6098, + "MATH Level 5": 0.1465, + "GPQA": 0.3305, + "MUSR": 0.4535, + "MMLU-PRO": 0.446 + } + }, + { + "model_id": "allknowingroger/Ph3task3-14B", + "name": "Ph3task3-14B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4962, + "BBH": 0.6298, + "MATH Level 5": 0.176, + "GPQA": 0.3414, + "MUSR": 0.4426, + "MMLU-PRO": 0.4771 + } + }, + { + "model_id": "allknowingroger/Ph3unsloth-3B-slerp", + "name": "Ph3unsloth-3B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.1894, + "BBH": 0.5468, + "MATH Level 5": 0.1012, + "GPQA": 0.3247, + "MUSR": 0.4528, + "MMLU-PRO": 0.3701 + } + }, + { + "model_id": "allknowingroger/Phi3mash1-17B-pass", + "name": "Phi3mash1-17B-pass", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.1884, + "BBH": 0.6129, + "MATH Level 5": 0.0, + "GPQA": 0.3196, + "MUSR": 0.4451, + "MMLU-PRO": 0.4589 + } + }, + { + "model_id": "allknowingroger/Quen2-65B", + "name": "Quen2-65B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.1758, + "BBH": 0.2757, + "MATH Level 5": 0.0, + "GPQA": 0.2357, + "MUSR": 0.3209, + "MMLU-PRO": 0.1114 + } + }, + { + "model_id": "allknowingroger/Qwen2.5-42B-AGI", + "name": "Qwen2.5-42B-AGI", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.1913, + "BBH": 0.2942, + "MATH Level 5": 0.0, + "GPQA": 0.2601, + "MUSR": 0.362, + "MMLU-PRO": 0.1168 + } + }, + { + "model_id": "allknowingroger/Qwen2.5-7B-task2", + "name": "Qwen2.5-7B-task2", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4527, + "BBH": 0.5626, + "MATH Level 5": 0.355, + "GPQA": 0.3163, + "MUSR": 0.437, + "MMLU-PRO": 0.4517 + } + }, + { + "model_id": "allknowingroger/Qwen2.5-7B-task3", + "name": "Qwen2.5-7B-task3", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.5129, + "BBH": 0.5398, + "MATH Level 5": 0.2606, + "GPQA": 0.3171, + "MUSR": 0.4356, + "MMLU-PRO": 0.4501 + } + }, + { + "model_id": "allknowingroger/Qwen2.5-7B-task4", + "name": "Qwen2.5-7B-task4", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.5005, + "BBH": 0.5583, + "MATH Level 5": 0.3112, + "GPQA": 0.3205, + "MUSR": 0.4395, + "MMLU-PRO": 0.4561 + } + }, + { + "model_id": "allknowingroger/Qwen2.5-7B-task7", + "name": "Qwen2.5-7B-task7", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4284, + "BBH": 0.5552, + "MATH Level 5": 0.065, + "GPQA": 0.3205, + "MUSR": 0.4326, + "MMLU-PRO": 0.4133 + } + }, + { + "model_id": "allknowingroger/Qwen2.5-7B-task8", + "name": "Qwen2.5-7B-task8", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4645, + "BBH": 0.5525, + "MATH Level 5": 0.3527, + "GPQA": 0.3205, + "MUSR": 0.4514, + "MMLU-PRO": 0.4433 + } + }, + { + "model_id": "allknowingroger/Qwen2.5-slerp-14B", + "name": "Qwen2.5-slerp-14B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4928, + "BBH": 0.6512, + "MATH Level 5": 0.4622, + "GPQA": 0.3674, + "MUSR": 0.4744, + "MMLU-PRO": 0.5379 + } + }, + { + "model_id": "allknowingroger/QwenSlerp12-7B", + "name": "QwenSlerp12-7B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.5076, + "BBH": 0.5556, + "MATH Level 5": 0.2946, + "GPQA": 0.3154, + "MUSR": 0.4595, + "MMLU-PRO": 0.4461 + } + }, + { + "model_id": "allknowingroger/QwenSlerp4-14B", + "name": "QwenSlerp4-14B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.6328, + "BBH": 0.6483, + "MATH Level 5": 0.3693, + "GPQA": 0.3725, + "MUSR": 0.465, + "MMLU-PRO": 0.5436 + } + }, + { + "model_id": "allknowingroger/QwenSlerp5-14B", + "name": "QwenSlerp5-14B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.7119, + "BBH": 0.6357, + "MATH Level 5": 0.3565, + "GPQA": 0.3649, + "MUSR": 0.4675, + "MMLU-PRO": 0.5391 + } + }, + { + "model_id": "allknowingroger/QwenSlerp6-14B", + "name": "QwenSlerp6-14B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.6867, + "BBH": 0.6384, + "MATH Level 5": 0.3724, + "GPQA": 0.3733, + "MUSR": 0.469, + "MMLU-PRO": 0.5406 + } + }, + { + "model_id": "allknowingroger/QwenStock1-14B", + "name": "QwenStock1-14B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.5634, + "BBH": 0.6528, + "MATH Level 5": 0.3769, + "GPQA": 0.3767, + "MUSR": 0.473, + "MMLU-PRO": 0.5418 + } + }, + { + "model_id": "allknowingroger/QwenStock2-14B", + "name": "QwenStock2-14B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.5563, + "BBH": 0.6569, + "MATH Level 5": 0.3882, + "GPQA": 0.3792, + "MUSR": 0.4756, + "MMLU-PRO": 0.5406 + } + }, + { + "model_id": "allknowingroger/QwenStock3-14B", + "name": "QwenStock3-14B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.5615, + "BBH": 0.6565, + "MATH Level 5": 0.3776, + "GPQA": 0.3784, + "MUSR": 0.4756, + "MMLU-PRO": 0.5428 + } + }, + { + "model_id": "allknowingroger/Qwenslerp2-14B", + "name": "Qwenslerp2-14B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.5007, + "BBH": 0.6555, + "MATH Level 5": 0.4456, + "GPQA": 0.3683, + "MUSR": 0.4729, + "MMLU-PRO": 0.5403 + } + }, + { + "model_id": "allknowingroger/Qwenslerp2-7B", + "name": "Qwenslerp2-7B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.5294, + "BBH": 0.5609, + "MATH Level 5": 0.3421, + "GPQA": 0.3129, + "MUSR": 0.4356, + "MMLU-PRO": 0.4515 + } + }, + { + "model_id": "allknowingroger/Qwenslerp3-14B", + "name": "Qwenslerp3-14B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.5052, + "BBH": 0.6521, + "MATH Level 5": 0.4464, + "GPQA": 0.375, + "MUSR": 0.4676, + "MMLU-PRO": 0.5395 + } + }, + { + "model_id": "allknowingroger/Qwenslerp3-7B", + "name": "Qwenslerp3-7B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.5018, + "BBH": 0.558, + "MATH Level 5": 0.3218, + "GPQA": 0.3247, + "MUSR": 0.4515, + "MMLU-PRO": 0.4542 + } + }, + { + "model_id": "allknowingroger/ROGERphi-7B-slerp", + "name": "ROGERphi-7B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.3861, + "BBH": 0.5196, + "MATH Level 5": 0.0733, + "GPQA": 0.2886, + "MUSR": 0.4685, + "MMLU-PRO": 0.3053 + } + }, + { + "model_id": "allknowingroger/RogerMerge-7B-slerp", + "name": "RogerMerge-7B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.3933, + "BBH": 0.516, + "MATH Level 5": 0.0687, + "GPQA": 0.2802, + "MUSR": 0.432, + "MMLU-PRO": 0.303 + } + }, + { + "model_id": "allknowingroger/Rombos-LLM-V2.5-Qwen-42b", + "name": "Rombos-LLM-V2.5-Qwen-42b", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.1879, + "BBH": 0.2969, + "MATH Level 5": 0.0, + "GPQA": 0.2626, + "MUSR": 0.3633, + "MMLU-PRO": 0.1168 + } + }, + { + "model_id": "allknowingroger/Strangecoven-7B-slerp", + "name": "Strangecoven-7B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.3746, + "BBH": 0.5368, + "MATH Level 5": 0.0763, + "GPQA": 0.2894, + "MUSR": 0.4199, + "MMLU-PRO": 0.3364 + } + }, + { + "model_id": "allknowingroger/Weirdslerp2-25B", + "name": "Weirdslerp2-25B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.1754, + "BBH": 0.2874, + "MATH Level 5": 0.0, + "GPQA": 0.2492, + "MUSR": 0.3524, + "MMLU-PRO": 0.1128 + } + }, + { + "model_id": "allknowingroger/WestlakeMaziyar-7B-slerp", + "name": "WestlakeMaziyar-7B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4838, + "BBH": 0.5245, + "MATH Level 5": 0.0665, + "GPQA": 0.3037, + "MUSR": 0.4474, + "MMLU-PRO": 0.3078 + } + }, + { + "model_id": "allknowingroger/YamMaths-7B-slerp", + "name": "YamMaths-7B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4148, + "BBH": 0.5156, + "MATH Level 5": 0.0853, + "GPQA": 0.2802, + "MUSR": 0.4384, + "MMLU-PRO": 0.3131 + } + }, + { + "model_id": "allknowingroger/Yi-1.5-34B", + "name": "Yi-1.5-34B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.1639, + "BBH": 0.2827, + "MATH Level 5": 0.0, + "GPQA": 0.2584, + "MUSR": 0.3857, + "MMLU-PRO": 0.1095 + } + }, + { + "model_id": "allknowingroger/Yi-blossom-40B", + "name": "Yi-blossom-40B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.2009, + "BBH": 0.3215, + "MATH Level 5": 0.0, + "GPQA": 0.2743, + "MUSR": 0.3843, + "MMLU-PRO": 0.108 + } + }, + { + "model_id": "allknowingroger/Yibuddy-35B", + "name": "Yibuddy-35B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4235, + "BBH": 0.5916, + "MATH Level 5": 0.1571, + "GPQA": 0.3557, + "MUSR": 0.4505, + "MMLU-PRO": 0.4489 + } + }, + { + "model_id": "allknowingroger/Yillama-40B", + "name": "Yillama-40B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.1697, + "BBH": 0.4063, + "MATH Level 5": 0.0, + "GPQA": 0.2827, + "MUSR": 0.3501, + "MMLU-PRO": 0.1981 + } + }, + { + "model_id": "allknowingroger/Yislerp-34B", + "name": "Yislerp-34B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.3692, + "BBH": 0.6159, + "MATH Level 5": 0.216, + "GPQA": 0.3582, + "MUSR": 0.4566, + "MMLU-PRO": 0.4751 + } + }, + { + "model_id": "allknowingroger/Yislerp2-34B", + "name": "Yislerp2-34B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.3999, + "BBH": 0.6246, + "MATH Level 5": 0.2296, + "GPQA": 0.3641, + "MUSR": 0.453, + "MMLU-PRO": 0.4724 + } + }, + { + "model_id": "allknowingroger/Yunconglong-13B-slerp", + "name": "Yunconglong-13B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4242, + "BBH": 0.5166, + "MATH Level 5": 0.0544, + "GPQA": 0.281, + "MUSR": 0.4161, + "MMLU-PRO": 0.3036 + } + }, + { + "model_id": "allknowingroger/limyClown-7B-slerp", + "name": "limyClown-7B-slerp", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.4017, + "BBH": 0.5148, + "MATH Level 5": 0.0687, + "GPQA": 0.281, + "MUSR": 0.4293, + "MMLU-PRO": 0.3038 + } + }, + { + "model_id": "allknowingroger/llama3-Jallabi-40B-s", + "name": "llama3-Jallabi-40B-s", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.1921, + "BBH": 0.3252, + "MATH Level 5": 0.0, + "GPQA": 0.2374, + "MUSR": 0.375, + "MMLU-PRO": 0.1088 + } + }, + { + "model_id": "allknowingroger/llama3AnFeng-40B", + "name": "llama3AnFeng-40B", + "developer": "allknowingroger", + "scores": { + "IFEval": 0.1742, + "BBH": 0.3794, + "MATH Level 5": 0.0, + "GPQA": 0.3062, + "MUSR": 0.394, + "MMLU-PRO": 0.198 + } + }, + { + "model_id": "allura-org/L3.1-8b-RP-Ink", + "name": "L3.1-8b-RP-Ink", + "developer": "allura-org", + "scores": { + "IFEval": 0.7811, + "BBH": 0.4828, + "MATH Level 5": 0.148, + "GPQA": 0.2643, + "MUSR": 0.3608, + "MMLU-PRO": 0.3428 + } + }, + { + "model_id": "allura-org/MN-12b-RP-Ink", + "name": "MN-12b-RP-Ink", + "developer": "allura-org", + "scores": { + "IFEval": 0.7186, + "BBH": 0.4834, + "MATH Level 5": 0.1186, + "GPQA": 0.2852, + "MUSR": 0.3818, + "MMLU-PRO": 0.3514 + } + }, + { + "model_id": "allura-org/MS-Meadowlark-22B", + "name": "MS-Meadowlark-22B", + "developer": "allura-org", + "scores": { + "IFEval": 0.6697, + "BBH": 0.5163, + "MATH Level 5": 0.1835, + "GPQA": 0.3255, + "MUSR": 0.3843, + "MMLU-PRO": 0.3823 + } + }, + { + "model_id": "allura-org/Mistral-Small-24b-Sertraline-0304", + "name": "Mistral-Small-24b-Sertraline-0304", + "developer": "allura-org", + "scores": { + "IFEval": 0.68, + "BBH": 0.6525, + "MATH Level 5": 0.2228, + "GPQA": 0.3515, + "MUSR": 0.4395, + "MMLU-PRO": 0.5106 + } + }, + { + "model_id": "allura-org/Mistral-Small-Sisyphus-24b-2503", + "name": "Mistral-Small-Sisyphus-24b-2503", + "developer": "allura-org", + "scores": { + "IFEval": 0.6848, + "BBH": 0.627, + "MATH Level 5": 0.25, + "GPQA": 0.2626, + "MUSR": 0.3977, + "MMLU-PRO": 0.5127 + } + }, + { + "model_id": "allura-org/MoE-Girl-1BA-7BT", + "name": "MoE-Girl-1BA-7BT", + "developer": "allura-org", + "scores": { + "IFEval": 0.2705, + "BBH": 0.3139, + "MATH Level 5": 0.0151, + "GPQA": 0.2584, + "MUSR": 0.3436, + "MMLU-PRO": 0.1218 + } + }, + { + "model_id": "allura-org/TQ2.5-14B-Aletheia-v1", + "name": "TQ2.5-14B-Aletheia-v1", + "developer": "allura-org", + "scores": { + "IFEval": 0.753, + "BBH": 0.6585, + "MATH Level 5": 0.3399, + "GPQA": 0.3624, + "MUSR": 0.4452, + "MMLU-PRO": 0.5241 + } + }, + { + "model_id": "allura-org/TQ2.5-14B-Neon-v1", + "name": "TQ2.5-14B-Neon-v1", + "developer": "allura-org", + "scores": { + "IFEval": 0.6754, + "BBH": 0.6553, + "MATH Level 5": 0.3603, + "GPQA": 0.3716, + "MUSR": 0.461, + "MMLU-PRO": 0.5253 + } + }, + { + "model_id": "allura-org/Teleut-7b", + "name": "Teleut-7b", + "developer": "allura-org", + "scores": { + "IFEval": 0.6379, + "BBH": 0.5141, + "MATH Level 5": 0.2409, + "GPQA": 0.3263, + "MUSR": 0.464, + "MMLU-PRO": 0.4131 + } + }, + { + "model_id": "aloobun/Meta-Llama-3-7B-28Layers", + "name": "Meta-Llama-3-7B-28Layers", + "developer": "aloobun", + "scores": { + "IFEval": 0.1964, + "BBH": 0.4437, + "MATH Level 5": 0.0279, + "GPQA": 0.2945, + "MUSR": 0.3589, + "MMLU-PRO": 0.316 + } + }, + { + "model_id": "aloobun/d-SmolLM2-360M", + "name": "d-SmolLM2-360M", + "developer": "aloobun", + "scores": { + "IFEval": 0.2097, + "BBH": 0.3196, + "MATH Level 5": 0.0128, + "GPQA": 0.2534, + "MUSR": 0.3981, + "MMLU-PRO": 0.1169 + } + }, + { + "model_id": "alpindale/WizardLM-2-8x22B", + "name": "WizardLM-2-8x22B", + "developer": "alpindale", + "scores": { + "IFEval": 0.5272, + "BBH": 0.6377, + "MATH Level 5": 0.25, + "GPQA": 0.3817, + "MUSR": 0.4387, + "MMLU-PRO": 0.4596 + } + }, + { + "model_id": "alpindale/magnum-72b-v1", + "name": "magnum-72b-v1", + "developer": "alpindale", + "scores": { + "IFEval": 0.7606, + "BBH": 0.6982, + "MATH Level 5": 0.398, + "GPQA": 0.3909, + "MUSR": 0.4489, + "MMLU-PRO": 0.5468 + } + }, + { + "model_id": "altomek/YiSM-34B-0rn", + "name": "YiSM-34B-0rn", + "developer": "altomek", + "scores": { + "IFEval": 0.4284, + "BBH": 0.614, + "MATH Level 5": 0.2281, + "GPQA": 0.3716, + "MUSR": 0.445, + "MMLU-PRO": 0.4696 + } + }, + { + "model_id": "amazon/MegaBeam-Mistral-7B-300k", + "name": "MegaBeam-Mistral-7B-300k", + "developer": "amazon", + "scores": { + "IFEval": 0.5203, + "BBH": 0.4228, + "MATH Level 5": 0.0211, + "GPQA": 0.2735, + "MUSR": 0.398, + "MMLU-PRO": 0.2549 + } + }, + { + "model_id": "amd/AMD-Llama-135m", + "name": "AMD-Llama-135m", + "developer": "amd", + "scores": { + "IFEval": 0.1918, + "BBH": 0.2969, + "MATH Level 5": 0.0076, + "GPQA": 0.2584, + "MUSR": 0.3846, + "MMLU-PRO": 0.1169 + } + }, + { + "model_id": "anakin87/gemma-2b-orpo", + "name": "gemma-2b-orpo", + "developer": "anakin87", + "scores": { + "IFEval": 0.2478, + "BBH": 0.3426, + "MATH Level 5": 0.0189, + "GPQA": 0.2617, + "MUSR": 0.3728, + "MMLU-PRO": 0.1306 + } + }, + { + "model_id": "anthracite-org/magnum-v1-72b", + "name": "magnum-v1-72b", + "developer": "anthracite-org", + "scores": { + "IFEval": 0.7606, + "BBH": 0.6982, + "MATH Level 5": 0.398, + "GPQA": 0.3909, + "MUSR": 0.4489, + "MMLU-PRO": 0.5486 + } + }, + { + "model_id": "anthracite-org/magnum-v2-12b", + "name": "magnum-v2-12b", + "developer": "anthracite-org", + "scores": { + "IFEval": 0.3762, + "BBH": 0.5021, + "MATH Level 5": 0.0544, + "GPQA": 0.2911, + "MUSR": 0.4179, + "MMLU-PRO": 0.3167 + } + }, + { + "model_id": "anthracite-org/magnum-v2-72b", + "name": "magnum-v2-72b", + "developer": "anthracite-org", + "scores": { + "IFEval": 0.756, + "BBH": 0.7005, + "MATH Level 5": 0.3542, + "GPQA": 0.3859, + "MUSR": 0.4372, + "MMLU-PRO": 0.5456 + } + }, + { + "model_id": "anthracite-org/magnum-v2.5-12b-kto", + "name": "magnum-v2.5-12b-kto", + "developer": "anthracite-org", + "scores": { + "IFEval": 0.3866, + "BBH": 0.5077, + "MATH Level 5": 0.0521, + "GPQA": 0.2936, + "MUSR": 0.4086, + "MMLU-PRO": 0.3215 + } + }, + { + "model_id": "anthracite-org/magnum-v3-27b-kto", + "name": "magnum-v3-27b-kto", + "developer": "anthracite-org", + "scores": { + "IFEval": 0.5675, + "BBH": 0.586, + "MATH Level 5": 0.1813, + "GPQA": 0.3557, + "MUSR": 0.3855, + "MMLU-PRO": 0.4238 + } + }, + { + "model_id": "anthracite-org/magnum-v3-34b", + "name": "magnum-v3-34b", + "developer": "anthracite-org", + "scores": { + "IFEval": 0.5115, + "BBH": 0.6088, + "MATH Level 5": 0.1949, + "GPQA": 0.3607, + "MUSR": 0.3872, + "MMLU-PRO": 0.4752 + } + }, + { + "model_id": "anthracite-org/magnum-v3-9b-chatml", + "name": "magnum-v3-9b-chatml", + "developer": "anthracite-org", + "scores": { + "IFEval": 0.1275, + "BBH": 0.5428, + "MATH Level 5": 0.0695, + "GPQA": 0.3456, + "MUSR": 0.4432, + "MMLU-PRO": 0.4242 + } + }, + { + "model_id": "anthracite-org/magnum-v3-9b-customgemma2", + "name": "magnum-v3-9b-customgemma2", + "developer": "anthracite-org", + "scores": { + "IFEval": 0.1273, + "BBH": 0.534, + "MATH Level 5": 0.0718, + "GPQA": 0.3289, + "MUSR": 0.4565, + "MMLU-PRO": 0.4205 + } + }, + { + "model_id": "anthracite-org/magnum-v4-12b", + "name": "magnum-v4-12b", + "developer": "anthracite-org", + "scores": { + "IFEval": 0.3393, + "BBH": 0.5177, + "MATH Level 5": 0.1178, + "GPQA": 0.2961, + "MUSR": 0.4093, + "MMLU-PRO": 0.3604 + } + }, + { + "model_id": "anthracite-org/magnum-v4-22b", + "name": "magnum-v4-22b", + "developer": "anthracite-org", + "scores": { + "IFEval": 0.5629, + "BBH": 0.5486, + "MATH Level 5": 0.2002, + "GPQA": 0.328, + "MUSR": 0.4408, + "MMLU-PRO": 0.383 + } + }, + { + "model_id": "anthracite-org/magnum-v4-27b", + "name": "magnum-v4-27b", + "developer": "anthracite-org", + "scores": { + "IFEval": 0.3454, + "BBH": 0.5867, + "MATH Level 5": 0.1798, + "GPQA": 0.37, + "MUSR": 0.438, + "MMLU-PRO": 0.4376 + } + }, + { + "model_id": "anthracite-org/magnum-v4-9b", + "name": "magnum-v4-9b", + "developer": "anthracite-org", + "scores": { + "IFEval": 0.3503, + "BBH": 0.5336, + "MATH Level 5": 0.1307, + "GPQA": 0.3473, + "MUSR": 0.4516, + "MMLU-PRO": 0.3953 + } + }, + { + "model_id": "apple/DCLM-7B", + "name": "DCLM-7B", + "developer": "apple", + "scores": { + "IFEval": 0.2173, + "BBH": 0.4232, + "MATH Level 5": 0.037, + "GPQA": 0.3154, + "MUSR": 0.3921, + "MMLU-PRO": 0.3111 + } + }, + { + "model_id": "appvoid/arco-2", + "name": "arco-2", + "developer": "appvoid", + "scores": { + "IFEval": 0.1991, + "BBH": 0.3146, + "MATH Level 5": 0.0136, + "GPQA": 0.2391, + "MUSR": 0.3536, + "MMLU-PRO": 0.1116 + } + }, + { + "model_id": "appvoid/arco-2-instruct", + "name": "arco-2-instruct", + "developer": "appvoid", + "scores": { + "IFEval": 0.2164, + "BBH": 0.3133, + "MATH Level 5": 0.0128, + "GPQA": 0.2383, + "MUSR": 0.3496, + "MMLU-PRO": 0.1113 + } + }, + { + "model_id": "arcee-ai/Arcee-Blitz", + "name": "Arcee-Blitz", + "developer": "arcee-ai", + "scores": { + "IFEval": 0.5543, + "BBH": 0.6607, + "MATH Level 5": 0.3482, + "GPQA": 0.3851, + "MUSR": 0.5047, + "MMLU-PRO": 0.6154 + } + }, + { + "model_id": "arcee-ai/Arcee-Maestro-7B-Preview", + "name": "Arcee-Maestro-7B-Preview", + "developer": "arcee-ai", + "scores": { + "IFEval": 0.275, + "BBH": 0.4648, + "MATH Level 5": 0.4992, + "GPQA": 0.3322, + "MUSR": 0.3885, + "MMLU-PRO": 0.3039 + } + }, + { + "model_id": "arcee-ai/Arcee-Nova", + "name": "Arcee-Nova", + "developer": "arcee-ai", + "scores": { + "IFEval": 0.7907, + "BBH": 0.6942, + "MATH Level 5": 0.4381, + "GPQA": 0.3851, + "MUSR": 0.4562, + "MMLU-PRO": 0.5452 + } + }, + { + "model_id": "arcee-ai/Arcee-Spark", + "name": "Arcee-Spark", + "developer": "arcee-ai", + "scores": { + "IFEval": 0.5621, + "BBH": 0.5489, + "MATH Level 5": 0.2953, + "GPQA": 0.307, + "MUSR": 0.4021, + "MMLU-PRO": 0.3822 + } + }, + { + "model_id": "arcee-ai/Llama-3.1-SuperNova-Lite", + "name": "Llama-3.1-SuperNova-Lite", + "developer": "arcee-ai", + "scores": { + "IFEval": 0.8017, + "BBH": 0.5152, + "MATH Level 5": 0.1828, + "GPQA": 0.3062, + "MUSR": 0.4163, + "MMLU-PRO": 0.3877 + } + }, + { + "model_id": "arcee-ai/Llama-Spark", + "name": "Llama-Spark", + "developer": "arcee-ai", + "scores": { + "IFEval": 0.7911, + "BBH": 0.5054, + "MATH Level 5": 0.139, + "GPQA": 0.2995, + "MUSR": 0.3593, + "MMLU-PRO": 0.3721 + } + }, + { + "model_id": "arcee-ai/SuperNova-Medius", + "name": "SuperNova-Medius", + "developer": "arcee-ai", + "scores": { + "IFEval": 0.7184, + "BBH": 0.6377, + "MATH Level 5": 0.469, + "GPQA": 0.3331, + "MUSR": 0.4233, + "MMLU-PRO": 0.5035 + } + }, + { + "model_id": "arcee-ai/Virtuoso-Lite", + "name": "Virtuoso-Lite", + "developer": "arcee-ai", + "scores": { + "IFEval": 0.81, + "BBH": 0.6099, + "MATH Level 5": 0.253, + "GPQA": 0.344, + "MUSR": 0.4595, + "MMLU-PRO": 0.4441 + } + }, + { + "model_id": "arcee-ai/Virtuoso-Small", + "name": "Virtuoso-Small", + "developer": "arcee-ai", + "scores": { + "IFEval": 0.7935, + "BBH": 0.6518, + "MATH Level 5": 0.4094, + "GPQA": 0.3364, + "MUSR": 0.4339, + "MMLU-PRO": 0.5191 + } + }, + { + "model_id": "arcee-ai/Virtuoso-Small-v2", + "name": "Virtuoso-Small-v2", + "developer": "arcee-ai", + "scores": { + "IFEval": 0.8273, + "BBH": 0.6554, + "MATH Level 5": 0.466, + "GPQA": 0.3532, + "MUSR": 0.4313, + "MMLU-PRO": 0.5188 + } + }, + { + "model_id": "arcee-ai/raspberry-3B", + "name": "raspberry-3B", + "developer": "arcee-ai", + "scores": { + "IFEval": 0.3154, + "BBH": 0.4269, + "MATH Level 5": 0.1035, + "GPQA": 0.2777, + "MUSR": 0.4123, + "MMLU-PRO": 0.2854 + } + }, + { + "model_id": "argilla-warehouse/Llama-3.1-8B-MagPie-Ultra", + "name": "Llama-3.1-8B-MagPie-Ultra", + "developer": "argilla-warehouse", + "scores": { + "IFEval": 0.5757, + "BBH": 0.462, + "MATH Level 5": 0.077, + "GPQA": 0.2668, + "MUSR": 0.3543, + "MMLU-PRO": 0.3144 + } + }, + { + "model_id": "argilla/notus-7b-v1", + "name": "notus-7b-v1", + "developer": "argilla", + "scores": { + "IFEval": 0.5082, + "BBH": 0.4512, + "MATH Level 5": 0.0317, + "GPQA": 0.2894, + "MUSR": 0.3364, + "MMLU-PRO": 0.3004 + } + }, + { + "model_id": "argilla/notux-8x7b-v1", + "name": "notux-8x7b-v1", + "developer": "argilla", + "scores": { + "IFEval": 0.5422, + "BBH": 0.5363, + "MATH Level 5": 0.0997, + "GPQA": 0.3087, + "MUSR": 0.4176, + "MMLU-PRO": 0.366 + } + }, + { + "model_id": "arisin/orca-platypus-13B-slerp", + "name": "orca-platypus-13B-slerp", + "developer": "arisin", + "scores": { + "IFEval": 0.2672, + "BBH": 0.4631, + "MATH Level 5": 0.0159, + "GPQA": 0.2987, + "MUSR": 0.4253, + "MMLU-PRO": 0.2592 + } + }, + { + "model_id": "arshiaafshani/Arsh-V1", + "name": "Arsh-V1", + "developer": "arshiaafshani", + "scores": { + "IFEval": 0.6043, + "BBH": 0.674, + "MATH Level 5": 0.2621, + "GPQA": 0.3733, + "MUSR": 0.4899, + "MMLU-PRO": 0.5257 + } + }, + { + "model_id": "asharsha30/LLAMA_Harsha_8_B_ORDP_10k", + "name": "LLAMA_Harsha_8_B_ORDP_10k", + "developer": "asharsha30", + "scores": { + "IFEval": 0.3464, + "BBH": 0.4669, + "MATH Level 5": 0.0665, + "GPQA": 0.2735, + "MUSR": 0.3697, + "MMLU-PRO": 0.281 + } + }, + { + "model_id": "ashercn97/a1-v0.0.1", + "name": "a1-v0.0.1", + "developer": "ashercn97", + "scores": { + "IFEval": 0.2198, + "BBH": 0.5188, + "MATH Level 5": 0.2145, + "GPQA": 0.3112, + "MUSR": 0.412, + "MMLU-PRO": 0.4165 + } + }, + { + "model_id": "ashercn97/a1-v002", + "name": "a1-v002", + "developer": "ashercn97", + "scores": { + "IFEval": 0.2585, + "BBH": 0.5261, + "MATH Level 5": 0.2341, + "GPQA": 0.3188, + "MUSR": 0.4159, + "MMLU-PRO": 0.4175 + } + }, + { + "model_id": "assskelad/smollm2-360M-sft_SmallThoughts", + "name": "smollm2-360M-sft_SmallThoughts", + "developer": "assskelad", + "scores": { + "IFEval": 0.2007, + "BBH": 0.315, + "MATH Level 5": 0.0166, + "GPQA": 0.2592, + "MUSR": 0.3395, + "MMLU-PRO": 0.1182 + } + }, + { + "model_id": "athirdpath/Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit", + "name": "Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit", + "developer": "athirdpath", + "scores": { + "IFEval": 0.4521, + "BBH": 0.4939, + "MATH Level 5": 0.102, + "GPQA": 0.2919, + "MUSR": 0.3864, + "MMLU-PRO": 0.3565 + } + }, + { + "model_id": "automerger/YamshadowExperiment28-7B", + "name": "YamshadowExperiment28-7B", + "developer": "automerger", + "scores": { + "IFEval": 0.407, + "BBH": 0.515, + "MATH Level 5": 0.0612, + "GPQA": 0.2869, + "MUSR": 0.4306, + "MMLU-PRO": 0.306 + } + }, + { + "model_id": "avemio/GRAG-NEMO-12B-ORPO-HESSIAN-AI", + "name": "GRAG-NEMO-12B-ORPO-HESSIAN-AI", + "developer": "avemio", + "scores": { + "IFEval": 0.0, + "BBH": 0.2607, + "MATH Level 5": 0.0, + "GPQA": 0.2592, + "MUSR": 0.3447, + "MMLU-PRO": 0.1061 + } + }, + { + "model_id": "awnr/Mistral-7B-v0.1-signtensors-1-over-2", + "name": "Mistral-7B-v0.1-signtensors-1-over-2", + "developer": "awnr", + "scores": { + "IFEval": 0.2179, + "BBH": 0.4423, + "MATH Level 5": 0.034, + "GPQA": 0.307, + "MUSR": 0.4006, + "MMLU-PRO": 0.3 + } + }, + { + "model_id": "awnr/Mistral-7B-v0.1-signtensors-1-over-4", + "name": "Mistral-7B-v0.1-signtensors-1-over-4", + "developer": "awnr", + "scores": { + "IFEval": 0.2133, + "BBH": 0.3507, + "MATH Level 5": 0.0249, + "GPQA": 0.2701, + "MUSR": 0.346, + "MMLU-PRO": 0.2311 + } + }, + { + "model_id": "awnr/Mistral-7B-v0.1-signtensors-3-over-8", + "name": "Mistral-7B-v0.1-signtensors-3-over-8", + "developer": "awnr", + "scores": { + "IFEval": 0.2394, + "BBH": 0.43, + "MATH Level 5": 0.0332, + "GPQA": 0.3037, + "MUSR": 0.3818, + "MMLU-PRO": 0.3001 + } + }, + { + "model_id": "awnr/Mistral-7B-v0.1-signtensors-5-over-16", + "name": "Mistral-7B-v0.1-signtensors-5-over-16", + "developer": "awnr", + "scores": { + "IFEval": 0.2118, + "BBH": 0.4124, + "MATH Level 5": 0.0295, + "GPQA": 0.281, + "MUSR": 0.3686, + "MMLU-PRO": 0.2958 + } + }, + { + "model_id": "awnr/Mistral-7B-v0.1-signtensors-7-over-16", + "name": "Mistral-7B-v0.1-signtensors-7-over-16", + "developer": "awnr", + "scores": { + "IFEval": 0.2294, + "BBH": 0.4316, + "MATH Level 5": 0.0385, + "GPQA": 0.3037, + "MUSR": 0.3952, + "MMLU-PRO": 0.303 + } + }, + { + "model_id": "aws-prototyping/MegaBeam-Mistral-7B-512k", + "name": "MegaBeam-Mistral-7B-512k", + "developer": "aws-prototyping", + "scores": { + "IFEval": 0.5973, + "BBH": 0.3662, + "MATH Level 5": 0.0287, + "GPQA": 0.2827, + "MUSR": 0.3994, + "MMLU-PRO": 0.2589 + } + }, + { + "model_id": "axolotl-ai-co/romulus-mistral-nemo-12b-simpo", + "name": "romulus-mistral-nemo-12b-simpo", + "developer": "axolotl-ai-co", + "scores": { + "IFEval": 0.6079, + "BBH": 0.5395, + "MATH Level 5": 0.114, + "GPQA": 0.2785, + "MUSR": 0.4233, + "MMLU-PRO": 0.3469 + } + }, + { + "model_id": "baconnier/Napoleon_24B_V0.0", + "name": "Napoleon_24B_V0.0", + "developer": "baconnier", + "scores": { + "IFEval": 0.1801, + "BBH": 0.6367, + "MATH Level 5": 0.2273, + "GPQA": 0.3792, + "MUSR": 0.442, + "MMLU-PRO": 0.504 + } + }, + { + "model_id": "baconnier/Napoleon_24B_V0.2", + "name": "Napoleon_24B_V0.2", + "developer": "baconnier", + "scores": { + "IFEval": 0.2527, + "BBH": 0.5911, + "MATH Level 5": 0.1435, + "GPQA": 0.3381, + "MUSR": 0.446, + "MMLU-PRO": 0.4357 + } + }, + { + "model_id": "baebee/7B-Cetacea", + "name": "7B-Cetacea", + "developer": "baebee", + "scores": { + "IFEval": 0.5279, + "BBH": 0.4757, + "MATH Level 5": 0.0468, + "GPQA": 0.2861, + "MUSR": 0.4136, + "MMLU-PRO": 0.2955 + } + }, + { + "model_id": "baebee/mergekit-model_stock-nzjnheg", + "name": "mergekit-model_stock-nzjnheg", + "developer": "baebee", + "scores": { + "IFEval": 0.4844, + "BBH": 0.5287, + "MATH Level 5": 0.1677, + "GPQA": 0.2802, + "MUSR": 0.3847, + "MMLU-PRO": 0.3699 + } + }, + { + "model_id": "baebee/mergekit-ties-fnjenli", + "name": "mergekit-ties-fnjenli", + "developer": "baebee", + "scores": { + "IFEval": 0.1988, + "BBH": 0.3024, + "MATH Level 5": 0.0023, + "GPQA": 0.245, + "MUSR": 0.4019, + "MMLU-PRO": 0.1129 + } + }, + { + "model_id": "bamec66557/MISCHIEVOUS-12B", + "name": "MISCHIEVOUS-12B", + "developer": "bamec66557", + "scores": { + "IFEval": 0.3852, + "BBH": 0.5405, + "MATH Level 5": 0.1276, + "GPQA": 0.3205, + "MUSR": 0.4145, + "MMLU-PRO": 0.3672 + } + }, + { + "model_id": "bamec66557/MISCHIEVOUS-12B-Mix_0.1v", + "name": "MISCHIEVOUS-12B-Mix_0.1v", + "developer": "bamec66557", + "scores": { + "IFEval": 0.3636, + "BBH": 0.5436, + "MATH Level 5": 0.1329, + "GPQA": 0.328, + "MUSR": 0.4132, + "MMLU-PRO": 0.3674 + } + }, + { + "model_id": "bamec66557/MISCHIEVOUS-12B-Mix_0.2v", + "name": "MISCHIEVOUS-12B-Mix_0.2v", + "developer": "bamec66557", + "scores": { + "IFEval": 0.3624, + "BBH": 0.5434, + "MATH Level 5": 0.1261, + "GPQA": 0.3255, + "MUSR": 0.4158, + "MMLU-PRO": 0.3663 + } + }, + { + "model_id": "bamec66557/MISCHIEVOUS-12B-Mix_0.3v", + "name": "MISCHIEVOUS-12B-Mix_0.3v", + "developer": "bamec66557", + "scores": { + "IFEval": 0.387, + "BBH": 0.5431, + "MATH Level 5": 0.1337, + "GPQA": 0.3196, + "MUSR": 0.4131, + "MMLU-PRO": 0.3664 + } + }, + { + "model_id": "bamec66557/MISCHIEVOUS-12B-Mix_0.4v", + "name": "MISCHIEVOUS-12B-Mix_0.4v", + "developer": "bamec66557", + "scores": { + "IFEval": 0.6508, + "BBH": 0.5094, + "MATH Level 5": 0.1352, + "GPQA": 0.3171, + "MUSR": 0.4176, + "MMLU-PRO": 0.3683 + } + }, + { + "model_id": "bamec66557/MISCHIEVOUS-12B-Mix_0.5v", + "name": "MISCHIEVOUS-12B-Mix_0.5v", + "developer": "bamec66557", + "scores": { + "IFEval": 0.3746, + "BBH": 0.5422, + "MATH Level 5": 0.1367, + "GPQA": 0.3205, + "MUSR": 0.4132, + "MMLU-PRO": 0.3661 + } + }, + { + "model_id": "bamec66557/MISCHIEVOUS-12B-Mix_0.6v", + "name": "MISCHIEVOUS-12B-Mix_0.6v", + "developer": "bamec66557", + "scores": { + "IFEval": 0.4366, + "BBH": 0.5449, + "MATH Level 5": 0.1254, + "GPQA": 0.328, + "MUSR": 0.4185, + "MMLU-PRO": 0.3662 + } + }, + { + "model_id": "bamec66557/MISCHIEVOUS-12B-Mix_III_IV_V", + "name": "MISCHIEVOUS-12B-Mix_III_IV_V", + "developer": "bamec66557", + "scores": { + "IFEval": 0.4031, + "BBH": 0.5465, + "MATH Level 5": 0.1292, + "GPQA": 0.3205, + "MUSR": 0.4198, + "MMLU-PRO": 0.3664 + } + }, + { + "model_id": "bamec66557/MISCHIEVOUS-12B-Mix_III_ex_V", + "name": "MISCHIEVOUS-12B-Mix_III_ex_V", + "developer": "bamec66557", + "scores": { + "IFEval": 0.4316, + "BBH": 0.5449, + "MATH Level 5": 0.1322, + "GPQA": 0.3205, + "MUSR": 0.4198, + "MMLU-PRO": 0.3649 + } + }, + { + "model_id": "bamec66557/MISCHIEVOUS-12B-Mix_Neo", + "name": "MISCHIEVOUS-12B-Mix_Neo", + "developer": "bamec66557", + "scores": { + "IFEval": 0.625, + "BBH": 0.5078, + "MATH Level 5": 0.1329, + "GPQA": 0.3163, + "MUSR": 0.415, + "MMLU-PRO": 0.3685 + } + }, + { + "model_id": "bamec66557/Mistral-Nemo-VICIOUS_MESH-12B-2407", + "name": "Mistral-Nemo-VICIOUS_MESH-12B-2407", + "developer": "bamec66557", + "scores": { + "IFEval": 0.6706, + "BBH": 0.5156, + "MATH Level 5": 0.1367, + "GPQA": 0.3154, + "MUSR": 0.431, + "MMLU-PRO": 0.3677 + } + }, + { + "model_id": "bamec66557/NameLess-12B-prob", + "name": "NameLess-12B-prob", + "developer": "bamec66557", + "scores": { + "IFEval": 0.6602, + "BBH": 0.5158, + "MATH Level 5": 0.1261, + "GPQA": 0.3146, + "MUSR": 0.4336, + "MMLU-PRO": 0.3684 + } + }, + { + "model_id": "bamec66557/VICIOUS_MESH-12B", + "name": "VICIOUS_MESH-12B", + "developer": "bamec66557", + "scores": { + "IFEval": 0.3716, + "BBH": 0.5436, + "MATH Level 5": 0.1344, + "GPQA": 0.328, + "MUSR": 0.4105, + "MMLU-PRO": 0.3679 + } + }, + { + "model_id": "bamec66557/VICIOUS_MESH-12B-0.1v", + "name": "VICIOUS_MESH-12B-0.1v", + "developer": "bamec66557", + "scores": { + "IFEval": 0.3657, + "BBH": 0.5412, + "MATH Level 5": 0.1322, + "GPQA": 0.3247, + "MUSR": 0.4158, + "MMLU-PRO": 0.3683 + } + }, + { + "model_id": "bamec66557/VICIOUS_MESH-12B-0.X.ver", + "name": "VICIOUS_MESH-12B-0.X.ver", + "developer": "bamec66557", + "scores": { + "IFEval": 0.3776, + "BBH": 0.5416, + "MATH Level 5": 0.1201, + "GPQA": 0.3213, + "MUSR": 0.4198, + "MMLU-PRO": 0.3671 + } + }, + { + "model_id": "bamec66557/VICIOUS_MESH-12B-ALPHA", + "name": "VICIOUS_MESH-12B-ALPHA", + "developer": "bamec66557", + "scores": { + "IFEval": 0.6365, + "BBH": 0.5094, + "MATH Level 5": 0.1367, + "GPQA": 0.3138, + "MUSR": 0.4203, + "MMLU-PRO": 0.3697 + } + }, + { + "model_id": "bamec66557/VICIOUS_MESH-12B-BETA", + "name": "VICIOUS_MESH-12B-BETA", + "developer": "bamec66557", + "scores": { + "IFEval": 0.6721, + "BBH": 0.5156, + "MATH Level 5": 0.1329, + "GPQA": 0.3163, + "MUSR": 0.431, + "MMLU-PRO": 0.3679 + } + }, + { + "model_id": "bamec66557/VICIOUS_MESH-12B-DELTA", + "name": "VICIOUS_MESH-12B-DELTA", + "developer": "bamec66557", + "scores": { + "IFEval": 0.6469, + "BBH": 0.5055, + "MATH Level 5": 0.1375, + "GPQA": 0.3121, + "MUSR": 0.4057, + "MMLU-PRO": 0.3651 + } + }, + { + "model_id": "bamec66557/VICIOUS_MESH-12B-DIGAMMA", + "name": "VICIOUS_MESH-12B-DIGAMMA", + "developer": "bamec66557", + "scores": { + "IFEval": 0.6429, + "BBH": 0.5061, + "MATH Level 5": 0.1337, + "GPQA": 0.3129, + "MUSR": 0.4097, + "MMLU-PRO": 0.3659 + } + }, + { + "model_id": "bamec66557/VICIOUS_MESH-12B-EPSILON", + "name": "VICIOUS_MESH-12B-EPSILON", + "developer": "bamec66557", + "scores": { + "IFEval": 0.6305, + "BBH": 0.5038, + "MATH Level 5": 0.1261, + "GPQA": 0.3146, + "MUSR": 0.407, + "MMLU-PRO": 0.3648 + } + }, + { + "model_id": "bamec66557/VICIOUS_MESH-12B-GAMMA", + "name": "VICIOUS_MESH-12B-GAMMA", + "developer": "bamec66557", + "scores": { + "IFEval": 0.6362, + "BBH": 0.5182, + "MATH Level 5": 0.1307, + "GPQA": 0.3138, + "MUSR": 0.4363, + "MMLU-PRO": 0.3666 + } + }, + { + "model_id": "bamec66557/VICIOUS_MESH-12B-NEMO", + "name": "VICIOUS_MESH-12B-NEMO", + "developer": "bamec66557", + "scores": { + "IFEval": 0.4022, + "BBH": 0.5442, + "MATH Level 5": 0.1269, + "GPQA": 0.3238, + "MUSR": 0.4251, + "MMLU-PRO": 0.3716 + } + }, + { + "model_id": "bamec66557/VICIOUS_MESH-12B-OMEGA", + "name": "VICIOUS_MESH-12B-OMEGA", + "developer": "bamec66557", + "scores": { + "IFEval": 0.67, + "BBH": 0.5166, + "MATH Level 5": 0.1344, + "GPQA": 0.3154, + "MUSR": 0.4323, + "MMLU-PRO": 0.3677 + } + }, + { + "model_id": "bamec66557/VICIOUS_MESH-12B-UNION", + "name": "VICIOUS_MESH-12B-UNION", + "developer": "bamec66557", + "scores": { + "IFEval": 0.6429, + "BBH": 0.5107, + "MATH Level 5": 0.139, + "GPQA": 0.3121, + "MUSR": 0.4257, + "MMLU-PRO": 0.3672 + } + }, + { + "model_id": "bamec66557/VICIOUS_MESH-12B_Razor", + "name": "VICIOUS_MESH-12B_Razor", + "developer": "bamec66557", + "scores": { + "IFEval": 0.3736, + "BBH": 0.5447, + "MATH Level 5": 0.1299, + "GPQA": 0.323, + "MUSR": 0.4092, + "MMLU-PRO": 0.3669 + } + }, + { + "model_id": "bamec66557/mergekit-model_stock-zdaysvi", + "name": "mergekit-model_stock-zdaysvi", + "developer": "bamec66557", + "scores": { + "IFEval": 0.6426, + "BBH": 0.5063, + "MATH Level 5": 0.1352, + "GPQA": 0.3138, + "MUSR": 0.4124, + "MMLU-PRO": 0.3688 + } + }, + { + "model_id": "bamec66557/mergekit-ties-sinbkow", + "name": "mergekit-ties-sinbkow", + "developer": "bamec66557", + "scores": { + "IFEval": 0.6432, + "BBH": 0.5092, + "MATH Level 5": 0.145, + "GPQA": 0.3196, + "MUSR": 0.4045, + "MMLU-PRO": 0.3603 + } + }, + { + "model_id": "belztjti/dffghgjh", + "name": "dffghgjh", + "developer": "belztjti", + "scores": { + "IFEval": 0.5784, + "BBH": 0.3582, + "MATH Level 5": 0.0234, + "GPQA": 0.2634, + "MUSR": 0.3475, + "MMLU-PRO": 0.3422 + } + }, + { + "model_id": "belztjti/dtfgv", + "name": "dtfgv", + "developer": "belztjti", + "scores": { + "IFEval": 0.3345, + "BBH": 0.3282, + "MATH Level 5": 0.0181, + "GPQA": 0.2693, + "MUSR": 0.3794, + "MMLU-PRO": 0.1504 + } + }, + { + "model_id": "benhaotang/phi4-qwq-sky-t1", + "name": "phi4-qwq-sky-t1", + "developer": "benhaotang", + "scores": { + "IFEval": 0.046, + "BBH": 0.6711, + "MATH Level 5": 0.4101, + "GPQA": 0.3951, + "MUSR": 0.49, + "MMLU-PRO": 0.5244 + } + }, + { + "model_id": "beomi/gemma-mling-7b", + "name": "gemma-mling-7b", + "developer": "beomi", + "scores": { + "IFEval": 0.2029, + "BBH": 0.4068, + "MATH Level 5": 0.0544, + "GPQA": 0.25, + "MUSR": 0.3759, + "MMLU-PRO": 0.2633 + } + }, + { + "model_id": "beowolx/CodeNinja-1.0-OpenChat-7B", + "name": "CodeNinja-1.0-OpenChat-7B", + "developer": "beowolx", + "scores": { + "IFEval": 0.5447, + "BBH": 0.4441, + "MATH Level 5": 0.0672, + "GPQA": 0.2945, + "MUSR": 0.4243, + "MMLU-PRO": 0.3015 + } + }, + { + "model_id": "berkeley-nest/Starling-LM-7B-alpha", + "name": "Starling-LM-7B-alpha", + "developer": "berkeley-nest", + "scores": { + "IFEval": 0.548, + "BBH": 0.444, + "MATH Level 5": 0.0838, + "GPQA": 0.297, + "MUSR": 0.412, + "MMLU-PRO": 0.3172 + } + }, + { + "model_id": "bfuzzy1/Gunny", + "name": "Gunny", + "developer": "bfuzzy1", + "scores": { + "IFEval": 0.7129, + "BBH": 0.4546, + "MATH Level 5": 0.173, + "GPQA": 0.2785, + "MUSR": 0.3583, + "MMLU-PRO": 0.3039 + } + }, + { + "model_id": "bfuzzy1/acheron", + "name": "acheron", + "developer": "bfuzzy1", + "scores": { + "IFEval": 0.1983, + "BBH": 0.3108, + "MATH Level 5": 0.0166, + "GPQA": 0.2391, + "MUSR": 0.3511, + "MMLU-PRO": 0.1096 + } + }, + { + "model_id": "bfuzzy1/acheron-c", + "name": "acheron-c", + "developer": "bfuzzy1", + "scores": { + "IFEval": 0.1929, + "BBH": 0.3026, + "MATH Level 5": 0.003, + "GPQA": 0.2475, + "MUSR": 0.3382, + "MMLU-PRO": 0.1172 + } + }, + { + "model_id": "bfuzzy1/acheron-d", + "name": "acheron-d", + "developer": "bfuzzy1", + "scores": { + "IFEval": 0.1925, + "BBH": 0.314, + "MATH Level 5": 0.0151, + "GPQA": 0.2366, + "MUSR": 0.3497, + "MMLU-PRO": 0.1134 + } + }, + { + "model_id": "bfuzzy1/acheron-m", + "name": "acheron-m", + "developer": "bfuzzy1", + "scores": { + "IFEval": 0.1758, + "BBH": 0.2928, + "MATH Level 5": 0.0091, + "GPQA": 0.2601, + "MUSR": 0.3487, + "MMLU-PRO": 0.1113 + } + }, + { + "model_id": "bfuzzy1/acheron-m1a-llama", + "name": "acheron-m1a-llama", + "developer": "bfuzzy1", + "scores": { + "IFEval": 0.1125, + "BBH": 0.2956, + "MATH Level 5": 0.0076, + "GPQA": 0.2601, + "MUSR": 0.3633, + "MMLU-PRO": 0.1146 + } + }, + { + "model_id": "bfuzzy1/llambses-1", + "name": "llambses-1", + "developer": "bfuzzy1", + "scores": { + "IFEval": 0.3554, + "BBH": 0.5047, + "MATH Level 5": 0.0687, + "GPQA": 0.2978, + "MUSR": 0.4529, + "MMLU-PRO": 0.314 + } + }, + { + "model_id": "bhuvneshsaini/merged_model", + "name": "merged_model", + "developer": "bhuvneshsaini", + "scores": { + "IFEval": 0.1813, + "BBH": 0.336, + "MATH Level 5": 0.0, + "GPQA": 0.25, + "MUSR": 0.3497, + "MMLU-PRO": 0.1445 + } + }, + { + "model_id": "bigcode/starcoder2-15b", + "name": "starcoder2-15b", + "developer": "bigcode", + "scores": { + "IFEval": 0.278, + "BBH": 0.4448, + "MATH Level 5": 0.0597, + "GPQA": 0.2735, + "MUSR": 0.3501, + "MMLU-PRO": 0.2353 + } + }, + { + "model_id": "bigcode/starcoder2-3b", + "name": "starcoder2-3b", + "developer": "bigcode", + "scores": { + "IFEval": 0.2037, + "BBH": 0.3509, + "MATH Level 5": 0.0151, + "GPQA": 0.2441, + "MUSR": 0.3435, + "MMLU-PRO": 0.1636 + } + }, + { + "model_id": "bigcode/starcoder2-7b", + "name": "starcoder2-7b", + "developer": "bigcode", + "scores": { + "IFEval": 0.2209, + "BBH": 0.3661, + "MATH Level 5": 0.031, + "GPQA": 0.2517, + "MUSR": 0.3793, + "MMLU-PRO": 0.1642 + } + }, + { + "model_id": "bigscience/bloom-1b1", + "name": "bloom-1b1", + "developer": "bigscience", + "scores": { + "IFEval": 0.1373, + "BBH": 0.3107, + "MATH Level 5": 0.0053, + "GPQA": 0.2592, + "MUSR": 0.37, + "MMLU-PRO": 0.1108 + } + }, + { + "model_id": "bigscience/bloom-1b7", + "name": "bloom-1b7", + "developer": "bigscience", + "scores": { + "IFEval": 0.1044, + "BBH": 0.3141, + "MATH Level 5": 0.0053, + "GPQA": 0.2584, + "MUSR": 0.3886, + "MMLU-PRO": 0.1086 + } + }, + { + "model_id": "bigscience/bloom-3b", + "name": "bloom-3b", + "developer": "bigscience", + "scores": { + "IFEval": 0.1271, + "BBH": 0.3063, + "MATH Level 5": 0.0083, + "GPQA": 0.2399, + "MUSR": 0.3981, + "MMLU-PRO": 0.1133 + } + }, + { + "model_id": "bigscience/bloom-560m", + "name": "bloom-560m", + "developer": "bigscience", + "scores": { + "IFEval": 0.062, + "BBH": 0.3026, + "MATH Level 5": 0.0038, + "GPQA": 0.2617, + "MUSR": 0.4031, + "MMLU-PRO": 0.1164 + } + }, + { + "model_id": "bigscience/bloom-7b1", + "name": "bloom-7b1", + "developer": "bigscience", + "scores": { + "IFEval": 0.1322, + "BBH": 0.3114, + "MATH Level 5": 0.0053, + "GPQA": 0.2643, + "MUSR": 0.3487, + "MMLU-PRO": 0.1105 + } + }, + { + "model_id": "bluuwhale/L3-SthenoMaid-8B-V1", + "name": "L3-SthenoMaid-8B-V1", + "developer": "bluuwhale", + "scores": { + "IFEval": 0.7345, + "BBH": 0.5219, + "MATH Level 5": 0.108, + "GPQA": 0.2802, + "MUSR": 0.3687, + "MMLU-PRO": 0.3656 + } + }, + { + "model_id": "bond005/meno-tiny-0.1", + "name": "meno-tiny-0.1", + "developer": "bond005", + "scores": { + "IFEval": 0.455, + "BBH": 0.4263, + "MATH Level 5": 0.139, + "GPQA": 0.2819, + "MUSR": 0.4185, + "MMLU-PRO": 0.2786 + } + }, + { + "model_id": "bosonai/Higgs-Llama-3-70B", + "name": "Higgs-Llama-3-70B", + "developer": "bosonai", + "scores": { + "IFEval": 0.5561, + "BBH": 0.6258, + "MATH Level 5": 0.2523, + "GPQA": 0.3666, + "MUSR": 0.4471, + "MMLU-PRO": 0.4902 + } + }, + { + "model_id": "braindao/DeepSeek-R1-Distill-Qwen-1.5B-Blunt", + "name": "DeepSeek-R1-Distill-Qwen-1.5B-Blunt", + "developer": "braindao", + "scores": { + "IFEval": 0.2611, + "BBH": 0.2774, + "MATH Level 5": 0.1382, + "GPQA": 0.2475, + "MUSR": 0.3595, + "MMLU-PRO": 0.1184 + } + }, + { + "model_id": "braindao/DeepSeek-R1-Distill-Qwen-1.5B-Reflective", + "name": "DeepSeek-R1-Distill-Qwen-1.5B-Reflective", + "developer": "braindao", + "scores": { + "IFEval": 0.3033, + "BBH": 0.2908, + "MATH Level 5": 0.1631, + "GPQA": 0.2609, + "MUSR": 0.3356, + "MMLU-PRO": 0.113 + } + }, + { + "model_id": "braindao/DeepSeek-R1-Distill-Qwen-14B", + "name": "DeepSeek-R1-Distill-Qwen-14B", + "developer": "braindao", + "scores": { + "IFEval": 0.4172, + "BBH": 0.3033, + "MATH Level 5": 0.176, + "GPQA": 0.2802, + "MUSR": 0.4488, + "MMLU-PRO": 0.1127 + } + }, + { + "model_id": "braindao/DeepSeek-R1-Distill-Qwen-14B-ABUB-ST", + "name": "DeepSeek-R1-Distill-Qwen-14B-ABUB-ST", + "developer": "braindao", + "scores": { + "IFEval": 0.3752, + "BBH": 0.4927, + "MATH Level 5": 0.5015, + "GPQA": 0.3448, + "MUSR": 0.4221, + "MMLU-PRO": 0.4243 + } + }, + { + "model_id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt", + "name": "DeepSeek-R1-Distill-Qwen-14B-Blunt", + "developer": "braindao", + "scores": { + "IFEval": 0.5612, + "BBH": 0.3283, + "MATH Level 5": 0.1639, + "GPQA": 0.3029, + "MUSR": 0.4554, + "MMLU-PRO": 0.1447 + } + }, + { + "model_id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored", + "name": "DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored", + "developer": "braindao", + "scores": { + "IFEval": 0.5422, + "BBH": 0.317, + "MATH Level 5": 0.1631, + "GPQA": 0.2827, + "MUSR": 0.4487, + "MMLU-PRO": 0.1431 + } + }, + { + "model_id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt", + "name": "DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt", + "developer": "braindao", + "scores": { + "IFEval": 0.5221, + "BBH": 0.3199, + "MATH Level 5": 0.2508, + "GPQA": 0.2785, + "MUSR": 0.4527, + "MMLU-PRO": 0.1484 + } + }, + { + "model_id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective", + "name": "DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective", + "developer": "braindao", + "scores": { + "IFEval": 0.554, + "BBH": 0.3371, + "MATH Level 5": 0.2372, + "GPQA": 0.2777, + "MUSR": 0.4248, + "MMLU-PRO": 0.1504 + } + }, + { + "model_id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective", + "name": "DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective", + "developer": "braindao", + "scores": { + "IFEval": 0.5139, + "BBH": 0.3013, + "MATH Level 5": 0.1473, + "GPQA": 0.2878, + "MUSR": 0.4433, + "MMLU-PRO": 0.1289 + } + }, + { + "model_id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Reflective", + "name": "DeepSeek-R1-Distill-Qwen-14B-Reflective", + "developer": "braindao", + "scores": { + "IFEval": 0.429, + "BBH": 0.3012, + "MATH Level 5": 0.1918, + "GPQA": 0.2727, + "MUSR": 0.4554, + "MMLU-PRO": 0.1129 + } + }, + { + "model_id": "braindao/DeepSeek-R1-Distill-Qwen-7B", + "name": "DeepSeek-R1-Distill-Qwen-7B", + "developer": "braindao", + "scores": { + "IFEval": 0.3968, + "BBH": 0.2887, + "MATH Level 5": 0.1918, + "GPQA": 0.2617, + "MUSR": 0.3767, + "MMLU-PRO": 0.1141 + } + }, + { + "model_id": "braindao/DeepSeek-R1-Distill-Qwen-7B-Blunt", + "name": "DeepSeek-R1-Distill-Qwen-7B-Blunt", + "developer": "braindao", + "scores": { + "IFEval": 0.4266, + "BBH": 0.2902, + "MATH Level 5": 0.2145, + "GPQA": 0.271, + "MUSR": 0.3885, + "MMLU-PRO": 0.1169 + } + }, + { + "model_id": "braindao/DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored", + "name": "DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored", + "developer": "braindao", + "scores": { + "IFEval": 0.3655, + "BBH": 0.2958, + "MATH Level 5": 0.1737, + "GPQA": 0.2534, + "MUSR": 0.3846, + "MMLU-PRO": 0.1133 + } + }, + { + "model_id": "braindao/DeepSeek-R1-Distill-Qwen-7B-Reflective", + "name": "DeepSeek-R1-Distill-Qwen-7B-Reflective", + "developer": "braindao", + "scores": { + "IFEval": 0.3922, + "BBH": 0.2907, + "MATH Level 5": 0.2024, + "GPQA": 0.2542, + "MUSR": 0.39, + "MMLU-PRO": 0.1155 + } + }, + { + "model_id": "braindao/Qwen2.5-14B", + "name": "Qwen2.5-14B", + "developer": "braindao", + "scores": { + "IFEval": 0.5409, + "BBH": 0.5853, + "MATH Level 5": 0.2923, + "GPQA": 0.3733, + "MUSR": 0.4124, + "MMLU-PRO": 0.4884 + } + }, + { + "model_id": "braindao/Qwen2.5-14B-Instruct", + "name": "Qwen2.5-14B-Instruct", + "developer": "braindao", + "scores": { + "IFEval": 0.8143, + "BBH": 0.6404, + "MATH Level 5": 0.5529, + "GPQA": 0.3289, + "MUSR": 0.414, + "MMLU-PRO": 0.4889 + } + }, + { + "model_id": "braindao/iq-code-evmind-0.5b", + "name": "iq-code-evmind-0.5b", + "developer": "braindao", + "scores": { + "IFEval": 0.3216, + "BBH": 0.3164, + "MATH Level 5": 0.0242, + "GPQA": 0.2416, + "MUSR": 0.3304, + "MMLU-PRO": 0.1189 + } + }, + { + "model_id": "brgx53/3Bgeneral-ECE-PRYMMAL-Martial", + "name": "3Bgeneral-ECE-PRYMMAL-Martial", + "developer": "brgx53", + "scores": { + "IFEval": 0.3289, + "BBH": 0.5458, + "MATH Level 5": 0.1314, + "GPQA": 0.3247, + "MUSR": 0.4373, + "MMLU-PRO": 0.3934 + } + }, + { + "model_id": "brgx53/3Bgeneralv2-ECE-PRYMMAL-Martial", + "name": "3Bgeneralv2-ECE-PRYMMAL-Martial", + "developer": "brgx53", + "scores": { + "IFEval": 0.5677, + "BBH": 0.5607, + "MATH Level 5": 0.3497, + "GPQA": 0.3112, + "MUSR": 0.4356, + "MMLU-PRO": 0.4505 + } + }, + { + "model_id": "brgx53/3Blareneg-ECE-PRYMMAL-Martial", + "name": "3Blareneg-ECE-PRYMMAL-Martial", + "developer": "brgx53", + "scores": { + "IFEval": 0.2876, + "BBH": 0.5358, + "MATH Level 5": 0.1208, + "GPQA": 0.3347, + "MUSR": 0.4429, + "MMLU-PRO": 0.4016 + } + }, + { + "model_id": "brgx53/3Blarenegv2-ECE-PRYMMAL-Martial", + "name": "3Blarenegv2-ECE-PRYMMAL-Martial", + "developer": "brgx53", + "scores": { + "IFEval": 0.5662, + "BBH": 0.5607, + "MATH Level 5": 0.3497, + "GPQA": 0.3112, + "MUSR": 0.4356, + "MMLU-PRO": 0.4505 + } + }, + { + "model_id": "brgx53/Barracuda-PRYMMAL-ECE-TW3", + "name": "Barracuda-PRYMMAL-ECE-TW3", + "developer": "brgx53", + "scores": { + "IFEval": 0.164, + "BBH": 0.3002, + "MATH Level 5": 0.0023, + "GPQA": 0.2534, + "MUSR": 0.3609, + "MMLU-PRO": 0.1093 + } + }, + { + "model_id": "brgx53/LaConfiance-PRYMMAL-ECE-TW3", + "name": "LaConfiance-PRYMMAL-ECE-TW3", + "developer": "brgx53", + "scores": { + "IFEval": 0.1579, + "BBH": 0.2962, + "MATH Level 5": 0.0, + "GPQA": 0.2517, + "MUSR": 0.3846, + "MMLU-PRO": 0.1146 + } + }, + { + "model_id": "bunnycore/Best-Mix-Llama-3.1-8B", + "name": "Best-Mix-Llama-3.1-8B", + "developer": "bunnycore", + "scores": { + "IFEval": 0.2067, + "BBH": 0.3432, + "MATH Level 5": 0.2054, + "GPQA": 0.2651, + "MUSR": 0.2929, + "MMLU-PRO": 0.1565 + } + }, + { + "model_id": "bunnycore/Blabbertron-1.0", + "name": "Blabbertron-1.0", + "developer": "bunnycore", + "scores": { + "IFEval": 0.7433, + "BBH": 0.5497, + "MATH Level 5": 0.4924, + "GPQA": 0.302, + "MUSR": 0.4337, + "MMLU-PRO": 0.4354 + } + }, + { + "model_id": "bunnycore/Blabbertron-1.1", + "name": "Blabbertron-1.1", + "developer": "bunnycore", + "scores": { + "IFEval": 0.7265, + "BBH": 0.5534, + "MATH Level 5": 0.4804, + "GPQA": 0.3029, + "MUSR": 0.4416, + "MMLU-PRO": 0.4431 + } + }, + { + "model_id": "bunnycore/CyberCore-Qwen-2.1-7B", + "name": "CyberCore-Qwen-2.1-7B", + "developer": "bunnycore", + "scores": { + "IFEval": 0.5766, + "BBH": 0.5572, + "MATH Level 5": 0.3588, + "GPQA": 0.3079, + "MUSR": 0.4145, + "MMLU-PRO": 0.4445 + } + }, + { + "model_id": "bunnycore/DeepQwen-3B-LCoT-SCE", + "name": "DeepQwen-3B-LCoT-SCE", + "developer": "bunnycore", + "scores": { + "IFEval": 0.449, + "BBH": 0.4512, + "MATH Level 5": 0.247, + "GPQA": 0.2626, + "MUSR": 0.3514, + "MMLU-PRO": 0.329 + } + }, + { + "model_id": "bunnycore/DeepSeek-R1-Distill-Qwen-7B-RRP-Ex", + "name": "DeepSeek-R1-Distill-Qwen-7B-RRP-Ex", + "developer": "bunnycore", + "scores": { + "IFEval": 0.3901, + "BBH": 0.3494, + "MATH Level 5": 0.1654, + "GPQA": 0.2785, + "MUSR": 0.3663, + "MMLU-PRO": 0.2508 + } + }, + { + "model_id": "bunnycore/DeepThinker-7B-Sce-v1", + "name": "DeepThinker-7B-Sce-v1", + "developer": "bunnycore", + "scores": { + "IFEval": 0.1218, + "BBH": 0.3018, + "MATH Level 5": 0.0098, + "GPQA": 0.2517, + "MUSR": 0.4194, + "MMLU-PRO": 0.1123 + } + }, + { + "model_id": "bunnycore/DeepThinker-7B-Sce-v2", + "name": "DeepThinker-7B-Sce-v2", + "developer": "bunnycore", + "scores": { + "IFEval": 0.1631, + "BBH": 0.3057, + "MATH Level 5": 0.0113, + "GPQA": 0.2584, + "MUSR": 0.4101, + "MMLU-PRO": 0.1146 + } + }, + { + "model_id": "bunnycore/FuseCyberMix-Qwen-2.5-7B-Instruct", + "name": "FuseCyberMix-Qwen-2.5-7B-Instruct", + "developer": "bunnycore", + "scores": { + "IFEval": 0.7019, + "BBH": 0.5518, + "MATH Level 5": 0.4841, + "GPQA": 0.297, + "MUSR": 0.402, + "MMLU-PRO": 0.4337 + } + }, + { + "model_id": "bunnycore/FuseQwQen-7B", + "name": "FuseQwQen-7B", + "developer": "bunnycore", + "scores": { + "IFEval": 0.7275, + "BBH": 0.5504, + "MATH Level 5": 0.4366, + "GPQA": 0.2945, + "MUSR": 0.4217, + "MMLU-PRO": 0.4407 + } + }, + { + "model_id": "bunnycore/FwF-Qwen-7B-0.1", + "name": "FwF-Qwen-7B-0.1", + "developer": "bunnycore", + "scores": { + "IFEval": 0.3005, + "BBH": 0.5019, + "MATH Level 5": 0.2764, + "GPQA": 0.271, + "MUSR": 0.3952, + "MMLU-PRO": 0.4061 + } + }, + { + "model_id": "bunnycore/FwF-Qwen-7B-0.2", + "name": "FwF-Qwen-7B-0.2", + "developer": "bunnycore", + "scores": { + "IFEval": 0.4479, + "BBH": 0.5596, + "MATH Level 5": 0.426, + "GPQA": 0.2903, + "MUSR": 0.4218, + "MMLU-PRO": 0.4382 + } + }, + { + "model_id": "bunnycore/Gemma-2-2B-Smart", + "name": "Gemma-2-2B-Smart", + "developer": "bunnycore", + "scores": { + "IFEval": 0.1321, + "BBH": 0.3974, + "MATH Level 5": 0.0332, + "GPQA": 0.2827, + "MUSR": 0.4249, + "MMLU-PRO": 0.2426 + } + }, + { + "model_id": "bunnycore/Gemma2-9B-TitanFusion", + "name": "Gemma2-9B-TitanFusion", + "developer": "bunnycore", + "scores": { + "IFEval": 0.1618, + "BBH": 0.5712, + "MATH Level 5": 0.077, + "GPQA": 0.3322, + "MUSR": 0.4136, + "MMLU-PRO": 0.396 + } + }, + { + "model_id": "bunnycore/HyperLlama-3.1-8B", + "name": "HyperLlama-3.1-8B", + "developer": "bunnycore", + "scores": { + "IFEval": 0.7883, + "BBH": 0.5103, + "MATH Level 5": 0.1828, + "GPQA": 0.2869, + "MUSR": 0.3829, + "MMLU-PRO": 0.3783 + } + }, + { + "model_id": "bunnycore/Llama-3.1-8B-TitanFusion-Mix", + "name": "Llama-3.1-8B-TitanFusion-Mix", + "developer": "bunnycore", + "scores": { + "IFEval": 0.4925, + "BBH": 0.5756, + "MATH Level 5": 0.1284, + "GPQA": 0.2953, + "MUSR": 0.4317, + "MMLU-PRO": 0.3695 + } + }, + { + "model_id": "bunnycore/Llama-3.1-8B-TitanFusion-v3", + "name": "Llama-3.1-8B-TitanFusion-v3", + "developer": "bunnycore", + "scores": { + "IFEval": 0.481, + "BBH": 0.5262, + "MATH Level 5": 0.142, + "GPQA": 0.3087, + "MUSR": 0.4302, + "MMLU-PRO": 0.3806 + } + }, + { + "model_id": "bunnycore/Llama-3.2-3B-All-Mix", + "name": "Llama-3.2-3B-All-Mix", + "developer": "bunnycore", + "scores": { + "IFEval": 0.7226, + "BBH": 0.4508, + "MATH Level 5": 0.1503, + "GPQA": 0.2626, + "MUSR": 0.3287, + "MMLU-PRO": 0.316 + } + }, + { + "model_id": "bunnycore/Llama-3.2-3B-Bespoke-Thought", + "name": "Llama-3.2-3B-Bespoke-Thought", + "developer": "bunnycore", + "scores": { + "IFEval": 0.4113, + "BBH": 0.4522, + "MATH Level 5": 0.1647, + "GPQA": 0.2659, + "MUSR": 0.3302, + "MMLU-PRO": 0.311 + } + }, + { + "model_id": "bunnycore/Llama-3.2-3B-Booval", + "name": "Llama-3.2-3B-Booval", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6669, + "BBH": 0.4514, + "MATH Level 5": 0.1269, + "GPQA": 0.2668, + "MUSR": 0.3394, + "MMLU-PRO": 0.3058 + } + }, + { + "model_id": "bunnycore/Llama-3.2-3B-Deep-Test", + "name": "Llama-3.2-3B-Deep-Test", + "developer": "bunnycore", + "scores": { + "IFEval": 0.1775, + "BBH": 0.295, + "MATH Level 5": 0.0, + "GPQA": 0.2517, + "MUSR": 0.3647, + "MMLU-PRO": 0.1049 + } + }, + { + "model_id": "bunnycore/Llama-3.2-3B-Della", + "name": "Llama-3.2-3B-Della", + "developer": "bunnycore", + "scores": { + "IFEval": 0.3561, + "BBH": 0.3683, + "MATH Level 5": 0.0302, + "GPQA": 0.276, + "MUSR": 0.3902, + "MMLU-PRO": 0.2128 + } + }, + { + "model_id": "bunnycore/Llama-3.2-3B-Long-Think", + "name": "Llama-3.2-3B-Long-Think", + "developer": "bunnycore", + "scores": { + "IFEval": 0.5473, + "BBH": 0.461, + "MATH Level 5": 0.1458, + "GPQA": 0.2609, + "MUSR": 0.3396, + "MMLU-PRO": 0.3048 + } + }, + { + "model_id": "bunnycore/Llama-3.2-3B-Mix-Skill", + "name": "Llama-3.2-3B-Mix-Skill", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6404, + "BBH": 0.4582, + "MATH Level 5": 0.1473, + "GPQA": 0.2617, + "MUSR": 0.3396, + "MMLU-PRO": 0.3121 + } + }, + { + "model_id": "bunnycore/Llama-3.2-3B-ProdigyPlus", + "name": "Llama-3.2-3B-ProdigyPlus", + "developer": "bunnycore", + "scores": { + "IFEval": 0.4015, + "BBH": 0.4392, + "MATH Level 5": 0.1156, + "GPQA": 0.2685, + "MUSR": 0.358, + "MMLU-PRO": 0.2817 + } + }, + { + "model_id": "bunnycore/Llama-3.2-3B-ProdigyPlusPlus", + "name": "Llama-3.2-3B-ProdigyPlusPlus", + "developer": "bunnycore", + "scores": { + "IFEval": 0.1645, + "BBH": 0.369, + "MATH Level 5": 0.0453, + "GPQA": 0.2534, + "MUSR": 0.3541, + "MMLU-PRO": 0.15 + } + }, + { + "model_id": "bunnycore/Llama-3.2-3B-RP-DeepThink", + "name": "Llama-3.2-3B-RP-DeepThink", + "developer": "bunnycore", + "scores": { + "IFEval": 0.7144, + "BBH": 0.4563, + "MATH Level 5": 0.1609, + "GPQA": 0.2659, + "MUSR": 0.3302, + "MMLU-PRO": 0.3242 + } + }, + { + "model_id": "bunnycore/Llama-3.2-3B-RRStock", + "name": "Llama-3.2-3B-RRStock", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6657, + "BBH": 0.4568, + "MATH Level 5": 0.1699, + "GPQA": 0.2659, + "MUSR": 0.3314, + "MMLU-PRO": 0.3236 + } + }, + { + "model_id": "bunnycore/Llama-3.2-3B-ToxicKod", + "name": "Llama-3.2-3B-ToxicKod", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6319, + "BBH": 0.4525, + "MATH Level 5": 0.1699, + "GPQA": 0.2659, + "MUSR": 0.3475, + "MMLU-PRO": 0.288 + } + }, + { + "model_id": "bunnycore/Llama-3.2-3b-RP-Toxic-Fuse", + "name": "Llama-3.2-3b-RP-Toxic-Fuse", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6834, + "BBH": 0.465, + "MATH Level 5": 0.2402, + "GPQA": 0.2777, + "MUSR": 0.3954, + "MMLU-PRO": 0.3106 + } + }, + { + "model_id": "bunnycore/Maestro-S1k-7B-Sce", + "name": "Maestro-S1k-7B-Sce", + "developer": "bunnycore", + "scores": { + "IFEval": 0.2523, + "BBH": 0.3104, + "MATH Level 5": 0.0279, + "GPQA": 0.2609, + "MUSR": 0.3768, + "MMLU-PRO": 0.117 + } + }, + { + "model_id": "bunnycore/Phi-3.5-mini-TitanFusion-0.1", + "name": "Phi-3.5-mini-TitanFusion-0.1", + "developer": "bunnycore", + "scores": { + "IFEval": 0.5228, + "BBH": 0.5374, + "MATH Level 5": 0.1186, + "GPQA": 0.3314, + "MUSR": 0.4453, + "MMLU-PRO": 0.3807 + } + }, + { + "model_id": "bunnycore/Phi-4-Model-Stock", + "name": "Phi-4-Model-Stock", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6879, + "BBH": 0.689, + "MATH Level 5": 0.4298, + "GPQA": 0.3549, + "MUSR": 0.4441, + "MMLU-PRO": 0.5368 + } + }, + { + "model_id": "bunnycore/Phi-4-Model-Stock-v2", + "name": "Phi-4-Model-Stock-v2", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6375, + "BBH": 0.6825, + "MATH Level 5": 0.3754, + "GPQA": 0.349, + "MUSR": 0.4662, + "MMLU-PRO": 0.5331 + } + }, + { + "model_id": "bunnycore/Phi-4-Model-Stock-v3", + "name": "Phi-4-Model-Stock-v3", + "developer": "bunnycore", + "scores": { + "IFEval": 0.5912, + "BBH": 0.6726, + "MATH Level 5": 0.4902, + "GPQA": 0.2894, + "MUSR": 0.4166, + "MMLU-PRO": 0.5381 + } + }, + { + "model_id": "bunnycore/Phi-4-Model-Stock-v4", + "name": "Phi-4-Model-Stock-v4", + "developer": "bunnycore", + "scores": { + "IFEval": 0.711, + "BBH": 0.6924, + "MATH Level 5": 0.3829, + "GPQA": 0.3691, + "MUSR": 0.4611, + "MMLU-PRO": 0.5394 + } + }, + { + "model_id": "bunnycore/Phi-4-RP-v0", + "name": "Phi-4-RP-v0", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6827, + "BBH": 0.6856, + "MATH Level 5": 0.3316, + "GPQA": 0.3523, + "MUSR": 0.4141, + "MMLU-PRO": 0.5364 + } + }, + { + "model_id": "bunnycore/Phi-4-RR-Shoup", + "name": "Phi-4-RR-Shoup", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6587, + "BBH": 0.6947, + "MATH Level 5": 0.4992, + "GPQA": 0.3372, + "MUSR": 0.444, + "MMLU-PRO": 0.5429 + } + }, + { + "model_id": "bunnycore/Phi-4-RStock-v0.1", + "name": "Phi-4-RStock-v0.1", + "developer": "bunnycore", + "scores": { + "IFEval": 0.7019, + "BBH": 0.6928, + "MATH Level 5": 0.395, + "GPQA": 0.3649, + "MUSR": 0.4584, + "MMLU-PRO": 0.5401 + } + }, + { + "model_id": "bunnycore/Phi-4-ReasoningRP", + "name": "Phi-4-ReasoningRP", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6736, + "BBH": 0.6922, + "MATH Level 5": 0.4569, + "GPQA": 0.344, + "MUSR": 0.4491, + "MMLU-PRO": 0.5421 + } + }, + { + "model_id": "bunnycore/Phi-4-Sce-exp-v0.1", + "name": "Phi-4-Sce-exp-v0.1", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6595, + "BBH": 0.6943, + "MATH Level 5": 0.503, + "GPQA": 0.3356, + "MUSR": 0.4441, + "MMLU-PRO": 0.5423 + } + }, + { + "model_id": "bunnycore/Phi-4-Stock-Ex", + "name": "Phi-4-Stock-Ex", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6575, + "BBH": 0.6864, + "MATH Level 5": 0.4086, + "GPQA": 0.3507, + "MUSR": 0.4624, + "MMLU-PRO": 0.5375 + } + }, + { + "model_id": "bunnycore/Phi-4-Stock-RP", + "name": "Phi-4-Stock-RP", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6399, + "BBH": 0.686, + "MATH Level 5": 0.3414, + "GPQA": 0.3582, + "MUSR": 0.4715, + "MMLU-PRO": 0.5317 + } + }, + { + "model_id": "bunnycore/Phi-4-Trim-Exp1", + "name": "Phi-4-Trim-Exp1", + "developer": "bunnycore", + "scores": { + "IFEval": 0.1219, + "BBH": 0.2852, + "MATH Level 5": 0.0053, + "GPQA": 0.255, + "MUSR": 0.4177, + "MMLU-PRO": 0.1147 + } + }, + { + "model_id": "bunnycore/Phi-Seek-4-Sce-V1", + "name": "Phi-Seek-4-Sce-V1", + "developer": "bunnycore", + "scores": { + "IFEval": 0.2935, + "BBH": 0.6459, + "MATH Level 5": 0.2145, + "GPQA": 0.276, + "MUSR": 0.3982, + "MMLU-PRO": 0.5123 + } + }, + { + "model_id": "bunnycore/Qandora-2.5-7B-Creative", + "name": "Qandora-2.5-7B-Creative", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6803, + "BBH": 0.5542, + "MATH Level 5": 0.3059, + "GPQA": 0.3104, + "MUSR": 0.4212, + "MMLU-PRO": 0.448 + } + }, + { + "model_id": "bunnycore/QandoraExp-7B", + "name": "QandoraExp-7B", + "developer": "bunnycore", + "scores": { + "IFEval": 0.7509, + "BBH": 0.5478, + "MATH Level 5": 0.4743, + "GPQA": 0.3104, + "MUSR": 0.4312, + "MMLU-PRO": 0.441 + } + }, + { + "model_id": "bunnycore/QandoraExp-7B-Persona", + "name": "QandoraExp-7B-Persona", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6247, + "BBH": 0.5558, + "MATH Level 5": 0.3104, + "GPQA": 0.3146, + "MUSR": 0.4372, + "MMLU-PRO": 0.4407 + } + }, + { + "model_id": "bunnycore/QandoraExp-7B-v2", + "name": "QandoraExp-7B-v2", + "developer": "bunnycore", + "scores": { + "IFEval": 0.5607, + "BBH": 0.5445, + "MATH Level 5": 0.4713, + "GPQA": 0.3029, + "MUSR": 0.4045, + "MMLU-PRO": 0.3909 + } + }, + { + "model_id": "bunnycore/QwQen-3B-LCoT", + "name": "QwQen-3B-LCoT", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6025, + "BBH": 0.4899, + "MATH Level 5": 0.3618, + "GPQA": 0.2668, + "MUSR": 0.4178, + "MMLU-PRO": 0.3699 + } + }, + { + "model_id": "bunnycore/QwQen-3B-LCoT-R1", + "name": "QwQen-3B-LCoT-R1", + "developer": "bunnycore", + "scores": { + "IFEval": 0.5342, + "BBH": 0.4799, + "MATH Level 5": 0.3353, + "GPQA": 0.2617, + "MUSR": 0.4138, + "MMLU-PRO": 0.3723 + } + }, + { + "model_id": "bunnycore/Qwen-2.5-7B-Deep-Sky-T1", + "name": "Qwen-2.5-7B-Deep-Sky-T1", + "developer": "bunnycore", + "scores": { + "IFEval": 0.4208, + "BBH": 0.414, + "MATH Level 5": 0.0551, + "GPQA": 0.281, + "MUSR": 0.4018, + "MMLU-PRO": 0.2104 + } + }, + { + "model_id": "bunnycore/Qwen-2.5-7B-Deep-Stock-v1", + "name": "Qwen-2.5-7B-Deep-Stock-v1", + "developer": "bunnycore", + "scores": { + "IFEval": 0.5695, + "BBH": 0.5361, + "MATH Level 5": 0.2644, + "GPQA": 0.2777, + "MUSR": 0.4109, + "MMLU-PRO": 0.4066 + } + }, + { + "model_id": "bunnycore/Qwen-2.5-7B-Deep-Stock-v4", + "name": "Qwen-2.5-7B-Deep-Stock-v4", + "developer": "bunnycore", + "scores": { + "IFEval": 0.7753, + "BBH": 0.5453, + "MATH Level 5": 0.4894, + "GPQA": 0.3003, + "MUSR": 0.4127, + "MMLU-PRO": 0.4342 + } + }, + { + "model_id": "bunnycore/Qwen-2.5-7B-Deep-Stock-v5", + "name": "Qwen-2.5-7B-Deep-Stock-v5", + "developer": "bunnycore", + "scores": { + "IFEval": 0.4509, + "BBH": 0.4672, + "MATH Level 5": 0.1473, + "GPQA": 0.2701, + "MUSR": 0.3648, + "MMLU-PRO": 0.2832 + } + }, + { + "model_id": "bunnycore/Qwen-2.5-7B-Exp-Sce", + "name": "Qwen-2.5-7B-Exp-Sce", + "developer": "bunnycore", + "scores": { + "IFEval": 0.7652, + "BBH": 0.5506, + "MATH Level 5": 0.3255, + "GPQA": 0.2987, + "MUSR": 0.443, + "MMLU-PRO": 0.4259 + } + }, + { + "model_id": "bunnycore/Qwen-2.5-7B-R1-Stock", + "name": "Qwen-2.5-7B-R1-Stock", + "developer": "bunnycore", + "scores": { + "IFEval": 0.7573, + "BBH": 0.5393, + "MATH Level 5": 0.5008, + "GPQA": 0.2995, + "MUSR": 0.3994, + "MMLU-PRO": 0.4294 + } + }, + { + "model_id": "bunnycore/Qwen-2.5-7B-Stock-Deep-Bespoke", + "name": "Qwen-2.5-7B-Stock-Deep-Bespoke", + "developer": "bunnycore", + "scores": { + "IFEval": 0.5206, + "BBH": 0.492, + "MATH Level 5": 0.1888, + "GPQA": 0.281, + "MUSR": 0.4068, + "MMLU-PRO": 0.358 + } + }, + { + "model_id": "bunnycore/Qwen-2.5-7b-S1k", + "name": "Qwen-2.5-7b-S1k", + "developer": "bunnycore", + "scores": { + "IFEval": 0.7162, + "BBH": 0.5563, + "MATH Level 5": 0.4781, + "GPQA": 0.2844, + "MUSR": 0.4071, + "MMLU-PRO": 0.4382 + } + }, + { + "model_id": "bunnycore/Qwen2.5-1.5B-Model-Stock", + "name": "Qwen2.5-1.5B-Model-Stock", + "developer": "bunnycore", + "scores": { + "IFEval": 0.1829, + "BBH": 0.2874, + "MATH Level 5": 0.0, + "GPQA": 0.2592, + "MUSR": 0.3674, + "MMLU-PRO": 0.11 + } + }, + { + "model_id": "bunnycore/Qwen2.5-3B-Model-Stock", + "name": "Qwen2.5-3B-Model-Stock", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6381, + "BBH": 0.4712, + "MATH Level 5": 0.3799, + "GPQA": 0.2886, + "MUSR": 0.3942, + "MMLU-PRO": 0.325 + } + }, + { + "model_id": "bunnycore/Qwen2.5-3B-Model-Stock-v2", + "name": "Qwen2.5-3B-Model-Stock-v2", + "developer": "bunnycore", + "scores": { + "IFEval": 0.649, + "BBH": 0.4677, + "MATH Level 5": 0.3867, + "GPQA": 0.2869, + "MUSR": 0.3915, + "MMLU-PRO": 0.327 + } + }, + { + "model_id": "bunnycore/Qwen2.5-3B-Model-Stock-v3.1", + "name": "Qwen2.5-3B-Model-Stock-v3.1", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6481, + "BBH": 0.4737, + "MATH Level 5": 0.3897, + "GPQA": 0.2844, + "MUSR": 0.3968, + "MMLU-PRO": 0.329 + } + }, + { + "model_id": "bunnycore/Qwen2.5-3B-Model-Stock-v3.2", + "name": "Qwen2.5-3B-Model-Stock-v3.2", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6353, + "BBH": 0.4727, + "MATH Level 5": 0.3754, + "GPQA": 0.2836, + "MUSR": 0.3928, + "MMLU-PRO": 0.3294 + } + }, + { + "model_id": "bunnycore/Qwen2.5-3B-Model-Stock-v4.1", + "name": "Qwen2.5-3B-Model-Stock-v4.1", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6381, + "BBH": 0.482, + "MATH Level 5": 0.3769, + "GPQA": 0.2794, + "MUSR": 0.3941, + "MMLU-PRO": 0.3387 + } + }, + { + "model_id": "bunnycore/Qwen2.5-3B-RP-Mix", + "name": "Qwen2.5-3B-RP-Mix", + "developer": "bunnycore", + "scores": { + "IFEval": 0.5721, + "BBH": 0.4894, + "MATH Level 5": 0.2153, + "GPQA": 0.2735, + "MUSR": 0.4284, + "MMLU-PRO": 0.3728 + } + }, + { + "model_id": "bunnycore/Qwen2.5-3B-RP-Thinker", + "name": "Qwen2.5-3B-RP-Thinker", + "developer": "bunnycore", + "scores": { + "IFEval": 0.5894, + "BBH": 0.4164, + "MATH Level 5": 0.3353, + "GPQA": 0.2643, + "MUSR": 0.3287, + "MMLU-PRO": 0.315 + } + }, + { + "model_id": "bunnycore/Qwen2.5-3B-RP-Thinker-V2", + "name": "Qwen2.5-3B-RP-Thinker-V2", + "developer": "bunnycore", + "scores": { + "IFEval": 0.642, + "BBH": 0.4678, + "MATH Level 5": 0.3829, + "GPQA": 0.2852, + "MUSR": 0.3981, + "MMLU-PRO": 0.3271 + } + }, + { + "model_id": "bunnycore/Qwen2.5-7B-CyberRombos", + "name": "Qwen2.5-7B-CyberRombos", + "developer": "bunnycore", + "scores": { + "IFEval": 0.7518, + "BBH": 0.5465, + "MATH Level 5": 0.4962, + "GPQA": 0.3045, + "MUSR": 0.4125, + "MMLU-PRO": 0.4391 + } + }, + { + "model_id": "bunnycore/Qwen2.5-7B-Fuse-Exp", + "name": "Qwen2.5-7B-Fuse-Exp", + "developer": "bunnycore", + "scores": { + "IFEval": 0.5469, + "BBH": 0.5109, + "MATH Level 5": 0.3142, + "GPQA": 0.276, + "MUSR": 0.4573, + "MMLU-PRO": 0.3309 + } + }, + { + "model_id": "bunnycore/Qwen2.5-7B-Instruct-Fusion", + "name": "Qwen2.5-7B-Instruct-Fusion", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6962, + "BBH": 0.5492, + "MATH Level 5": 0.3406, + "GPQA": 0.3045, + "MUSR": 0.4297, + "MMLU-PRO": 0.4467 + } + }, + { + "model_id": "bunnycore/Qwen2.5-7B-Instruct-Merge-Stock-v0.1", + "name": "Qwen2.5-7B-Instruct-Merge-Stock-v0.1", + "developer": "bunnycore", + "scores": { + "IFEval": 0.7509, + "BBH": 0.5529, + "MATH Level 5": 0.4894, + "GPQA": 0.3037, + "MUSR": 0.4231, + "MMLU-PRO": 0.4383 + } + }, + { + "model_id": "bunnycore/Qwen2.5-7B-MixStock-Sce-V0.3", + "name": "Qwen2.5-7B-MixStock-Sce-V0.3", + "developer": "bunnycore", + "scores": { + "IFEval": 0.212, + "BBH": 0.3479, + "MATH Level 5": 0.2576, + "GPQA": 0.2576, + "MUSR": 0.3714, + "MMLU-PRO": 0.1779 + } + }, + { + "model_id": "bunnycore/Qwen2.5-7B-MixStock-V0.1", + "name": "Qwen2.5-7B-MixStock-V0.1", + "developer": "bunnycore", + "scores": { + "IFEval": 0.7673, + "BBH": 0.5479, + "MATH Level 5": 0.3172, + "GPQA": 0.3003, + "MUSR": 0.4416, + "MMLU-PRO": 0.4256 + } + }, + { + "model_id": "bunnycore/Qwen2.5-7B-R1-Bespoke-Stock", + "name": "Qwen2.5-7B-R1-Bespoke-Stock", + "developer": "bunnycore", + "scores": { + "IFEval": 0.3726, + "BBH": 0.4822, + "MATH Level 5": 0.2047, + "GPQA": 0.2785, + "MUSR": 0.3926, + "MMLU-PRO": 0.3472 + } + }, + { + "model_id": "bunnycore/Qwen2.5-7B-R1-Bespoke-Task", + "name": "Qwen2.5-7B-R1-Bespoke-Task", + "developer": "bunnycore", + "scores": { + "IFEval": 0.3787, + "BBH": 0.415, + "MATH Level 5": 0.1782, + "GPQA": 0.2534, + "MUSR": 0.3569, + "MMLU-PRO": 0.2688 + } + }, + { + "model_id": "bunnycore/Qwen2.5-7B-RRP-1M", + "name": "Qwen2.5-7B-RRP-1M", + "developer": "bunnycore", + "scores": { + "IFEval": 0.7481, + "BBH": 0.5452, + "MATH Level 5": 0.3248, + "GPQA": 0.3029, + "MUSR": 0.4483, + "MMLU-PRO": 0.4266 + } + }, + { + "model_id": "bunnycore/Qwen2.5-7B-RRP-1M-Thinker", + "name": "Qwen2.5-7B-RRP-1M-Thinker", + "developer": "bunnycore", + "scores": { + "IFEval": 0.2308, + "BBH": 0.3482, + "MATH Level 5": 0.2719, + "GPQA": 0.2576, + "MUSR": 0.3767, + "MMLU-PRO": 0.1769 + } + }, + { + "model_id": "bunnycore/Qwen2.5-7B-RRP-ID", + "name": "Qwen2.5-7B-RRP-ID", + "developer": "bunnycore", + "scores": { + "IFEval": 0.7473, + "BBH": 0.548, + "MATH Level 5": 0.4864, + "GPQA": 0.2827, + "MUSR": 0.418, + "MMLU-PRO": 0.4387 + } + }, + { + "model_id": "bunnycore/Qwen2.5-7B-Sky-R1-Mini", + "name": "Qwen2.5-7B-Sky-R1-Mini", + "developer": "bunnycore", + "scores": { + "IFEval": 0.2305, + "BBH": 0.3503, + "MATH Level 5": 0.0295, + "GPQA": 0.2894, + "MUSR": 0.3448, + "MMLU-PRO": 0.1253 + } + }, + { + "model_id": "bunnycore/QwenMosaic-7B", + "name": "QwenMosaic-7B", + "developer": "bunnycore", + "scores": { + "IFEval": 0.5819, + "BBH": 0.5564, + "MATH Level 5": 0.4441, + "GPQA": 0.2609, + "MUSR": 0.4164, + "MMLU-PRO": 0.431 + } + }, + { + "model_id": "bunnycore/Smol-Llama-3.2-3B", + "name": "Smol-Llama-3.2-3B", + "developer": "bunnycore", + "scores": { + "IFEval": 0.6679, + "BBH": 0.4539, + "MATH Level 5": 0.1382, + "GPQA": 0.2768, + "MUSR": 0.346, + "MMLU-PRO": 0.3228 + } + }, + { + "model_id": "bunnycore/SmolLM2-1.7-Persona", + "name": "SmolLM2-1.7-Persona", + "developer": "bunnycore", + "scores": { + "IFEval": 0.5465, + "BBH": 0.3623, + "MATH Level 5": 0.0566, + "GPQA": 0.2634, + "MUSR": 0.3341, + "MMLU-PRO": 0.1974 + } + }, + { + "model_id": "bunnycore/SmolLM2-1.7B-roleplay-lora", + "name": "SmolLM2-1.7B-roleplay-lora", + "developer": "bunnycore", + "scores": { + "IFEval": 0.5382, + "BBH": 0.361, + "MATH Level 5": 0.0529, + "GPQA": 0.2752, + "MUSR": 0.3395, + "MMLU-PRO": 0.1966 + } + }, + { + "model_id": "bunnycore/Tulu-3.1-8B-SuperNova", + "name": "Tulu-3.1-8B-SuperNova", + "developer": "bunnycore", + "scores": { + "IFEval": 0.8194, + "BBH": 0.5254, + "MATH Level 5": 0.2462, + "GPQA": 0.302, + "MUSR": 0.3935, + "MMLU-PRO": 0.3814 + } + }, + { + "model_id": "byroneverson/Mistral-Small-Instruct-2409-abliterated", + "name": "Mistral-Small-Instruct-2409-abliterated", + "developer": "byroneverson", + "scores": { + "IFEval": 0.6971, + "BBH": 0.5238, + "MATH Level 5": 0.2477, + "GPQA": 0.3331, + "MUSR": 0.3697, + "MMLU-PRO": 0.3923 + } + }, + { + "model_id": "byroneverson/Yi-1.5-9B-Chat-16K-abliterated", + "name": "Yi-1.5-9B-Chat-16K-abliterated", + "developer": "byroneverson", + "scores": { + "IFEval": 0.5528, + "BBH": 0.5282, + "MATH Level 5": 0.1412, + "GPQA": 0.3129, + "MUSR": 0.4734, + "MMLU-PRO": 0.3823 + } + }, + { + "model_id": "byroneverson/Yi-1.5-9B-Chat-abliterated", + "name": "Yi-1.5-9B-Chat-abliterated", + "developer": "byroneverson", + "scores": { + "IFEval": 0.5723, + "BBH": 0.5401, + "MATH Level 5": 0.1662, + "GPQA": 0.2919, + "MUSR": 0.4389, + "MMLU-PRO": 0.3715 + } + }, + { + "model_id": "c10x/Q-Pluse", + "name": "Q-Pluse", + "developer": "c10x", + "scores": { + "IFEval": 0.1123, + "BBH": 0.2875, + "MATH Level 5": 0.0, + "GPQA": 0.2466, + "MUSR": 0.3938, + "MMLU-PRO": 0.1135 + } + }, + { + "model_id": "c10x/longthinker", + "name": "longthinker", + "developer": "c10x", + "scores": { + "IFEval": 0.3609, + "BBH": 0.4927, + "MATH Level 5": 0.2319, + "GPQA": 0.2643, + "MUSR": 0.391, + "MMLU-PRO": 0.3527 + } + }, + { + "model_id": "carsenk/flippa-v6", + "name": "flippa-v6", + "developer": "carsenk", + "scores": { + "IFEval": 0.3439, + "BBH": 0.5047, + "MATH Level 5": 0.1405, + "GPQA": 0.2928, + "MUSR": 0.4089, + "MMLU-PRO": 0.3668 + } + }, + { + "model_id": "carsenk/phi3.5_mini_exp_825_uncensored", + "name": "phi3.5_mini_exp_825_uncensored", + "developer": "carsenk", + "scores": { + "IFEval": 0.1364, + "BBH": 0.2965, + "MATH Level 5": 0.0106, + "GPQA": 0.2492, + "MUSR": 0.3644, + "MMLU-PRO": 0.1175 + } + }, + { + "model_id": "cat-searcher/gemma-2-9b-it-sppo-iter-1", + "name": "gemma-2-9b-it-sppo-iter-1", + "developer": "cat-searcher", + "scores": { + "IFEval": 0.3015, + "BBH": 0.5972, + "MATH Level 5": 0.0831, + "GPQA": 0.3448, + "MUSR": 0.3927, + "MMLU-PRO": 0.3854 + } + }, + { + "model_id": "cat-searcher/gemma-2-9b-it-sppo-iter-1-evol-1", + "name": "gemma-2-9b-it-sppo-iter-1-evol-1", + "developer": "cat-searcher", + "scores": { + "IFEval": 0.2942, + "BBH": 0.5939, + "MATH Level 5": 0.0853, + "GPQA": 0.3406, + "MUSR": 0.3926, + "MMLU-PRO": 0.38 + } + }, + { + "model_id": "cckm/tinymistral_950m", + "name": "tinymistral_950m", + "developer": "cckm", + "scores": { + "IFEval": 0.2395, + "BBH": 0.2969, + "MATH Level 5": 0.0053, + "GPQA": 0.2601, + "MUSR": 0.3554, + "MMLU-PRO": 0.1096 + } + }, + { + "model_id": "cgato/TheSalt-L3-8b-v0.3.2", + "name": "TheSalt-L3-8b-v0.3.2", + "developer": "cgato", + "scores": { + "IFEval": 0.2705, + "BBH": 0.2968, + "MATH Level 5": 0.0476, + "GPQA": 0.2659, + "MUSR": 0.3896, + "MMLU-PRO": 0.1139 + } + }, + { + "model_id": "chargoddard/prometheus-2-llama-3-8b", + "name": "prometheus-2-llama-3-8b", + "developer": "chargoddard", + "scores": { + "IFEval": 0.5289, + "BBH": 0.4931, + "MATH Level 5": 0.0823, + "GPQA": 0.2727, + "MUSR": 0.3396, + "MMLU-PRO": 0.3087 + } + }, + { + "model_id": "chujiezheng/Llama-3-Instruct-8B-SimPO-ExPO", + "name": "Llama-3-Instruct-8B-SimPO-ExPO", + "developer": "chujiezheng", + "scores": { + "IFEval": 0.6434, + "BBH": 0.4765, + "MATH Level 5": 0.0702, + "GPQA": 0.2869, + "MUSR": 0.392, + "MMLU-PRO": 0.3401 + } + }, + { + "model_id": "chujiezheng/Mistral7B-PairRM-SPPO-ExPO", + "name": "Mistral7B-PairRM-SPPO-ExPO", + "developer": "chujiezheng", + "scores": { + "IFEval": 0.3673, + "BBH": 0.3882, + "MATH Level 5": 0.0181, + "GPQA": 0.2768, + "MUSR": 0.4055, + "MMLU-PRO": 0.2552 + } + }, + { + "model_id": "cjvt/GaMS-1B", + "name": "GaMS-1B", + "developer": "cjvt", + "scores": { + "IFEval": 0.1635, + "BBH": 0.3075, + "MATH Level 5": 0.0136, + "GPQA": 0.2584, + "MUSR": 0.3684, + "MMLU-PRO": 0.1149 + } + }, + { + "model_id": "cloudyu/Llama-3-70Bx2-MOE", + "name": "Llama-3-70Bx2-MOE", + "developer": "cloudyu", + "scores": { + "IFEval": 0.5482, + "BBH": 0.6636, + "MATH Level 5": 0.2175, + "GPQA": 0.3935, + "MUSR": 0.4812, + "MMLU-PRO": 0.5142 + } + }, + { + "model_id": "cloudyu/Llama-3.2-3Bx4", + "name": "Llama-3.2-3Bx4", + "developer": "cloudyu", + "scores": { + "IFEval": 0.5069, + "BBH": 0.4332, + "MATH Level 5": 0.1073, + "GPQA": 0.2777, + "MUSR": 0.3496, + "MMLU-PRO": 0.2985 + } + }, + { + "model_id": "cloudyu/Mixtral_11Bx2_MoE_19B", + "name": "Mixtral_11Bx2_MoE_19B", + "developer": "cloudyu", + "scores": { + "IFEval": 0.3851, + "BBH": 0.5209, + "MATH Level 5": 0.0672, + "GPQA": 0.2903, + "MUSR": 0.4297, + "MMLU-PRO": 0.3311 + } + }, + { + "model_id": "cloudyu/Mixtral_34Bx2_MoE_60B", + "name": "Mixtral_34Bx2_MoE_60B", + "developer": "cloudyu", + "scores": { + "IFEval": 0.4538, + "BBH": 0.587, + "MATH Level 5": 0.077, + "GPQA": 0.3381, + "MUSR": 0.4625, + "MMLU-PRO": 0.4766 + } + }, + { + "model_id": "cloudyu/Mixtral_7Bx2_MoE", + "name": "Mixtral_7Bx2_MoE", + "developer": "cloudyu", + "scores": { + "IFEval": 0.448, + "BBH": 0.516, + "MATH Level 5": 0.0687, + "GPQA": 0.3054, + "MUSR": 0.4473, + "MMLU-PRO": 0.3044 + } + }, + { + "model_id": "cloudyu/S1-Llama-3.2-3Bx4-MoE", + "name": "S1-Llama-3.2-3Bx4-MoE", + "developer": "cloudyu", + "scores": { + "IFEval": 0.5302, + "BBH": 0.4358, + "MATH Level 5": 0.1201, + "GPQA": 0.2936, + "MUSR": 0.3456, + "MMLU-PRO": 0.3044 + } + }, + { + "model_id": "cloudyu/Yi-34Bx2-MoE-60B-DPO", + "name": "Yi-34Bx2-MoE-60B-DPO", + "developer": "cloudyu", + "scores": { + "IFEval": 0.5319, + "BBH": 0.5168, + "MATH Level 5": 0.0702, + "GPQA": 0.3221, + "MUSR": 0.4375, + "MMLU-PRO": 0.4677 + } + }, + { + "model_id": "cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-ipo", + "name": "Llama-3.1-8B-paraphrase-type-generation-apty-ipo", + "developer": "cluebbers", + "scores": { + "IFEval": 0.1327, + "BBH": 0.38, + "MATH Level 5": 0.0249, + "GPQA": 0.2634, + "MUSR": 0.4332, + "MMLU-PRO": 0.2591 + } + }, + { + "model_id": "cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid", + "name": "Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid", + "developer": "cluebbers", + "scores": { + "IFEval": 0.1318, + "BBH": 0.3789, + "MATH Level 5": 0.0264, + "GPQA": 0.2685, + "MUSR": 0.4306, + "MMLU-PRO": 0.2562 + } + }, + { + "model_id": "cluebbers/Llama-3.1-8B-paraphrase-type-generation-etpc", + "name": "Llama-3.1-8B-paraphrase-type-generation-etpc", + "developer": "cluebbers", + "scores": { + "IFEval": 0.1209, + "BBH": 0.3781, + "MATH Level 5": 0.0196, + "GPQA": 0.2651, + "MUSR": 0.4319, + "MMLU-PRO": 0.2556 + } + }, + { + "model_id": "cognitivecomputations/Dolphin3.0-Llama3.1-8B", + "name": "Dolphin3.0-Llama3.1-8B", + "developer": "cognitivecomputations", + "scores": { + "IFEval": 0.7621, + "BBH": 0.4916, + "MATH Level 5": 0.1231, + "GPQA": 0.2827, + "MUSR": 0.3653, + "MMLU-PRO": 0.2992 + } + }, + { + "model_id": "cognitivecomputations/Dolphin3.0-Llama3.2-1B", + "name": "Dolphin3.0-Llama3.2-1B", + "developer": "cognitivecomputations", + "scores": { + "IFEval": 0.5428, + "BBH": 0.3122, + "MATH Level 5": 0.0279, + "GPQA": 0.2299, + "MUSR": 0.3249, + "MMLU-PRO": 0.1375 + } + }, + { + "model_id": "cognitivecomputations/Dolphin3.0-Qwen2.5-0.5B", + "name": "Dolphin3.0-Qwen2.5-0.5B", + "developer": "cognitivecomputations", + "scores": { + "IFEval": 0.4697, + "BBH": 0.3114, + "MATH Level 5": 0.0514, + "GPQA": 0.2349, + "MUSR": 0.3555, + "MMLU-PRO": 0.1413 + } + }, + { + "model_id": "cognitivecomputations/Dolphin3.0-R1-Mistral-24B", + "name": "Dolphin3.0-R1-Mistral-24B", + "developer": "cognitivecomputations", + "scores": { + "IFEval": 0.4068, + "BBH": 0.536, + "MATH Level 5": 0.3119, + "GPQA": 0.2945, + "MUSR": 0.3952, + "MMLU-PRO": 0.3005 + } + }, + { + "model_id": "cognitivecomputations/dolphin-2.9-llama3-8b", + "name": "dolphin-2.9-llama3-8b", + "developer": "cognitivecomputations", + "scores": { + "IFEval": 0.385, + "BBH": 0.495, + "MATH Level 5": 0.0574, + "GPQA": 0.2869, + "MUSR": 0.4375, + "MMLU-PRO": 0.2771 + } + }, + { + "model_id": "cognitivecomputations/dolphin-2.9.1-llama-3-70b", + "name": "dolphin-2.9.1-llama-3-70b", + "developer": "cognitivecomputations", + "scores": { + "IFEval": 0.376, + "BBH": 0.5205, + "MATH Level 5": 0.182, + "GPQA": 0.3087, + "MUSR": 0.4976, + "MMLU-PRO": 0.413 + } + }, + { + "model_id": "cognitivecomputations/dolphin-2.9.1-yi-1.5-34b", + "name": "dolphin-2.9.1-yi-1.5-34b", + "developer": "cognitivecomputations", + "scores": { + "IFEval": 0.3853, + "BBH": 0.6076, + "MATH Level 5": 0.1866, + "GPQA": 0.3431, + "MUSR": 0.4598, + "MMLU-PRO": 0.4519 + } + }, + { + "model_id": "cognitivecomputations/dolphin-2.9.1-yi-1.5-9b", + "name": "dolphin-2.9.1-yi-1.5-9b", + "developer": "cognitivecomputations", + "scores": { + "IFEval": 0.4465, + "BBH": 0.5484, + "MATH Level 5": 0.1518, + "GPQA": 0.3381, + "MUSR": 0.4348, + "MMLU-PRO": 0.3967 + } + }, + { + "model_id": "cognitivecomputations/dolphin-2.9.2-Phi-3-Medium", + "name": "dolphin-2.9.2-Phi-3-Medium", + "developer": "cognitivecomputations", + "scores": { + "IFEval": 0.4248, + "BBH": 0.6457, + "MATH Level 5": 0.1828, + "GPQA": 0.3272, + "MUSR": 0.4191, + "MMLU-PRO": 0.4555 + } + }, + { + "model_id": "cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated", + "name": "dolphin-2.9.2-Phi-3-Medium-abliterated", + "developer": "cognitivecomputations", + "scores": { + "IFEval": 0.3613, + "BBH": 0.6123, + "MATH Level 5": 0.1239, + "GPQA": 0.328, + "MUSR": 0.4112, + "MMLU-PRO": 0.4494 + } + }, + { + "model_id": "cognitivecomputations/dolphin-2.9.2-qwen2-72b", + "name": "dolphin-2.9.2-qwen2-72b", + "developer": "cognitivecomputations", + "scores": { + "IFEval": 0.6344, + "BBH": 0.6296, + "MATH Level 5": 0.2802, + "GPQA": 0.37, + "MUSR": 0.4521, + "MMLU-PRO": 0.5471 + } + }, + { + "model_id": "cognitivecomputations/dolphin-2.9.2-qwen2-7b", + "name": "dolphin-2.9.2-qwen2-7b", + "developer": "cognitivecomputations", + "scores": { + "IFEval": 0.3535, + "BBH": 0.4894, + "MATH Level 5": 0.1344, + "GPQA": 0.2903, + "MUSR": 0.4191, + "MMLU-PRO": 0.4051 + } + }, + { + "model_id": "cognitivecomputations/dolphin-2.9.3-Yi-1.5-34B-32k", + "name": "dolphin-2.9.3-Yi-1.5-34B-32k", + "developer": "cognitivecomputations", + "scores": { + "IFEval": 0.3639, + "BBH": 0.6047, + "MATH Level 5": 0.1669, + "GPQA": 0.3431, + "MUSR": 0.4311, + "MMLU-PRO": 0.463 + } + }, + { + "model_id": "cognitivecomputations/dolphin-2.9.3-mistral-7B-32k", + "name": "dolphin-2.9.3-mistral-7B-32k", + "developer": "cognitivecomputations", + "scores": { + "IFEval": 0.4126, + "BBH": 0.4813, + "MATH Level 5": 0.0506, + "GPQA": 0.2852, + "MUSR": 0.4643, + "MMLU-PRO": 0.2821 + } + }, + { + "model_id": "cognitivecomputations/dolphin-2.9.3-mistral-nemo-12b", + "name": "dolphin-2.9.3-mistral-nemo-12b", + "developer": "cognitivecomputations", + "scores": { + "IFEval": 0.5601, + "BBH": 0.548, + "MATH Level 5": 0.074, + "GPQA": 0.3154, + "MUSR": 0.443, + "MMLU-PRO": 0.3377 + } + }, + { + "model_id": "cognitivecomputations/dolphin-2.9.4-gemma2-2b", + "name": "dolphin-2.9.4-gemma2-2b", + "developer": "cognitivecomputations", + "scores": { + "IFEval": 0.0896, + "BBH": 0.4081, + "MATH Level 5": 0.0491, + "GPQA": 0.2844, + "MUSR": 0.418, + "MMLU-PRO": 0.2105 + } + }, + { + "model_id": "cognitivecomputations/dolphin-2.9.4-llama3.1-8b", + "name": "dolphin-2.9.4-llama3.1-8b", + "developer": "cognitivecomputations", + "scores": { + "IFEval": 0.2757, + "BBH": 0.3524, + "MATH Level 5": 0.0121, + "GPQA": 0.2634, + "MUSR": 0.3236, + "MMLU-PRO": 0.1237 + } + }, + { + "model_id": "collaiborateorg/Collaiborator-MEDLLM-Llama-3-8B-v2", + "name": "Collaiborator-MEDLLM-Llama-3-8B-v2", + "developer": "collaiborateorg", + "scores": { + "IFEval": 0.3809, + "BBH": 0.4648, + "MATH Level 5": 0.0566, + "GPQA": 0.3331, + "MUSR": 0.3434, + "MMLU-PRO": 0.3481 + } + }, + { + "model_id": "cpayne1303/cp2024", + "name": "cp2024", + "developer": "cpayne1303", + "scores": { + "IFEval": 0.1658, + "BBH": 0.2985, + "MATH Level 5": 0.0053, + "GPQA": 0.2559, + "MUSR": 0.3383, + "MMLU-PRO": 0.1101 + } + }, + { + "model_id": "cpayne1303/cp2024-instruct", + "name": "cp2024-instruct", + "developer": "cpayne1303", + "scores": { + "IFEval": 0.1706, + "BBH": 0.2947, + "MATH Level 5": 0.0, + "GPQA": 0.2601, + "MUSR": 0.3686, + "MMLU-PRO": 0.1167 + } + }, + { + "model_id": "cpayne1303/llama-43m-beta", + "name": "llama-43m-beta", + "developer": "cpayne1303", + "scores": { + "IFEval": 0.1949, + "BBH": 0.2965, + "MATH Level 5": 0.0045, + "GPQA": 0.2685, + "MUSR": 0.3885, + "MMLU-PRO": 0.1111 + } + }, + { + "model_id": "cpayne1303/smallcp2024", + "name": "smallcp2024", + "developer": "cpayne1303", + "scores": { + "IFEval": 0.1582, + "BBH": 0.3027, + "MATH Level 5": 0.0053, + "GPQA": 0.2307, + "MUSR": 0.3425, + "MMLU-PRO": 0.1114 + } + }, + { + "model_id": "crestf411/MN-Slush", + "name": "MN-Slush", + "developer": "crestf411", + "scores": { + "IFEval": 0.4077, + "BBH": 0.534, + "MATH Level 5": 0.1269, + "GPQA": 0.3238, + "MUSR": 0.3933, + "MMLU-PRO": 0.3508 + } + }, + { + "model_id": "cstr/llama3.1-8b-spaetzle-v90", + "name": "llama3.1-8b-spaetzle-v90", + "developer": "cstr", + "scores": { + "IFEval": 0.7356, + "BBH": 0.5303, + "MATH Level 5": 0.1495, + "GPQA": 0.2827, + "MUSR": 0.4134, + "MMLU-PRO": 0.3731 + } + }, + { + "model_id": "cyberagent/calm3-22b-chat", + "name": "calm3-22b-chat", + "developer": "cyberagent", + "scores": { + "IFEval": 0.5091, + "BBH": 0.4992, + "MATH Level 5": 0.0695, + "GPQA": 0.2768, + "MUSR": 0.4553, + "MMLU-PRO": 0.295 + } + }, + { + "model_id": "darkc0de/BuddyGlassNeverSleeps", + "name": "BuddyGlassNeverSleeps", + "developer": "darkc0de", + "scores": { + "IFEval": 0.4239, + "BBH": 0.4977, + "MATH Level 5": 0.0627, + "GPQA": 0.2945, + "MUSR": 0.3993, + "MMLU-PRO": 0.3452 + } + }, + { + "model_id": "darkc0de/BuddyGlassUncensored2025.2", + "name": "BuddyGlassUncensored2025.2", + "developer": "darkc0de", + "scores": { + "IFEval": 0.7731, + "BBH": 0.6095, + "MATH Level 5": 0.2402, + "GPQA": 0.328, + "MUSR": 0.4071, + "MMLU-PRO": 0.4336 + } + }, + { + "model_id": "darkc0de/BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp", + "name": "BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp", + "developer": "darkc0de", + "scores": { + "IFEval": 0.4358, + "BBH": 0.5243, + "MATH Level 5": 0.1284, + "GPQA": 0.2987, + "MUSR": 0.4143, + "MMLU-PRO": 0.3673 + } + }, + { + "model_id": "databricks/dbrx-base", + "name": "dbrx-base", + "developer": "databricks", + "scores": { + "IFEval": 0.0821, + "BBH": 0.5196, + "MATH Level 5": 0.1, + "GPQA": 0.3267, + "MUSR": 0.4067, + "MMLU-PRO": 0.35 + } + }, + { + "model_id": "databricks/dbrx-instruct", + "name": "DBRX Instruct", + "developer": "databricks", + "scores": { + "IFEval": 0.5416, + "BBH": 0.5429, + "MATH Level 5": 0.0687, + "GPQA": 0.3414, + "MUSR": 0.4269, + "MMLU-PRO": 0.3683 + } + }, + { + "model_id": "databricks/dolly-v1-6b", + "name": "dolly-v1-6b", + "developer": "databricks", + "scores": { + "IFEval": 0.2224, + "BBH": 0.3172, + "MATH Level 5": 0.0189, + "GPQA": 0.2643, + "MUSR": 0.4004, + "MMLU-PRO": 0.1266 + } + }, + { + "model_id": "databricks/dolly-v2-12b", + "name": "dolly-v2-12b", + "developer": "databricks", + "scores": { + "IFEval": 0.2355, + "BBH": 0.332, + "MATH Level 5": 0.0136, + "GPQA": 0.2408, + "MUSR": 0.3739, + "MMLU-PRO": 0.1129 + } + }, + { + "model_id": "databricks/dolly-v2-3b", + "name": "dolly-v2-3b", + "developer": "databricks", + "scores": { + "IFEval": 0.2247, + "BBH": 0.3079, + "MATH Level 5": 0.0151, + "GPQA": 0.2609, + "MUSR": 0.3338, + "MMLU-PRO": 0.1145 + } + }, + { + "model_id": "databricks/dolly-v2-7b", + "name": "dolly-v2-7b", + "developer": "databricks", + "scores": { + "IFEval": 0.201, + "BBH": 0.3173, + "MATH Level 5": 0.0144, + "GPQA": 0.2685, + "MUSR": 0.3553, + "MMLU-PRO": 0.1149 + } + }, + { + "model_id": "davidkim205/Rhea-72b-v0.5", + "name": "Rhea-72b-v0.5", + "developer": "davidkim205", + "scores": { + "IFEval": 0.0145, + "BBH": 0.3078, + "MATH Level 5": 0.1737, + "GPQA": 0.2525, + "MUSR": 0.4241, + "MMLU-PRO": 0.1166 + } + }, + { + "model_id": "davidkim205/nox-solar-10.7b-v4", + "name": "nox-solar-10.7b-v4", + "developer": "davidkim205", + "scores": { + "IFEval": 0.3753, + "BBH": 0.4814, + "MATH Level 5": 0.0083, + "GPQA": 0.307, + "MUSR": 0.4298, + "MMLU-PRO": 0.3333 + } + }, + { + "model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B", + "name": "DeepSeek-R1-Distill-Llama-70B", + "developer": "deepseek-ai", + "scores": { + "IFEval": 0.4336, + "BBH": 0.5635, + "MATH Level 5": 0.3074, + "GPQA": 0.2651, + "MUSR": 0.4342, + "MMLU-PRO": 0.4748 + } + }, + { + "model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "name": "DeepSeek-R1-Distill-Llama-8B", + "developer": "deepseek-ai", + "scores": { + "IFEval": 0.3782, + "BBH": 0.3239, + "MATH Level 5": 0.2198, + "GPQA": 0.255, + "MUSR": 0.325, + "MMLU-PRO": 0.2089 + } + }, + { + "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "name": "DeepSeek-R1-Distill-Qwen-1.5B", + "developer": "deepseek-ai", + "scores": { + "IFEval": 0.3463, + "BBH": 0.3241, + "MATH Level 5": 0.1692, + "GPQA": 0.2559, + "MUSR": 0.3635, + "MMLU-PRO": 0.1187 + } + }, + { + "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", + "name": "DeepSeek-R1-Distill-Qwen-14B", + "developer": "deepseek-ai", + "scores": { + "IFEval": 0.4382, + "BBH": 0.5906, + "MATH Level 5": 0.5702, + "GPQA": 0.3876, + "MUSR": 0.5366, + "MMLU-PRO": 0.4667 + } + }, + { + "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "name": "DeepSeek-R1-Distill-Qwen-32B", + "developer": "deepseek-ai", + "scores": { + "IFEval": 0.4186, + "BBH": 0.4197, + "MATH Level 5": 0.1707, + "GPQA": 0.2844, + "MUSR": 0.4526, + "MMLU-PRO": 0.4687 + } + }, + { + "model_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", + "name": "DeepSeek-R1-Distill-Qwen-7B", + "developer": "deepseek-ai", + "scores": { + "IFEval": 0.4038, + "BBH": 0.3443, + "MATH Level 5": 0.1956, + "GPQA": 0.2794, + "MUSR": 0.3663, + "MMLU-PRO": 0.2321 + } + }, + { + "model_id": "deepseek-ai/deepseek-llm-67b-chat", + "name": "DeepSeek LLM Chat 67B", + "developer": "deepseek-ai", + "scores": { + "IFEval": 0.5587, + "BBH": 0.5243, + "MATH Level 5": 0.0929, + "GPQA": 0.3163, + "MUSR": 0.5059, + "MMLU-PRO": 0.3944 + } + }, + { + "model_id": "deepseek-ai/deepseek-llm-7b-base", + "name": "deepseek-llm-7b-base", + "developer": "deepseek-ai", + "scores": { + "IFEval": 0.2179, + "BBH": 0.3503, + "MATH Level 5": 0.0196, + "GPQA": 0.2735, + "MUSR": 0.3738, + "MMLU-PRO": 0.1806 + } + }, + { + "model_id": "deepseek-ai/deepseek-llm-7b-chat", + "name": "deepseek-llm-7b-chat", + "developer": "deepseek-ai", + "scores": { + "IFEval": 0.4171, + "BBH": 0.3632, + "MATH Level 5": 0.0204, + "GPQA": 0.2659, + "MUSR": 0.4668, + "MMLU-PRO": 0.2133 + } + }, + { + "model_id": "deepseek-ai/deepseek-moe-16b-base", + "name": "deepseek-moe-16b-base", + "developer": "deepseek-ai", + "scores": { + "IFEval": 0.245, + "BBH": 0.3409, + "MATH Level 5": 0.0242, + "GPQA": 0.2542, + "MUSR": 0.3658, + "MMLU-PRO": 0.1505 + } + }, + { + "model_id": "deepseek-ai/deepseek-moe-16b-chat", + "name": "deepseek-moe-16b-chat", + "developer": "deepseek-ai", + "scores": { + "IFEval": 0.3663, + "BBH": 0.3275, + "MATH Level 5": 0.0257, + "GPQA": 0.2248, + "MUSR": 0.3808, + "MMLU-PRO": 0.1964 + } + }, + { + "model_id": "dfurman/CalmeRys-78B-Orpo-v0.1", + "name": "CalmeRys-78B-Orpo-v0.1", + "developer": "dfurman", + "scores": { + "IFEval": 0.8163, + "BBH": 0.7262, + "MATH Level 5": 0.4063, + "GPQA": 0.4002, + "MUSR": 0.5902, + "MMLU-PRO": 0.7012 + } + }, + { + "model_id": "dfurman/Llama-3-70B-Orpo-v0.1", + "name": "Llama-3-70B-Orpo-v0.1", + "developer": "dfurman", + "scores": { + "IFEval": 0.2049, + "BBH": 0.4655, + "MATH Level 5": 0.1579, + "GPQA": 0.2576, + "MUSR": 0.4534, + "MMLU-PRO": 0.3893 + } + }, + { + "model_id": "dfurman/Llama-3-8B-Orpo-v0.1", + "name": "Llama-3-8B-Orpo-v0.1", + "developer": "dfurman", + "scores": { + "IFEval": 0.3, + "BBH": 0.3853, + "MATH Level 5": 0.0415, + "GPQA": 0.2617, + "MUSR": 0.3579, + "MMLU-PRO": 0.2281 + } + }, + { + "model_id": "dfurman/Qwen2-72B-Orpo-v0.1", + "name": "Qwen2-72B-Orpo-v0.1", + "developer": "dfurman", + "scores": { + "IFEval": 0.788, + "BBH": 0.6969, + "MATH Level 5": 0.4056, + "GPQA": 0.3842, + "MUSR": 0.4784, + "MMLU-PRO": 0.5455 + } + }, + { + "model_id": "dicta-il/dictalm2.0", + "name": "dictalm2.0", + "developer": "dicta-il", + "scores": { + "IFEval": 0.2413, + "BBH": 0.4018, + "MATH Level 5": 0.0181, + "GPQA": 0.2919, + "MUSR": 0.382, + "MMLU-PRO": 0.2605 + } + }, + { + "model_id": "dicta-il/dictalm2.0-instruct", + "name": "dictalm2.0-instruct", + "developer": "dicta-il", + "scores": { + "IFEval": 0.4412, + "BBH": 0.4256, + "MATH Level 5": 0.0227, + "GPQA": 0.3029, + "MUSR": 0.3946, + "MMLU-PRO": 0.2605 + } + }, + { + "model_id": "distilbert/distilgpt2", + "name": "distilgpt2", + "developer": "distilbert", + "scores": { + "IFEval": 0.0611, + "BBH": 0.3038, + "MATH Level 5": 0.006, + "GPQA": 0.2592, + "MUSR": 0.4207, + "MMLU-PRO": 0.1187 + } + }, + { + "model_id": "divyanshukunwar/SASTRI_1_9B", + "name": "SASTRI_1_9B", + "developer": "divyanshukunwar", + "scores": { + "IFEval": 0.4207, + "BBH": 0.468, + "MATH Level 5": 0.1156, + "GPQA": 0.3213, + "MUSR": 0.3831, + "MMLU-PRO": 0.3187 + } + }, + { + "model_id": "djuna-test-lab/TEST-L3.2-ReWish-3B", + "name": "TEST-L3.2-ReWish-3B", + "developer": "djuna-test-lab", + "scores": { + "IFEval": 0.6368, + "BBH": 0.4495, + "MATH Level 5": 0.1367, + "GPQA": 0.2836, + "MUSR": 0.3777, + "MMLU-PRO": 0.3126 + } + }, + { + "model_id": "djuna-test-lab/TEST-L3.2-ReWish-3B-ties-w-base", + "name": "TEST-L3.2-ReWish-3B-ties-w-base", + "developer": "djuna-test-lab", + "scores": { + "IFEval": 0.6353, + "BBH": 0.4495, + "MATH Level 5": 0.1367, + "GPQA": 0.2836, + "MUSR": 0.3777, + "MMLU-PRO": 0.3126 + } + }, + { + "model_id": "djuna/G2-BigGSHT-27B-2", + "name": "G2-BigGSHT-27B-2", + "developer": "djuna", + "scores": { + "IFEval": 0.7974, + "BBH": 0.6415, + "MATH Level 5": 0.2349, + "GPQA": 0.3633, + "MUSR": 0.4072, + "MMLU-PRO": 0.4528 + } + }, + { + "model_id": "djuna/G2-GSHT", + "name": "G2-GSHT", + "developer": "djuna", + "scores": { + "IFEval": 0.563, + "BBH": 0.527, + "MATH Level 5": 0.1926, + "GPQA": 0.3255, + "MUSR": 0.4006, + "MMLU-PRO": 0.307 + } + }, + { + "model_id": "djuna/Gemma-2-gemmama-9b", + "name": "Gemma-2-gemmama-9b", + "developer": "djuna", + "scores": { + "IFEval": 0.7703, + "BBH": 0.542, + "MATH Level 5": 0.1926, + "GPQA": 0.3356, + "MUSR": 0.4031, + "MMLU-PRO": 0.3109 + } + }, + { + "model_id": "djuna/L3.1-ForStHS", + "name": "L3.1-ForStHS", + "developer": "djuna", + "scores": { + "IFEval": 0.7813, + "BBH": 0.5203, + "MATH Level 5": 0.1503, + "GPQA": 0.2911, + "MUSR": 0.4026, + "MMLU-PRO": 0.3735 + } + }, + { + "model_id": "djuna/L3.1-Promissum_Mane-8B-Della-1.5-calc", + "name": "L3.1-Promissum_Mane-8B-Della-1.5-calc", + "developer": "djuna", + "scores": { + "IFEval": 0.7235, + "BBH": 0.5433, + "MATH Level 5": 0.1639, + "GPQA": 0.3146, + "MUSR": 0.4253, + "MMLU-PRO": 0.3904 + } + }, + { + "model_id": "djuna/L3.1-Promissum_Mane-8B-Della-calc", + "name": "L3.1-Promissum_Mane-8B-Della-calc", + "developer": "djuna", + "scores": { + "IFEval": 0.5442, + "BBH": 0.5486, + "MATH Level 5": 0.1843, + "GPQA": 0.2995, + "MUSR": 0.423, + "MMLU-PRO": 0.3802 + } + }, + { + "model_id": "djuna/L3.1-Purosani-2-8B", + "name": "L3.1-Purosani-2-8B", + "developer": "djuna", + "scores": { + "IFEval": 0.4988, + "BBH": 0.5182, + "MATH Level 5": 0.1171, + "GPQA": 0.3012, + "MUSR": 0.3816, + "MMLU-PRO": 0.3752 + } + }, + { + "model_id": "djuna/L3.1-Suze-Vume-calc", + "name": "L3.1-Suze-Vume-calc", + "developer": "djuna", + "scores": { + "IFEval": 0.7297, + "BBH": 0.5164, + "MATH Level 5": 0.114, + "GPQA": 0.2819, + "MUSR": 0.3843, + "MMLU-PRO": 0.3515 + } + }, + { + "model_id": "djuna/MN-Chinofun", + "name": "MN-Chinofun", + "developer": "djuna", + "scores": { + "IFEval": 0.611, + "BBH": 0.4953, + "MATH Level 5": 0.1307, + "GPQA": 0.2961, + "MUSR": 0.4084, + "MMLU-PRO": 0.3603 + } + }, + { + "model_id": "djuna/MN-Chinofun-12B-2", + "name": "MN-Chinofun-12B-2", + "developer": "djuna", + "scores": { + "IFEval": 0.6171, + "BBH": 0.5037, + "MATH Level 5": 0.1307, + "GPQA": 0.3054, + "MUSR": 0.4268, + "MMLU-PRO": 0.3615 + } + }, + { + "model_id": "djuna/MN-Chinofun-12B-3", + "name": "MN-Chinofun-12B-3", + "developer": "djuna", + "scores": { + "IFEval": 0.3053, + "BBH": 0.5348, + "MATH Level 5": 0.1005, + "GPQA": 0.2659, + "MUSR": 0.4198, + "MMLU-PRO": 0.3026 + } + }, + { + "model_id": "djuna/MN-Chinofun-12B-4", + "name": "MN-Chinofun-12B-4", + "developer": "djuna", + "scores": { + "IFEval": 0.5404, + "BBH": 0.5348, + "MATH Level 5": 0.1118, + "GPQA": 0.2953, + "MUSR": 0.4307, + "MMLU-PRO": 0.3497 + } + }, + { + "model_id": "djuna/Q2.5-Partron-7B", + "name": "Q2.5-Partron-7B", + "developer": "djuna", + "scores": { + "IFEval": 0.7321, + "BBH": 0.5418, + "MATH Level 5": 0.4826, + "GPQA": 0.2978, + "MUSR": 0.4165, + "MMLU-PRO": 0.4283 + } + }, + { + "model_id": "djuna/Q2.5-Veltha-14B", + "name": "Q2.5-Veltha-14B", + "developer": "djuna", + "scores": { + "IFEval": 0.8292, + "BBH": 0.6484, + "MATH Level 5": 0.4789, + "GPQA": 0.3591, + "MUSR": 0.4194, + "MMLU-PRO": 0.5298 + } + }, + { + "model_id": "djuna/Q2.5-Veltha-14B-0.5", + "name": "Q2.5-Veltha-14B-0.5", + "developer": "djuna", + "scores": { + "IFEval": 0.7796, + "BBH": 0.6523, + "MATH Level 5": 0.4373, + "GPQA": 0.3683, + "MUSR": 0.4339, + "MMLU-PRO": 0.5295 + } + }, + { + "model_id": "dnhkng/RYS-Llama-3-8B-Instruct", + "name": "RYS-Llama-3-8B-Instruct", + "developer": "dnhkng", + "scores": { + "IFEval": 0.6958, + "BBH": 0.4809, + "MATH Level 5": 0.0687, + "GPQA": 0.2576, + "MUSR": 0.3383, + "MMLU-PRO": 0.3557 + } + }, + { + "model_id": "dnhkng/RYS-Llama-3-Huge-Instruct", + "name": "RYS-Llama-3-Huge-Instruct", + "developer": "dnhkng", + "scores": { + "IFEval": 0.7686, + "BBH": 0.6481, + "MATH Level 5": 0.2289, + "GPQA": 0.2609, + "MUSR": 0.4208, + "MMLU-PRO": 0.511 + } + }, + { + "model_id": "dnhkng/RYS-Llama-3-Large-Instruct", + "name": "RYS-Llama-3-Large-Instruct", + "developer": "dnhkng", + "scores": { + "IFEval": 0.8051, + "BBH": 0.6525, + "MATH Level 5": 0.2304, + "GPQA": 0.2894, + "MUSR": 0.418, + "MMLU-PRO": 0.5137 + } + }, + { + "model_id": "dnhkng/RYS-Llama-3.1-8B-Instruct", + "name": "RYS-Llama-3.1-8B-Instruct", + "developer": "dnhkng", + "scores": { + "IFEval": 0.7685, + "BBH": 0.5164, + "MATH Level 5": 0.1329, + "GPQA": 0.2676, + "MUSR": 0.3681, + "MMLU-PRO": 0.3639 + } + }, + { + "model_id": "dnhkng/RYS-Llama3.1-Large", + "name": "RYS-Llama3.1-Large", + "developer": "dnhkng", + "scores": { + "IFEval": 0.8492, + "BBH": 0.6899, + "MATH Level 5": 0.3505, + "GPQA": 0.3742, + "MUSR": 0.4554, + "MMLU-PRO": 0.5249 + } + }, + { + "model_id": "dnhkng/RYS-Medium", + "name": "RYS-Medium", + "developer": "dnhkng", + "scores": { + "IFEval": 0.4406, + "BBH": 0.6285, + "MATH Level 5": 0.108, + "GPQA": 0.328, + "MUSR": 0.4069, + "MMLU-PRO": 0.4326 + } + }, + { + "model_id": "dnhkng/RYS-Phi-3-medium-4k-instruct", + "name": "RYS-Phi-3-medium-4k-instruct", + "developer": "dnhkng", + "scores": { + "IFEval": 0.4391, + "BBH": 0.6226, + "MATH Level 5": 0.1609, + "GPQA": 0.3549, + "MUSR": 0.4253, + "MMLU-PRO": 0.4846 + } + }, + { + "model_id": "dnhkng/RYS-XLarge", + "name": "RYS-XLarge", + "developer": "dnhkng", + "scores": { + "IFEval": 0.7996, + "BBH": 0.705, + "MATH Level 5": 0.4252, + "GPQA": 0.3842, + "MUSR": 0.497, + "MMLU-PRO": 0.5428 + } + }, + { + "model_id": "dnhkng/RYS-XLarge-base", + "name": "RYS-XLarge-base", + "developer": "dnhkng", + "scores": { + "IFEval": 0.791, + "BBH": 0.7047, + "MATH Level 5": 0.3792, + "GPQA": 0.3792, + "MUSR": 0.4903, + "MMLU-PRO": 0.5431 + } + }, + { + "model_id": "dnhkng/RYS-XLarge2", + "name": "RYS-XLarge2", + "developer": "dnhkng", + "scores": { + "IFEval": 0.4902, + "BBH": 0.6574, + "MATH Level 5": 0.2749, + "GPQA": 0.3742, + "MUSR": 0.4508, + "MMLU-PRO": 0.5378 + } + }, + { + "model_id": "dreamgen/WizardLM-2-7B", + "name": "WizardLM-2-7B", + "developer": "dreamgen", + "scores": { + "IFEval": 0.4583, + "BBH": 0.3487, + "MATH Level 5": 0.0332, + "GPQA": 0.2869, + "MUSR": 0.3941, + "MMLU-PRO": 0.266 + } + }, + { + "model_id": "dustinwloring1988/Reflexis-8b-chat-v1", + "name": "Reflexis-8b-chat-v1", + "developer": "dustinwloring1988", + "scores": { + "IFEval": 0.3658, + "BBH": 0.4664, + "MATH Level 5": 0.1156, + "GPQA": 0.2542, + "MUSR": 0.3754, + "MMLU-PRO": 0.3384 + } + }, + { + "model_id": "dustinwloring1988/Reflexis-8b-chat-v2", + "name": "Reflexis-8b-chat-v2", + "developer": "dustinwloring1988", + "scores": { + "IFEval": 0.3912, + "BBH": 0.4724, + "MATH Level 5": 0.1163, + "GPQA": 0.2701, + "MUSR": 0.3526, + "MMLU-PRO": 0.3378 + } + }, + { + "model_id": "dustinwloring1988/Reflexis-8b-chat-v3", + "name": "Reflexis-8b-chat-v3", + "developer": "dustinwloring1988", + "scores": { + "IFEval": 0.5367, + "BBH": 0.4658, + "MATH Level 5": 0.1224, + "GPQA": 0.2424, + "MUSR": 0.3512, + "MMLU-PRO": 0.3548 + } + }, + { + "model_id": "dustinwloring1988/Reflexis-8b-chat-v4", + "name": "Reflexis-8b-chat-v4", + "developer": "dustinwloring1988", + "scores": { + "IFEval": 0.4698, + "BBH": 0.4686, + "MATH Level 5": 0.1027, + "GPQA": 0.2341, + "MUSR": 0.3393, + "MMLU-PRO": 0.339 + } + }, + { + "model_id": "dustinwloring1988/Reflexis-8b-chat-v5", + "name": "Reflexis-8b-chat-v5", + "developer": "dustinwloring1988", + "scores": { + "IFEval": 0.4238, + "BBH": 0.4782, + "MATH Level 5": 0.1216, + "GPQA": 0.271, + "MUSR": 0.3354, + "MMLU-PRO": 0.3217 + } + }, + { + "model_id": "dustinwloring1988/Reflexis-8b-chat-v6", + "name": "Reflexis-8b-chat-v6", + "developer": "dustinwloring1988", + "scores": { + "IFEval": 0.4939, + "BBH": 0.481, + "MATH Level 5": 0.1299, + "GPQA": 0.2626, + "MUSR": 0.3753, + "MMLU-PRO": 0.3479 + } + }, + { + "model_id": "dustinwloring1988/Reflexis-8b-chat-v7", + "name": "Reflexis-8b-chat-v7", + "developer": "dustinwloring1988", + "scores": { + "IFEval": 0.398, + "BBH": 0.481, + "MATH Level 5": 0.1631, + "GPQA": 0.2617, + "MUSR": 0.3222, + "MMLU-PRO": 0.3643 + } + }, + { + "model_id": "duyhv1411/Llama-3.2-1B-en-vi", + "name": "Llama-3.2-1B-en-vi", + "developer": "duyhv1411", + "scores": { + "IFEval": 0.4788, + "BBH": 0.3291, + "MATH Level 5": 0.0287, + "GPQA": 0.2768, + "MUSR": 0.3197, + "MMLU-PRO": 0.1341 + } + }, + { + "model_id": "duyhv1411/Llama-3.2-3B-en-vi", + "name": "Llama-3.2-3B-en-vi", + "developer": "duyhv1411", + "scores": { + "IFEval": 0.4852, + "BBH": 0.3272, + "MATH Level 5": 0.0227, + "GPQA": 0.2752, + "MUSR": 0.321, + "MMLU-PRO": 0.1359 + } + }, + { + "model_id": "dwikitheduck/gemma-2-2b-id", + "name": "gemma-2-2b-id", + "developer": "dwikitheduck", + "scores": { + "IFEval": 0.3879, + "BBH": 0.3962, + "MATH Level 5": 0.0453, + "GPQA": 0.2995, + "MUSR": 0.4154, + "MMLU-PRO": 0.2173 + } + }, + { + "model_id": "dwikitheduck/gemma-2-2b-id-inst", + "name": "gemma-2-2b-id-inst", + "developer": "dwikitheduck", + "scores": { + "IFEval": 0.3879, + "BBH": 0.3962, + "MATH Level 5": 0.0453, + "GPQA": 0.2995, + "MUSR": 0.4154, + "MMLU-PRO": 0.2173 + } + }, + { + "model_id": "dwikitheduck/gemma-2-2b-id-instruct", + "name": "gemma-2-2b-id-instruct", + "developer": "dwikitheduck", + "scores": { + "IFEval": 0.3879, + "BBH": 0.3962, + "MATH Level 5": 0.0453, + "GPQA": 0.2995, + "MUSR": 0.4154, + "MMLU-PRO": 0.2173 + } + }, + { + "model_id": "dwikitheduck/gen-inst-1", + "name": "gen-inst-1", + "developer": "dwikitheduck", + "scores": { + "IFEval": 0.775, + "BBH": 0.642, + "MATH Level 5": 0.4554, + "GPQA": 0.3716, + "MUSR": 0.4205, + "MMLU-PRO": 0.5089 + } + }, + { + "model_id": "dwikitheduck/gen-try1", + "name": "gen-try1", + "developer": "dwikitheduck", + "scores": { + "IFEval": 0.7522, + "BBH": 0.6359, + "MATH Level 5": 0.4101, + "GPQA": 0.3414, + "MUSR": 0.4416, + "MMLU-PRO": 0.5111 + } + }, + { + "model_id": "dwikitheduck/gen-try1-notemp", + "name": "gen-try1-notemp", + "developer": "dwikitheduck", + "scores": { + "IFEval": 0.2627, + "BBH": 0.6263, + "MATH Level 5": 0.318, + "GPQA": 0.354, + "MUSR": 0.4714, + "MMLU-PRO": 0.521 + } + }, + { + "model_id": "dzakwan/dzakwan-MoE-4x7b-Beta", + "name": "dzakwan-MoE-4x7b-Beta", + "developer": "dzakwan", + "scores": { + "IFEval": 0.4443, + "BBH": 0.514, + "MATH Level 5": 0.0778, + "GPQA": 0.2861, + "MUSR": 0.4267, + "MMLU-PRO": 0.3108 + } + }, + { + "model_id": "ehristoforu/Falcon3-8B-Franken-Basestruct", + "name": "Falcon3-8B-Franken-Basestruct", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.1715, + "BBH": 0.5463, + "MATH Level 5": 0.0, + "GPQA": 0.3406, + "MUSR": 0.3555, + "MMLU-PRO": 0.3947 + } + }, + { + "model_id": "ehristoforu/Falcon3-MoE-2x7B-Insruct", + "name": "Falcon3-MoE-2x7B-Insruct", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.7643, + "BBH": 0.5648, + "MATH Level 5": 0.4124, + "GPQA": 0.3121, + "MUSR": 0.484, + "MMLU-PRO": 0.4095 + } + }, + { + "model_id": "ehristoforu/Gemma2-9B-it-psy10k-mental_health", + "name": "Gemma2-9B-it-psy10k-mental_health", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.5887, + "BBH": 0.5539, + "MATH Level 5": 0.1631, + "GPQA": 0.3372, + "MUSR": 0.4086, + "MMLU-PRO": 0.3829 + } + }, + { + "model_id": "ehristoforu/Gemma2-9b-it-train6", + "name": "Gemma2-9b-it-train6", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.7025, + "BBH": 0.5898, + "MATH Level 5": 0.1911, + "GPQA": 0.3289, + "MUSR": 0.4084, + "MMLU-PRO": 0.3942 + } + }, + { + "model_id": "ehristoforu/HappyLlama1", + "name": "HappyLlama1", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.7363, + "BBH": 0.4996, + "MATH Level 5": 0.1427, + "GPQA": 0.2836, + "MUSR": 0.4287, + "MMLU-PRO": 0.3546 + } + }, + { + "model_id": "ehristoforu/QwenQwen2.5-7B-IT", + "name": "QwenQwen2.5-7B-IT", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.7518, + "BBH": 0.5398, + "MATH Level 5": 0.5091, + "GPQA": 0.3037, + "MUSR": 0.4034, + "MMLU-PRO": 0.4289 + } + }, + { + "model_id": "ehristoforu/QwenQwen2.5-7B-IT-Dare", + "name": "QwenQwen2.5-7B-IT-Dare", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.7509, + "BBH": 0.5398, + "MATH Level 5": 0.5091, + "GPQA": 0.3037, + "MUSR": 0.4034, + "MMLU-PRO": 0.4289 + } + }, + { + "model_id": "ehristoforu/RQwen-v0.1", + "name": "RQwen-v0.1", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.7625, + "BBH": 0.6446, + "MATH Level 5": 0.4645, + "GPQA": 0.3255, + "MUSR": 0.4139, + "MMLU-PRO": 0.5202 + } + }, + { + "model_id": "ehristoforu/RQwen-v0.2", + "name": "RQwen-v0.2", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.7504, + "BBH": 0.6427, + "MATH Level 5": 0.327, + "GPQA": 0.3372, + "MUSR": 0.4207, + "MMLU-PRO": 0.5159 + } + }, + { + "model_id": "ehristoforu/SoRu-0009", + "name": "SoRu-0009", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.2582, + "BBH": 0.315, + "MATH Level 5": 0.0211, + "GPQA": 0.2609, + "MUSR": 0.3369, + "MMLU-PRO": 0.1239 + } + }, + { + "model_id": "ehristoforu/coolqwen-3b-it", + "name": "coolqwen-3b-it", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.6473, + "BBH": 0.4851, + "MATH Level 5": 0.3671, + "GPQA": 0.2827, + "MUSR": 0.4125, + "MMLU-PRO": 0.3601 + } + }, + { + "model_id": "ehristoforu/della-70b-test-v1", + "name": "della-70b-test-v1", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.4979, + "BBH": 0.3029, + "MATH Level 5": 0.0098, + "GPQA": 0.2525, + "MUSR": 0.4555, + "MMLU-PRO": 0.1575 + } + }, + { + "model_id": "ehristoforu/falcon3-ultraset", + "name": "falcon3-ultraset", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.7135, + "BBH": 0.5584, + "MATH Level 5": 0.2122, + "GPQA": 0.3322, + "MUSR": 0.4853, + "MMLU-PRO": 0.3982 + } + }, + { + "model_id": "ehristoforu/fd-lora-merged-16x32", + "name": "fd-lora-merged-16x32", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.3481, + "BBH": 0.3308, + "MATH Level 5": 0.1707, + "GPQA": 0.2534, + "MUSR": 0.3514, + "MMLU-PRO": 0.1205 + } + }, + { + "model_id": "ehristoforu/fd-lora-merged-64x128", + "name": "fd-lora-merged-64x128", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.3281, + "BBH": 0.3345, + "MATH Level 5": 0.1873, + "GPQA": 0.255, + "MUSR": 0.3368, + "MMLU-PRO": 0.1537 + } + }, + { + "model_id": "ehristoforu/fp4-14b-it-v1", + "name": "fp4-14b-it-v1", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.2535, + "BBH": 0.574, + "MATH Level 5": 0.0408, + "GPQA": 0.2953, + "MUSR": 0.3595, + "MMLU-PRO": 0.4205 + } + }, + { + "model_id": "ehristoforu/fp4-14b-v1-fix", + "name": "fp4-14b-v1-fix", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.6742, + "BBH": 0.6817, + "MATH Level 5": 0.4207, + "GPQA": 0.354, + "MUSR": 0.4532, + "MMLU-PRO": 0.5353 + } + }, + { + "model_id": "ehristoforu/fq2.5-7b-it-normalize_false", + "name": "fq2.5-7b-it-normalize_false", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.7399, + "BBH": 0.552, + "MATH Level 5": 0.4622, + "GPQA": 0.302, + "MUSR": 0.4612, + "MMLU-PRO": 0.4413 + } + }, + { + "model_id": "ehristoforu/fq2.5-7b-it-normalize_true", + "name": "fq2.5-7b-it-normalize_true", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.7399, + "BBH": 0.552, + "MATH Level 5": 0.4622, + "GPQA": 0.302, + "MUSR": 0.4612, + "MMLU-PRO": 0.4413 + } + }, + { + "model_id": "ehristoforu/frqwen2.5-from7b-duable4layers-it", + "name": "frqwen2.5-from7b-duable4layers-it", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.7729, + "BBH": 0.5264, + "MATH Level 5": 0.4509, + "GPQA": 0.2953, + "MUSR": 0.4166, + "MMLU-PRO": 0.4126 + } + }, + { + "model_id": "ehristoforu/frqwen2.5-from7b-it", + "name": "frqwen2.5-from7b-it", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.6532, + "BBH": 0.5143, + "MATH Level 5": 0.2923, + "GPQA": 0.2903, + "MUSR": 0.4086, + "MMLU-PRO": 0.3977 + } + }, + { + "model_id": "ehristoforu/mllama-3.1-8b-instruct", + "name": "mllama-3.1-8b-instruct", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.3458, + "BBH": 0.4718, + "MATH Level 5": 0.3776, + "GPQA": 0.2701, + "MUSR": 0.338, + "MMLU-PRO": 0.2533 + } + }, + { + "model_id": "ehristoforu/mllama-3.1-8b-it", + "name": "mllama-3.1-8b-it", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.3879, + "BBH": 0.4868, + "MATH Level 5": 0.3799, + "GPQA": 0.2768, + "MUSR": 0.3349, + "MMLU-PRO": 0.2622 + } + }, + { + "model_id": "ehristoforu/moremerge", + "name": "moremerge", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.2019, + "BBH": 0.2868, + "MATH Level 5": 0.0, + "GPQA": 0.2601, + "MUSR": 0.3566, + "MMLU-PRO": 0.1065 + } + }, + { + "model_id": "ehristoforu/moremerge-upscaled", + "name": "moremerge-upscaled", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.1979, + "BBH": 0.2698, + "MATH Level 5": 0.0, + "GPQA": 0.2466, + "MUSR": 0.3593, + "MMLU-PRO": 0.1041 + } + }, + { + "model_id": "ehristoforu/phi-4-25b", + "name": "phi-4-25b", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.6484, + "BBH": 0.6908, + "MATH Level 5": 0.4524, + "GPQA": 0.3188, + "MUSR": 0.4208, + "MMLU-PRO": 0.5351 + } + }, + { + "model_id": "ehristoforu/qwen2.5-test-32b-it", + "name": "qwen2.5-test-32b-it", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.7889, + "BBH": 0.7081, + "MATH Level 5": 0.5974, + "GPQA": 0.3641, + "MUSR": 0.4578, + "MMLU-PRO": 0.5765 + } + }, + { + "model_id": "ehristoforu/qwen2.5-with-lora-think-3b-it", + "name": "qwen2.5-with-lora-think-3b-it", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.5319, + "BBH": 0.4687, + "MATH Level 5": 0.2364, + "GPQA": 0.2802, + "MUSR": 0.431, + "MMLU-PRO": 0.3403 + } + }, + { + "model_id": "ehristoforu/rmoe-v1", + "name": "rmoe-v1", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.265, + "BBH": 0.2929, + "MATH Level 5": 0.0015, + "GPQA": 0.2584, + "MUSR": 0.3663, + "MMLU-PRO": 0.1125 + } + }, + { + "model_id": "ehristoforu/rufalcon3-3b-it", + "name": "rufalcon3-3b-it", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.5942, + "BBH": 0.4155, + "MATH Level 5": 0.1782, + "GPQA": 0.2727, + "MUSR": 0.3895, + "MMLU-PRO": 0.2348 + } + }, + { + "model_id": "ehristoforu/ruphi-4b", + "name": "ruphi-4b", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.1752, + "BBH": 0.2906, + "MATH Level 5": 0.0, + "GPQA": 0.2399, + "MUSR": 0.3512, + "MMLU-PRO": 0.1126 + } + }, + { + "model_id": "ehristoforu/testq-32b", + "name": "testq-32b", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.1876, + "BBH": 0.2877, + "MATH Level 5": 0.003, + "GPQA": 0.2542, + "MUSR": 0.3715, + "MMLU-PRO": 0.1166 + } + }, + { + "model_id": "ehristoforu/tmoe", + "name": "tmoe", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.1193, + "BBH": 0.3073, + "MATH Level 5": 0.0076, + "GPQA": 0.2232, + "MUSR": 0.3699, + "MMLU-PRO": 0.1191 + } + }, + { + "model_id": "ehristoforu/tmoe-v2", + "name": "tmoe-v2", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.1903, + "BBH": 0.2897, + "MATH Level 5": 0.0023, + "GPQA": 0.2634, + "MUSR": 0.4151, + "MMLU-PRO": 0.11 + } + }, + { + "model_id": "ehristoforu/trd-7b-it", + "name": "trd-7b-it", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.2185, + "BBH": 0.299, + "MATH Level 5": 0.0317, + "GPQA": 0.2701, + "MUSR": 0.3794, + "MMLU-PRO": 0.1179 + } + }, + { + "model_id": "ehristoforu/ud-14b", + "name": "ud-14b", + "developer": "ehristoforu", + "scores": { + "IFEval": 0.4235, + "BBH": 0.3324, + "MATH Level 5": 0.1903, + "GPQA": 0.2374, + "MUSR": 0.4394, + "MMLU-PRO": 0.2415 + } + }, + { + "model_id": "elinas/Chronos-Gold-12B-1.0", + "name": "Chronos-Gold-12B-1.0", + "developer": "elinas", + "scores": { + "IFEval": 0.3166, + "BBH": 0.5515, + "MATH Level 5": 0.0695, + "GPQA": 0.318, + "MUSR": 0.474, + "MMLU-PRO": 0.3518 + } + }, + { + "model_id": "ell44ot/gemma-2b-def", + "name": "gemma-2b-def", + "developer": "ell44ot", + "scores": { + "IFEval": 0.2693, + "BBH": 0.3159, + "MATH Level 5": 0.0242, + "GPQA": 0.2735, + "MUSR": 0.367, + "MMLU-PRO": 0.1572 + } + }, + { + "model_id": "euclaise/ReMask-3B", + "name": "ReMask-3B", + "developer": "euclaise", + "scores": { + "IFEval": 0.2419, + "BBH": 0.3517, + "MATH Level 5": 0.0196, + "GPQA": 0.2668, + "MUSR": 0.3341, + "MMLU-PRO": 0.1357 + } + }, + { + "model_id": "eworojoshua/vas-01", + "name": "vas-01", + "developer": "eworojoshua", + "scores": { + "IFEval": 0.7612, + "BBH": 0.5418, + "MATH Level 5": 0.4736, + "GPQA": 0.3096, + "MUSR": 0.4432, + "MMLU-PRO": 0.4348 + } + }, + { + "model_id": "ewre324/Thinker-Llama-3.2-3B-Instruct-Reasoning", + "name": "Thinker-Llama-3.2-3B-Instruct-Reasoning", + "developer": "ewre324", + "scores": { + "IFEval": 0.4439, + "BBH": 0.4273, + "MATH Level 5": 0.0846, + "GPQA": 0.2768, + "MUSR": 0.3655, + "MMLU-PRO": 0.2886 + } + }, + { + "model_id": "ewre324/Thinker-Qwen2.5-0.5B-Instruct-Reasoning", + "name": "Thinker-Qwen2.5-0.5B-Instruct-Reasoning", + "developer": "ewre324", + "scores": { + "IFEval": 0.2476, + "BBH": 0.3292, + "MATH Level 5": 0.0287, + "GPQA": 0.2852, + "MUSR": 0.3382, + "MMLU-PRO": 0.1647 + } + }, + { + "model_id": "ewre324/Thinker-SmolLM2-135M-Instruct-Reasoning", + "name": "Thinker-SmolLM2-135M-Instruct-Reasoning", + "developer": "ewre324", + "scores": { + "IFEval": 0.2584, + "BBH": 0.3071, + "MATH Level 5": 0.0091, + "GPQA": 0.2525, + "MUSR": 0.3661, + "MMLU-PRO": 0.1094 + } + }, + { + "model_id": "ewre324/ewre324-R1-SmolLM2-135M-Distill", + "name": "ewre324-R1-SmolLM2-135M-Distill", + "developer": "ewre324", + "scores": { + "IFEval": 0.1649, + "BBH": 0.3042, + "MATH Level 5": 0.0128, + "GPQA": 0.2617, + "MUSR": 0.3409, + "MMLU-PRO": 0.1134 + } + }, + { + "model_id": "experiment-llm/exp-3-q-r", + "name": "exp-3-q-r", + "developer": "experiment-llm", + "scores": { + "IFEval": 0.6036, + "BBH": 0.5397, + "MATH Level 5": 0.2787, + "GPQA": 0.2936, + "MUSR": 0.4315, + "MMLU-PRO": 0.4316 + } + }, + { + "model_id": "facebook/opt-1.3b", + "name": "opt-1.3b", + "developer": "facebook", + "scores": { + "IFEval": 0.2383, + "BBH": 0.3094, + "MATH Level 5": 0.0091, + "GPQA": 0.2424, + "MUSR": 0.342, + "MMLU-PRO": 0.1107 + } + }, + { + "model_id": "facebook/opt-30b", + "name": "opt-30b", + "developer": "facebook", + "scores": { + "IFEval": 0.2453, + "BBH": 0.307, + "MATH Level 5": 0.0106, + "GPQA": 0.2693, + "MUSR": 0.3604, + "MMLU-PRO": 0.1164 + } + }, + { + "model_id": "failspy/Llama-3-8B-Instruct-MopeyMule", + "name": "Llama-3-8B-Instruct-MopeyMule", + "developer": "failspy", + "scores": { + "IFEval": 0.675, + "BBH": 0.3839, + "MATH Level 5": 0.0196, + "GPQA": 0.2391, + "MUSR": 0.3513, + "MMLU-PRO": 0.1764 + } + }, + { + "model_id": "failspy/Llama-3-8B-Instruct-abliterated", + "name": "Llama-3-8B-Instruct-abliterated", + "developer": "failspy", + "scores": { + "IFEval": 0.5909, + "BBH": 0.4354, + "MATH Level 5": 0.0385, + "GPQA": 0.276, + "MUSR": 0.4116, + "MMLU-PRO": 0.2742 + } + }, + { + "model_id": "failspy/Meta-Llama-3-70B-Instruct-abliterated-v3.5", + "name": "Meta-Llama-3-70B-Instruct-abliterated-v3.5", + "developer": "failspy", + "scores": { + "IFEval": 0.7747, + "BBH": 0.5747, + "MATH Level 5": 0.1284, + "GPQA": 0.297, + "MUSR": 0.3982, + "MMLU-PRO": 0.4452 + } + }, + { + "model_id": "failspy/Meta-Llama-3-8B-Instruct-abliterated-v3", + "name": "Meta-Llama-3-8B-Instruct-abliterated-v3", + "developer": "failspy", + "scores": { + "IFEval": 0.7245, + "BBH": 0.4925, + "MATH Level 5": 0.0959, + "GPQA": 0.2643, + "MUSR": 0.3622, + "MMLU-PRO": 0.3654 + } + }, + { + "model_id": "failspy/Phi-3-medium-4k-instruct-abliterated-v3", + "name": "Phi-3-medium-4k-instruct-abliterated-v3", + "developer": "failspy", + "scores": { + "IFEval": 0.6319, + "BBH": 0.6305, + "MATH Level 5": 0.1594, + "GPQA": 0.3171, + "MUSR": 0.4604, + "MMLU-PRO": 0.44 + } + }, + { + "model_id": "failspy/llama-3-70B-Instruct-abliterated", + "name": "llama-3-70B-Instruct-abliterated", + "developer": "failspy", + "scores": { + "IFEval": 0.8023, + "BBH": 0.6465, + "MATH Level 5": 0.2432, + "GPQA": 0.2894, + "MUSR": 0.4128, + "MMLU-PRO": 0.5145 + } + }, + { + "model_id": "fblgit/TheBeagle-v2beta-32B-MGS", + "name": "TheBeagle-v2beta-32B-MGS", + "developer": "fblgit", + "scores": { + "IFEval": 0.5181, + "BBH": 0.7033, + "MATH Level 5": 0.4947, + "GPQA": 0.3826, + "MUSR": 0.5008, + "MMLU-PRO": 0.5915 + } + }, + { + "model_id": "fblgit/UNA-SimpleSmaug-34b-v1beta", + "name": "UNA-SimpleSmaug-34b-v1beta", + "developer": "fblgit", + "scores": { + "IFEval": 0.4556, + "BBH": 0.5287, + "MATH Level 5": 0.0718, + "GPQA": 0.3171, + "MUSR": 0.4256, + "MMLU-PRO": 0.454 + } + }, + { + "model_id": "fblgit/UNA-TheBeagle-7b-v1", + "name": "UNA-TheBeagle-7b-v1", + "developer": "fblgit", + "scores": { + "IFEval": 0.3689, + "BBH": 0.5029, + "MATH Level 5": 0.077, + "GPQA": 0.2844, + "MUSR": 0.4564, + "MMLU-PRO": 0.3019 + } + }, + { + "model_id": "fblgit/UNA-ThePitbull-21.4B-v2", + "name": "UNA-ThePitbull-21.4B-v2", + "developer": "fblgit", + "scores": { + "IFEval": 0.379, + "BBH": 0.635, + "MATH Level 5": 0.1216, + "GPQA": 0.302, + "MUSR": 0.3922, + "MMLU-PRO": 0.3516 + } + }, + { + "model_id": "fblgit/cybertron-v4-qw7B-MGS", + "name": "cybertron-v4-qw7B-MGS", + "developer": "fblgit", + "scores": { + "IFEval": 0.6264, + "BBH": 0.5592, + "MATH Level 5": 0.3489, + "GPQA": 0.3104, + "MUSR": 0.4371, + "MMLU-PRO": 0.4473 + } + }, + { + "model_id": "fblgit/cybertron-v4-qw7B-UNAMGS", + "name": "cybertron-v4-qw7B-UNAMGS", + "developer": "fblgit", + "scores": { + "IFEval": 0.609, + "BBH": 0.5643, + "MATH Level 5": 0.3731, + "GPQA": 0.3314, + "MUSR": 0.4343, + "MMLU-PRO": 0.45 + } + }, + { + "model_id": "fblgit/juanako-7b-UNA", + "name": "juanako-7b-UNA", + "developer": "fblgit", + "scores": { + "IFEval": 0.4837, + "BBH": 0.507, + "MATH Level 5": 0.034, + "GPQA": 0.2961, + "MUSR": 0.4645, + "MMLU-PRO": 0.2771 + } + }, + { + "model_id": "fblgit/miniclaus-qw1.5B-UNAMGS", + "name": "miniclaus-qw1.5B-UNAMGS", + "developer": "fblgit", + "scores": { + "IFEval": 0.3348, + "BBH": 0.4239, + "MATH Level 5": 0.1088, + "GPQA": 0.2919, + "MUSR": 0.4293, + "MMLU-PRO": 0.2937 + } + }, + { + "model_id": "fblgit/miniclaus-qw1.5B-UNAMGS-GRPO", + "name": "miniclaus-qw1.5B-UNAMGS-GRPO", + "developer": "fblgit", + "scores": { + "IFEval": 0.3518, + "BBH": 0.4234, + "MATH Level 5": 0.1103, + "GPQA": 0.2978, + "MUSR": 0.4254, + "MMLU-PRO": 0.2945 + } + }, + { + "model_id": "fblgit/pancho-v1-qw25-3B-UNAMGS", + "name": "pancho-v1-qw25-3B-UNAMGS", + "developer": "fblgit", + "scores": { + "IFEval": 0.5361, + "BBH": 0.4926, + "MATH Level 5": 0.1571, + "GPQA": 0.297, + "MUSR": 0.4027, + "MMLU-PRO": 0.3766 + } + }, + { + "model_id": "fblgit/una-cybertron-7b-v2-bf16", + "name": "una-cybertron-7b-v2-bf16", + "developer": "fblgit", + "scores": { + "IFEval": 0.4737, + "BBH": 0.3973, + "MATH Level 5": 0.0408, + "GPQA": 0.2978, + "MUSR": 0.4473, + "MMLU-PRO": 0.2443 + } + }, + { + "model_id": "fhai50032/RolePlayLake-7B", + "name": "RolePlayLake-7B", + "developer": "fhai50032", + "scores": { + "IFEval": 0.5057, + "BBH": 0.5252, + "MATH Level 5": 0.0725, + "GPQA": 0.3037, + "MUSR": 0.4459, + "MMLU-PRO": 0.316 + } + }, + { + "model_id": "fhai50032/Unaligned-Thinker-PHI-4", + "name": "Unaligned-Thinker-PHI-4", + "developer": "fhai50032", + "scores": { + "IFEval": 0.0563, + "BBH": 0.6643, + "MATH Level 5": 0.3353, + "GPQA": 0.3809, + "MUSR": 0.4679, + "MMLU-PRO": 0.5147 + } + }, + { + "model_id": "flammenai/Llama3.1-Flammades-70B", + "name": "Llama3.1-Flammades-70B", + "developer": "flammenai", + "scores": { + "IFEval": 0.7058, + "BBH": 0.666, + "MATH Level 5": 0.2092, + "GPQA": 0.354, + "MUSR": 0.4871, + "MMLU-PRO": 0.4752 + } + }, + { + "model_id": "flammenai/Mahou-1.2a-llama3-8B", + "name": "Mahou-1.2a-llama3-8B", + "developer": "flammenai", + "scores": { + "IFEval": 0.5093, + "BBH": 0.5094, + "MATH Level 5": 0.0838, + "GPQA": 0.2886, + "MUSR": 0.3847, + "MMLU-PRO": 0.3817 + } + }, + { + "model_id": "flammenai/Mahou-1.2a-mistral-7B", + "name": "Mahou-1.2a-mistral-7B", + "developer": "flammenai", + "scores": { + "IFEval": 0.4552, + "BBH": 0.5118, + "MATH Level 5": 0.0687, + "GPQA": 0.2718, + "MUSR": 0.3896, + "MMLU-PRO": 0.3163 + } + }, + { + "model_id": "flammenai/Mahou-1.5-llama3.1-70B", + "name": "Mahou-1.5-llama3.1-70B", + "developer": "flammenai", + "scores": { + "IFEval": 0.7147, + "BBH": 0.6651, + "MATH Level 5": 0.21, + "GPQA": 0.354, + "MUSR": 0.495, + "MMLU-PRO": 0.4749 + } + }, + { + "model_id": "flammenai/Mahou-1.5-mistral-nemo-12B", + "name": "Mahou-1.5-mistral-nemo-12B", + "developer": "flammenai", + "scores": { + "IFEval": 0.6751, + "BBH": 0.5522, + "MATH Level 5": 0.0869, + "GPQA": 0.276, + "MUSR": 0.452, + "MMLU-PRO": 0.3602 + } + }, + { + "model_id": "flammenai/flammen15-gutenberg-DPO-v1-7B", + "name": "flammen15-gutenberg-DPO-v1-7B", + "developer": "flammenai", + "scores": { + "IFEval": 0.4798, + "BBH": 0.5203, + "MATH Level 5": 0.0763, + "GPQA": 0.2844, + "MUSR": 0.4293, + "MMLU-PRO": 0.3186 + } + }, + { + "model_id": "fluently-lm/FluentlyLM-Prinum", + "name": "FluentlyLM-Prinum", + "developer": "fluently-lm", + "scores": { + "IFEval": 0.809, + "BBH": 0.7144, + "MATH Level 5": 0.54, + "GPQA": 0.3867, + "MUSR": 0.4471, + "MMLU-PRO": 0.5808 + } + }, + { + "model_id": "fluently-lm/Llama-TI-8B", + "name": "Llama-TI-8B", + "developer": "fluently-lm", + "scores": { + "IFEval": 0.288, + "BBH": 0.5201, + "MATH Level 5": 0.1964, + "GPQA": 0.2961, + "MUSR": 0.4103, + "MMLU-PRO": 0.344 + } + }, + { + "model_id": "fluently-lm/Llama-TI-8B-Instruct", + "name": "Llama-TI-8B-Instruct", + "developer": "fluently-lm", + "scores": { + "IFEval": 0.7716, + "BBH": 0.5252, + "MATH Level 5": 0.2304, + "GPQA": 0.2953, + "MUSR": 0.3813, + "MMLU-PRO": 0.3726 + } + }, + { + "model_id": "fluently-sets/FalconThink3-10B-IT", + "name": "FalconThink3-10B-IT", + "developer": "fluently-sets", + "scores": { + "IFEval": 0.7326, + "BBH": 0.62, + "MATH Level 5": 0.2447, + "GPQA": 0.3347, + "MUSR": 0.4479, + "MMLU-PRO": 0.4435 + } + }, + { + "model_id": "fluently-sets/reasoning-1-1k-demo", + "name": "reasoning-1-1k-demo", + "developer": "fluently-sets", + "scores": { + "IFEval": 0.7525, + "BBH": 0.6397, + "MATH Level 5": 0.4282, + "GPQA": 0.3356, + "MUSR": 0.4061, + "MMLU-PRO": 0.4774 + } + }, + { + "model_id": "formulae/mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp", + "name": "mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp", + "developer": "formulae", + "scores": { + "IFEval": 0.1614, + "BBH": 0.2976, + "MATH Level 5": 0.0015, + "GPQA": 0.2534, + "MUSR": 0.4219, + "MMLU-PRO": 0.1174 + } + }, + { + "model_id": "formulae/mita-elite-v1.1-7b-2-25-2025", + "name": "mita-elite-v1.1-7b-2-25-2025", + "developer": "formulae", + "scores": { + "IFEval": 0.125, + "BBH": 0.2867, + "MATH Level 5": 0.0, + "GPQA": 0.2483, + "MUSR": 0.3487, + "MMLU-PRO": 0.1098 + } + }, + { + "model_id": "formulae/mita-elite-v1.1-gen2-7b-2-25-2025", + "name": "mita-elite-v1.1-gen2-7b-2-25-2025", + "developer": "formulae", + "scores": { + "IFEval": 0.1411, + "BBH": 0.2924, + "MATH Level 5": 0.0, + "GPQA": 0.2525, + "MUSR": 0.3541, + "MMLU-PRO": 0.1101 + } + }, + { + "model_id": "formulae/mita-elite-v1.2-7b-2-26-2025", + "name": "mita-elite-v1.2-7b-2-26-2025", + "developer": "formulae", + "scores": { + "IFEval": 0.148, + "BBH": 0.293, + "MATH Level 5": 0.0023, + "GPQA": 0.2743, + "MUSR": 0.4287, + "MMLU-PRO": 0.1186 + } + }, + { + "model_id": "formulae/mita-gen3-7b-2-26-2025", + "name": "mita-gen3-7b-2-26-2025", + "developer": "formulae", + "scores": { + "IFEval": 0.1964, + "BBH": 0.2916, + "MATH Level 5": 0.0023, + "GPQA": 0.2651, + "MUSR": 0.3912, + "MMLU-PRO": 0.1124 + } + }, + { + "model_id": "formulae/mita-gen3-v1.2-7b-2-26-2025", + "name": "mita-gen3-v1.2-7b-2-26-2025", + "developer": "formulae", + "scores": { + "IFEval": 0.2044, + "BBH": 0.3058, + "MATH Level 5": 0.0023, + "GPQA": 0.2592, + "MUSR": 0.39, + "MMLU-PRO": 0.1128 + } + }, + { + "model_id": "formulae/mita-math-v2.3-2-25-2025", + "name": "mita-math-v2.3-2-25-2025", + "developer": "formulae", + "scores": { + "IFEval": 0.1373, + "BBH": 0.2949, + "MATH Level 5": 0.0, + "GPQA": 0.2508, + "MUSR": 0.3698, + "MMLU-PRO": 0.1118 + } + }, + { + "model_id": "formulae/mita-v1-7b", + "name": "mita-v1-7b", + "developer": "formulae", + "scores": { + "IFEval": 0.1972, + "BBH": 0.3003, + "MATH Level 5": 0.0023, + "GPQA": 0.25, + "MUSR": 0.4152, + "MMLU-PRO": 0.1147 + } + }, + { + "model_id": "formulae/mita-v1.1-7b-2-24-2025", + "name": "mita-v1.1-7b-2-24-2025", + "developer": "formulae", + "scores": { + "IFEval": 0.3412, + "BBH": 0.5442, + "MATH Level 5": 0.435, + "GPQA": 0.3146, + "MUSR": 0.4557, + "MMLU-PRO": 0.4524 + } + }, + { + "model_id": "formulae/mita-v1.2-7b-2-24-2025", + "name": "mita-v1.2-7b-2-24-2025", + "developer": "formulae", + "scores": { + "IFEval": 0.2564, + "BBH": 0.4919, + "MATH Level 5": 0.4879, + "GPQA": 0.3062, + "MUSR": 0.4344, + "MMLU-PRO": 0.3359 + } + }, + { + "model_id": "frameai/Loxa-4B", + "name": "Loxa-4B", + "developer": "frameai", + "scores": { + "IFEval": 0.4765, + "BBH": 0.4217, + "MATH Level 5": 0.1095, + "GPQA": 0.2836, + "MUSR": 0.3377, + "MMLU-PRO": 0.2802 + } + }, + { + "model_id": "freewheelin/free-evo-qwen72b-v0.8-re", + "name": "free-evo-qwen72b-v0.8-re", + "developer": "freewheelin", + "scores": { + "IFEval": 0.5331, + "BBH": 0.6127, + "MATH Level 5": 0.1805, + "GPQA": 0.3565, + "MUSR": 0.4872, + "MMLU-PRO": 0.487 + } + }, + { + "model_id": "freewheelin/free-solar-evo-v0.1", + "name": "free-solar-evo-v0.1", + "developer": "freewheelin", + "scores": { + "IFEval": 0.205, + "BBH": 0.4502, + "MATH Level 5": 0.0083, + "GPQA": 0.2911, + "MUSR": 0.4946, + "MMLU-PRO": 0.3414 + } + }, + { + "model_id": "freewheelin/free-solar-evo-v0.11", + "name": "free-solar-evo-v0.11", + "developer": "freewheelin", + "scores": { + "IFEval": 0.2027, + "BBH": 0.4545, + "MATH Level 5": 0.0083, + "GPQA": 0.2852, + "MUSR": 0.5052, + "MMLU-PRO": 0.3467 + } + }, + { + "model_id": "freewheelin/free-solar-evo-v0.13", + "name": "free-solar-evo-v0.13", + "developer": "freewheelin", + "scores": { + "IFEval": 0.2321, + "BBH": 0.4555, + "MATH Level 5": 0.0121, + "GPQA": 0.2886, + "MUSR": 0.5052, + "MMLU-PRO": 0.347 + } + }, + { + "model_id": "fulim/FineLlama-3.1-8B", + "name": "FineLlama-3.1-8B", + "developer": "fulim", + "scores": { + "IFEval": 0.1439, + "BBH": 0.4569, + "MATH Level 5": 0.0476, + "GPQA": 0.2928, + "MUSR": 0.3867, + "MMLU-PRO": 0.3167 + } + }, + { + "model_id": "gabrielmbmb/SmolLM-1.7B-Instruct-IFEval", + "name": "SmolLM-1.7B-Instruct-IFEval", + "developer": "gabrielmbmb", + "scores": { + "IFEval": 0.2306, + "BBH": 0.3138, + "MATH Level 5": 0.0106, + "GPQA": 0.2534, + "MUSR": 0.3328, + "MMLU-PRO": 0.1156 + } + }, + { + "model_id": "gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA", + "name": "Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA", + "developer": "gaverfraxz", + "scores": { + "IFEval": 0.4009, + "BBH": 0.3985, + "MATH Level 5": 0.0196, + "GPQA": 0.2844, + "MUSR": 0.365, + "MMLU-PRO": 0.1654 + } + }, + { + "model_id": "gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES", + "name": "Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES", + "developer": "gaverfraxz", + "scores": { + "IFEval": 0.4551, + "BBH": 0.5044, + "MATH Level 5": 0.1299, + "GPQA": 0.2668, + "MUSR": 0.3738, + "MMLU-PRO": 0.3679 + } + }, + { + "model_id": "gbueno86/Brinebreath-Llama-3.1-70B", + "name": "Brinebreath-Llama-3.1-70B", + "developer": "gbueno86", + "scores": { + "IFEval": 0.5533, + "BBH": 0.6881, + "MATH Level 5": 0.2976, + "GPQA": 0.3465, + "MUSR": 0.4541, + "MMLU-PRO": 0.5196 + } + }, + { + "model_id": "gbueno86/Meta-LLama-3-Cat-Smaug-LLama-70b", + "name": "Meta-LLama-3-Cat-Smaug-LLama-70b", + "developer": "gbueno86", + "scores": { + "IFEval": 0.8072, + "BBH": 0.6674, + "MATH Level 5": 0.2938, + "GPQA": 0.3272, + "MUSR": 0.4368, + "MMLU-PRO": 0.5075 + } + }, + { + "model_id": "ghost-x/ghost-8b-beta-1608", + "name": "ghost-8b-beta-1608", + "developer": "ghost-x", + "scores": { + "IFEval": 0.4273, + "BBH": 0.4517, + "MATH Level 5": 0.0695, + "GPQA": 0.2584, + "MUSR": 0.3516, + "MMLU-PRO": 0.284 + } + }, + { + "model_id": "glaiveai/Reflection-Llama-3.1-70B", + "name": "Reflection-Llama-3.1-70B", + "developer": "glaiveai", + "scores": { + "IFEval": 0.5991, + "BBH": 0.5681, + "MATH Level 5": 0.2757, + "GPQA": 0.3146, + "MUSR": 0.438, + "MMLU-PRO": 0.6341 + } + }, + { + "model_id": "gmonsoon/SahabatAI-Llama-11B-Test", + "name": "SahabatAI-Llama-11B-Test", + "developer": "gmonsoon", + "scores": { + "IFEval": 0.3376, + "BBH": 0.4728, + "MATH Level 5": 0.031, + "GPQA": 0.2819, + "MUSR": 0.4001, + "MMLU-PRO": 0.3182 + } + }, + { + "model_id": "gmonsoon/SahabatAI-MediChatIndo-8B-v1", + "name": "SahabatAI-MediChatIndo-8B-v1", + "developer": "gmonsoon", + "scores": { + "IFEval": 0.4163, + "BBH": 0.4509, + "MATH Level 5": 0.0619, + "GPQA": 0.2827, + "MUSR": 0.3754, + "MMLU-PRO": 0.3108 + } + }, + { + "model_id": "gmonsoon/SahabatAI-Rebase-8B-Test", + "name": "SahabatAI-Rebase-8B-Test", + "developer": "gmonsoon", + "scores": { + "IFEval": 0.5156, + "BBH": 0.523, + "MATH Level 5": 0.1148, + "GPQA": 0.2878, + "MUSR": 0.4133, + "MMLU-PRO": 0.3664 + } + }, + { + "model_id": "gmonsoon/StockSeaLLMs-7B-v1", + "name": "StockSeaLLMs-7B-v1", + "developer": "gmonsoon", + "scores": { + "IFEval": 0.4599, + "BBH": 0.5271, + "MATH Level 5": 0.1964, + "GPQA": 0.3029, + "MUSR": 0.4214, + "MMLU-PRO": 0.3952 + } + }, + { + "model_id": "gmonsoon/gemma2-9b-sahabatai-v1-instruct-BaseTIES", + "name": "gemma2-9b-sahabatai-v1-instruct-BaseTIES", + "developer": "gmonsoon", + "scores": { + "IFEval": 0.7378, + "BBH": 0.6077, + "MATH Level 5": 0.1994, + "GPQA": 0.3205, + "MUSR": 0.4778, + "MMLU-PRO": 0.4347 + } + }, + { + "model_id": "godlikehhd/alpaca_data_full_2", + "name": "alpaca_data_full_2", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.3178, + "BBH": 0.4217, + "MATH Level 5": 0.0929, + "GPQA": 0.2978, + "MUSR": 0.4052, + "MMLU-PRO": 0.2854 + } + }, + { + "model_id": "godlikehhd/alpaca_data_full_3B", + "name": "alpaca_data_full_3B", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.3696, + "BBH": 0.4684, + "MATH Level 5": 0.1337, + "GPQA": 0.2777, + "MUSR": 0.4955, + "MMLU-PRO": 0.3357 + } + }, + { + "model_id": "godlikehhd/alpaca_data_ifd_max_2600", + "name": "alpaca_data_ifd_max_2600", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.3043, + "BBH": 0.4029, + "MATH Level 5": 0.0989, + "GPQA": 0.3029, + "MUSR": 0.3509, + "MMLU-PRO": 0.2916 + } + }, + { + "model_id": "godlikehhd/alpaca_data_ifd_max_2600_3B", + "name": "alpaca_data_ifd_max_2600_3B", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.2982, + "BBH": 0.4626, + "MATH Level 5": 0.1594, + "GPQA": 0.2727, + "MUSR": 0.4346, + "MMLU-PRO": 0.3288 + } + }, + { + "model_id": "godlikehhd/alpaca_data_ifd_me_max_5200", + "name": "alpaca_data_ifd_me_max_5200", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.3683, + "BBH": 0.4153, + "MATH Level 5": 0.0974, + "GPQA": 0.2911, + "MUSR": 0.3483, + "MMLU-PRO": 0.2982 + } + }, + { + "model_id": "godlikehhd/alpaca_data_ifd_min_2600", + "name": "alpaca_data_ifd_min_2600", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.375, + "BBH": 0.4219, + "MATH Level 5": 0.0967, + "GPQA": 0.2919, + "MUSR": 0.3656, + "MMLU-PRO": 0.2893 + } + }, + { + "model_id": "godlikehhd/alpaca_data_ins_ans_max_5200", + "name": "alpaca_data_ins_ans_max_5200", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.3479, + "BBH": 0.4098, + "MATH Level 5": 0.1027, + "GPQA": 0.2911, + "MUSR": 0.3602, + "MMLU-PRO": 0.2901 + } + }, + { + "model_id": "godlikehhd/alpaca_data_ins_max_5200", + "name": "alpaca_data_ins_max_5200", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.3275, + "BBH": 0.4155, + "MATH Level 5": 0.0997, + "GPQA": 0.2961, + "MUSR": 0.3614, + "MMLU-PRO": 0.2916 + } + }, + { + "model_id": "godlikehhd/alpaca_data_ins_min_2600", + "name": "alpaca_data_ins_min_2600", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.333, + "BBH": 0.4187, + "MATH Level 5": 0.111, + "GPQA": 0.2978, + "MUSR": 0.3853, + "MMLU-PRO": 0.288 + } + }, + { + "model_id": "godlikehhd/alpaca_data_ins_min_5200", + "name": "alpaca_data_ins_min_5200", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.336, + "BBH": 0.4289, + "MATH Level 5": 0.1035, + "GPQA": 0.2869, + "MUSR": 0.3906, + "MMLU-PRO": 0.2949 + } + }, + { + "model_id": "godlikehhd/alpaca_data_sampled_ifd_5200", + "name": "alpaca_data_sampled_ifd_5200", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.2924, + "BBH": 0.4033, + "MATH Level 5": 0.1254, + "GPQA": 0.3087, + "MUSR": 0.3521, + "MMLU-PRO": 0.2896 + } + }, + { + "model_id": "godlikehhd/alpaca_data_sampled_ifd_new_5200", + "name": "alpaca_data_sampled_ifd_new_5200", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.3663, + "BBH": 0.4178, + "MATH Level 5": 0.0944, + "GPQA": 0.2936, + "MUSR": 0.3613, + "MMLU-PRO": 0.2925 + } + }, + { + "model_id": "godlikehhd/alpaca_data_score_max_0.1_2600", + "name": "alpaca_data_score_max_0.1_2600", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.3288, + "BBH": 0.4252, + "MATH Level 5": 0.0989, + "GPQA": 0.2911, + "MUSR": 0.3706, + "MMLU-PRO": 0.2923 + } + }, + { + "model_id": "godlikehhd/alpaca_data_score_max_0.3_2600", + "name": "alpaca_data_score_max_0.3_2600", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.3375, + "BBH": 0.4151, + "MATH Level 5": 0.1035, + "GPQA": 0.2894, + "MUSR": 0.3759, + "MMLU-PRO": 0.2913 + } + }, + { + "model_id": "godlikehhd/alpaca_data_score_max_0.7_2600", + "name": "alpaca_data_score_max_0.7_2600", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.364, + "BBH": 0.4185, + "MATH Level 5": 0.1073, + "GPQA": 0.3037, + "MUSR": 0.3469, + "MMLU-PRO": 0.2983 + } + }, + { + "model_id": "godlikehhd/alpaca_data_score_max_2500", + "name": "alpaca_data_score_max_2500", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.3564, + "BBH": 0.418, + "MATH Level 5": 0.0952, + "GPQA": 0.2953, + "MUSR": 0.3627, + "MMLU-PRO": 0.294 + } + }, + { + "model_id": "godlikehhd/alpaca_data_score_max_2600_3B", + "name": "alpaca_data_score_max_2600_3B", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.3358, + "BBH": 0.4716, + "MATH Level 5": 0.1548, + "GPQA": 0.2651, + "MUSR": 0.4474, + "MMLU-PRO": 0.3342 + } + }, + { + "model_id": "godlikehhd/alpaca_data_score_max_5200", + "name": "alpaca_data_score_max_5200", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.3445, + "BBH": 0.4242, + "MATH Level 5": 0.0974, + "GPQA": 0.2978, + "MUSR": 0.3878, + "MMLU-PRO": 0.2945 + } + }, + { + "model_id": "godlikehhd/ifd_2500_qwen", + "name": "ifd_2500_qwen", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.3365, + "BBH": 0.4298, + "MATH Level 5": 0.0982, + "GPQA": 0.2953, + "MUSR": 0.3615, + "MMLU-PRO": 0.2921 + } + }, + { + "model_id": "godlikehhd/ifd_new_correct_all_sample_2500_qwen", + "name": "ifd_new_correct_all_sample_2500_qwen", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.3376, + "BBH": 0.402, + "MATH Level 5": 0.0959, + "GPQA": 0.2903, + "MUSR": 0.3562, + "MMLU-PRO": 0.2889 + } + }, + { + "model_id": "godlikehhd/ifd_new_correct_sample_2500_qwen", + "name": "ifd_new_correct_sample_2500_qwen", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.3397, + "BBH": 0.411, + "MATH Level 5": 0.1042, + "GPQA": 0.3079, + "MUSR": 0.3627, + "MMLU-PRO": 0.2932 + } + }, + { + "model_id": "godlikehhd/ifd_new_qwen_2500", + "name": "ifd_new_qwen_2500", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.324, + "BBH": 0.416, + "MATH Level 5": 0.1118, + "GPQA": 0.3003, + "MUSR": 0.359, + "MMLU-PRO": 0.2911 + } + }, + { + "model_id": "godlikehhd/qwen-2.5-1.5b-cherry", + "name": "qwen-2.5-1.5b-cherry", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.2893, + "BBH": 0.4036, + "MATH Level 5": 0.102, + "GPQA": 0.3003, + "MUSR": 0.3456, + "MMLU-PRO": 0.2923 + } + }, + { + "model_id": "godlikehhd/qwen_2.5-1.5b-cherry_new", + "name": "qwen_2.5-1.5b-cherry_new", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.312, + "BBH": 0.415, + "MATH Level 5": 0.0967, + "GPQA": 0.2978, + "MUSR": 0.3496, + "MMLU-PRO": 0.2894 + } + }, + { + "model_id": "godlikehhd/qwen_full_data_alpaca", + "name": "qwen_full_data_alpaca", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.3136, + "BBH": 0.4229, + "MATH Level 5": 0.0921, + "GPQA": 0.2928, + "MUSR": 0.4052, + "MMLU-PRO": 0.2851 + } + }, + { + "model_id": "godlikehhd/qwen_ins_ans_2500", + "name": "qwen_ins_ans_2500", + "developer": "godlikehhd", + "scores": { + "IFEval": 0.2698, + "BBH": 0.4074, + "MATH Level 5": 0.114, + "GPQA": 0.2919, + "MUSR": 0.3589, + "MMLU-PRO": 0.2809 + } + }, + { + "model_id": "google/codegemma-1.1-2b", + "name": "codegemma-1.1-2b", + "developer": "google", + "scores": { + "IFEval": 0.2294, + "BBH": 0.3353, + "MATH Level 5": 0.0128, + "GPQA": 0.2651, + "MUSR": 0.3871, + "MMLU-PRO": 0.1278 + } + }, + { + "model_id": "google/flan-t5-base", + "name": "flan-t5-base", + "developer": "google", + "scores": { + "IFEval": 0.1891, + "BBH": 0.3526, + "MATH Level 5": 0.0106, + "GPQA": 0.2383, + "MUSR": 0.3671, + "MMLU-PRO": 0.1357 + } + }, + { + "model_id": "google/flan-t5-large", + "name": "flan-t5-large", + "developer": "google", + "scores": { + "IFEval": 0.2201, + "BBH": 0.4153, + "MATH Level 5": 0.0144, + "GPQA": 0.2508, + "MUSR": 0.4083, + "MMLU-PRO": 0.1709 + } + }, + { + "model_id": "google/flan-t5-small", + "name": "flan-t5-small", + "developer": "google", + "scores": { + "IFEval": 0.1524, + "BBH": 0.3283, + "MATH Level 5": 0.0076, + "GPQA": 0.2609, + "MUSR": 0.4123, + "MMLU-PRO": 0.1233 + } + }, + { + "model_id": "google/flan-t5-xl", + "name": "flan-t5-xl", + "developer": "google", + "scores": { + "IFEval": 0.2237, + "BBH": 0.4531, + "MATH Level 5": 0.0076, + "GPQA": 0.2525, + "MUSR": 0.4181, + "MMLU-PRO": 0.2147 + } + }, + { + "model_id": "google/flan-t5-xxl", + "name": "flan-t5-xxl", + "developer": "google", + "scores": { + "IFEval": 0.22, + "BBH": 0.5066, + "MATH Level 5": 0.0106, + "GPQA": 0.2701, + "MUSR": 0.4218, + "MMLU-PRO": 0.2343 + } + }, + { + "model_id": "google/flan-ul2", + "name": "flan-ul2", + "developer": "google", + "scores": { + "IFEval": 0.2393, + "BBH": 0.5054, + "MATH Level 5": 0.0091, + "GPQA": 0.2878, + "MUSR": 0.3844, + "MMLU-PRO": 0.2493 + } + }, + { + "model_id": "google/gemma-1.1-2b-it", + "name": "gemma-1.1-2b-it", + "developer": "google", + "scores": { + "IFEval": 0.3067, + "BBH": 0.3185, + "MATH Level 5": 0.0181, + "GPQA": 0.2693, + "MUSR": 0.3394, + "MMLU-PRO": 0.1484 + } + }, + { + "model_id": "google/gemma-1.1-7b-it", + "name": "gemma-1.1-7b-it", + "developer": "google", + "scores": { + "IFEval": 0.5039, + "BBH": 0.3935, + "MATH Level 5": 0.0491, + "GPQA": 0.2936, + "MUSR": 0.423, + "MMLU-PRO": 0.2584 + } + }, + { + "model_id": "google/gemma-2-27b", + "name": "Gemma 2 27B", + "developer": "google", + "scores": { + "IFEval": 0.2475, + "BBH": 0.5643, + "MATH Level 5": 0.1662, + "GPQA": 0.3507, + "MUSR": 0.4396, + "MMLU-PRO": 0.4371 + } + }, + { + "model_id": "google/gemma-2-27b-it", + "name": "Gemma 2 Instruct 27B", + "developer": "google", + "scores": { + "IFEval": 0.7978, + "BBH": 0.6451, + "MATH Level 5": 0.2387, + "GPQA": 0.375, + "MUSR": 0.4033, + "MMLU-PRO": 0.4451 + } + }, + { + "model_id": "google/gemma-2-2b", + "name": "gemma-2-2b", + "developer": "google", + "scores": { + "IFEval": 0.2018, + "BBH": 0.3709, + "MATH Level 5": 0.0302, + "GPQA": 0.2626, + "MUSR": 0.4219, + "MMLU-PRO": 0.2217 + } + }, + { + "model_id": "google/gemma-2-2b-it", + "name": "gemma-2-2b-it", + "developer": "google", + "scores": { + "IFEval": 0.5668, + "BBH": 0.4199, + "MATH Level 5": 0.0008, + "GPQA": 0.2743, + "MUSR": 0.3929, + "MMLU-PRO": 0.255 + } + }, + { + "model_id": "google/gemma-2-2b-jpn-it", + "name": "gemma-2-2b-jpn-it", + "developer": "google", + "scores": { + "IFEval": 0.5078, + "BBH": 0.4226, + "MATH Level 5": 0.0347, + "GPQA": 0.2852, + "MUSR": 0.3964, + "MMLU-PRO": 0.2578 + } + }, + { + "model_id": "google/gemma-2-9b", + "name": "Gemma 2 9B", + "developer": "google", + "scores": { + "IFEval": 0.204, + "BBH": 0.5377, + "MATH Level 5": 0.1344, + "GPQA": 0.3289, + "MUSR": 0.4461, + "MMLU-PRO": 0.4103 + } + }, + { + "model_id": "google/gemma-2-9b-it", + "name": "Gemma 2 Instruct 9B", + "developer": "google", + "scores": { + "IFEval": 0.7436, + "BBH": 0.599, + "MATH Level 5": 0.1949, + "GPQA": 0.3607, + "MUSR": 0.4073, + "MMLU-PRO": 0.3875 + } + }, + { + "model_id": "google/gemma-2b", + "name": "gemma-2b", + "developer": "google", + "scores": { + "IFEval": 0.2038, + "BBH": 0.3366, + "MATH Level 5": 0.0302, + "GPQA": 0.255, + "MUSR": 0.3978, + "MMLU-PRO": 0.1366 + } + }, + { + "model_id": "google/gemma-2b-it", + "name": "gemma-2b-it", + "developer": "google", + "scores": { + "IFEval": 0.269, + "BBH": 0.3151, + "MATH Level 5": 0.0204, + "GPQA": 0.2785, + "MUSR": 0.3341, + "MMLU-PRO": 0.1353 + } + }, + { + "model_id": "google/gemma-7b", + "name": "Gemma 7B", + "developer": "google", + "scores": { + "IFEval": 0.2659, + "BBH": 0.4362, + "MATH Level 5": 0.074, + "GPQA": 0.2869, + "MUSR": 0.4062, + "MMLU-PRO": 0.2948 + } + }, + { + "model_id": "google/gemma-7b-it", + "name": "gemma-7b-it", + "developer": "google", + "scores": { + "IFEval": 0.3868, + "BBH": 0.3646, + "MATH Level 5": 0.0295, + "GPQA": 0.2844, + "MUSR": 0.4274, + "MMLU-PRO": 0.1695 + } + }, + { + "model_id": "google/mt5-base", + "name": "mt5-base", + "developer": "google", + "scores": { + "IFEval": 0.1645, + "BBH": 0.2883, + "MATH Level 5": 0.0091, + "GPQA": 0.2391, + "MUSR": 0.3672, + "MMLU-PRO": 0.107 + } + }, + { + "model_id": "google/mt5-small", + "name": "mt5-small", + "developer": "google", + "scores": { + "IFEval": 0.1718, + "BBH": 0.2766, + "MATH Level 5": 0.0, + "GPQA": 0.2424, + "MUSR": 0.3857, + "MMLU-PRO": 0.1123 + } + }, + { + "model_id": "google/mt5-xl", + "name": "mt5-xl", + "developer": "google", + "scores": { + "IFEval": 0.196, + "BBH": 0.3047, + "MATH Level 5": 0.0, + "GPQA": 0.2643, + "MUSR": 0.3795, + "MMLU-PRO": 0.112 + } + }, + { + "model_id": "google/mt5-xxl", + "name": "mt5-xxl", + "developer": "google", + "scores": { + "IFEval": 0.2358, + "BBH": 0.2959, + "MATH Level 5": 0.0, + "GPQA": 0.2416, + "MUSR": 0.3689, + "MMLU-PRO": 0.1089 + } + }, + { + "model_id": "google/recurrentgemma-2b", + "name": "recurrentgemma-2b", + "developer": "google", + "scores": { + "IFEval": 0.3017, + "BBH": 0.3197, + "MATH Level 5": 0.0204, + "GPQA": 0.2458, + "MUSR": 0.3446, + "MMLU-PRO": 0.1176 + } + }, + { + "model_id": "google/recurrentgemma-2b-it", + "name": "recurrentgemma-2b-it", + "developer": "google", + "scores": { + "IFEval": 0.2949, + "BBH": 0.333, + "MATH Level 5": 0.0196, + "GPQA": 0.2534, + "MUSR": 0.3341, + "MMLU-PRO": 0.1402 + } + }, + { + "model_id": "google/recurrentgemma-9b", + "name": "recurrentgemma-9b", + "developer": "google", + "scores": { + "IFEval": 0.3116, + "BBH": 0.3956, + "MATH Level 5": 0.0665, + "GPQA": 0.2852, + "MUSR": 0.3803, + "MMLU-PRO": 0.2605 + } + }, + { + "model_id": "google/recurrentgemma-9b-it", + "name": "recurrentgemma-9b-it", + "developer": "google", + "scores": { + "IFEval": 0.501, + "BBH": 0.4367, + "MATH Level 5": 0.0665, + "GPQA": 0.2701, + "MUSR": 0.4379, + "MMLU-PRO": 0.2843 + } + }, + { + "model_id": "google/switch-base-8", + "name": "switch-base-8", + "developer": "google", + "scores": { + "IFEval": 0.1585, + "BBH": 0.2876, + "MATH Level 5": 0.0, + "GPQA": 0.25, + "MUSR": 0.3517, + "MMLU-PRO": 0.1098 + } + }, + { + "model_id": "google/umt5-base", + "name": "umt5-base", + "developer": "google", + "scores": { + "IFEval": 0.1746, + "BBH": 0.2788, + "MATH Level 5": 0.0045, + "GPQA": 0.2542, + "MUSR": 0.3382, + "MMLU-PRO": 0.1078 + } + }, + { + "model_id": "goulue5/merging_LLM", + "name": "merging_LLM", + "developer": "goulue5", + "scores": { + "IFEval": 0.3233, + "BBH": 0.4216, + "MATH Level 5": 0.0967, + "GPQA": 0.2911, + "MUSR": 0.4333, + "MMLU-PRO": 0.2958 + } + }, + { + "model_id": "gradientai/Llama-3-8B-Instruct-Gradient-1048k", + "name": "Llama-3-8B-Instruct-Gradient-1048k", + "developer": "gradientai", + "scores": { + "IFEval": 0.4456, + "BBH": 0.4346, + "MATH Level 5": 0.0536, + "GPQA": 0.2777, + "MUSR": 0.4298, + "MMLU-PRO": 0.294 + } + }, + { + "model_id": "grimjim/DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B", + "name": "DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B", + "developer": "grimjim", + "scores": { + "IFEval": 0.4797, + "BBH": 0.5269, + "MATH Level 5": 0.2221, + "GPQA": 0.3381, + "MUSR": 0.4408, + "MMLU-PRO": 0.3957 + } + }, + { + "model_id": "grimjim/Gigantes-v1-gemma2-9b-it", + "name": "Gigantes-v1-gemma2-9b-it", + "developer": "grimjim", + "scores": { + "IFEval": 0.6925, + "BBH": 0.5978, + "MATH Level 5": 0.2145, + "GPQA": 0.3532, + "MUSR": 0.4555, + "MMLU-PRO": 0.4225 + } + }, + { + "model_id": "grimjim/Gigantes-v2-gemma2-9b-it", + "name": "Gigantes-v2-gemma2-9b-it", + "developer": "grimjim", + "scores": { + "IFEval": 0.7351, + "BBH": 0.5987, + "MATH Level 5": 0.2017, + "GPQA": 0.3515, + "MUSR": 0.4595, + "MMLU-PRO": 0.4259 + } + }, + { + "model_id": "grimjim/Gigantes-v3-gemma2-9b-it", + "name": "Gigantes-v3-gemma2-9b-it", + "developer": "grimjim", + "scores": { + "IFEval": 0.6976, + "BBH": 0.5984, + "MATH Level 5": 0.21, + "GPQA": 0.3565, + "MUSR": 0.4608, + "MMLU-PRO": 0.4226 + } + }, + { + "model_id": "grimjim/HuatuoSkywork-o1-Llama-3.1-8B", + "name": "HuatuoSkywork-o1-Llama-3.1-8B", + "developer": "grimjim", + "scores": { + "IFEval": 0.3961, + "BBH": 0.4886, + "MATH Level 5": 0.3882, + "GPQA": 0.2928, + "MUSR": 0.3839, + "MMLU-PRO": 0.3095 + } + }, + { + "model_id": "grimjim/Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge", + "name": "Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge", + "developer": "grimjim", + "scores": { + "IFEval": 0.4271, + "BBH": 0.4962, + "MATH Level 5": 0.0997, + "GPQA": 0.2903, + "MUSR": 0.4043, + "MMLU-PRO": 0.3625 + } + }, + { + "model_id": "grimjim/Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge", + "name": "Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge", + "developer": "grimjim", + "scores": { + "IFEval": 0.6806, + "BBH": 0.5022, + "MATH Level 5": 0.0891, + "GPQA": 0.2626, + "MUSR": 0.3885, + "MMLU-PRO": 0.3684 + } + }, + { + "model_id": "grimjim/Llama-3.1-8B-Instruct-abliterated_via_adapter", + "name": "Llama-3.1-8B-Instruct-abliterated_via_adapter", + "developer": "grimjim", + "scores": { + "IFEval": 0.487, + "BBH": 0.5105, + "MATH Level 5": 0.1397, + "GPQA": 0.3138, + "MUSR": 0.401, + "MMLU-PRO": 0.3651 + } + }, + { + "model_id": "grimjim/Llama-3.1-Bonsaikraft-8B-Instruct", + "name": "Llama-3.1-Bonsaikraft-8B-Instruct", + "developer": "grimjim", + "scores": { + "IFEval": 0.425, + "BBH": 0.5287, + "MATH Level 5": 0.1314, + "GPQA": 0.3037, + "MUSR": 0.4235, + "MMLU-PRO": 0.3764 + } + }, + { + "model_id": "grimjim/Llama-Nephilim-Metamorphosis-v2-8B", + "name": "Llama-Nephilim-Metamorphosis-v2-8B", + "developer": "grimjim", + "scores": { + "IFEval": 0.4545, + "BBH": 0.5013, + "MATH Level 5": 0.1397, + "GPQA": 0.323, + "MUSR": 0.4091, + "MMLU-PRO": 0.3809 + } + }, + { + "model_id": "grimjim/Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B", + "name": "Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B", + "developer": "grimjim", + "scores": { + "IFEval": 0.4366, + "BBH": 0.5287, + "MATH Level 5": 0.3006, + "GPQA": 0.3112, + "MUSR": 0.3999, + "MMLU-PRO": 0.3684 + } + }, + { + "model_id": "grimjim/Magnolia-v1-Gemma2-8k-9B", + "name": "Magnolia-v1-Gemma2-8k-9B", + "developer": "grimjim", + "scores": { + "IFEval": 0.3531, + "BBH": 0.5589, + "MATH Level 5": 0.1684, + "GPQA": 0.3364, + "MUSR": 0.4645, + "MMLU-PRO": 0.4242 + } + }, + { + "model_id": "grimjim/Magnolia-v2-12B", + "name": "Magnolia-v2-12B", + "developer": "grimjim", + "scores": { + "IFEval": 0.3506, + "BBH": 0.529, + "MATH Level 5": 0.1292, + "GPQA": 0.3188, + "MUSR": 0.4171, + "MMLU-PRO": 0.3601 + } + }, + { + "model_id": "grimjim/Magnolia-v2-Gemma2-8k-9B", + "name": "Magnolia-v2-Gemma2-8k-9B", + "developer": "grimjim", + "scores": { + "IFEval": 0.7384, + "BBH": 0.6016, + "MATH Level 5": 0.2281, + "GPQA": 0.3574, + "MUSR": 0.4488, + "MMLU-PRO": 0.4332 + } + }, + { + "model_id": "grimjim/Magnolia-v3-12B", + "name": "Magnolia-v3-12B", + "developer": "grimjim", + "scores": { + "IFEval": 0.3965, + "BBH": 0.5327, + "MATH Level 5": 0.1352, + "GPQA": 0.3255, + "MUSR": 0.4184, + "MMLU-PRO": 0.3615 + } + }, + { + "model_id": "grimjim/Magnolia-v3-Gemma2-8k-9B", + "name": "Magnolia-v3-Gemma2-8k-9B", + "developer": "grimjim", + "scores": { + "IFEval": 0.7378, + "BBH": 0.6015, + "MATH Level 5": 0.2319, + "GPQA": 0.3565, + "MUSR": 0.4488, + "MMLU-PRO": 0.4337 + } + }, + { + "model_id": "grimjim/Magnolia-v4-12B", + "name": "Magnolia-v4-12B", + "developer": "grimjim", + "scores": { + "IFEval": 0.3418, + "BBH": 0.5431, + "MATH Level 5": 0.1314, + "GPQA": 0.328, + "MUSR": 0.4211, + "MMLU-PRO": 0.3672 + } + }, + { + "model_id": "grimjim/Magnolia-v5a-12B", + "name": "Magnolia-v5a-12B", + "developer": "grimjim", + "scores": { + "IFEval": 0.4114, + "BBH": 0.5312, + "MATH Level 5": 0.1375, + "GPQA": 0.3221, + "MUSR": 0.4145, + "MMLU-PRO": 0.3601 + } + }, + { + "model_id": "grimjim/Magot-v1-Gemma2-8k-9B", + "name": "Magot-v1-Gemma2-8k-9B", + "developer": "grimjim", + "scores": { + "IFEval": 0.2997, + "BBH": 0.6019, + "MATH Level 5": 0.0989, + "GPQA": 0.3465, + "MUSR": 0.4488, + "MMLU-PRO": 0.4337 + } + }, + { + "model_id": "grimjim/Magot-v2-Gemma2-8k-9B", + "name": "Magot-v2-Gemma2-8k-9B", + "developer": "grimjim", + "scores": { + "IFEval": 0.7347, + "BBH": 0.5897, + "MATH Level 5": 0.2017, + "GPQA": 0.354, + "MUSR": 0.4344, + "MMLU-PRO": 0.4223 + } + }, + { + "model_id": "grimjim/SauerHuatuoSkywork-o1-Llama-3.1-8B", + "name": "SauerHuatuoSkywork-o1-Llama-3.1-8B", + "developer": "grimjim", + "scores": { + "IFEval": 0.5219, + "BBH": 0.5222, + "MATH Level 5": 0.173, + "GPQA": 0.3213, + "MUSR": 0.4527, + "MMLU-PRO": 0.3991 + } + }, + { + "model_id": "grimjim/llama-3-Nephilim-v1-8B", + "name": "llama-3-Nephilim-v1-8B", + "developer": "grimjim", + "scores": { + "IFEval": 0.4277, + "BBH": 0.5132, + "MATH Level 5": 0.0906, + "GPQA": 0.302, + "MUSR": 0.4136, + "MMLU-PRO": 0.3796 + } + }, + { + "model_id": "grimjim/llama-3-Nephilim-v2-8B", + "name": "llama-3-Nephilim-v2-8B", + "developer": "grimjim", + "scores": { + "IFEval": 0.3922, + "BBH": 0.5048, + "MATH Level 5": 0.1065, + "GPQA": 0.2995, + "MUSR": 0.3895, + "MMLU-PRO": 0.3641 + } + }, + { + "model_id": "grimjim/llama-3-Nephilim-v2.1-8B", + "name": "llama-3-Nephilim-v2.1-8B", + "developer": "grimjim", + "scores": { + "IFEval": 0.3895, + "BBH": 0.5095, + "MATH Level 5": 0.0997, + "GPQA": 0.2995, + "MUSR": 0.3935, + "MMLU-PRO": 0.3644 + } + }, + { + "model_id": "grimjim/llama-3-Nephilim-v3-8B", + "name": "llama-3-Nephilim-v3-8B", + "developer": "grimjim", + "scores": { + "IFEval": 0.4174, + "BBH": 0.5013, + "MATH Level 5": 0.0952, + "GPQA": 0.2953, + "MUSR": 0.3989, + "MMLU-PRO": 0.3612 + } + }, + { + "model_id": "gupta-tanish/llama-7b-dpo-baseline", + "name": "llama-7b-dpo-baseline", + "developer": "gupta-tanish", + "scores": { + "IFEval": 0.2693, + "BBH": 0.3897, + "MATH Level 5": 0.0196, + "GPQA": 0.2626, + "MUSR": 0.4456, + "MMLU-PRO": 0.2028 + } + }, + { + "model_id": "gz987/qwen2.5-7b-cabs-v0.1", + "name": "qwen2.5-7b-cabs-v0.1", + "developer": "gz987", + "scores": { + "IFEval": 0.7506, + "BBH": 0.5482, + "MATH Level 5": 0.4796, + "GPQA": 0.3138, + "MUSR": 0.4376, + "MMLU-PRO": 0.4406 + } + }, + { + "model_id": "gz987/qwen2.5-7b-cabs-v0.2", + "name": "qwen2.5-7b-cabs-v0.2", + "developer": "gz987", + "scores": { + "IFEval": 0.7418, + "BBH": 0.5516, + "MATH Level 5": 0.4902, + "GPQA": 0.307, + "MUSR": 0.4429, + "MMLU-PRO": 0.4397 + } + }, + { + "model_id": "gz987/qwen2.5-7b-cabs-v0.3", + "name": "qwen2.5-7b-cabs-v0.3", + "developer": "gz987", + "scores": { + "IFEval": 0.757, + "BBH": 0.5494, + "MATH Level 5": 0.4932, + "GPQA": 0.307, + "MUSR": 0.443, + "MMLU-PRO": 0.4402 + } + }, + { + "model_id": "gz987/qwen2.5-7b-cabs-v0.4", + "name": "qwen2.5-7b-cabs-v0.4", + "developer": "gz987", + "scores": { + "IFEval": 0.7583, + "BBH": 0.5524, + "MATH Level 5": 0.4849, + "GPQA": 0.3079, + "MUSR": 0.443, + "MMLU-PRO": 0.4396 + } + }, + { + "model_id": "h2oai/h2o-danube-1.8b-chat", + "name": "h2o-danube-1.8b-chat", + "developer": "h2oai", + "scores": { + "IFEval": 0.2199, + "BBH": 0.322, + "MATH Level 5": 0.0136, + "GPQA": 0.2542, + "MUSR": 0.3989, + "MMLU-PRO": 0.1314 + } + }, + { + "model_id": "h2oai/h2o-danube3-4b-base", + "name": "h2o-danube3-4b-base", + "developer": "h2oai", + "scores": { + "IFEval": 0.2338, + "BBH": 0.3599, + "MATH Level 5": 0.0227, + "GPQA": 0.2911, + "MUSR": 0.3778, + "MMLU-PRO": 0.2109 + } + }, + { + "model_id": "h2oai/h2o-danube3-4b-chat", + "name": "h2o-danube3-4b-chat", + "developer": "h2oai", + "scores": { + "IFEval": 0.3629, + "BBH": 0.3466, + "MATH Level 5": 0.0408, + "GPQA": 0.2601, + "MUSR": 0.3781, + "MMLU-PRO": 0.2228 + } + }, + { + "model_id": "h2oai/h2o-danube3-500m-chat", + "name": "h2o-danube3-500m-chat", + "developer": "h2oai", + "scores": { + "IFEval": 0.2208, + "BBH": 0.3035, + "MATH Level 5": 0.0166, + "GPQA": 0.2307, + "MUSR": 0.3434, + "MMLU-PRO": 0.1144 + } + }, + { + "model_id": "h2oai/h2o-danube3.1-4b-chat", + "name": "h2o-danube3.1-4b-chat", + "developer": "h2oai", + "scores": { + "IFEval": 0.5021, + "BBH": 0.3608, + "MATH Level 5": 0.0332, + "GPQA": 0.2852, + "MUSR": 0.4102, + "MMLU-PRO": 0.2719 + } + }, + { + "model_id": "haoranxu/ALMA-13B-R", + "name": "ALMA-13B-R", + "developer": "haoranxu", + "scores": { + "IFEval": 0.0039, + "BBH": 0.3457, + "MATH Level 5": 0.0174, + "GPQA": 0.2576, + "MUSR": 0.3528, + "MMLU-PRO": 0.1817 + } + }, + { + "model_id": "haoranxu/Llama-3-Instruct-8B-CPO-SimPO", + "name": "Llama-3-Instruct-8B-CPO-SimPO", + "developer": "haoranxu", + "scores": { + "IFEval": 0.7046, + "BBH": 0.5048, + "MATH Level 5": 0.1027, + "GPQA": 0.2928, + "MUSR": 0.3567, + "MMLU-PRO": 0.3686 + } + }, + { + "model_id": "haoranxu/Llama-3-Instruct-8B-SimPO", + "name": "Llama-3-Instruct-8B-SimPO", + "developer": "haoranxu", + "scores": { + "IFEval": 0.7347, + "BBH": 0.4979, + "MATH Level 5": 0.0876, + "GPQA": 0.2903, + "MUSR": 0.3566, + "MMLU-PRO": 0.3733 + } + }, + { + "model_id": "hatemmahmoud/qwen2.5-1.5b-sft-raft-grpo-hra-doc", + "name": "qwen2.5-1.5b-sft-raft-grpo-hra-doc", + "developer": "hatemmahmoud", + "scores": { + "IFEval": 0.4196, + "BBH": 0.427, + "MATH Level 5": 0.2175, + "GPQA": 0.2676, + "MUSR": 0.361, + "MMLU-PRO": 0.2776 + } + }, + { + "model_id": "hon9kon9ize/CantoneseLLMChat-v0.5", + "name": "CantoneseLLMChat-v0.5", + "developer": "hon9kon9ize", + "scores": { + "IFEval": 0.3231, + "BBH": 0.4345, + "MATH Level 5": 0.0415, + "GPQA": 0.2777, + "MUSR": 0.4706, + "MMLU-PRO": 0.2504 + } + }, + { + "model_id": "hon9kon9ize/CantoneseLLMChat-v1.0-7B", + "name": "CantoneseLLMChat-v1.0-7B", + "developer": "hon9kon9ize", + "scores": { + "IFEval": 0.4455, + "BBH": 0.4866, + "MATH Level 5": 0.2107, + "GPQA": 0.3221, + "MUSR": 0.3883, + "MMLU-PRO": 0.3785 + } + }, + { + "model_id": "hongbai12/li-0.4-pre", + "name": "li-0.4-pre", + "developer": "hongbai12", + "scores": { + "IFEval": 0.52, + "BBH": 0.6298, + "MATH Level 5": 0.4924, + "GPQA": 0.323, + "MUSR": 0.4513, + "MMLU-PRO": 0.5015 + } + }, + { + "model_id": "hotmailuser/Deepseek-qwen-modelstock-2B", + "name": "Deepseek-qwen-modelstock-2B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.2149, + "BBH": 0.3549, + "MATH Level 5": 0.3399, + "GPQA": 0.2802, + "MUSR": 0.3475, + "MMLU-PRO": 0.1911 + } + }, + { + "model_id": "hotmailuser/Falcon3Slerp1-10B", + "name": "Falcon3Slerp1-10B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.5694, + "BBH": 0.617, + "MATH Level 5": 0.2598, + "GPQA": 0.344, + "MUSR": 0.4318, + "MMLU-PRO": 0.4402 + } + }, + { + "model_id": "hotmailuser/Falcon3Slerp2-10B", + "name": "Falcon3Slerp2-10B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.6118, + "BBH": 0.6164, + "MATH Level 5": 0.2319, + "GPQA": 0.3381, + "MUSR": 0.4096, + "MMLU-PRO": 0.4369 + } + }, + { + "model_id": "hotmailuser/Falcon3Slerp4-10B", + "name": "Falcon3Slerp4-10B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.6072, + "BBH": 0.6114, + "MATH Level 5": 0.2289, + "GPQA": 0.3289, + "MUSR": 0.4017, + "MMLU-PRO": 0.4387 + } + }, + { + "model_id": "hotmailuser/FalconSlerp-3B", + "name": "FalconSlerp-3B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.5695, + "BBH": 0.4624, + "MATH Level 5": 0.176, + "GPQA": 0.2878, + "MUSR": 0.3989, + "MMLU-PRO": 0.2968 + } + }, + { + "model_id": "hotmailuser/FalconSlerp1-7B", + "name": "FalconSlerp1-7B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.5395, + "BBH": 0.5355, + "MATH Level 5": 0.2379, + "GPQA": 0.3196, + "MUSR": 0.4452, + "MMLU-PRO": 0.4129 + } + }, + { + "model_id": "hotmailuser/FalconSlerp2-7B", + "name": "FalconSlerp2-7B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.616, + "BBH": 0.5538, + "MATH Level 5": 0.2983, + "GPQA": 0.3196, + "MUSR": 0.4479, + "MMLU-PRO": 0.4141 + } + }, + { + "model_id": "hotmailuser/FalconSlerp3-10B", + "name": "FalconSlerp3-10B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.6002, + "BBH": 0.606, + "MATH Level 5": 0.2273, + "GPQA": 0.3356, + "MUSR": 0.4031, + "MMLU-PRO": 0.4323 + } + }, + { + "model_id": "hotmailuser/FalconSlerp3-7B", + "name": "FalconSlerp3-7B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.6096, + "BBH": 0.5533, + "MATH Level 5": 0.3157, + "GPQA": 0.3188, + "MUSR": 0.4507, + "MMLU-PRO": 0.4127 + } + }, + { + "model_id": "hotmailuser/FalconSlerp4-7B", + "name": "FalconSlerp4-7B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.6285, + "BBH": 0.5524, + "MATH Level 5": 0.2213, + "GPQA": 0.3322, + "MUSR": 0.4585, + "MMLU-PRO": 0.4032 + } + }, + { + "model_id": "hotmailuser/FalconSlerp6-7B", + "name": "FalconSlerp6-7B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.6027, + "BBH": 0.5384, + "MATH Level 5": 0.2047, + "GPQA": 0.318, + "MUSR": 0.4492, + "MMLU-PRO": 0.3995 + } + }, + { + "model_id": "hotmailuser/Gemma2Crono-27B", + "name": "Gemma2Crono-27B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.7086, + "BBH": 0.6505, + "MATH Level 5": 0.2424, + "GPQA": 0.3708, + "MUSR": 0.4567, + "MMLU-PRO": 0.4633 + } + }, + { + "model_id": "hotmailuser/Gemma2SimPO-27B", + "name": "Gemma2SimPO-27B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.7222, + "BBH": 0.6413, + "MATH Level 5": 0.2817, + "GPQA": 0.3582, + "MUSR": 0.4447, + "MMLU-PRO": 0.4642 + } + }, + { + "model_id": "hotmailuser/Gemma2atlas-27B", + "name": "Gemma2atlas-27B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.7214, + "BBH": 0.6545, + "MATH Level 5": 0.2145, + "GPQA": 0.3557, + "MUSR": 0.4445, + "MMLU-PRO": 0.475 + } + }, + { + "model_id": "hotmailuser/Gemma2magnum-27b", + "name": "Gemma2magnum-27b", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.5051, + "BBH": 0.62, + "MATH Level 5": 0.2205, + "GPQA": 0.3851, + "MUSR": 0.4723, + "MMLU-PRO": 0.4596 + } + }, + { + "model_id": "hotmailuser/Llama-Hermes-slerp-8B", + "name": "Llama-Hermes-slerp-8B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.339, + "BBH": 0.531, + "MATH Level 5": 0.0801, + "GPQA": 0.2936, + "MUSR": 0.4078, + "MMLU-PRO": 0.3331 + } + }, + { + "model_id": "hotmailuser/Llama-Hermes-slerp2-8B", + "name": "Llama-Hermes-slerp2-8B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.3728, + "BBH": 0.5265, + "MATH Level 5": 0.0974, + "GPQA": 0.2961, + "MUSR": 0.4248, + "MMLU-PRO": 0.3379 + } + }, + { + "model_id": "hotmailuser/LlamaStock-8B", + "name": "LlamaStock-8B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.425, + "BBH": 0.5329, + "MATH Level 5": 0.1699, + "GPQA": 0.3272, + "MUSR": 0.4129, + "MMLU-PRO": 0.3807 + } + }, + { + "model_id": "hotmailuser/Mistral-modelstock-24B", + "name": "Mistral-modelstock-24B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.3424, + "BBH": 0.6452, + "MATH Level 5": 0.1307, + "GPQA": 0.4102, + "MUSR": 0.459, + "MMLU-PRO": 0.507 + } + }, + { + "model_id": "hotmailuser/Mistral-modelstock2-24B", + "name": "Mistral-modelstock2-24B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.4318, + "BBH": 0.6689, + "MATH Level 5": 0.2402, + "GPQA": 0.3926, + "MUSR": 0.4616, + "MMLU-PRO": 0.5318 + } + }, + { + "model_id": "hotmailuser/Phi4-Slerp4-14B", + "name": "Phi4-Slerp4-14B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.0629, + "BBH": 0.6731, + "MATH Level 5": 0.3474, + "GPQA": 0.3968, + "MUSR": 0.5097, + "MMLU-PRO": 0.5278 + } + }, + { + "model_id": "hotmailuser/Qwen2.5-HomerSlerp-7B", + "name": "Qwen2.5-HomerSlerp-7B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.4488, + "BBH": 0.5633, + "MATH Level 5": 0.3316, + "GPQA": 0.3138, + "MUSR": 0.4383, + "MMLU-PRO": 0.4549 + } + }, + { + "model_id": "hotmailuser/QwenModelStock-1.8B", + "name": "QwenModelStock-1.8B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.3263, + "BBH": 0.4188, + "MATH Level 5": 0.0989, + "GPQA": 0.2869, + "MUSR": 0.4359, + "MMLU-PRO": 0.2959 + } + }, + { + "model_id": "hotmailuser/QwenSlerp-14B", + "name": "QwenSlerp-14B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.7025, + "BBH": 0.6491, + "MATH Level 5": 0.3837, + "GPQA": 0.3876, + "MUSR": 0.4634, + "MMLU-PRO": 0.54 + } + }, + { + "model_id": "hotmailuser/QwenSlerp-3B", + "name": "QwenSlerp-3B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.4334, + "BBH": 0.4892, + "MATH Level 5": 0.2749, + "GPQA": 0.2945, + "MUSR": 0.4317, + "MMLU-PRO": 0.3693 + } + }, + { + "model_id": "hotmailuser/QwenSlerp-7B", + "name": "QwenSlerp-7B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.4673, + "BBH": 0.5636, + "MATH Level 5": 0.3444, + "GPQA": 0.318, + "MUSR": 0.4409, + "MMLU-PRO": 0.4509 + } + }, + { + "model_id": "hotmailuser/QwenSlerp2-14B", + "name": "QwenSlerp2-14B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.7037, + "BBH": 0.6493, + "MATH Level 5": 0.3965, + "GPQA": 0.3809, + "MUSR": 0.4807, + "MMLU-PRO": 0.5379 + } + }, + { + "model_id": "hotmailuser/QwenSlerp2-3B", + "name": "QwenSlerp2-3B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.428, + "BBH": 0.4802, + "MATH Level 5": 0.2606, + "GPQA": 0.297, + "MUSR": 0.4252, + "MMLU-PRO": 0.3742 + } + }, + { + "model_id": "hotmailuser/QwenSlerp3-14B", + "name": "QwenSlerp3-14B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.6632, + "BBH": 0.6267, + "MATH Level 5": 0.4305, + "GPQA": 0.3666, + "MUSR": 0.4808, + "MMLU-PRO": 0.5263 + } + }, + { + "model_id": "hotmailuser/QwenSparse-7B", + "name": "QwenSparse-7B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.1086, + "BBH": 0.2896, + "MATH Level 5": 0.0106, + "GPQA": 0.2601, + "MUSR": 0.3562, + "MMLU-PRO": 0.1122 + } + }, + { + "model_id": "hotmailuser/QwenStock-0.5B", + "name": "QwenStock-0.5B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.2049, + "BBH": 0.2912, + "MATH Level 5": 0.0, + "GPQA": 0.2601, + "MUSR": 0.3575, + "MMLU-PRO": 0.1167 + } + }, + { + "model_id": "hotmailuser/QwenStock-1.7B", + "name": "QwenStock-1.7B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.3214, + "BBH": 0.4188, + "MATH Level 5": 0.0997, + "GPQA": 0.2878, + "MUSR": 0.4412, + "MMLU-PRO": 0.2955 + } + }, + { + "model_id": "hotmailuser/QwenStock1-14B", + "name": "QwenStock1-14B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.6693, + "BBH": 0.6502, + "MATH Level 5": 0.3701, + "GPQA": 0.3859, + "MUSR": 0.4781, + "MMLU-PRO": 0.5416 + } + }, + { + "model_id": "hotmailuser/RombosBeagle-v2beta-MGS-32B", + "name": "RombosBeagle-v2beta-MGS-32B", + "developer": "hotmailuser", + "scores": { + "IFEval": 0.5157, + "BBH": 0.7037, + "MATH Level 5": 0.4992, + "GPQA": 0.38, + "MUSR": 0.5021, + "MMLU-PRO": 0.5908 + } + }, + { + "model_id": "huggyllama/llama-13b", + "name": "llama-13b", + "developer": "huggyllama", + "scores": { + "IFEval": 0.2411, + "BBH": 0.3988, + "MATH Level 5": 0.0204, + "GPQA": 0.255, + "MUSR": 0.3462, + "MMLU-PRO": 0.1952 + } + }, + { + "model_id": "huggyllama/llama-65b", + "name": "llama-65b", + "developer": "huggyllama", + "scores": { + "IFEval": 0.2526, + "BBH": 0.4703, + "MATH Level 5": 0.031, + "GPQA": 0.276, + "MUSR": 0.3595, + "MMLU-PRO": 0.3078 + } + }, + { + "model_id": "huggyllama/llama-7b", + "name": "llama-7b", + "developer": "huggyllama", + "scores": { + "IFEval": 0.2501, + "BBH": 0.3277, + "MATH Level 5": 0.0083, + "GPQA": 0.2525, + "MUSR": 0.3354, + "MMLU-PRO": 0.1313 + } + }, + { + "model_id": "huihui-ai/DeepSeek-R1-Distill-Qwen-14B-abliterated-v2", + "name": "DeepSeek-R1-Distill-Qwen-14B-abliterated-v2", + "developer": "huihui-ai", + "scores": { + "IFEval": 0.4211, + "BBH": 0.3487, + "MATH Level 5": 0.2205, + "GPQA": 0.276, + "MUSR": 0.4701, + "MMLU-PRO": 0.1915 + } + }, + { + "model_id": "huihui-ai/QwQ-32B-Coder-Fusion-7030", + "name": "QwQ-32B-Coder-Fusion-7030", + "developer": "huihui-ai", + "scores": { + "IFEval": 0.3865, + "BBH": 0.6178, + "MATH Level 5": 0.2795, + "GPQA": 0.2844, + "MUSR": 0.3922, + "MMLU-PRO": 0.4368 + } + }, + { + "model_id": "huihui-ai/QwQ-32B-Coder-Fusion-8020", + "name": "QwQ-32B-Coder-Fusion-8020", + "developer": "huihui-ai", + "scores": { + "IFEval": 0.6021, + "BBH": 0.6665, + "MATH Level 5": 0.4592, + "GPQA": 0.3549, + "MUSR": 0.4293, + "MMLU-PRO": 0.5367 + } + }, + { + "model_id": "huihui-ai/QwQ-32B-Coder-Fusion-9010", + "name": "QwQ-32B-Coder-Fusion-9010", + "developer": "huihui-ai", + "scores": { + "IFEval": 0.5778, + "BBH": 0.6727, + "MATH Level 5": 0.5317, + "GPQA": 0.3616, + "MUSR": 0.4682, + "MMLU-PRO": 0.56 + } + }, + { + "model_id": "huihui-ai/Qwen2.5-14B-Instruct-abliterated-v2", + "name": "Qwen2.5-14B-Instruct-abliterated-v2", + "developer": "huihui-ai", + "scores": { + "IFEval": 0.8328, + "BBH": 0.6324, + "MATH Level 5": 0.5302, + "GPQA": 0.3339, + "MUSR": 0.422, + "MMLU-PRO": 0.4962 + } + }, + { + "model_id": "huihui-ai/Qwen2.5-72B-Instruct-abliterated", + "name": "Qwen2.5-72B-Instruct-abliterated", + "developer": "huihui-ai", + "scores": { + "IFEval": 0.8593, + "BBH": 0.719, + "MATH Level 5": 0.6012, + "GPQA": 0.3951, + "MUSR": 0.4233, + "MMLU-PRO": 0.5537 + } + }, + { + "model_id": "huihui-ai/Qwen2.5-7B-Instruct-abliterated", + "name": "Qwen2.5-7B-Instruct-abliterated", + "developer": "huihui-ai", + "scores": { + "IFEval": 0.7546, + "BBH": 0.5262, + "MATH Level 5": 0.4577, + "GPQA": 0.3154, + "MUSR": 0.3967, + "MMLU-PRO": 0.418 + } + }, + { + "model_id": "huihui-ai/Qwen2.5-7B-Instruct-abliterated-v2", + "name": "Qwen2.5-7B-Instruct-abliterated-v2", + "developer": "huihui-ai", + "scores": { + "IFEval": 0.7606, + "BBH": 0.5377, + "MATH Level 5": 0.4637, + "GPQA": 0.3087, + "MUSR": 0.3981, + "MMLU-PRO": 0.4208 + } + }, + { + "model_id": "huu-ontocord/wide_3b_orpo_stage1.1-ss1-orpo3", + "name": "wide_3b_orpo_stage1.1-ss1-orpo3", + "developer": "huu-ontocord", + "scores": { + "IFEval": 0.1505, + "BBH": 0.2937, + "MATH Level 5": 0.0098, + "GPQA": 0.2584, + "MUSR": 0.3618, + "MMLU-PRO": 0.1164 + } + }, + { + "model_id": "iFaz/llama31_8B_en_emo_v4", + "name": "llama31_8B_en_emo_v4", + "developer": "iFaz", + "scores": { + "IFEval": 0.3043, + "BBH": 0.4916, + "MATH Level 5": 0.0884, + "GPQA": 0.297, + "MUSR": 0.3643, + "MMLU-PRO": 0.3049 + } + }, + { + "model_id": "iFaz/llama32_1B_en_emo_v1", + "name": "llama32_1B_en_emo_v1", + "developer": "iFaz", + "scores": { + "IFEval": 0.4408, + "BBH": 0.338, + "MATH Level 5": 0.0378, + "GPQA": 0.25, + "MUSR": 0.3489, + "MMLU-PRO": 0.1761 + } + }, + { + "model_id": "iFaz/llama32_3B_en_emo_1000_stp", + "name": "llama32_3B_en_emo_1000_stp", + "developer": "iFaz", + "scores": { + "IFEval": 0.7295, + "BBH": 0.4522, + "MATH Level 5": 0.1465, + "GPQA": 0.2777, + "MUSR": 0.3621, + "MMLU-PRO": 0.3123 + } + }, + { + "model_id": "iFaz/llama32_3B_en_emo_2000_stp", + "name": "llama32_3B_en_emo_2000_stp", + "developer": "iFaz", + "scores": { + "IFEval": 0.7369, + "BBH": 0.4535, + "MATH Level 5": 0.1533, + "GPQA": 0.2836, + "MUSR": 0.3527, + "MMLU-PRO": 0.3098 + } + }, + { + "model_id": "iFaz/llama32_3B_en_emo_300_stp", + "name": "llama32_3B_en_emo_300_stp", + "developer": "iFaz", + "scores": { + "IFEval": 0.7256, + "BBH": 0.4505, + "MATH Level 5": 0.1601, + "GPQA": 0.2743, + "MUSR": 0.3621, + "MMLU-PRO": 0.3148 + } + }, + { + "model_id": "iFaz/llama32_3B_en_emo_5000_stp", + "name": "llama32_3B_en_emo_5000_stp", + "developer": "iFaz", + "scores": { + "IFEval": 0.71, + "BBH": 0.4568, + "MATH Level 5": 0.1292, + "GPQA": 0.2794, + "MUSR": 0.3446, + "MMLU-PRO": 0.3067 + } + }, + { + "model_id": "iFaz/llama32_3B_en_emo_v2", + "name": "llama32_3B_en_emo_v2", + "developer": "iFaz", + "scores": { + "IFEval": 0.5454, + "BBH": 0.4284, + "MATH Level 5": 0.1088, + "GPQA": 0.2676, + "MUSR": 0.3482, + "MMLU-PRO": 0.3004 + } + }, + { + "model_id": "iFaz/llama32_3B_en_emo_v3", + "name": "llama32_3B_en_emo_v3", + "developer": "iFaz", + "scores": { + "IFEval": 0.5759, + "BBH": 0.4301, + "MATH Level 5": 0.068, + "GPQA": 0.2676, + "MUSR": 0.3553, + "MMLU-PRO": 0.271 + } + }, + { + "model_id": "iRyanBell/ARC1", + "name": "ARC1", + "developer": "iRyanBell", + "scores": { + "IFEval": 0.4411, + "BBH": 0.4903, + "MATH Level 5": 0.0687, + "GPQA": 0.2945, + "MUSR": 0.3991, + "MMLU-PRO": 0.3371 + } + }, + { + "model_id": "iRyanBell/ARC1-II", + "name": "ARC1-II", + "developer": "iRyanBell", + "scores": { + "IFEval": 0.1708, + "BBH": 0.3382, + "MATH Level 5": 0.0219, + "GPQA": 0.2718, + "MUSR": 0.4913, + "MMLU-PRO": 0.1686 + } + }, + { + "model_id": "ibivibiv/colossus_120b", + "name": "colossus_120b", + "developer": "ibivibiv", + "scores": { + "IFEval": 0.4276, + "BBH": 0.6061, + "MATH Level 5": 0.0566, + "GPQA": 0.3087, + "MUSR": 0.4733, + "MMLU-PRO": 0.3961 + } + }, + { + "model_id": "ibivibiv/multimaster-7b-v6", + "name": "multimaster-7b-v6", + "developer": "ibivibiv", + "scores": { + "IFEval": 0.4473, + "BBH": 0.5194, + "MATH Level 5": 0.0559, + "GPQA": 0.3037, + "MUSR": 0.4396, + "MMLU-PRO": 0.3095 + } + }, + { + "model_id": "ibm-granite/granite-3.0-1b-a400m-base", + "name": "granite-3.0-1b-a400m-base", + "developer": "ibm-granite", + "scores": { + "IFEval": 0.2404, + "BBH": 0.3221, + "MATH Level 5": 0.0264, + "GPQA": 0.2475, + "MUSR": 0.3367, + "MMLU-PRO": 0.1152 + } + }, + { + "model_id": "ibm-granite/granite-3.0-1b-a400m-instruct", + "name": "granite-3.0-1b-a400m-instruct", + "developer": "ibm-granite", + "scores": { + "IFEval": 0.3332, + "BBH": 0.3224, + "MATH Level 5": 0.0279, + "GPQA": 0.2609, + "MUSR": 0.3623, + "MMLU-PRO": 0.1244 + } + }, + { + "model_id": "ibm-granite/granite-3.0-2b-base", + "name": "granite-3.0-2b-base", + "developer": "ibm-granite", + "scores": { + "IFEval": 0.3874, + "BBH": 0.4047, + "MATH Level 5": 0.0544, + "GPQA": 0.2802, + "MUSR": 0.3434, + "MMLU-PRO": 0.2381 + } + }, + { + "model_id": "ibm-granite/granite-3.0-2b-instruct", + "name": "granite-3.0-2b-instruct", + "developer": "ibm-granite", + "scores": { + "IFEval": 0.514, + "BBH": 0.4412, + "MATH Level 5": 0.0921, + "GPQA": 0.2995, + "MUSR": 0.3515, + "MMLU-PRO": 0.2814 + } + }, + { + "model_id": "ibm-granite/granite-3.0-3b-a800m-base", + "name": "granite-3.0-3b-a800m-base", + "developer": "ibm-granite", + "scores": { + "IFEval": 0.2732, + "BBH": 0.3667, + "MATH Level 5": 0.0483, + "GPQA": 0.2517, + "MUSR": 0.342, + "MMLU-PRO": 0.1891 + } + }, + { + "model_id": "ibm-granite/granite-3.0-3b-a800m-instruct", + "name": "granite-3.0-3b-a800m-instruct", + "developer": "ibm-granite", + "scores": { + "IFEval": 0.4298, + "BBH": 0.3753, + "MATH Level 5": 0.0702, + "GPQA": 0.281, + "MUSR": 0.3487, + "MMLU-PRO": 0.2152 + } + }, + { + "model_id": "ibm-granite/granite-3.0-8b-base", + "name": "granite-3.0-8b-base", + "developer": "ibm-granite", + "scores": { + "IFEval": 0.4583, + "BBH": 0.4944, + "MATH Level 5": 0.1012, + "GPQA": 0.3255, + "MUSR": 0.4081, + "MMLU-PRO": 0.3313 + } + }, + { + "model_id": "ibm-granite/granite-3.0-8b-instruct", + "name": "granite-3.0-8b-instruct", + "developer": "ibm-granite", + "scores": { + "IFEval": 0.531, + "BBH": 0.5192, + "MATH Level 5": 0.142, + "GPQA": 0.3322, + "MUSR": 0.3901, + "MMLU-PRO": 0.3457 + } + }, + { + "model_id": "ibm-granite/granite-3.1-1b-a400m-base", + "name": "granite-3.1-1b-a400m-base", + "developer": "ibm-granite", + "scores": { + "IFEval": 0.2519, + "BBH": 0.3299, + "MATH Level 5": 0.0272, + "GPQA": 0.2517, + "MUSR": 0.3501, + "MMLU-PRO": 0.1139 + } + }, + { + "model_id": "ibm-granite/granite-3.1-1b-a400m-instruct", + "name": "granite-3.1-1b-a400m-instruct", + "developer": "ibm-granite", + "scores": { + "IFEval": 0.4686, + "BBH": 0.328, + "MATH Level 5": 0.0453, + "GPQA": 0.2399, + "MUSR": 0.3302, + "MMLU-PRO": 0.1217 + } + }, + { + "model_id": "ibm-granite/granite-3.1-2b-base", + "name": "granite-3.1-2b-base", + "developer": "ibm-granite", + "scores": { + "IFEval": 0.3522, + "BBH": 0.4047, + "MATH Level 5": 0.0566, + "GPQA": 0.2777, + "MUSR": 0.3486, + "MMLU-PRO": 0.2251 + } + }, + { + "model_id": "ibm-granite/granite-3.1-2b-instruct", + "name": "granite-3.1-2b-instruct", + "developer": "ibm-granite", + "scores": { + "IFEval": 0.6286, + "BBH": 0.4409, + "MATH Level 5": 0.1526, + "GPQA": 0.2894, + "MUSR": 0.3605, + "MMLU-PRO": 0.2819 + } + }, + { + "model_id": "ibm-granite/granite-3.1-3b-a800m-base", + "name": "granite-3.1-3b-a800m-base", + "developer": "ibm-granite", + "scores": { + "IFEval": 0.2996, + "BBH": 0.3628, + "MATH Level 5": 0.0453, + "GPQA": 0.2777, + "MUSR": 0.3275, + "MMLU-PRO": 0.1793 + } + }, + { + "model_id": "ibm-granite/granite-3.1-3b-a800m-instruct", + "name": "granite-3.1-3b-a800m-instruct", + "developer": "ibm-granite", + "scores": { + "IFEval": 0.5516, + "BBH": 0.4009, + "MATH Level 5": 0.114, + "GPQA": 0.2886, + "MUSR": 0.3486, + "MMLU-PRO": 0.2148 + } + }, + { + "model_id": "ibm-granite/granite-3.1-8b-base", + "name": "granite-3.1-8b-base", + "developer": "ibm-granite", + "scores": { + "IFEval": 0.4221, + "BBH": 0.4777, + "MATH Level 5": 0.0944, + "GPQA": 0.3213, + "MUSR": 0.3922, + "MMLU-PRO": 0.3232 + } + }, + { + "model_id": "ibm-granite/granite-3.1-8b-instruct", + "name": "granite-3.1-8b-instruct", + "developer": "ibm-granite", + "scores": { + "IFEval": 0.7208, + "BBH": 0.5364, + "MATH Level 5": 0.2198, + "GPQA": 0.3121, + "MUSR": 0.4707, + "MMLU-PRO": 0.3537 + } + }, + { + "model_id": "ibm-granite/granite-3.2-2b-instruct", + "name": "granite-3.2-2b-instruct", + "developer": "ibm-granite", + "scores": { + "IFEval": 0.6152, + "BBH": 0.4387, + "MATH Level 5": 0.1443, + "GPQA": 0.2903, + "MUSR": 0.3646, + "MMLU-PRO": 0.2783 + } + }, + { + "model_id": "ibm-granite/granite-3.2-8b-instruct", + "name": "granite-3.2-8b-instruct", + "developer": "ibm-granite", + "scores": { + "IFEval": 0.7275, + "BBH": 0.5402, + "MATH Level 5": 0.2379, + "GPQA": 0.3154, + "MUSR": 0.4562, + "MMLU-PRO": 0.3512 + } + }, + { + "model_id": "ibm-granite/granite-7b-base", + "name": "granite-7b-base", + "developer": "ibm-granite", + "scores": { + "IFEval": 0.2414, + "BBH": 0.348, + "MATH Level 5": 0.0159, + "GPQA": 0.2458, + "MUSR": 0.3555, + "MMLU-PRO": 0.1834 + } + }, + { + "model_id": "ibm-granite/granite-7b-instruct", + "name": "granite-7b-instruct", + "developer": "ibm-granite", + "scores": { + "IFEval": 0.2972, + "BBH": 0.3723, + "MATH Level 5": 0.0204, + "GPQA": 0.2852, + "MUSR": 0.402, + "MMLU-PRO": 0.2286 + } + }, + { + "model_id": "ibm/PowerLM-3b", + "name": "PowerLM-3b", + "developer": "ibm", + "scores": { + "IFEval": 0.3321, + "BBH": 0.3679, + "MATH Level 5": 0.0363, + "GPQA": 0.2752, + "MUSR": 0.3563, + "MMLU-PRO": 0.2016 + } + }, + { + "model_id": "ibm/merlinite-7b", + "name": "merlinite-7b", + "developer": "ibm", + "scores": { + "IFEval": 0.2499, + "BBH": 0.5007, + "MATH Level 5": 0.0242, + "GPQA": 0.297, + "MUSR": 0.4412, + "MMLU-PRO": 0.3068 + } + }, + { + "model_id": "icefog72/Ice0.15-02.10-RP", + "name": "Ice0.15-02.10-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.5343, + "BBH": 0.4976, + "MATH Level 5": 0.0574, + "GPQA": 0.2777, + "MUSR": 0.432, + "MMLU-PRO": 0.3066 + } + }, + { + "model_id": "icefog72/Ice0.16-02.10-RP", + "name": "Ice0.16-02.10-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.5069, + "BBH": 0.4946, + "MATH Level 5": 0.0589, + "GPQA": 0.2794, + "MUSR": 0.4334, + "MMLU-PRO": 0.3068 + } + }, + { + "model_id": "icefog72/Ice0.17-03.10-RP", + "name": "Ice0.17-03.10-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.5124, + "BBH": 0.5007, + "MATH Level 5": 0.0612, + "GPQA": 0.2819, + "MUSR": 0.4334, + "MMLU-PRO": 0.3085 + } + }, + { + "model_id": "icefog72/Ice0.27-06.11-RP", + "name": "Ice0.27-06.11-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.4918, + "BBH": 0.5112, + "MATH Level 5": 0.0566, + "GPQA": 0.3121, + "MUSR": 0.4328, + "MMLU-PRO": 0.3154 + } + }, + { + "model_id": "icefog72/Ice0.29-06.11-RP", + "name": "Ice0.29-06.11-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.4861, + "BBH": 0.5088, + "MATH Level 5": 0.0566, + "GPQA": 0.3029, + "MUSR": 0.4459, + "MMLU-PRO": 0.3093 + } + }, + { + "model_id": "icefog72/Ice0.31-08.11-RP", + "name": "Ice0.31-08.11-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.5146, + "BBH": 0.5032, + "MATH Level 5": 0.0612, + "GPQA": 0.3079, + "MUSR": 0.4277, + "MMLU-PRO": 0.3131 + } + }, + { + "model_id": "icefog72/Ice0.32-10.11-RP", + "name": "Ice0.32-10.11-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.4915, + "BBH": 0.5048, + "MATH Level 5": 0.0514, + "GPQA": 0.3121, + "MUSR": 0.4382, + "MMLU-PRO": 0.31 + } + }, + { + "model_id": "icefog72/Ice0.34b-14.11-RP", + "name": "Ice0.34b-14.11-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.4762, + "BBH": 0.5067, + "MATH Level 5": 0.065, + "GPQA": 0.3096, + "MUSR": 0.442, + "MMLU-PRO": 0.3125 + } + }, + { + "model_id": "icefog72/Ice0.34n-14.11-RP", + "name": "Ice0.34n-14.11-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.4787, + "BBH": 0.5091, + "MATH Level 5": 0.0725, + "GPQA": 0.3138, + "MUSR": 0.438, + "MMLU-PRO": 0.3124 + } + }, + { + "model_id": "icefog72/Ice0.37-18.11-RP", + "name": "Ice0.37-18.11-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.4972, + "BBH": 0.5084, + "MATH Level 5": 0.0642, + "GPQA": 0.3121, + "MUSR": 0.4339, + "MMLU-PRO": 0.3143 + } + }, + { + "model_id": "icefog72/Ice0.38-19.11-RP", + "name": "Ice0.38-19.11-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.4403, + "BBH": 0.5101, + "MATH Level 5": 0.0551, + "GPQA": 0.3045, + "MUSR": 0.4367, + "MMLU-PRO": 0.314 + } + }, + { + "model_id": "icefog72/Ice0.39-19.11-RP", + "name": "Ice0.39-19.11-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.4757, + "BBH": 0.5093, + "MATH Level 5": 0.0498, + "GPQA": 0.3104, + "MUSR": 0.4341, + "MMLU-PRO": 0.3127 + } + }, + { + "model_id": "icefog72/Ice0.40-20.11-RP", + "name": "Ice0.40-20.11-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.4763, + "BBH": 0.5093, + "MATH Level 5": 0.0642, + "GPQA": 0.307, + "MUSR": 0.4446, + "MMLU-PRO": 0.3099 + } + }, + { + "model_id": "icefog72/Ice0.41-22.11-RP", + "name": "Ice0.41-22.11-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.462, + "BBH": 0.4723, + "MATH Level 5": 0.031, + "GPQA": 0.2869, + "MUSR": 0.456, + "MMLU-PRO": 0.2618 + } + }, + { + "model_id": "icefog72/Ice0.50-16.01-RP", + "name": "Ice0.50-16.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.4385, + "BBH": 0.498, + "MATH Level 5": 0.0468, + "GPQA": 0.2995, + "MUSR": 0.4381, + "MMLU-PRO": 0.3069 + } + }, + { + "model_id": "icefog72/Ice0.50.1-16.01-RP", + "name": "Ice0.50.1-16.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.4829, + "BBH": 0.5107, + "MATH Level 5": 0.0612, + "GPQA": 0.3096, + "MUSR": 0.4327, + "MMLU-PRO": 0.3132 + } + }, + { + "model_id": "icefog72/Ice0.51-16.01-RP", + "name": "Ice0.51-16.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.4431, + "BBH": 0.5044, + "MATH Level 5": 0.0514, + "GPQA": 0.3045, + "MUSR": 0.4437, + "MMLU-PRO": 0.306 + } + }, + { + "model_id": "icefog72/Ice0.51.1-16.01-RP", + "name": "Ice0.51.1-16.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.4573, + "BBH": 0.5121, + "MATH Level 5": 0.0642, + "GPQA": 0.3062, + "MUSR": 0.4394, + "MMLU-PRO": 0.3104 + } + }, + { + "model_id": "icefog72/Ice0.52-16.01-RP", + "name": "Ice0.52-16.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.4503, + "BBH": 0.5047, + "MATH Level 5": 0.0506, + "GPQA": 0.3087, + "MUSR": 0.4396, + "MMLU-PRO": 0.308 + } + }, + { + "model_id": "icefog72/Ice0.52.1-16.01-RP", + "name": "Ice0.52.1-16.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.4549, + "BBH": 0.5106, + "MATH Level 5": 0.0627, + "GPQA": 0.3037, + "MUSR": 0.4394, + "MMLU-PRO": 0.3105 + } + }, + { + "model_id": "icefog72/Ice0.53-16.01-RP", + "name": "Ice0.53-16.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.4741, + "BBH": 0.5102, + "MATH Level 5": 0.0634, + "GPQA": 0.3087, + "MUSR": 0.4327, + "MMLU-PRO": 0.313 + } + }, + { + "model_id": "icefog72/Ice0.54-17.01-RP", + "name": "Ice0.54-17.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.4379, + "BBH": 0.4853, + "MATH Level 5": 0.0408, + "GPQA": 0.3096, + "MUSR": 0.4874, + "MMLU-PRO": 0.2326 + } + }, + { + "model_id": "icefog72/Ice0.55-17.01-RP", + "name": "Ice0.55-17.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.4961, + "BBH": 0.5077, + "MATH Level 5": 0.0604, + "GPQA": 0.2869, + "MUSR": 0.4725, + "MMLU-PRO": 0.2658 + } + }, + { + "model_id": "icefog72/Ice0.57-17.01-RP", + "name": "Ice0.57-17.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.5152, + "BBH": 0.5064, + "MATH Level 5": 0.0514, + "GPQA": 0.297, + "MUSR": 0.4686, + "MMLU-PRO": 0.2651 + } + }, + { + "model_id": "icefog72/Ice0.60-18.01-RP", + "name": "Ice0.60-18.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.5374, + "BBH": 0.5094, + "MATH Level 5": 0.0536, + "GPQA": 0.3045, + "MUSR": 0.467, + "MMLU-PRO": 0.2837 + } + }, + { + "model_id": "icefog72/Ice0.60.1-18.01-RP", + "name": "Ice0.60.1-18.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.5188, + "BBH": 0.512, + "MATH Level 5": 0.0461, + "GPQA": 0.302, + "MUSR": 0.4498, + "MMLU-PRO": 0.2914 + } + }, + { + "model_id": "icefog72/Ice0.61-18.01-RP", + "name": "Ice0.61-18.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.5441, + "BBH": 0.5105, + "MATH Level 5": 0.0468, + "GPQA": 0.2987, + "MUSR": 0.4697, + "MMLU-PRO": 0.2709 + } + }, + { + "model_id": "icefog72/Ice0.62-18.01-RP", + "name": "Ice0.62-18.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.5367, + "BBH": 0.5103, + "MATH Level 5": 0.0574, + "GPQA": 0.2978, + "MUSR": 0.4538, + "MMLU-PRO": 0.2877 + } + }, + { + "model_id": "icefog72/Ice0.62.1-24.01-RP", + "name": "Ice0.62.1-24.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.5182, + "BBH": 0.5109, + "MATH Level 5": 0.0559, + "GPQA": 0.3003, + "MUSR": 0.4551, + "MMLU-PRO": 0.2871 + } + }, + { + "model_id": "icefog72/Ice0.64-24.01-RP", + "name": "Ice0.64-24.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.5441, + "BBH": 0.506, + "MATH Level 5": 0.0627, + "GPQA": 0.3037, + "MUSR": 0.462, + "MMLU-PRO": 0.2933 + } + }, + { + "model_id": "icefog72/Ice0.64.1-24.01-RP", + "name": "Ice0.64.1-24.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.5447, + "BBH": 0.506, + "MATH Level 5": 0.0627, + "GPQA": 0.3037, + "MUSR": 0.462, + "MMLU-PRO": 0.2933 + } + }, + { + "model_id": "icefog72/Ice0.65-25.01-RP", + "name": "Ice0.65-25.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.5029, + "BBH": 0.5096, + "MATH Level 5": 0.065, + "GPQA": 0.3045, + "MUSR": 0.434, + "MMLU-PRO": 0.2997 + } + }, + { + "model_id": "icefog72/Ice0.66-25.01-RP", + "name": "Ice0.66-25.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.5325, + "BBH": 0.5129, + "MATH Level 5": 0.0604, + "GPQA": 0.307, + "MUSR": 0.4434, + "MMLU-PRO": 0.3039 + } + }, + { + "model_id": "icefog72/Ice0.67-25.01-RP", + "name": "Ice0.67-25.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.5361, + "BBH": 0.5113, + "MATH Level 5": 0.0748, + "GPQA": 0.2852, + "MUSR": 0.4279, + "MMLU-PRO": 0.3097 + } + }, + { + "model_id": "icefog72/Ice0.68-25.01-RP", + "name": "Ice0.68-25.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.5514, + "BBH": 0.513, + "MATH Level 5": 0.0725, + "GPQA": 0.3104, + "MUSR": 0.4446, + "MMLU-PRO": 0.3012 + } + }, + { + "model_id": "icefog72/Ice0.69-25.01-RP", + "name": "Ice0.69-25.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.5438, + "BBH": 0.5098, + "MATH Level 5": 0.0566, + "GPQA": 0.3129, + "MUSR": 0.4486, + "MMLU-PRO": 0.2965 + } + }, + { + "model_id": "icefog72/Ice0.7-29.09-RP", + "name": "Ice0.7-29.09-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.5176, + "BBH": 0.5048, + "MATH Level 5": 0.0665, + "GPQA": 0.2878, + "MUSR": 0.4238, + "MMLU-PRO": 0.3127 + } + }, + { + "model_id": "icefog72/Ice0.70-25.01-RP", + "name": "Ice0.70-25.01-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.5498, + "BBH": 0.5136, + "MATH Level 5": 0.0597, + "GPQA": 0.3079, + "MUSR": 0.4512, + "MMLU-PRO": 0.2996 + } + }, + { + "model_id": "icefog72/Ice0.70.1-01.02-RP", + "name": "Ice0.70.1-01.02-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.507, + "BBH": 0.506, + "MATH Level 5": 0.034, + "GPQA": 0.2978, + "MUSR": 0.4599, + "MMLU-PRO": 0.2749 + } + }, + { + "model_id": "icefog72/Ice0.73-01.02-RP", + "name": "Ice0.73-01.02-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.5292, + "BBH": 0.5103, + "MATH Level 5": 0.0385, + "GPQA": 0.2911, + "MUSR": 0.4664, + "MMLU-PRO": 0.2702 + } + }, + { + "model_id": "icefog72/Ice0.74-02.02-RP", + "name": "Ice0.74-02.02-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.2935, + "BBH": 0.4646, + "MATH Level 5": 0.0015, + "GPQA": 0.2961, + "MUSR": 0.428, + "MMLU-PRO": 0.2143 + } + }, + { + "model_id": "icefog72/Ice0.76-02.02-RP", + "name": "Ice0.76-02.02-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.4529, + "BBH": 0.5086, + "MATH Level 5": 0.0144, + "GPQA": 0.2869, + "MUSR": 0.4362, + "MMLU-PRO": 0.2652 + } + }, + { + "model_id": "icefog72/Ice0.77-02.02-RP", + "name": "Ice0.77-02.02-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.531, + "BBH": 0.5109, + "MATH Level 5": 0.0393, + "GPQA": 0.2852, + "MUSR": 0.4765, + "MMLU-PRO": 0.2999 + } + }, + { + "model_id": "icefog72/Ice0.78-02.02-RP", + "name": "Ice0.78-02.02-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.4053, + "BBH": 0.5002, + "MATH Level 5": 0.0438, + "GPQA": 0.2936, + "MUSR": 0.4686, + "MMLU-PRO": 0.2955 + } + }, + { + "model_id": "icefog72/Ice0.80-03.02-RP", + "name": "Ice0.80-03.02-RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.5516, + "BBH": 0.5098, + "MATH Level 5": 0.0559, + "GPQA": 0.2785, + "MUSR": 0.4923, + "MMLU-PRO": 0.2912 + } + }, + { + "model_id": "icefog72/IceCocoaRP-7b", + "name": "IceCocoaRP-7b", + "developer": "icefog72", + "scores": { + "IFEval": 0.4962, + "BBH": 0.4938, + "MATH Level 5": 0.0574, + "GPQA": 0.2953, + "MUSR": 0.4198, + "MMLU-PRO": 0.3098 + } + }, + { + "model_id": "icefog72/IceCoffeeRP-7b", + "name": "IceCoffeeRP-7b", + "developer": "icefog72", + "scores": { + "IFEval": 0.4959, + "BBH": 0.4889, + "MATH Level 5": 0.0544, + "GPQA": 0.2852, + "MUSR": 0.416, + "MMLU-PRO": 0.2975 + } + }, + { + "model_id": "icefog72/IceDrinkByFrankensteinV3RP", + "name": "IceDrinkByFrankensteinV3RP", + "developer": "icefog72", + "scores": { + "IFEval": 0.4975, + "BBH": 0.4833, + "MATH Level 5": 0.0506, + "GPQA": 0.2617, + "MUSR": 0.4253, + "MMLU-PRO": 0.2927 + } + }, + { + "model_id": "icefog72/IceDrinkNameGoesHereRP-7b-Model_Stock", + "name": "IceDrinkNameGoesHereRP-7b-Model_Stock", + "developer": "icefog72", + "scores": { + "IFEval": 0.4968, + "BBH": 0.4658, + "MATH Level 5": 0.0408, + "GPQA": 0.2685, + "MUSR": 0.4067, + "MMLU-PRO": 0.2817 + } + }, + { + "model_id": "icefog72/IceDrinkNameNotFoundRP-7b-Model_Stock", + "name": "IceDrinkNameNotFoundRP-7b-Model_Stock", + "developer": "icefog72", + "scores": { + "IFEval": 0.513, + "BBH": 0.5026, + "MATH Level 5": 0.0604, + "GPQA": 0.2777, + "MUSR": 0.4372, + "MMLU-PRO": 0.3064 + } + }, + { + "model_id": "icefog72/IceDrunkCherryRP-7b", + "name": "IceDrunkCherryRP-7b", + "developer": "icefog72", + "scores": { + "IFEval": 0.4898, + "BBH": 0.4847, + "MATH Level 5": 0.0612, + "GPQA": 0.2768, + "MUSR": 0.4292, + "MMLU-PRO": 0.3009 + } + }, + { + "model_id": "icefog72/IceDrunkenCherryRP-7b", + "name": "IceDrunkenCherryRP-7b", + "developer": "icefog72", + "scores": { + "IFEval": 0.4763, + "BBH": 0.5093, + "MATH Level 5": 0.0642, + "GPQA": 0.307, + "MUSR": 0.4446, + "MMLU-PRO": 0.3099 + } + }, + { + "model_id": "icefog72/IceEspressoRPv2-7b", + "name": "IceEspressoRPv2-7b", + "developer": "icefog72", + "scores": { + "IFEval": 0.4977, + "BBH": 0.5055, + "MATH Level 5": 0.0619, + "GPQA": 0.2894, + "MUSR": 0.4331, + "MMLU-PRO": 0.3061 + } + }, + { + "model_id": "icefog72/IceLemonTeaRP-32k-7b", + "name": "IceLemonTeaRP-32k-7b", + "developer": "icefog72", + "scores": { + "IFEval": 0.5212, + "BBH": 0.4997, + "MATH Level 5": 0.0544, + "GPQA": 0.2903, + "MUSR": 0.429, + "MMLU-PRO": 0.3068 + } + }, + { + "model_id": "icefog72/IceMartiniRP-7b", + "name": "IceMartiniRP-7b", + "developer": "icefog72", + "scores": { + "IFEval": 0.5045, + "BBH": 0.4972, + "MATH Level 5": 0.0665, + "GPQA": 0.2794, + "MUSR": 0.4345, + "MMLU-PRO": 0.3073 + } + }, + { + "model_id": "icefog72/IceNalyvkaRP-7b", + "name": "IceNalyvkaRP-7b", + "developer": "icefog72", + "scores": { + "IFEval": 0.5498, + "BBH": 0.5136, + "MATH Level 5": 0.0597, + "GPQA": 0.3079, + "MUSR": 0.4512, + "MMLU-PRO": 0.2996 + } + }, + { + "model_id": "icefog72/IceSakeRP-7b", + "name": "IceSakeRP-7b", + "developer": "icefog72", + "scores": { + "IFEval": 0.5228, + "BBH": 0.5119, + "MATH Level 5": 0.0634, + "GPQA": 0.2852, + "MUSR": 0.413, + "MMLU-PRO": 0.3177 + } + }, + { + "model_id": "icefog72/IceSakeV4RP-7b", + "name": "IceSakeV4RP-7b", + "developer": "icefog72", + "scores": { + "IFEval": 0.4634, + "BBH": 0.493, + "MATH Level 5": 0.0559, + "GPQA": 0.2945, + "MUSR": 0.4082, + "MMLU-PRO": 0.3103 + } + }, + { + "model_id": "icefog72/IceSakeV6RP-7b", + "name": "IceSakeV6RP-7b", + "developer": "icefog72", + "scores": { + "IFEval": 0.5033, + "BBH": 0.4976, + "MATH Level 5": 0.0619, + "GPQA": 0.2911, + "MUSR": 0.42, + "MMLU-PRO": 0.3093 + } + }, + { + "model_id": "icefog72/IceSakeV8RP-7b", + "name": "IceSakeV8RP-7b", + "developer": "icefog72", + "scores": { + "IFEval": 0.6086, + "BBH": 0.4885, + "MATH Level 5": 0.0597, + "GPQA": 0.276, + "MUSR": 0.3993, + "MMLU-PRO": 0.301 + } + }, + { + "model_id": "icefog72/IceTea21EnergyDrinkRPV13-DPOv3", + "name": "IceTea21EnergyDrinkRPV13-DPOv3", + "developer": "icefog72", + "scores": { + "IFEval": 0.5263, + "BBH": 0.502, + "MATH Level 5": 0.0582, + "GPQA": 0.2836, + "MUSR": 0.4372, + "MMLU-PRO": 0.3056 + } + }, + { + "model_id": "icefog72/IceTea21EnergyDrinkRPV13-DPOv3.5", + "name": "IceTea21EnergyDrinkRPV13-DPOv3.5", + "developer": "icefog72", + "scores": { + "IFEval": 0.4871, + "BBH": 0.44, + "MATH Level 5": 0.0363, + "GPQA": 0.2844, + "MUSR": 0.3964, + "MMLU-PRO": 0.2498 + } + }, + { + "model_id": "ifable/gemma-2-Ifable-9B", + "name": "gemma-2-Ifable-9B", + "developer": "ifable", + "scores": { + "IFEval": 0.2984, + "BBH": 0.5866, + "MATH Level 5": 0.1397, + "GPQA": 0.3414, + "MUSR": 0.4053, + "MMLU-PRO": 0.4226 + } + }, + { + "model_id": "ilsp/Llama-Krikri-8B-Instruct", + "name": "Llama-Krikri-8B-Instruct", + "developer": "ilsp", + "scores": { + "IFEval": 0.6079, + "BBH": 0.5047, + "MATH Level 5": 0.1178, + "GPQA": 0.3029, + "MUSR": 0.408, + "MMLU-PRO": 0.3313 + } + }, + { + "model_id": "inflatebot/MN-12B-Mag-Mell-R1", + "name": "MN-12B-Mag-Mell-R1", + "developer": "inflatebot", + "scores": { + "IFEval": 0.4613, + "BBH": 0.5304, + "MATH Level 5": 0.1299, + "GPQA": 0.3163, + "MUSR": 0.4002, + "MMLU-PRO": 0.3438 + } + }, + { + "model_id": "informatiker/Qwen2-7B-Instruct-abliterated", + "name": "Qwen2-7B-Instruct-abliterated", + "developer": "informatiker", + "scores": { + "IFEval": 0.5822, + "BBH": 0.5534, + "MATH Level 5": 0.2636, + "GPQA": 0.3012, + "MUSR": 0.3888, + "MMLU-PRO": 0.3873 + } + }, + { + "model_id": "insightfactory/Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model", + "name": "Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model", + "developer": "insightfactory", + "scores": { + "IFEval": 0.4588, + "BBH": 0.4146, + "MATH Level 5": 0.105, + "GPQA": 0.2718, + "MUSR": 0.3499, + "MMLU-PRO": 0.296 + } + }, + { + "model_id": "instruction-pretrain/InstructLM-500M", + "name": "InstructLM-500M", + "developer": "instruction-pretrain", + "scores": { + "IFEval": 0.1028, + "BBH": 0.2941, + "MATH Level 5": 0.0, + "GPQA": 0.2567, + "MUSR": 0.3528, + "MMLU-PRO": 0.1141 + } + }, + { + "model_id": "internlm/internlm2-1_8b", + "name": "internlm2-1_8b", + "developer": "internlm", + "scores": { + "IFEval": 0.2198, + "BBH": 0.388, + "MATH Level 5": 0.0211, + "GPQA": 0.2483, + "MUSR": 0.3813, + "MMLU-PRO": 0.1588 + } + }, + { + "model_id": "internlm/internlm2-7b", + "name": "internlm2-7b", + "developer": "internlm", + "scores": { + "IFEval": 0.228, + "BBH": 0.5825, + "MATH Level 5": 0.0857, + "GPQA": 0.3367, + "MUSR": 0.44, + "MMLU-PRO": 0.19 + } + }, + { + "model_id": "internlm/internlm2-chat-1_8b", + "name": "internlm2-chat-1_8b", + "developer": "internlm", + "scores": { + "IFEval": 0.2387, + "BBH": 0.4452, + "MATH Level 5": 0.0325, + "GPQA": 0.2659, + "MUSR": 0.3631, + "MMLU-PRO": 0.1839 + } + }, + { + "model_id": "internlm/internlm2_5-1_8b-chat", + "name": "internlm2_5-1_8b-chat", + "developer": "internlm", + "scores": { + "IFEval": 0.3849, + "BBH": 0.4489, + "MATH Level 5": 0.1586, + "GPQA": 0.2903, + "MUSR": 0.3594, + "MMLU-PRO": 0.1299 + } + }, + { + "model_id": "internlm/internlm2_5-20b-chat", + "name": "internlm2_5-20b-chat", + "developer": "internlm", + "scores": { + "IFEval": 0.701, + "BBH": 0.7474, + "MATH Level 5": 0.4079, + "GPQA": 0.3213, + "MUSR": 0.4558, + "MMLU-PRO": 0.3998 + } + }, + { + "model_id": "internlm/internlm2_5-7b-chat", + "name": "internlm2_5-7b-chat", + "developer": "internlm", + "scores": { + "IFEval": 0.5539, + "BBH": 0.7073, + "MATH Level 5": 0.253, + "GPQA": 0.3473, + "MUSR": 0.4594, + "MMLU-PRO": 0.3777 + } + }, + { + "model_id": "intervitens/mini-magnum-12b-v1.1", + "name": "mini-magnum-12b-v1.1", + "developer": "intervitens", + "scores": { + "IFEval": 0.5156, + "BBH": 0.5062, + "MATH Level 5": 0.0619, + "GPQA": 0.2886, + "MUSR": 0.4004, + "MMLU-PRO": 0.3291 + } + }, + { + "model_id": "inumulaisk/eval_model", + "name": "eval_model", + "developer": "inumulaisk", + "scores": { + "IFEval": 0.1931, + "BBH": 0.3512, + "MATH Level 5": 0.2976, + "GPQA": 0.2794, + "MUSR": 0.358, + "MMLU-PRO": 0.1664 + } + }, + { + "model_id": "invalid-coder/Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp", + "name": "Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp", + "developer": "invalid-coder", + "scores": { + "IFEval": 0.4555, + "BBH": 0.5158, + "MATH Level 5": 0.0491, + "GPQA": 0.3054, + "MUSR": 0.3992, + "MMLU-PRO": 0.3146 + } + }, + { + "model_id": "invisietch/EtherealRainbow-v0.2-8B", + "name": "EtherealRainbow-v0.2-8B", + "developer": "invisietch", + "scores": { + "IFEval": 0.3903, + "BBH": 0.5102, + "MATH Level 5": 0.0823, + "GPQA": 0.3029, + "MUSR": 0.3827, + "MMLU-PRO": 0.3653 + } + }, + { + "model_id": "invisietch/EtherealRainbow-v0.3-8B", + "name": "EtherealRainbow-v0.3-8B", + "developer": "invisietch", + "scores": { + "IFEval": 0.3682, + "BBH": 0.5097, + "MATH Level 5": 0.0763, + "GPQA": 0.3045, + "MUSR": 0.3904, + "MMLU-PRO": 0.3626 + } + }, + { + "model_id": "invisietch/MiS-Firefly-v0.2-22B", + "name": "MiS-Firefly-v0.2-22B", + "developer": "invisietch", + "scores": { + "IFEval": 0.5371, + "BBH": 0.5514, + "MATH Level 5": 0.1654, + "GPQA": 0.3045, + "MUSR": 0.4694, + "MMLU-PRO": 0.362 + } + }, + { + "model_id": "invisietch/Nimbus-Miqu-v0.1-70B", + "name": "Nimbus-Miqu-v0.1-70B", + "developer": "invisietch", + "scores": { + "IFEval": 0.4647, + "BBH": 0.601, + "MATH Level 5": 0.0604, + "GPQA": 0.3389, + "MUSR": 0.4133, + "MMLU-PRO": 0.3853 + } + }, + { + "model_id": "irahulpandey/mistralai-7B-slerp-v0.1", + "name": "mistralai-7B-slerp-v0.1", + "developer": "irahulpandey", + "scores": { + "IFEval": 0.4966, + "BBH": 0.5011, + "MATH Level 5": 0.0514, + "GPQA": 0.3037, + "MUSR": 0.455, + "MMLU-PRO": 0.2951 + } + }, + { + "model_id": "jaredjoss/pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model", + "name": "pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model", + "developer": "jaredjoss", + "scores": { + "IFEval": 0.1572, + "BBH": 0.2863, + "MATH Level 5": 0.0, + "GPQA": 0.2592, + "MUSR": 0.3607, + "MMLU-PRO": 0.1169 + } + }, + { + "model_id": "jaspionjader/Auro-Kosmos-EVAA-v2-8B", + "name": "Auro-Kosmos-EVAA-v2-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4778, + "BBH": 0.5447, + "MATH Level 5": 0.1412, + "GPQA": 0.3154, + "MUSR": 0.425, + "MMLU-PRO": 0.3858 + } + }, + { + "model_id": "jaspionjader/Auro-Kosmos-EVAA-v2.1-8B", + "name": "Auro-Kosmos-EVAA-v2.1-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4666, + "BBH": 0.5444, + "MATH Level 5": 0.1458, + "GPQA": 0.3087, + "MUSR": 0.4317, + "MMLU-PRO": 0.3826 + } + }, + { + "model_id": "jaspionjader/Auro-Kosmos-EVAA-v2.2-8B", + "name": "Auro-Kosmos-EVAA-v2.2-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4268, + "BBH": 0.5431, + "MATH Level 5": 0.1412, + "GPQA": 0.3104, + "MUSR": 0.4251, + "MMLU-PRO": 0.3798 + } + }, + { + "model_id": "jaspionjader/Auro-Kosmos-EVAA-v2.3-8B", + "name": "Auro-Kosmos-EVAA-v2.3-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4271, + "BBH": 0.5441, + "MATH Level 5": 0.1344, + "GPQA": 0.3121, + "MUSR": 0.4278, + "MMLU-PRO": 0.3784 + } + }, + { + "model_id": "jaspionjader/Kosmos-Aurora_faustus-8B", + "name": "Kosmos-Aurora_faustus-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4432, + "BBH": 0.526, + "MATH Level 5": 0.1125, + "GPQA": 0.2953, + "MUSR": 0.4117, + "MMLU-PRO": 0.3813 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-8B", + "name": "Kosmos-EVAA-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4405, + "BBH": 0.5312, + "MATH Level 5": 0.1178, + "GPQA": 0.2995, + "MUSR": 0.4237, + "MMLU-PRO": 0.3818 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-Franken-Immersive-v39-8B", + "name": "Kosmos-EVAA-Franken-Immersive-v39-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4378, + "BBH": 0.519, + "MATH Level 5": 0.1292, + "GPQA": 0.3154, + "MUSR": 0.4236, + "MMLU-PRO": 0.39 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-Franken-v38-8B", + "name": "Kosmos-EVAA-Franken-v38-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4356, + "BBH": 0.523, + "MATH Level 5": 0.1292, + "GPQA": 0.3087, + "MUSR": 0.4212, + "MMLU-PRO": 0.389 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-Fusion-8B", + "name": "Kosmos-EVAA-Fusion-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4418, + "BBH": 0.5406, + "MATH Level 5": 0.1352, + "GPQA": 0.3062, + "MUSR": 0.4277, + "MMLU-PRO": 0.386 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-PRP-8B", + "name": "Kosmos-EVAA-PRP-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.3405, + "BBH": 0.5196, + "MATH Level 5": 0.0884, + "GPQA": 0.3129, + "MUSR": 0.4301, + "MMLU-PRO": 0.3647 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-PRP-light-8B", + "name": "Kosmos-EVAA-PRP-light-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.3824, + "BBH": 0.5271, + "MATH Level 5": 0.1103, + "GPQA": 0.3121, + "MUSR": 0.4249, + "MMLU-PRO": 0.3782 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-PRP-v23-8B", + "name": "Kosmos-EVAA-PRP-v23-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4041, + "BBH": 0.529, + "MATH Level 5": 0.1156, + "GPQA": 0.3087, + "MUSR": 0.4368, + "MMLU-PRO": 0.3706 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-PRP-v24-8B", + "name": "Kosmos-EVAA-PRP-v24-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4259, + "BBH": 0.5276, + "MATH Level 5": 0.1103, + "GPQA": 0.3104, + "MUSR": 0.429, + "MMLU-PRO": 0.3779 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-PRP-v25-8B", + "name": "Kosmos-EVAA-PRP-v25-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4421, + "BBH": 0.5291, + "MATH Level 5": 0.1186, + "GPQA": 0.318, + "MUSR": 0.4303, + "MMLU-PRO": 0.3716 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-PRP-v26-8B", + "name": "Kosmos-EVAA-PRP-v26-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4414, + "BBH": 0.5271, + "MATH Level 5": 0.1133, + "GPQA": 0.3045, + "MUSR": 0.4264, + "MMLU-PRO": 0.3793 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-PRP-v27-8B", + "name": "Kosmos-EVAA-PRP-v27-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4378, + "BBH": 0.529, + "MATH Level 5": 0.1193, + "GPQA": 0.3087, + "MUSR": 0.4343, + "MMLU-PRO": 0.3755 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-PRP-v28-8B", + "name": "Kosmos-EVAA-PRP-v28-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4366, + "BBH": 0.5295, + "MATH Level 5": 0.1171, + "GPQA": 0.307, + "MUSR": 0.433, + "MMLU-PRO": 0.375 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-PRP-v29-8B", + "name": "Kosmos-EVAA-PRP-v29-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4487, + "BBH": 0.5275, + "MATH Level 5": 0.1201, + "GPQA": 0.3104, + "MUSR": 0.4237, + "MMLU-PRO": 0.3765 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-PRP-v30-8B", + "name": "Kosmos-EVAA-PRP-v30-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4295, + "BBH": 0.5328, + "MATH Level 5": 0.1178, + "GPQA": 0.3045, + "MUSR": 0.4263, + "MMLU-PRO": 0.3938 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-PRP-v31-8B", + "name": "Kosmos-EVAA-PRP-v31-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4399, + "BBH": 0.5315, + "MATH Level 5": 0.1133, + "GPQA": 0.3138, + "MUSR": 0.4251, + "MMLU-PRO": 0.3935 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-PRP-v32-8B", + "name": "Kosmos-EVAA-PRP-v32-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4487, + "BBH": 0.5293, + "MATH Level 5": 0.1148, + "GPQA": 0.3163, + "MUSR": 0.4211, + "MMLU-PRO": 0.3777 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-PRP-v33-8B", + "name": "Kosmos-EVAA-PRP-v33-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4302, + "BBH": 0.5321, + "MATH Level 5": 0.1178, + "GPQA": 0.3129, + "MUSR": 0.4184, + "MMLU-PRO": 0.3909 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-PRP-v34-8B", + "name": "Kosmos-EVAA-PRP-v34-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4563, + "BBH": 0.5333, + "MATH Level 5": 0.1125, + "GPQA": 0.3112, + "MUSR": 0.4237, + "MMLU-PRO": 0.3927 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-TSN-8B", + "name": "Kosmos-EVAA-TSN-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4721, + "BBH": 0.5177, + "MATH Level 5": 0.1344, + "GPQA": 0.3029, + "MUSR": 0.4329, + "MMLU-PRO": 0.3816 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-TSN-light-8B", + "name": "Kosmos-EVAA-TSN-light-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4685, + "BBH": 0.5235, + "MATH Level 5": 0.1216, + "GPQA": 0.3045, + "MUSR": 0.4289, + "MMLU-PRO": 0.3806 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-TSN-v19-8B", + "name": "Kosmos-EVAA-TSN-v19-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4564, + "BBH": 0.5316, + "MATH Level 5": 0.1156, + "GPQA": 0.3054, + "MUSR": 0.4277, + "MMLU-PRO": 0.379 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-TSN-v20-8B", + "name": "Kosmos-EVAA-TSN-v20-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4423, + "BBH": 0.525, + "MATH Level 5": 0.1246, + "GPQA": 0.3138, + "MUSR": 0.421, + "MMLU-PRO": 0.3936 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-TSN-v21-8B", + "name": "Kosmos-EVAA-TSN-v21-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.467, + "BBH": 0.5248, + "MATH Level 5": 0.1193, + "GPQA": 0.3121, + "MUSR": 0.4343, + "MMLU-PRO": 0.3816 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-TSN-v22-8B", + "name": "Kosmos-EVAA-TSN-v22-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4673, + "BBH": 0.5246, + "MATH Level 5": 0.1133, + "GPQA": 0.307, + "MUSR": 0.4303, + "MMLU-PRO": 0.3812 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-gamma-8B", + "name": "Kosmos-EVAA-gamma-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4572, + "BBH": 0.5322, + "MATH Level 5": 0.105, + "GPQA": 0.3188, + "MUSR": 0.4306, + "MMLU-PRO": 0.3901 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-gamma-alt-8B", + "name": "Kosmos-EVAA-gamma-alt-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4542, + "BBH": 0.5298, + "MATH Level 5": 0.1095, + "GPQA": 0.3247, + "MUSR": 0.4292, + "MMLU-PRO": 0.3896 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-gamma-light-8B", + "name": "Kosmos-EVAA-gamma-light-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4581, + "BBH": 0.5376, + "MATH Level 5": 0.1103, + "GPQA": 0.3163, + "MUSR": 0.4291, + "MMLU-PRO": 0.3943 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-gamma-light-alt-8B", + "name": "Kosmos-EVAA-gamma-light-alt-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4454, + "BBH": 0.5327, + "MATH Level 5": 0.1133, + "GPQA": 0.3138, + "MUSR": 0.4305, + "MMLU-PRO": 0.3923 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-gamma-ultra-light-8B", + "name": "Kosmos-EVAA-gamma-ultra-light-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4563, + "BBH": 0.5316, + "MATH Level 5": 0.1178, + "GPQA": 0.3163, + "MUSR": 0.4197, + "MMLU-PRO": 0.3915 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-gamma-v13-8B", + "name": "Kosmos-EVAA-gamma-v13-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4429, + "BBH": 0.5359, + "MATH Level 5": 0.1118, + "GPQA": 0.3138, + "MUSR": 0.4278, + "MMLU-PRO": 0.393 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-gamma-v14-8B", + "name": "Kosmos-EVAA-gamma-v14-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.438, + "BBH": 0.5363, + "MATH Level 5": 0.1103, + "GPQA": 0.3129, + "MUSR": 0.4277, + "MMLU-PRO": 0.3931 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-gamma-v15-8B", + "name": "Kosmos-EVAA-gamma-v15-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4654, + "BBH": 0.5343, + "MATH Level 5": 0.111, + "GPQA": 0.3112, + "MUSR": 0.4277, + "MMLU-PRO": 0.3941 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-gamma-v16-8B", + "name": "Kosmos-EVAA-gamma-v16-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4557, + "BBH": 0.5344, + "MATH Level 5": 0.1171, + "GPQA": 0.3154, + "MUSR": 0.4264, + "MMLU-PRO": 0.3917 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-gamma-v17-8B", + "name": "Kosmos-EVAA-gamma-v17-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4462, + "BBH": 0.5347, + "MATH Level 5": 0.111, + "GPQA": 0.3112, + "MUSR": 0.4291, + "MMLU-PRO": 0.3923 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-gamma-v18-8B", + "name": "Kosmos-EVAA-gamma-v18-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4341, + "BBH": 0.5339, + "MATH Level 5": 0.111, + "GPQA": 0.3112, + "MUSR": 0.4317, + "MMLU-PRO": 0.3905 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-immersive-sof-v44-8B", + "name": "Kosmos-EVAA-immersive-sof-v44-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4408, + "BBH": 0.5215, + "MATH Level 5": 0.1186, + "GPQA": 0.3096, + "MUSR": 0.4144, + "MMLU-PRO": 0.3888 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-v10-8B", + "name": "Kosmos-EVAA-v10-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4262, + "BBH": 0.5376, + "MATH Level 5": 0.1246, + "GPQA": 0.2995, + "MUSR": 0.4224, + "MMLU-PRO": 0.3831 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-v11-8B", + "name": "Kosmos-EVAA-v11-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4426, + "BBH": 0.5359, + "MATH Level 5": 0.1322, + "GPQA": 0.3154, + "MUSR": 0.4184, + "MMLU-PRO": 0.3836 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-v12-8B", + "name": "Kosmos-EVAA-v12-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4378, + "BBH": 0.5349, + "MATH Level 5": 0.1367, + "GPQA": 0.3096, + "MUSR": 0.4211, + "MMLU-PRO": 0.3836 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-v2-8B", + "name": "Kosmos-EVAA-v2-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4396, + "BBH": 0.5341, + "MATH Level 5": 0.1322, + "GPQA": 0.2978, + "MUSR": 0.4211, + "MMLU-PRO": 0.3826 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-v3-8B", + "name": "Kosmos-EVAA-v3-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4411, + "BBH": 0.5331, + "MATH Level 5": 0.1329, + "GPQA": 0.3054, + "MUSR": 0.4224, + "MMLU-PRO": 0.3821 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-v4-8B", + "name": "Kosmos-EVAA-v4-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4289, + "BBH": 0.5337, + "MATH Level 5": 0.1254, + "GPQA": 0.3003, + "MUSR": 0.4197, + "MMLU-PRO": 0.3817 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-v5-8B", + "name": "Kosmos-EVAA-v5-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.446, + "BBH": 0.5345, + "MATH Level 5": 0.1261, + "GPQA": 0.3037, + "MUSR": 0.4224, + "MMLU-PRO": 0.3821 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-v6-8B", + "name": "Kosmos-EVAA-v6-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4396, + "BBH": 0.538, + "MATH Level 5": 0.1292, + "GPQA": 0.302, + "MUSR": 0.4184, + "MMLU-PRO": 0.3821 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-v7-8B", + "name": "Kosmos-EVAA-v7-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4277, + "BBH": 0.5335, + "MATH Level 5": 0.1337, + "GPQA": 0.3054, + "MUSR": 0.4171, + "MMLU-PRO": 0.3836 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-v8-8B", + "name": "Kosmos-EVAA-v8-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4383, + "BBH": 0.5359, + "MATH Level 5": 0.1307, + "GPQA": 0.3037, + "MUSR": 0.421, + "MMLU-PRO": 0.3827 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-v9-8B", + "name": "Kosmos-EVAA-v9-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4369, + "BBH": 0.5361, + "MATH Level 5": 0.1276, + "GPQA": 0.3062, + "MUSR": 0.4184, + "MMLU-PRO": 0.382 + } + }, + { + "model_id": "jaspionjader/Kosmos-EVAA-v9-TitanFusion-Mix-8B", + "name": "Kosmos-EVAA-v9-TitanFusion-Mix-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4284, + "BBH": 0.554, + "MATH Level 5": 0.1148, + "GPQA": 0.2878, + "MUSR": 0.4354, + "MMLU-PRO": 0.3836 + } + }, + { + "model_id": "jaspionjader/Kosmos-Elusive-8b", + "name": "Kosmos-Elusive-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4169, + "BBH": 0.5339, + "MATH Level 5": 0.1261, + "GPQA": 0.3079, + "MUSR": 0.4078, + "MMLU-PRO": 0.376 + } + }, + { + "model_id": "jaspionjader/Kosmos-Elusive-VENN-8B", + "name": "Kosmos-Elusive-VENN-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4233, + "BBH": 0.5356, + "MATH Level 5": 0.1246, + "GPQA": 0.2995, + "MUSR": 0.4157, + "MMLU-PRO": 0.3797 + } + }, + { + "model_id": "jaspionjader/Kosmos-Elusive-VENN-Asymmetric-8B", + "name": "Kosmos-Elusive-VENN-Asymmetric-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4542, + "BBH": 0.5313, + "MATH Level 5": 0.1344, + "GPQA": 0.2945, + "MUSR": 0.4251, + "MMLU-PRO": 0.3842 + } + }, + { + "model_id": "jaspionjader/Kosmos-Elusive-VENN-Aurora_faustus-8B", + "name": "Kosmos-Elusive-VENN-Aurora_faustus-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4335, + "BBH": 0.5304, + "MATH Level 5": 0.1125, + "GPQA": 0.2953, + "MUSR": 0.417, + "MMLU-PRO": 0.3795 + } + }, + { + "model_id": "jaspionjader/Kosmos-VENN-8B", + "name": "Kosmos-VENN-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4332, + "BBH": 0.5318, + "MATH Level 5": 0.1412, + "GPQA": 0.2928, + "MUSR": 0.4211, + "MMLU-PRO": 0.3801 + } + }, + { + "model_id": "jaspionjader/PRP-Kosmos-EVAA-8B", + "name": "PRP-Kosmos-EVAA-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.3633, + "BBH": 0.5237, + "MATH Level 5": 0.0959, + "GPQA": 0.3096, + "MUSR": 0.425, + "MMLU-PRO": 0.3766 + } + }, + { + "model_id": "jaspionjader/PRP-Kosmos-EVAA-light-8B", + "name": "PRP-Kosmos-EVAA-light-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4321, + "BBH": 0.5275, + "MATH Level 5": 0.1103, + "GPQA": 0.3221, + "MUSR": 0.4235, + "MMLU-PRO": 0.3631 + } + }, + { + "model_id": "jaspionjader/TSN-Kosmos-EVAA-8B", + "name": "TSN-Kosmos-EVAA-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4903, + "BBH": 0.5347, + "MATH Level 5": 0.145, + "GPQA": 0.3205, + "MUSR": 0.4173, + "MMLU-PRO": 0.3831 + } + }, + { + "model_id": "jaspionjader/TSN-Kosmos-EVAA-v2-8B", + "name": "TSN-Kosmos-EVAA-v2-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4667, + "BBH": 0.5343, + "MATH Level 5": 0.108, + "GPQA": 0.3221, + "MUSR": 0.4186, + "MMLU-PRO": 0.3762 + } + }, + { + "model_id": "jaspionjader/bbb-1", + "name": "bbb-1", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4864, + "BBH": 0.5376, + "MATH Level 5": 0.1367, + "GPQA": 0.3138, + "MUSR": 0.4171, + "MMLU-PRO": 0.3897 + } + }, + { + "model_id": "jaspionjader/bbb-2", + "name": "bbb-2", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4077, + "BBH": 0.5067, + "MATH Level 5": 0.1125, + "GPQA": 0.302, + "MUSR": 0.4145, + "MMLU-PRO": 0.3635 + } + }, + { + "model_id": "jaspionjader/bbb-3", + "name": "bbb-3", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4168, + "BBH": 0.5158, + "MATH Level 5": 0.1405, + "GPQA": 0.3112, + "MUSR": 0.4265, + "MMLU-PRO": 0.3856 + } + }, + { + "model_id": "jaspionjader/bbb-4", + "name": "bbb-4", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4768, + "BBH": 0.5212, + "MATH Level 5": 0.1276, + "GPQA": 0.2978, + "MUSR": 0.4092, + "MMLU-PRO": 0.3773 + } + }, + { + "model_id": "jaspionjader/bbb-5", + "name": "bbb-5", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4703, + "BBH": 0.5207, + "MATH Level 5": 0.1397, + "GPQA": 0.3045, + "MUSR": 0.3998, + "MMLU-PRO": 0.3834 + } + }, + { + "model_id": "jaspionjader/bbb-6", + "name": "bbb-6", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.488, + "BBH": 0.5211, + "MATH Level 5": 0.139, + "GPQA": 0.3104, + "MUSR": 0.4052, + "MMLU-PRO": 0.3871 + } + }, + { + "model_id": "jaspionjader/bbb-7", + "name": "bbb-7", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4828, + "BBH": 0.5211, + "MATH Level 5": 0.1367, + "GPQA": 0.3104, + "MUSR": 0.4038, + "MMLU-PRO": 0.386 + } + }, + { + "model_id": "jaspionjader/bh-1", + "name": "bh-1", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4284, + "BBH": 0.589, + "MATH Level 5": 0.0536, + "GPQA": 0.2945, + "MUSR": 0.4441, + "MMLU-PRO": 0.3449 + } + }, + { + "model_id": "jaspionjader/bh-10", + "name": "bh-10", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4618, + "BBH": 0.5856, + "MATH Level 5": 0.1103, + "GPQA": 0.3003, + "MUSR": 0.4199, + "MMLU-PRO": 0.3708 + } + }, + { + "model_id": "jaspionjader/bh-11", + "name": "bh-11", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4575, + "BBH": 0.5851, + "MATH Level 5": 0.1178, + "GPQA": 0.307, + "MUSR": 0.4146, + "MMLU-PRO": 0.3738 + } + }, + { + "model_id": "jaspionjader/bh-12", + "name": "bh-12", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4734, + "BBH": 0.5802, + "MATH Level 5": 0.1186, + "GPQA": 0.3003, + "MUSR": 0.4145, + "MMLU-PRO": 0.3737 + } + }, + { + "model_id": "jaspionjader/bh-13", + "name": "bh-13", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4698, + "BBH": 0.5778, + "MATH Level 5": 0.1125, + "GPQA": 0.307, + "MUSR": 0.4159, + "MMLU-PRO": 0.373 + } + }, + { + "model_id": "jaspionjader/bh-15", + "name": "bh-15", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4745, + "BBH": 0.5819, + "MATH Level 5": 0.1246, + "GPQA": 0.2987, + "MUSR": 0.4105, + "MMLU-PRO": 0.3767 + } + }, + { + "model_id": "jaspionjader/bh-16", + "name": "bh-16", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4731, + "BBH": 0.5783, + "MATH Level 5": 0.1193, + "GPQA": 0.3029, + "MUSR": 0.4159, + "MMLU-PRO": 0.3776 + } + }, + { + "model_id": "jaspionjader/bh-17", + "name": "bh-17", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4722, + "BBH": 0.5776, + "MATH Level 5": 0.1133, + "GPQA": 0.2978, + "MUSR": 0.4158, + "MMLU-PRO": 0.3757 + } + }, + { + "model_id": "jaspionjader/bh-18", + "name": "bh-18", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4725, + "BBH": 0.5824, + "MATH Level 5": 0.1186, + "GPQA": 0.3003, + "MUSR": 0.4185, + "MMLU-PRO": 0.3757 + } + }, + { + "model_id": "jaspionjader/bh-19", + "name": "bh-19", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4584, + "BBH": 0.5766, + "MATH Level 5": 0.1193, + "GPQA": 0.297, + "MUSR": 0.4171, + "MMLU-PRO": 0.3775 + } + }, + { + "model_id": "jaspionjader/bh-2", + "name": "bh-2", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4579, + "BBH": 0.5937, + "MATH Level 5": 0.1027, + "GPQA": 0.3012, + "MUSR": 0.4186, + "MMLU-PRO": 0.3695 + } + }, + { + "model_id": "jaspionjader/bh-20", + "name": "bh-20", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4727, + "BBH": 0.575, + "MATH Level 5": 0.1201, + "GPQA": 0.2878, + "MUSR": 0.4105, + "MMLU-PRO": 0.3768 + } + }, + { + "model_id": "jaspionjader/bh-21", + "name": "bh-21", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.47, + "BBH": 0.5738, + "MATH Level 5": 0.1216, + "GPQA": 0.2978, + "MUSR": 0.4158, + "MMLU-PRO": 0.3776 + } + }, + { + "model_id": "jaspionjader/bh-22", + "name": "bh-22", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.46, + "BBH": 0.5793, + "MATH Level 5": 0.1186, + "GPQA": 0.2961, + "MUSR": 0.4172, + "MMLU-PRO": 0.3764 + } + }, + { + "model_id": "jaspionjader/bh-23", + "name": "bh-23", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4658, + "BBH": 0.57, + "MATH Level 5": 0.1201, + "GPQA": 0.2945, + "MUSR": 0.4197, + "MMLU-PRO": 0.3796 + } + }, + { + "model_id": "jaspionjader/bh-24", + "name": "bh-24", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4715, + "BBH": 0.5717, + "MATH Level 5": 0.1269, + "GPQA": 0.2961, + "MUSR": 0.4158, + "MMLU-PRO": 0.3809 + } + }, + { + "model_id": "jaspionjader/bh-25", + "name": "bh-25", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4752, + "BBH": 0.5706, + "MATH Level 5": 0.1133, + "GPQA": 0.2911, + "MUSR": 0.4118, + "MMLU-PRO": 0.3782 + } + }, + { + "model_id": "jaspionjader/bh-26", + "name": "bh-26", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4691, + "BBH": 0.5735, + "MATH Level 5": 0.1163, + "GPQA": 0.2995, + "MUSR": 0.4277, + "MMLU-PRO": 0.3772 + } + }, + { + "model_id": "jaspionjader/bh-27", + "name": "bh-27", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4819, + "BBH": 0.5714, + "MATH Level 5": 0.1276, + "GPQA": 0.2961, + "MUSR": 0.4091, + "MMLU-PRO": 0.3799 + } + }, + { + "model_id": "jaspionjader/bh-28", + "name": "bh-28", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4785, + "BBH": 0.5703, + "MATH Level 5": 0.1231, + "GPQA": 0.2987, + "MUSR": 0.4131, + "MMLU-PRO": 0.3812 + } + }, + { + "model_id": "jaspionjader/bh-29", + "name": "bh-29", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4688, + "BBH": 0.567, + "MATH Level 5": 0.1208, + "GPQA": 0.2953, + "MUSR": 0.4237, + "MMLU-PRO": 0.3819 + } + }, + { + "model_id": "jaspionjader/bh-3", + "name": "bh-3", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4664, + "BBH": 0.5891, + "MATH Level 5": 0.1148, + "GPQA": 0.302, + "MUSR": 0.4173, + "MMLU-PRO": 0.3702 + } + }, + { + "model_id": "jaspionjader/bh-30", + "name": "bh-30", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4666, + "BBH": 0.5706, + "MATH Level 5": 0.1231, + "GPQA": 0.2928, + "MUSR": 0.4144, + "MMLU-PRO": 0.3782 + } + }, + { + "model_id": "jaspionjader/bh-31", + "name": "bh-31", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4727, + "BBH": 0.5665, + "MATH Level 5": 0.1284, + "GPQA": 0.2936, + "MUSR": 0.4104, + "MMLU-PRO": 0.382 + } + }, + { + "model_id": "jaspionjader/bh-32", + "name": "bh-32", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4636, + "BBH": 0.5662, + "MATH Level 5": 0.1246, + "GPQA": 0.297, + "MUSR": 0.4157, + "MMLU-PRO": 0.3812 + } + }, + { + "model_id": "jaspionjader/bh-33", + "name": "bh-33", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4685, + "BBH": 0.5653, + "MATH Level 5": 0.1178, + "GPQA": 0.2961, + "MUSR": 0.4157, + "MMLU-PRO": 0.3808 + } + }, + { + "model_id": "jaspionjader/bh-34", + "name": "bh-34", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4624, + "BBH": 0.5681, + "MATH Level 5": 0.1208, + "GPQA": 0.2919, + "MUSR": 0.4185, + "MMLU-PRO": 0.3804 + } + }, + { + "model_id": "jaspionjader/bh-35", + "name": "bh-35", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4721, + "BBH": 0.564, + "MATH Level 5": 0.1246, + "GPQA": 0.2953, + "MUSR": 0.4183, + "MMLU-PRO": 0.383 + } + }, + { + "model_id": "jaspionjader/bh-36", + "name": "bh-36", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4666, + "BBH": 0.5664, + "MATH Level 5": 0.1239, + "GPQA": 0.302, + "MUSR": 0.4196, + "MMLU-PRO": 0.3831 + } + }, + { + "model_id": "jaspionjader/bh-37", + "name": "bh-37", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.488, + "BBH": 0.5625, + "MATH Level 5": 0.1216, + "GPQA": 0.2945, + "MUSR": 0.4156, + "MMLU-PRO": 0.3828 + } + }, + { + "model_id": "jaspionjader/bh-38", + "name": "bh-38", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4618, + "BBH": 0.5658, + "MATH Level 5": 0.1239, + "GPQA": 0.2978, + "MUSR": 0.4117, + "MMLU-PRO": 0.3811 + } + }, + { + "model_id": "jaspionjader/bh-39", + "name": "bh-39", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4576, + "BBH": 0.5633, + "MATH Level 5": 0.1254, + "GPQA": 0.3003, + "MUSR": 0.4262, + "MMLU-PRO": 0.3831 + } + }, + { + "model_id": "jaspionjader/bh-4", + "name": "bh-4", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4673, + "BBH": 0.5892, + "MATH Level 5": 0.1095, + "GPQA": 0.2961, + "MUSR": 0.4173, + "MMLU-PRO": 0.3705 + } + }, + { + "model_id": "jaspionjader/bh-40", + "name": "bh-40", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4536, + "BBH": 0.5634, + "MATH Level 5": 0.1246, + "GPQA": 0.2987, + "MUSR": 0.4236, + "MMLU-PRO": 0.3835 + } + }, + { + "model_id": "jaspionjader/bh-41", + "name": "bh-41", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.474, + "BBH": 0.5614, + "MATH Level 5": 0.1254, + "GPQA": 0.2928, + "MUSR": 0.4183, + "MMLU-PRO": 0.3825 + } + }, + { + "model_id": "jaspionjader/bh-42", + "name": "bh-42", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.466, + "BBH": 0.5646, + "MATH Level 5": 0.1269, + "GPQA": 0.2961, + "MUSR": 0.421, + "MMLU-PRO": 0.3812 + } + }, + { + "model_id": "jaspionjader/bh-43", + "name": "bh-43", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.46, + "BBH": 0.5635, + "MATH Level 5": 0.1239, + "GPQA": 0.2945, + "MUSR": 0.4156, + "MMLU-PRO": 0.382 + } + }, + { + "model_id": "jaspionjader/bh-44", + "name": "bh-44", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4706, + "BBH": 0.5643, + "MATH Level 5": 0.1216, + "GPQA": 0.2961, + "MUSR": 0.4249, + "MMLU-PRO": 0.3834 + } + }, + { + "model_id": "jaspionjader/bh-46", + "name": "bh-46", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4727, + "BBH": 0.5632, + "MATH Level 5": 0.1276, + "GPQA": 0.2961, + "MUSR": 0.4262, + "MMLU-PRO": 0.3822 + } + }, + { + "model_id": "jaspionjader/bh-47", + "name": "bh-47", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4652, + "BBH": 0.5546, + "MATH Level 5": 0.1276, + "GPQA": 0.2945, + "MUSR": 0.4156, + "MMLU-PRO": 0.3855 + } + }, + { + "model_id": "jaspionjader/bh-48", + "name": "bh-48", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4688, + "BBH": 0.5541, + "MATH Level 5": 0.1254, + "GPQA": 0.2945, + "MUSR": 0.4209, + "MMLU-PRO": 0.386 + } + }, + { + "model_id": "jaspionjader/bh-49", + "name": "bh-49", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4725, + "BBH": 0.554, + "MATH Level 5": 0.1201, + "GPQA": 0.2945, + "MUSR": 0.4129, + "MMLU-PRO": 0.3808 + } + }, + { + "model_id": "jaspionjader/bh-5", + "name": "bh-5", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4652, + "BBH": 0.5882, + "MATH Level 5": 0.1057, + "GPQA": 0.2995, + "MUSR": 0.4186, + "MMLU-PRO": 0.3702 + } + }, + { + "model_id": "jaspionjader/bh-50", + "name": "bh-50", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4725, + "BBH": 0.5553, + "MATH Level 5": 0.1208, + "GPQA": 0.2936, + "MUSR": 0.4169, + "MMLU-PRO": 0.3842 + } + }, + { + "model_id": "jaspionjader/bh-51", + "name": "bh-51", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.463, + "BBH": 0.5557, + "MATH Level 5": 0.1239, + "GPQA": 0.2928, + "MUSR": 0.4168, + "MMLU-PRO": 0.3831 + } + }, + { + "model_id": "jaspionjader/bh-52", + "name": "bh-52", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4536, + "BBH": 0.5444, + "MATH Level 5": 0.1201, + "GPQA": 0.2919, + "MUSR": 0.4169, + "MMLU-PRO": 0.3843 + } + }, + { + "model_id": "jaspionjader/bh-53", + "name": "bh-53", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.478, + "BBH": 0.5494, + "MATH Level 5": 0.1269, + "GPQA": 0.2987, + "MUSR": 0.4196, + "MMLU-PRO": 0.3858 + } + }, + { + "model_id": "jaspionjader/bh-54", + "name": "bh-54", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4841, + "BBH": 0.5548, + "MATH Level 5": 0.1292, + "GPQA": 0.2945, + "MUSR": 0.4155, + "MMLU-PRO": 0.3825 + } + }, + { + "model_id": "jaspionjader/bh-55", + "name": "bh-55", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4709, + "BBH": 0.555, + "MATH Level 5": 0.1284, + "GPQA": 0.3062, + "MUSR": 0.4222, + "MMLU-PRO": 0.3846 + } + }, + { + "model_id": "jaspionjader/bh-56", + "name": "bh-56", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.46, + "BBH": 0.5447, + "MATH Level 5": 0.1231, + "GPQA": 0.3003, + "MUSR": 0.4116, + "MMLU-PRO": 0.3844 + } + }, + { + "model_id": "jaspionjader/bh-57", + "name": "bh-57", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4405, + "BBH": 0.5425, + "MATH Level 5": 0.1261, + "GPQA": 0.3037, + "MUSR": 0.421, + "MMLU-PRO": 0.3896 + } + }, + { + "model_id": "jaspionjader/bh-58", + "name": "bh-58", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.463, + "BBH": 0.5446, + "MATH Level 5": 0.1322, + "GPQA": 0.3062, + "MUSR": 0.4183, + "MMLU-PRO": 0.3896 + } + }, + { + "model_id": "jaspionjader/bh-59", + "name": "bh-59", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4341, + "BBH": 0.5512, + "MATH Level 5": 0.1541, + "GPQA": 0.3154, + "MUSR": 0.417, + "MMLU-PRO": 0.3838 + } + }, + { + "model_id": "jaspionjader/bh-6", + "name": "bh-6", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4621, + "BBH": 0.5891, + "MATH Level 5": 0.1088, + "GPQA": 0.2995, + "MUSR": 0.4199, + "MMLU-PRO": 0.3698 + } + }, + { + "model_id": "jaspionjader/bh-60", + "name": "bh-60", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4207, + "BBH": 0.5369, + "MATH Level 5": 0.1579, + "GPQA": 0.3255, + "MUSR": 0.4289, + "MMLU-PRO": 0.3689 + } + }, + { + "model_id": "jaspionjader/bh-61", + "name": "bh-61", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4247, + "BBH": 0.5271, + "MATH Level 5": 0.1707, + "GPQA": 0.3188, + "MUSR": 0.4356, + "MMLU-PRO": 0.3679 + } + }, + { + "model_id": "jaspionjader/bh-62", + "name": "bh-62", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.415, + "BBH": 0.5379, + "MATH Level 5": 0.1624, + "GPQA": 0.3205, + "MUSR": 0.4289, + "MMLU-PRO": 0.3719 + } + }, + { + "model_id": "jaspionjader/bh-63", + "name": "bh-63", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4308, + "BBH": 0.4917, + "MATH Level 5": 0.111, + "GPQA": 0.3029, + "MUSR": 0.4313, + "MMLU-PRO": 0.3248 + } + }, + { + "model_id": "jaspionjader/bh-64", + "name": "bh-64", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.414, + "BBH": 0.536, + "MATH Level 5": 0.1548, + "GPQA": 0.3213, + "MUSR": 0.4355, + "MMLU-PRO": 0.3693 + } + }, + { + "model_id": "jaspionjader/bh-7", + "name": "bh-7", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4624, + "BBH": 0.5861, + "MATH Level 5": 0.114, + "GPQA": 0.3037, + "MUSR": 0.4119, + "MMLU-PRO": 0.3715 + } + }, + { + "model_id": "jaspionjader/bh-8", + "name": "bh-8", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4597, + "BBH": 0.59, + "MATH Level 5": 0.1178, + "GPQA": 0.3012, + "MUSR": 0.4265, + "MMLU-PRO": 0.372 + } + }, + { + "model_id": "jaspionjader/bh-9", + "name": "bh-9", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4509, + "BBH": 0.585, + "MATH Level 5": 0.1156, + "GPQA": 0.302, + "MUSR": 0.4146, + "MMLU-PRO": 0.3703 + } + }, + { + "model_id": "jaspionjader/dp-6-8b", + "name": "dp-6-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4806, + "BBH": 0.53, + "MATH Level 5": 0.1329, + "GPQA": 0.3079, + "MUSR": 0.4434, + "MMLU-PRO": 0.3897 + } + }, + { + "model_id": "jaspionjader/dp-7-8b", + "name": "dp-7-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4498, + "BBH": 0.5291, + "MATH Level 5": 0.1261, + "GPQA": 0.3062, + "MUSR": 0.4407, + "MMLU-PRO": 0.3934 + } + }, + { + "model_id": "jaspionjader/ek-6", + "name": "ek-6", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4642, + "BBH": 0.5219, + "MATH Level 5": 0.1322, + "GPQA": 0.3087, + "MUSR": 0.4144, + "MMLU-PRO": 0.3861 + } + }, + { + "model_id": "jaspionjader/ek-7", + "name": "ek-7", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4767, + "BBH": 0.5194, + "MATH Level 5": 0.1329, + "GPQA": 0.3163, + "MUSR": 0.4171, + "MMLU-PRO": 0.3887 + } + }, + { + "model_id": "jaspionjader/f-1-8b", + "name": "f-1-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4983, + "BBH": 0.5141, + "MATH Level 5": 0.1284, + "GPQA": 0.3087, + "MUSR": 0.4527, + "MMLU-PRO": 0.3907 + } + }, + { + "model_id": "jaspionjader/f-2-8b", + "name": "f-2-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4824, + "BBH": 0.5294, + "MATH Level 5": 0.1171, + "GPQA": 0.2995, + "MUSR": 0.4501, + "MMLU-PRO": 0.3962 + } + }, + { + "model_id": "jaspionjader/f-3-8b", + "name": "f-3-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4803, + "BBH": 0.5275, + "MATH Level 5": 0.1216, + "GPQA": 0.3138, + "MUSR": 0.4421, + "MMLU-PRO": 0.3954 + } + }, + { + "model_id": "jaspionjader/f-4-8b", + "name": "f-4-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4797, + "BBH": 0.5289, + "MATH Level 5": 0.1148, + "GPQA": 0.3087, + "MUSR": 0.4514, + "MMLU-PRO": 0.3956 + } + }, + { + "model_id": "jaspionjader/f-5-8b", + "name": "f-5-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.5044, + "BBH": 0.5313, + "MATH Level 5": 0.1239, + "GPQA": 0.3087, + "MUSR": 0.4461, + "MMLU-PRO": 0.3949 + } + }, + { + "model_id": "jaspionjader/f-6-8b", + "name": "f-6-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4846, + "BBH": 0.5241, + "MATH Level 5": 0.1193, + "GPQA": 0.3079, + "MUSR": 0.4474, + "MMLU-PRO": 0.3939 + } + }, + { + "model_id": "jaspionjader/f-7-8b", + "name": "f-7-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4462, + "BBH": 0.5277, + "MATH Level 5": 0.1239, + "GPQA": 0.3129, + "MUSR": 0.4315, + "MMLU-PRO": 0.3936 + } + }, + { + "model_id": "jaspionjader/f-8-8b", + "name": "f-8-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4739, + "BBH": 0.5259, + "MATH Level 5": 0.1224, + "GPQA": 0.3096, + "MUSR": 0.4354, + "MMLU-PRO": 0.394 + } + }, + { + "model_id": "jaspionjader/f-9-8b", + "name": "f-9-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4602, + "BBH": 0.5292, + "MATH Level 5": 0.1299, + "GPQA": 0.3062, + "MUSR": 0.4461, + "MMLU-PRO": 0.3944 + } + }, + { + "model_id": "jaspionjader/fct-14-8b", + "name": "fct-14-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4129, + "BBH": 0.5206, + "MATH Level 5": 0.1201, + "GPQA": 0.3163, + "MUSR": 0.4186, + "MMLU-PRO": 0.3875 + } + }, + { + "model_id": "jaspionjader/fct-9-8b", + "name": "fct-9-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4354, + "BBH": 0.5205, + "MATH Level 5": 0.1193, + "GPQA": 0.3079, + "MUSR": 0.4291, + "MMLU-PRO": 0.3932 + } + }, + { + "model_id": "jaspionjader/fr-1-8b", + "name": "fr-1-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4211, + "BBH": 0.5142, + "MATH Level 5": 0.1118, + "GPQA": 0.3054, + "MUSR": 0.4277, + "MMLU-PRO": 0.361 + } + }, + { + "model_id": "jaspionjader/fr-10-8b", + "name": "fr-10-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4402, + "BBH": 0.5207, + "MATH Level 5": 0.1224, + "GPQA": 0.3171, + "MUSR": 0.4119, + "MMLU-PRO": 0.3863 + } + }, + { + "model_id": "jaspionjader/fr-3-8b", + "name": "fr-3-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4326, + "BBH": 0.5255, + "MATH Level 5": 0.1133, + "GPQA": 0.3054, + "MUSR": 0.4198, + "MMLU-PRO": 0.3863 + } + }, + { + "model_id": "jaspionjader/gamma-Kosmos-EVAA-8B", + "name": "gamma-Kosmos-EVAA-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.425, + "BBH": 0.5253, + "MATH Level 5": 0.0899, + "GPQA": 0.3138, + "MUSR": 0.4412, + "MMLU-PRO": 0.3776 + } + }, + { + "model_id": "jaspionjader/gamma-Kosmos-EVAA-v2-8B", + "name": "gamma-Kosmos-EVAA-v2-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4233, + "BBH": 0.5262, + "MATH Level 5": 0.1057, + "GPQA": 0.3205, + "MUSR": 0.4344, + "MMLU-PRO": 0.3756 + } + }, + { + "model_id": "jaspionjader/gamma-Kosmos-EVAA-v3-8B", + "name": "gamma-Kosmos-EVAA-v3-8B", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4333, + "BBH": 0.5278, + "MATH Level 5": 0.111, + "GPQA": 0.3129, + "MUSR": 0.4263, + "MMLU-PRO": 0.3898 + } + }, + { + "model_id": "jaspionjader/knf-2-8b", + "name": "knf-2-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.425, + "BBH": 0.5207, + "MATH Level 5": 0.1201, + "GPQA": 0.3104, + "MUSR": 0.4185, + "MMLU-PRO": 0.3875 + } + }, + { + "model_id": "jaspionjader/knfp-2-8b", + "name": "knfp-2-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.5327, + "BBH": 0.5305, + "MATH Level 5": 0.1427, + "GPQA": 0.2928, + "MUSR": 0.4185, + "MMLU-PRO": 0.3726 + } + }, + { + "model_id": "jaspionjader/knfp-3-8b", + "name": "knfp-3-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4946, + "BBH": 0.52, + "MATH Level 5": 0.1224, + "GPQA": 0.307, + "MUSR": 0.4171, + "MMLU-PRO": 0.3881 + } + }, + { + "model_id": "jaspionjader/kstc-1-8b", + "name": "kstc-1-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4643, + "BBH": 0.5209, + "MATH Level 5": 0.1171, + "GPQA": 0.3171, + "MUSR": 0.4158, + "MMLU-PRO": 0.3892 + } + }, + { + "model_id": "jaspionjader/kstc-11-8b", + "name": "kstc-11-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4757, + "BBH": 0.5189, + "MATH Level 5": 0.1201, + "GPQA": 0.3029, + "MUSR": 0.4118, + "MMLU-PRO": 0.3879 + } + }, + { + "model_id": "jaspionjader/kstc-4-8b", + "name": "kstc-4-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.477, + "BBH": 0.5216, + "MATH Level 5": 0.1239, + "GPQA": 0.3037, + "MUSR": 0.4118, + "MMLU-PRO": 0.3869 + } + }, + { + "model_id": "jaspionjader/kstc-5-8b", + "name": "kstc-5-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4721, + "BBH": 0.5211, + "MATH Level 5": 0.1299, + "GPQA": 0.3154, + "MUSR": 0.4224, + "MMLU-PRO": 0.3892 + } + }, + { + "model_id": "jaspionjader/kstc-6-8b", + "name": "kstc-6-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4944, + "BBH": 0.5231, + "MATH Level 5": 0.1246, + "GPQA": 0.2995, + "MUSR": 0.4105, + "MMLU-PRO": 0.3857 + } + }, + { + "model_id": "jaspionjader/kstc-8-8b", + "name": "kstc-8-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.491, + "BBH": 0.5239, + "MATH Level 5": 0.1307, + "GPQA": 0.3054, + "MUSR": 0.4211, + "MMLU-PRO": 0.3889 + } + }, + { + "model_id": "jaspionjader/kstc-9-8b", + "name": "kstc-9-8b", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4861, + "BBH": 0.5238, + "MATH Level 5": 0.136, + "GPQA": 0.3012, + "MUSR": 0.4118, + "MMLU-PRO": 0.3872 + } + }, + { + "model_id": "jaspionjader/slu-10", + "name": "slu-10", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.436, + "BBH": 0.5096, + "MATH Level 5": 0.0974, + "GPQA": 0.3138, + "MUSR": 0.392, + "MMLU-PRO": 0.3664 + } + }, + { + "model_id": "jaspionjader/slu-11", + "name": "slu-11", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.3725, + "BBH": 0.489, + "MATH Level 5": 0.0559, + "GPQA": 0.3037, + "MUSR": 0.3919, + "MMLU-PRO": 0.3382 + } + }, + { + "model_id": "jaspionjader/slu-13", + "name": "slu-13", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4378, + "BBH": 0.5097, + "MATH Level 5": 0.0808, + "GPQA": 0.3079, + "MUSR": 0.3814, + "MMLU-PRO": 0.358 + } + }, + { + "model_id": "jaspionjader/slu-14", + "name": "slu-14", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4107, + "BBH": 0.5089, + "MATH Level 5": 0.0974, + "GPQA": 0.3079, + "MUSR": 0.396, + "MMLU-PRO": 0.3627 + } + }, + { + "model_id": "jaspionjader/slu-17", + "name": "slu-17", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4217, + "BBH": 0.5071, + "MATH Level 5": 0.0853, + "GPQA": 0.3087, + "MUSR": 0.3761, + "MMLU-PRO": 0.3619 + } + }, + { + "model_id": "jaspionjader/slu-2", + "name": "slu-2", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4016, + "BBH": 0.5008, + "MATH Level 5": 0.0634, + "GPQA": 0.2987, + "MUSR": 0.3959, + "MMLU-PRO": 0.3506 + } + }, + { + "model_id": "jaspionjader/slu-20", + "name": "slu-20", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4393, + "BBH": 0.5061, + "MATH Level 5": 0.0869, + "GPQA": 0.3087, + "MUSR": 0.3933, + "MMLU-PRO": 0.3665 + } + }, + { + "model_id": "jaspionjader/slu-22", + "name": "slu-22", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4321, + "BBH": 0.5082, + "MATH Level 5": 0.0793, + "GPQA": 0.3163, + "MUSR": 0.3893, + "MMLU-PRO": 0.365 + } + }, + { + "model_id": "jaspionjader/slu-23", + "name": "slu-23", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4478, + "BBH": 0.5132, + "MATH Level 5": 0.0944, + "GPQA": 0.3045, + "MUSR": 0.4092, + "MMLU-PRO": 0.3725 + } + }, + { + "model_id": "jaspionjader/slu-25", + "name": "slu-25", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.45, + "BBH": 0.5095, + "MATH Level 5": 0.0838, + "GPQA": 0.3087, + "MUSR": 0.3946, + "MMLU-PRO": 0.3684 + } + }, + { + "model_id": "jaspionjader/slu-29", + "name": "slu-29", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4431, + "BBH": 0.5096, + "MATH Level 5": 0.0869, + "GPQA": 0.307, + "MUSR": 0.3933, + "MMLU-PRO": 0.3669 + } + }, + { + "model_id": "jaspionjader/slu-32", + "name": "slu-32", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4516, + "BBH": 0.5167, + "MATH Level 5": 0.1073, + "GPQA": 0.302, + "MUSR": 0.4039, + "MMLU-PRO": 0.3766 + } + }, + { + "model_id": "jaspionjader/slu-33", + "name": "slu-33", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4457, + "BBH": 0.5081, + "MATH Level 5": 0.0997, + "GPQA": 0.3121, + "MUSR": 0.3867, + "MMLU-PRO": 0.3679 + } + }, + { + "model_id": "jaspionjader/slu-34", + "name": "slu-34", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4351, + "BBH": 0.5077, + "MATH Level 5": 0.0997, + "GPQA": 0.3079, + "MUSR": 0.388, + "MMLU-PRO": 0.372 + } + }, + { + "model_id": "jaspionjader/slu-35", + "name": "slu-35", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4242, + "BBH": 0.5103, + "MATH Level 5": 0.1012, + "GPQA": 0.3121, + "MUSR": 0.3946, + "MMLU-PRO": 0.3676 + } + }, + { + "model_id": "jaspionjader/slu-36", + "name": "slu-36", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4518, + "BBH": 0.5087, + "MATH Level 5": 0.0906, + "GPQA": 0.3121, + "MUSR": 0.3933, + "MMLU-PRO": 0.3711 + } + }, + { + "model_id": "jaspionjader/slu-37", + "name": "slu-37", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4534, + "BBH": 0.51, + "MATH Level 5": 0.0974, + "GPQA": 0.307, + "MUSR": 0.3946, + "MMLU-PRO": 0.3695 + } + }, + { + "model_id": "jaspionjader/slu-6", + "name": "slu-6", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4117, + "BBH": 0.5099, + "MATH Level 5": 0.0944, + "GPQA": 0.3029, + "MUSR": 0.4066, + "MMLU-PRO": 0.3611 + } + }, + { + "model_id": "jaspionjader/slu-mix-1", + "name": "slu-mix-1", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4569, + "BBH": 0.524, + "MATH Level 5": 0.1118, + "GPQA": 0.3003, + "MUSR": 0.4277, + "MMLU-PRO": 0.393 + } + }, + { + "model_id": "jaspionjader/sof-1", + "name": "sof-1", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4314, + "BBH": 0.501, + "MATH Level 5": 0.114, + "GPQA": 0.3029, + "MUSR": 0.4082, + "MMLU-PRO": 0.3674 + } + }, + { + "model_id": "jaspionjader/sof-10", + "name": "sof-10", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4648, + "BBH": 0.5197, + "MATH Level 5": 0.1239, + "GPQA": 0.3062, + "MUSR": 0.4091, + "MMLU-PRO": 0.3874 + } + }, + { + "model_id": "jaspionjader/sof-3", + "name": "sof-3", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4637, + "BBH": 0.5206, + "MATH Level 5": 0.1276, + "GPQA": 0.297, + "MUSR": 0.4131, + "MMLU-PRO": 0.3812 + } + }, + { + "model_id": "jaspionjader/sof-6", + "name": "sof-6", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4354, + "BBH": 0.5209, + "MATH Level 5": 0.1299, + "GPQA": 0.2995, + "MUSR": 0.4171, + "MMLU-PRO": 0.3844 + } + }, + { + "model_id": "jaspionjader/test-10", + "name": "test-10", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4578, + "BBH": 0.5316, + "MATH Level 5": 0.114, + "GPQA": 0.3196, + "MUSR": 0.4251, + "MMLU-PRO": 0.3936 + } + }, + { + "model_id": "jaspionjader/test-11", + "name": "test-11", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4541, + "BBH": 0.535, + "MATH Level 5": 0.1201, + "GPQA": 0.3238, + "MUSR": 0.429, + "MMLU-PRO": 0.3939 + } + }, + { + "model_id": "jaspionjader/test-12", + "name": "test-12", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4368, + "BBH": 0.5347, + "MATH Level 5": 0.108, + "GPQA": 0.3188, + "MUSR": 0.425, + "MMLU-PRO": 0.3935 + } + }, + { + "model_id": "jaspionjader/test-13", + "name": "test-13", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4581, + "BBH": 0.5318, + "MATH Level 5": 0.1057, + "GPQA": 0.3163, + "MUSR": 0.4264, + "MMLU-PRO": 0.3935 + } + }, + { + "model_id": "jaspionjader/test-14", + "name": "test-14", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4444, + "BBH": 0.5323, + "MATH Level 5": 0.1103, + "GPQA": 0.3146, + "MUSR": 0.4317, + "MMLU-PRO": 0.393 + } + }, + { + "model_id": "jaspionjader/test-15", + "name": "test-15", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4365, + "BBH": 0.5328, + "MATH Level 5": 0.1118, + "GPQA": 0.3121, + "MUSR": 0.4264, + "MMLU-PRO": 0.393 + } + }, + { + "model_id": "jaspionjader/test-16", + "name": "test-16", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4599, + "BBH": 0.533, + "MATH Level 5": 0.1095, + "GPQA": 0.3138, + "MUSR": 0.4225, + "MMLU-PRO": 0.393 + } + }, + { + "model_id": "jaspionjader/test-17", + "name": "test-17", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4267, + "BBH": 0.5329, + "MATH Level 5": 0.1103, + "GPQA": 0.3129, + "MUSR": 0.429, + "MMLU-PRO": 0.3929 + } + }, + { + "model_id": "jaspionjader/test-18", + "name": "test-18", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4392, + "BBH": 0.5317, + "MATH Level 5": 0.1148, + "GPQA": 0.3121, + "MUSR": 0.4251, + "MMLU-PRO": 0.393 + } + }, + { + "model_id": "jaspionjader/test-19", + "name": "test-19", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4401, + "BBH": 0.5319, + "MATH Level 5": 0.1095, + "GPQA": 0.3096, + "MUSR": 0.4264, + "MMLU-PRO": 0.3929 + } + }, + { + "model_id": "jaspionjader/test-20", + "name": "test-20", + "developer": "jaspionjader", + "scores": { + "IFEval": 0.4529, + "BBH": 0.5327, + "MATH Level 5": 0.1118, + "GPQA": 0.3138, + "MUSR": 0.4251, + "MMLU-PRO": 0.392 + } + }, + { + "model_id": "jayasuryajsk/Qwen2.5-3B-reasoner", + "name": "Qwen2.5-3B-reasoner", + "developer": "jayasuryajsk", + "scores": { + "IFEval": 0.416, + "BBH": 0.4651, + "MATH Level 5": 0.2085, + "GPQA": 0.302, + "MUSR": 0.4123, + "MMLU-PRO": 0.3482 + } + }, + { + "model_id": "jeanmichela/o-distil-qwen", + "name": "o-distil-qwen", + "developer": "jeanmichela", + "scores": { + "IFEval": 0.4482, + "BBH": 0.59, + "MATH Level 5": 0.565, + "GPQA": 0.3935, + "MUSR": 0.534, + "MMLU-PRO": 0.4658 + } + }, + { + "model_id": "jebcarter/psyonic-cetacean-20B", + "name": "psyonic-cetacean-20B", + "developer": "jebcarter", + "scores": { + "IFEval": 0.2544, + "BBH": 0.4907, + "MATH Level 5": 0.0181, + "GPQA": 0.2735, + "MUSR": 0.4661, + "MMLU-PRO": 0.2886 + } + }, + { + "model_id": "jebish7/Llama-3-Nanda-10B-Chat", + "name": "Llama-3-Nanda-10B-Chat", + "developer": "jebish7", + "scores": { + "IFEval": 0.2953, + "BBH": 0.4959, + "MATH Level 5": 0.0559, + "GPQA": 0.3012, + "MUSR": 0.4356, + "MMLU-PRO": 0.3157 + } + }, + { + "model_id": "jebish7/Llama-3.1-8B-Instruct", + "name": "Llama-3.1-8B-Instruct", + "developer": "jebish7", + "scores": { + "IFEval": 0.5058, + "BBH": 0.5088, + "MATH Level 5": 0.1548, + "GPQA": 0.3213, + "MUSR": 0.3998, + "MMLU-PRO": 0.3777 + } + }, + { + "model_id": "jebish7/Nemotron-4-Mini-Hindi-4B-Base", + "name": "Nemotron-4-Mini-Hindi-4B-Base", + "developer": "jebish7", + "scores": { + "IFEval": 0.2285, + "BBH": 0.3924, + "MATH Level 5": 0.0272, + "GPQA": 0.2836, + "MUSR": 0.4249, + "MMLU-PRO": 0.2503 + } + }, + { + "model_id": "jebish7/Nemotron-4-Mini-Hindi-4B-Instruct", + "name": "Nemotron-4-Mini-Hindi-4B-Instruct", + "developer": "jebish7", + "scores": { + "IFEval": 0.3345, + "BBH": 0.4041, + "MATH Level 5": 0.0287, + "GPQA": 0.3087, + "MUSR": 0.4153, + "MMLU-PRO": 0.2595 + } + }, + { + "model_id": "jebish7/Nemotron-Mini-4B-Instruct", + "name": "Nemotron-Mini-4B-Instruct", + "developer": "jebish7", + "scores": { + "IFEval": 0.3709, + "BBH": 0.4244, + "MATH Level 5": 0.0325, + "GPQA": 0.276, + "MUSR": 0.4727, + "MMLU-PRO": 0.2783 + } + }, + { + "model_id": "jebish7/aya-expanse-8b", + "name": "aya-expanse-8b", + "developer": "jebish7", + "scores": { + "IFEval": 0.3791, + "BBH": 0.4969, + "MATH Level 5": 0.0816, + "GPQA": 0.2836, + "MUSR": 0.3869, + "MMLU-PRO": 0.3103 + } + }, + { + "model_id": "jebish7/gemma-2-2b-it", + "name": "gemma-2-2b-it", + "developer": "jebish7", + "scores": { + "IFEval": 0.1272, + "BBH": 0.4395, + "MATH Level 5": 0.034, + "GPQA": 0.297, + "MUSR": 0.4244, + "MMLU-PRO": 0.2715 + } + }, + { + "model_id": "jebish7/gemma-2-9b-it", + "name": "gemma-2-9b-it", + "developer": "jebish7", + "scores": { + "IFEval": 0.1557, + "BBH": 0.5949, + "MATH Level 5": 0.0846, + "GPQA": 0.3473, + "MUSR": 0.4554, + "MMLU-PRO": 0.4143 + } + }, + { + "model_id": "jebish7/qwen2.5-0.5B-IHA-Hin", + "name": "qwen2.5-0.5B-IHA-Hin", + "developer": "jebish7", + "scores": { + "IFEval": 0.1416, + "BBH": 0.2989, + "MATH Level 5": 0.0, + "GPQA": 0.2525, + "MUSR": 0.3475, + "MMLU-PRO": 0.1094 + } + }, + { + "model_id": "jeffmeloy/Qwen-7B-nerd-uncensored-v1.0", + "name": "Qwen-7B-nerd-uncensored-v1.0", + "developer": "jeffmeloy", + "scores": { + "IFEval": 0.6136, + "BBH": 0.5421, + "MATH Level 5": 0.287, + "GPQA": 0.328, + "MUSR": 0.4793, + "MMLU-PRO": 0.4363 + } + }, + { + "model_id": "jeffmeloy/Qwen2.5-7B-minperplexity-2", + "name": "Qwen2.5-7B-minperplexity-2", + "developer": "jeffmeloy", + "scores": { + "IFEval": 0.5097, + "BBH": 0.5524, + "MATH Level 5": 0.3014, + "GPQA": 0.3112, + "MUSR": 0.4625, + "MMLU-PRO": 0.4346 + } + }, + { + "model_id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v0.9", + "name": "Qwen2.5-7B-nerd-uncensored-v0.9", + "developer": "jeffmeloy", + "scores": { + "IFEval": 0.6048, + "BBH": 0.547, + "MATH Level 5": 0.2946, + "GPQA": 0.323, + "MUSR": 0.482, + "MMLU-PRO": 0.4363 + } + }, + { + "model_id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.0", + "name": "Qwen2.5-7B-nerd-uncensored-v1.0", + "developer": "jeffmeloy", + "scores": { + "IFEval": 0.7695, + "BBH": 0.5418, + "MATH Level 5": 0.4713, + "GPQA": 0.2903, + "MUSR": 0.4551, + "MMLU-PRO": 0.4254 + } + }, + { + "model_id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.1", + "name": "Qwen2.5-7B-nerd-uncensored-v1.1", + "developer": "jeffmeloy", + "scores": { + "IFEval": 0.6626, + "BBH": 0.4864, + "MATH Level 5": 0.1329, + "GPQA": 0.2869, + "MUSR": 0.3843, + "MMLU-PRO": 0.385 + } + }, + { + "model_id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.2", + "name": "Qwen2.5-7B-nerd-uncensored-v1.2", + "developer": "jeffmeloy", + "scores": { + "IFEval": 0.4965, + "BBH": 0.4946, + "MATH Level 5": 0.1208, + "GPQA": 0.3037, + "MUSR": 0.4172, + "MMLU-PRO": 0.3969 + } + }, + { + "model_id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.3", + "name": "Qwen2.5-7B-nerd-uncensored-v1.3", + "developer": "jeffmeloy", + "scores": { + "IFEval": 0.4995, + "BBH": 0.5026, + "MATH Level 5": 0.1231, + "GPQA": 0.3129, + "MUSR": 0.4187, + "MMLU-PRO": 0.4016 + } + }, + { + "model_id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.4", + "name": "Qwen2.5-7B-nerd-uncensored-v1.4", + "developer": "jeffmeloy", + "scores": { + "IFEval": 0.6079, + "BBH": 0.5467, + "MATH Level 5": 0.281, + "GPQA": 0.3238, + "MUSR": 0.4714, + "MMLU-PRO": 0.4419 + } + }, + { + "model_id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.5", + "name": "Qwen2.5-7B-nerd-uncensored-v1.5", + "developer": "jeffmeloy", + "scores": { + "IFEval": 0.565, + "BBH": 0.5523, + "MATH Level 5": 0.2757, + "GPQA": 0.3272, + "MUSR": 0.4982, + "MMLU-PRO": 0.4448 + } + }, + { + "model_id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.7", + "name": "Qwen2.5-7B-nerd-uncensored-v1.7", + "developer": "jeffmeloy", + "scores": { + "IFEval": 0.4202, + "BBH": 0.5392, + "MATH Level 5": 0.2915, + "GPQA": 0.3238, + "MUSR": 0.4848, + "MMLU-PRO": 0.428 + } + }, + { + "model_id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.8", + "name": "Qwen2.5-7B-nerd-uncensored-v1.8", + "developer": "jeffmeloy", + "scores": { + "IFEval": 0.6256, + "BBH": 0.5447, + "MATH Level 5": 0.2704, + "GPQA": 0.3238, + "MUSR": 0.4767, + "MMLU-PRO": 0.4343 + } + }, + { + "model_id": "jeffmeloy/Qwen2.5-7B-olm-v1.0", + "name": "Qwen2.5-7B-olm-v1.0", + "developer": "jeffmeloy", + "scores": { + "IFEval": 0.5331, + "BBH": 0.566, + "MATH Level 5": 0.2863, + "GPQA": 0.3205, + "MUSR": 0.4278, + "MMLU-PRO": 0.4566 + } + }, + { + "model_id": "jeffmeloy/Qwen2.5-7B-olm-v1.1", + "name": "Qwen2.5-7B-olm-v1.1", + "developer": "jeffmeloy", + "scores": { + "IFEval": 0.4329, + "BBH": 0.5478, + "MATH Level 5": 0.3829, + "GPQA": 0.3087, + "MUSR": 0.4808, + "MMLU-PRO": 0.4354 + } + }, + { + "model_id": "jeffmeloy/Qwen2.5-7B-olm-v1.2", + "name": "Qwen2.5-7B-olm-v1.2", + "developer": "jeffmeloy", + "scores": { + "IFEval": 0.4203, + "BBH": 0.5533, + "MATH Level 5": 0.2847, + "GPQA": 0.3171, + "MUSR": 0.4688, + "MMLU-PRO": 0.4387 + } + }, + { + "model_id": "jeffmeloy/Qwen2.5-7B-olm-v1.3", + "name": "Qwen2.5-7B-olm-v1.3", + "developer": "jeffmeloy", + "scores": { + "IFEval": 0.4219, + "BBH": 0.5532, + "MATH Level 5": 0.3104, + "GPQA": 0.3213, + "MUSR": 0.4701, + "MMLU-PRO": 0.447 + } + }, + { + "model_id": "jeffmeloy/Qwen2.5-7B-olm-v1.4", + "name": "Qwen2.5-7B-olm-v1.4", + "developer": "jeffmeloy", + "scores": { + "IFEval": 0.4545, + "BBH": 0.5582, + "MATH Level 5": 0.2923, + "GPQA": 0.3121, + "MUSR": 0.4622, + "MMLU-PRO": 0.4457 + } + }, + { + "model_id": "jeffmeloy/Qwen2.5-7B-olm-v1.5", + "name": "Qwen2.5-7B-olm-v1.5", + "developer": "jeffmeloy", + "scores": { + "IFEval": 0.4547, + "BBH": 0.5544, + "MATH Level 5": 0.2817, + "GPQA": 0.3398, + "MUSR": 0.4539, + "MMLU-PRO": 0.4399 + } + }, + { + "model_id": "jeffmeloy/jeffmeloy_Qwen2.5-7B-minperplexity-1", + "name": "jeffmeloy_Qwen2.5-7B-minperplexity-1", + "developer": "jeffmeloy", + "scores": { + "IFEval": 0.3757, + "BBH": 0.5582, + "MATH Level 5": 0.2915, + "GPQA": 0.3322, + "MUSR": 0.429, + "MMLU-PRO": 0.4368 + } + }, + { + "model_id": "jeonsworld/CarbonVillain-en-10.7B-v4", + "name": "CarbonVillain-en-10.7B-v4", + "developer": "jeonsworld", + "scores": { + "IFEval": 0.4579, + "BBH": 0.5168, + "MATH Level 5": 0.0468, + "GPQA": 0.3062, + "MUSR": 0.3965, + "MMLU-PRO": 0.3142 + } + }, + { + "model_id": "jiangxinyang-shanda/Homer-LLama3-8B", + "name": "Homer-LLama3-8B", + "developer": "jiangxinyang-shanda", + "scores": { + "IFEval": 0.3992, + "BBH": 0.5173, + "MATH Level 5": 0.0861, + "GPQA": 0.297, + "MUSR": 0.4056, + "MMLU-PRO": 0.3139 + } + }, + { + "model_id": "jieliu/Storm-7B", + "name": "Storm-7B", + "developer": "jieliu", + "scores": { + "IFEval": 0.3424, + "BBH": 0.5187, + "MATH Level 5": 0.0612, + "GPQA": 0.3079, + "MUSR": 0.4429, + "MMLU-PRO": 0.3119 + } + }, + { + "model_id": "jiviai/medX_v2", + "name": "medX_v2", + "developer": "jiviai", + "scores": { + "IFEval": 0.3743, + "BBH": 0.4509, + "MATH Level 5": 0.0544, + "GPQA": 0.323, + "MUSR": 0.3498, + "MMLU-PRO": 0.3428 + } + }, + { + "model_id": "jlzhou/Qwen2.5-3B-Infinity-Instruct-0625", + "name": "Qwen2.5-3B-Infinity-Instruct-0625", + "developer": "jlzhou", + "scores": { + "IFEval": 0.3558, + "BBH": 0.4774, + "MATH Level 5": 0.1367, + "GPQA": 0.2693, + "MUSR": 0.3981, + "MMLU-PRO": 0.3199 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01", + "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01", + "developer": "johnsutor", + "scores": { + "IFEval": 0.4271, + "BBH": 0.5036, + "MATH Level 5": 0.0453, + "GPQA": 0.3221, + "MUSR": 0.4638, + "MMLU-PRO": 0.3739 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1", + "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1", + "developer": "johnsutor", + "scores": { + "IFEval": 0.4253, + "BBH": 0.5019, + "MATH Level 5": 0.0967, + "GPQA": 0.3012, + "MUSR": 0.415, + "MMLU-PRO": 0.3724 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01", + "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01", + "developer": "johnsutor", + "scores": { + "IFEval": 0.3377, + "BBH": 0.4917, + "MATH Level 5": 0.0106, + "GPQA": 0.3121, + "MUSR": 0.5018, + "MMLU-PRO": 0.3533 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1", + "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1", + "developer": "johnsutor", + "scores": { + "IFEval": 0.4274, + "BBH": 0.5126, + "MATH Level 5": 0.0808, + "GPQA": 0.3087, + "MUSR": 0.4226, + "MMLU-PRO": 0.3739 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01", + "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01", + "developer": "johnsutor", + "scores": { + "IFEval": 0.3204, + "BBH": 0.4884, + "MATH Level 5": 0.0038, + "GPQA": 0.302, + "MUSR": 0.5098, + "MMLU-PRO": 0.3344 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1", + "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1", + "developer": "johnsutor", + "scores": { + "IFEval": 0.4396, + "BBH": 0.514, + "MATH Level 5": 0.0801, + "GPQA": 0.307, + "MUSR": 0.4398, + "MMLU-PRO": 0.3696 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01", + "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01", + "developer": "johnsutor", + "scores": { + "IFEval": 0.2814, + "BBH": 0.4854, + "MATH Level 5": 0.0023, + "GPQA": 0.2903, + "MUSR": 0.5163, + "MMLU-PRO": 0.3295 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1", + "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1", + "developer": "johnsutor", + "scores": { + "IFEval": 0.4302, + "BBH": 0.5157, + "MATH Level 5": 0.0627, + "GPQA": 0.3079, + "MUSR": 0.4332, + "MMLU-PRO": 0.3663 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01", + "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01", + "developer": "johnsutor", + "scores": { + "IFEval": 0.279, + "BBH": 0.4861, + "MATH Level 5": 0.0015, + "GPQA": 0.2945, + "MUSR": 0.515, + "MMLU-PRO": 0.3305 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1", + "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1", + "developer": "johnsutor", + "scores": { + "IFEval": 0.4223, + "BBH": 0.5154, + "MATH Level 5": 0.074, + "GPQA": 0.3079, + "MUSR": 0.4384, + "MMLU-PRO": 0.365 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01", + "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01", + "developer": "johnsutor", + "scores": { + "IFEval": 0.4359, + "BBH": 0.5041, + "MATH Level 5": 0.0483, + "GPQA": 0.3104, + "MUSR": 0.4532, + "MMLU-PRO": 0.3762 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1", + "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1", + "developer": "johnsutor", + "scores": { + "IFEval": 0.4202, + "BBH": 0.5011, + "MATH Level 5": 0.0982, + "GPQA": 0.3003, + "MUSR": 0.415, + "MMLU-PRO": 0.3699 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01", + "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01", + "developer": "johnsutor", + "scores": { + "IFEval": 0.3518, + "BBH": 0.4999, + "MATH Level 5": 0.0234, + "GPQA": 0.3062, + "MUSR": 0.4871, + "MMLU-PRO": 0.3611 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1", + "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1", + "developer": "johnsutor", + "scores": { + "IFEval": 0.4204, + "BBH": 0.5107, + "MATH Level 5": 0.0876, + "GPQA": 0.3045, + "MUSR": 0.4279, + "MMLU-PRO": 0.371 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01", + "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01", + "developer": "johnsutor", + "scores": { + "IFEval": 0.3454, + "BBH": 0.4984, + "MATH Level 5": 0.0219, + "GPQA": 0.297, + "MUSR": 0.4911, + "MMLU-PRO": 0.3531 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1", + "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1", + "developer": "johnsutor", + "scores": { + "IFEval": 0.4092, + "BBH": 0.5137, + "MATH Level 5": 0.0808, + "GPQA": 0.2953, + "MUSR": 0.4357, + "MMLU-PRO": 0.3669 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01", + "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01", + "developer": "johnsutor", + "scores": { + "IFEval": 0.2904, + "BBH": 0.4967, + "MATH Level 5": 0.0144, + "GPQA": 0.2995, + "MUSR": 0.4991, + "MMLU-PRO": 0.349 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1", + "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1", + "developer": "johnsutor", + "scores": { + "IFEval": 0.4199, + "BBH": 0.5147, + "MATH Level 5": 0.0808, + "GPQA": 0.2987, + "MUSR": 0.4358, + "MMLU-PRO": 0.3615 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01", + "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01", + "developer": "johnsutor", + "scores": { + "IFEval": 0.2913, + "BBH": 0.4918, + "MATH Level 5": 0.0106, + "GPQA": 0.3003, + "MUSR": 0.4977, + "MMLU-PRO": 0.3454 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1", + "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1", + "developer": "johnsutor", + "scores": { + "IFEval": 0.4162, + "BBH": 0.5139, + "MATH Level 5": 0.0778, + "GPQA": 0.297, + "MUSR": 0.4317, + "MMLU-PRO": 0.3625 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_dare_linear", + "name": "Llama-3-8B-Instruct_dare_linear", + "developer": "johnsutor", + "scores": { + "IFEval": 0.2145, + "BBH": 0.4283, + "MATH Level 5": 0.0, + "GPQA": 0.2961, + "MUSR": 0.4979, + "MMLU-PRO": 0.2414 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.1", + "name": "Llama-3-8B-Instruct_dare_ties-density-0.1", + "developer": "johnsutor", + "scores": { + "IFEval": 0.1891, + "BBH": 0.4119, + "MATH Level 5": 0.0008, + "GPQA": 0.2718, + "MUSR": 0.4658, + "MMLU-PRO": 0.2265 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.3", + "name": "Llama-3-8B-Instruct_dare_ties-density-0.3", + "developer": "johnsutor", + "scores": { + "IFEval": 0.2113, + "BBH": 0.4559, + "MATH Level 5": 0.0015, + "GPQA": 0.297, + "MUSR": 0.5069, + "MMLU-PRO": 0.304 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.7", + "name": "Llama-3-8B-Instruct_dare_ties-density-0.7", + "developer": "johnsutor", + "scores": { + "IFEval": 0.2034, + "BBH": 0.4723, + "MATH Level 5": 0.003, + "GPQA": 0.3037, + "MUSR": 0.511, + "MMLU-PRO": 0.3148 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.9", + "name": "Llama-3-8B-Instruct_dare_ties-density-0.9", + "developer": "johnsutor", + "scores": { + "IFEval": 0.2161, + "BBH": 0.4664, + "MATH Level 5": 0.0015, + "GPQA": 0.3079, + "MUSR": 0.523, + "MMLU-PRO": 0.3143 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_linear", + "name": "Llama-3-8B-Instruct_linear", + "developer": "johnsutor", + "scores": { + "IFEval": 0.4308, + "BBH": 0.5031, + "MATH Level 5": 0.1005, + "GPQA": 0.2953, + "MUSR": 0.4097, + "MMLU-PRO": 0.3712 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_ties-density-0.1", + "name": "Llama-3-8B-Instruct_ties-density-0.1", + "developer": "johnsutor", + "scores": { + "IFEval": 0.4116, + "BBH": 0.5021, + "MATH Level 5": 0.0793, + "GPQA": 0.2886, + "MUSR": 0.4174, + "MMLU-PRO": 0.36 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_ties-density-0.3", + "name": "Llama-3-8B-Instruct_ties-density-0.3", + "developer": "johnsutor", + "scores": { + "IFEval": 0.3626, + "BBH": 0.4906, + "MATH Level 5": 0.0672, + "GPQA": 0.2961, + "MUSR": 0.4025, + "MMLU-PRO": 0.3321 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_ties-density-0.5", + "name": "Llama-3-8B-Instruct_ties-density-0.5", + "developer": "johnsutor", + "scores": { + "IFEval": 0.3797, + "BBH": 0.4793, + "MATH Level 5": 0.0612, + "GPQA": 0.3045, + "MUSR": 0.388, + "MMLU-PRO": 0.3175 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_ties-density-0.7", + "name": "Llama-3-8B-Instruct_ties-density-0.7", + "developer": "johnsutor", + "scores": { + "IFEval": 0.3681, + "BBH": 0.4738, + "MATH Level 5": 0.0672, + "GPQA": 0.3096, + "MUSR": 0.3881, + "MMLU-PRO": 0.3152 + } + }, + { + "model_id": "johnsutor/Llama-3-8B-Instruct_ties-density-0.9", + "name": "Llama-3-8B-Instruct_ties-density-0.9", + "developer": "johnsutor", + "scores": { + "IFEval": 0.3858, + "BBH": 0.4735, + "MATH Level 5": 0.0619, + "GPQA": 0.2995, + "MUSR": 0.388, + "MMLU-PRO": 0.3182 + } + }, + { + "model_id": "jpacifico/Chocolatine-14B-Instruct-4k-DPO", + "name": "Chocolatine-14B-Instruct-4k-DPO", + "developer": "jpacifico", + "scores": { + "IFEval": 0.4689, + "BBH": 0.63, + "MATH Level 5": 0.1782, + "GPQA": 0.3414, + "MUSR": 0.4439, + "MMLU-PRO": 0.4764 + } + }, + { + "model_id": "jpacifico/Chocolatine-14B-Instruct-DPO-v1.2", + "name": "Chocolatine-14B-Instruct-DPO-v1.2", + "developer": "jpacifico", + "scores": { + "IFEval": 0.6852, + "BBH": 0.6438, + "MATH Level 5": 0.2092, + "GPQA": 0.3255, + "MUSR": 0.4268, + "MMLU-PRO": 0.4697 + } + }, + { + "model_id": "jpacifico/Chocolatine-14B-Instruct-DPO-v1.3", + "name": "Chocolatine-14B-Instruct-DPO-v1.3", + "developer": "jpacifico", + "scores": { + "IFEval": 0.704, + "BBH": 0.6846, + "MATH Level 5": 0.5619, + "GPQA": 0.3414, + "MUSR": 0.4234, + "MMLU-PRO": 0.5374 + } + }, + { + "model_id": "jpacifico/Chocolatine-2-14B-Instruct-DPO-v2.0b1", + "name": "Chocolatine-2-14B-Instruct-DPO-v2.0b1", + "developer": "jpacifico", + "scores": { + "IFEval": 0.1033, + "BBH": 0.6696, + "MATH Level 5": 0.2757, + "GPQA": 0.3758, + "MUSR": 0.4467, + "MMLU-PRO": 0.5124 + } + }, + { + "model_id": "jpacifico/Chocolatine-2-14B-Instruct-v2.0", + "name": "Chocolatine-2-14B-Instruct-v2.0", + "developer": "jpacifico", + "scores": { + "IFEval": 0.0885, + "BBH": 0.677, + "MATH Level 5": 0.4804, + "GPQA": 0.3876, + "MUSR": 0.5021, + "MMLU-PRO": 0.5302 + } + }, + { + "model_id": "jpacifico/Chocolatine-2-14B-Instruct-v2.0.1", + "name": "Chocolatine-2-14B-Instruct-v2.0.1", + "developer": "jpacifico", + "scores": { + "IFEval": 0.0742, + "BBH": 0.6736, + "MATH Level 5": 0.4796, + "GPQA": 0.3918, + "MUSR": 0.5008, + "MMLU-PRO": 0.5299 + } + }, + { + "model_id": "jpacifico/Chocolatine-2-14B-Instruct-v2.0.3", + "name": "Chocolatine-2-14B-Instruct-v2.0.3", + "developer": "jpacifico", + "scores": { + "IFEval": 0.7037, + "BBH": 0.6548, + "MATH Level 5": 0.4207, + "GPQA": 0.3792, + "MUSR": 0.4768, + "MMLU-PRO": 0.5374 + } + }, + { + "model_id": "jpacifico/Chocolatine-2-14B-Instruct-v2.0b2", + "name": "Chocolatine-2-14B-Instruct-v2.0b2", + "developer": "jpacifico", + "scores": { + "IFEval": 0.7241, + "BBH": 0.6476, + "MATH Level 5": 0.395, + "GPQA": 0.3834, + "MUSR": 0.4808, + "MMLU-PRO": 0.5369 + } + }, + { + "model_id": "jpacifico/Chocolatine-2-14B-Instruct-v2.0b3", + "name": "Chocolatine-2-14B-Instruct-v2.0b3", + "developer": "jpacifico", + "scores": { + "IFEval": 0.7323, + "BBH": 0.6469, + "MATH Level 5": 0.4109, + "GPQA": 0.3792, + "MUSR": 0.4781, + "MMLU-PRO": 0.5337 + } + }, + { + "model_id": "jpacifico/Chocolatine-3B-Instruct-DPO-Revised", + "name": "Chocolatine-3B-Instruct-DPO-Revised", + "developer": "jpacifico", + "scores": { + "IFEval": 0.5623, + "BBH": 0.554, + "MATH Level 5": 0.1805, + "GPQA": 0.3221, + "MUSR": 0.4453, + "MMLU-PRO": 0.3989 + } + }, + { + "model_id": "jpacifico/Chocolatine-3B-Instruct-DPO-v1.0", + "name": "Chocolatine-3B-Instruct-DPO-v1.0", + "developer": "jpacifico", + "scores": { + "IFEval": 0.3737, + "BBH": 0.5471, + "MATH Level 5": 0.1782, + "GPQA": 0.3154, + "MUSR": 0.4755, + "MMLU-PRO": 0.3937 + } + }, + { + "model_id": "jpacifico/Chocolatine-3B-Instruct-DPO-v1.2", + "name": "Chocolatine-3B-Instruct-DPO-v1.2", + "developer": "jpacifico", + "scores": { + "IFEval": 0.5455, + "BBH": 0.5487, + "MATH Level 5": 0.2047, + "GPQA": 0.3389, + "MUSR": 0.4154, + "MMLU-PRO": 0.3877 + } + }, + { + "model_id": "jpacifico/Distilucie-7B-Math-Instruct-DPO-v0.1", + "name": "Distilucie-7B-Math-Instruct-DPO-v0.1", + "developer": "jpacifico", + "scores": { + "IFEval": 0.3048, + "BBH": 0.3835, + "MATH Level 5": 0.0257, + "GPQA": 0.2995, + "MUSR": 0.3644, + "MMLU-PRO": 0.1809 + } + }, + { + "model_id": "jpacifico/Lucie-7B-Instruct-DPO-v1.1", + "name": "Lucie-7B-Instruct-DPO-v1.1", + "developer": "jpacifico", + "scores": { + "IFEval": 0.3121, + "BBH": 0.3781, + "MATH Level 5": 0.0234, + "GPQA": 0.2878, + "MUSR": 0.4016, + "MMLU-PRO": 0.1838 + } + }, + { + "model_id": "jpacifico/Lucie-7B-Instruct-DPO-v1.1.3", + "name": "Lucie-7B-Instruct-DPO-v1.1.3", + "developer": "jpacifico", + "scores": { + "IFEval": 0.3045, + "BBH": 0.3819, + "MATH Level 5": 0.0242, + "GPQA": 0.2861, + "MUSR": 0.3818, + "MMLU-PRO": 0.1764 + } + }, + { + "model_id": "jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.0", + "name": "Lucie-7B-Instruct-Merged-Model_Stock-v1.0", + "developer": "jpacifico", + "scores": { + "IFEval": 0.3234, + "BBH": 0.3802, + "MATH Level 5": 0.0242, + "GPQA": 0.2886, + "MUSR": 0.3844, + "MMLU-PRO": 0.1871 + } + }, + { + "model_id": "jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.1", + "name": "Lucie-7B-Instruct-Merged-Model_Stock-v1.1", + "developer": "jpacifico", + "scores": { + "IFEval": 0.3014, + "BBH": 0.3808, + "MATH Level 5": 0.0279, + "GPQA": 0.2827, + "MUSR": 0.375, + "MMLU-PRO": 0.1862 + } + }, + { + "model_id": "jpacifico/Lucie-Boosted-7B-Instruct", + "name": "Lucie-Boosted-7B-Instruct", + "developer": "jpacifico", + "scores": { + "IFEval": 0.2566, + "BBH": 0.3465, + "MATH Level 5": 0.0128, + "GPQA": 0.2668, + "MUSR": 0.3699, + "MMLU-PRO": 0.163 + } + }, + { + "model_id": "jsfs11/L3-8B-Stheno-slerp", + "name": "L3-8B-Stheno-slerp", + "developer": "jsfs11", + "scores": { + "IFEval": 0.6752, + "BBH": 0.5326, + "MATH Level 5": 0.0989, + "GPQA": 0.2852, + "MUSR": 0.3725, + "MMLU-PRO": 0.3649 + } + }, + { + "model_id": "jsfs11/MixtureofMerges-MoE-4x7b-v4", + "name": "MixtureofMerges-MoE-4x7b-v4", + "developer": "jsfs11", + "scores": { + "IFEval": 0.403, + "BBH": 0.5169, + "MATH Level 5": 0.0634, + "GPQA": 0.2861, + "MUSR": 0.4386, + "MMLU-PRO": 0.3032 + } + }, + { + "model_id": "jsfs11/MixtureofMerges-MoE-4x7b-v5", + "name": "MixtureofMerges-MoE-4x7b-v5", + "developer": "jsfs11", + "scores": { + "IFEval": 0.4199, + "BBH": 0.5198, + "MATH Level 5": 0.0755, + "GPQA": 0.2844, + "MUSR": 0.4305, + "MMLU-PRO": 0.3098 + } + }, + { + "model_id": "kaist-ai/janus-7b", + "name": "janus-7b", + "developer": "kaist-ai", + "scores": { + "IFEval": 0.3775, + "BBH": 0.4694, + "MATH Level 5": 0.0408, + "GPQA": 0.2727, + "MUSR": 0.4401, + "MMLU-PRO": 0.2874 + } + }, + { + "model_id": "kaist-ai/janus-dpo-7b", + "name": "janus-dpo-7b", + "developer": "kaist-ai", + "scores": { + "IFEval": 0.4003, + "BBH": 0.4773, + "MATH Level 5": 0.0415, + "GPQA": 0.2819, + "MUSR": 0.4387, + "MMLU-PRO": 0.2976 + } + }, + { + "model_id": "kaist-ai/janus-rm-7b", + "name": "janus-rm-7b", + "developer": "kaist-ai", + "scores": { + "IFEval": 0.1778, + "BBH": 0.3056, + "MATH Level 5": 0.0, + "GPQA": 0.2517, + "MUSR": 0.3883, + "MMLU-PRO": 0.1126 + } + }, + { + "model_id": "kaist-ai/mistral-orpo-capybara-7k", + "name": "mistral-orpo-capybara-7k", + "developer": "kaist-ai", + "scores": { + "IFEval": 0.5367, + "BBH": 0.4489, + "MATH Level 5": 0.0393, + "GPQA": 0.2861, + "MUSR": 0.3964, + "MMLU-PRO": 0.2971 + } + }, + { + "model_id": "kavonalds/BunderMaxx-0710", + "name": "BunderMaxx-0710", + "developer": "kavonalds", + "scores": { + "IFEval": 0.2701, + "BBH": 0.5566, + "MATH Level 5": 0.068, + "GPQA": 0.2802, + "MUSR": 0.3682, + "MMLU-PRO": 0.1449 + } + }, + { + "model_id": "kavonalds/BunderMaxx-1010", + "name": "BunderMaxx-1010", + "developer": "kavonalds", + "scores": { + "IFEval": 0.2981, + "BBH": 0.702, + "MATH Level 5": 0.105, + "GPQA": 0.2609, + "MUSR": 0.3484, + "MMLU-PRO": 0.1224 + } + }, + { + "model_id": "kavonalds/Lancer-1-1b-Instruct", + "name": "Lancer-1-1b-Instruct", + "developer": "kavonalds", + "scores": { + "IFEval": 0.5546, + "BBH": 0.3253, + "MATH Level 5": 0.0393, + "GPQA": 0.2617, + "MUSR": 0.3144, + "MMLU-PRO": 0.1568 + } + }, + { + "model_id": "kayfour/T3Q-Qwen2.5-7B-it-KOR-Safe", + "name": "T3Q-Qwen2.5-7B-it-KOR-Safe", + "developer": "kayfour", + "scores": { + "IFEval": 0.6081, + "BBH": 0.555, + "MATH Level 5": 0.3761, + "GPQA": 0.3213, + "MUSR": 0.4277, + "MMLU-PRO": 0.4464 + } + }, + { + "model_id": "keeeeenw/MicroLlama", + "name": "MicroLlama", + "developer": "keeeeenw", + "scores": { + "IFEval": 0.1985, + "BBH": 0.3007, + "MATH Level 5": 0.0113, + "GPQA": 0.2609, + "MUSR": 0.3698, + "MMLU-PRO": 0.1138 + } + }, + { + "model_id": "kekmodel/StopCarbon-10.7B-v5", + "name": "StopCarbon-10.7B-v5", + "developer": "kekmodel", + "scores": { + "IFEval": 0.4728, + "BBH": 0.5178, + "MATH Level 5": 0.0559, + "GPQA": 0.3062, + "MUSR": 0.4019, + "MMLU-PRO": 0.3157 + } + }, + { + "model_id": "kevin009/llamaRAGdrama", + "name": "llamaRAGdrama", + "developer": "kevin009", + "scores": { + "IFEval": 0.2598, + "BBH": 0.4007, + "MATH Level 5": 0.0431, + "GPQA": 0.2643, + "MUSR": 0.4316, + "MMLU-PRO": 0.2724 + } + }, + { + "model_id": "khoantap/cheap-moe-merge", + "name": "cheap-moe-merge", + "developer": "khoantap", + "scores": { + "IFEval": 0.4557, + "BBH": 0.5131, + "MATH Level 5": 0.0921, + "GPQA": 0.2953, + "MUSR": 0.4103, + "MMLU-PRO": 0.3339 + } + }, + { + "model_id": "khoantap/llama-3-8b-stock-merge", + "name": "llama-3-8b-stock-merge", + "developer": "khoantap", + "scores": { + "IFEval": 0.4812, + "BBH": 0.5162, + "MATH Level 5": 0.1616, + "GPQA": 0.318, + "MUSR": 0.3946, + "MMLU-PRO": 0.38 + } + }, + { + "model_id": "khoantap/llama-breadcrumbs-ties-merge", + "name": "llama-breadcrumbs-ties-merge", + "developer": "khoantap", + "scores": { + "IFEval": 0.2205, + "BBH": 0.5416, + "MATH Level 5": 0.1125, + "GPQA": 0.2659, + "MUSR": 0.4434, + "MMLU-PRO": 0.3172 + } + }, + { + "model_id": "khoantap/llama-evolve-ties-best-merge", + "name": "llama-evolve-ties-best-merge", + "developer": "khoantap", + "scores": { + "IFEval": 0.6744, + "BBH": 0.5414, + "MATH Level 5": 0.1563, + "GPQA": 0.3171, + "MUSR": 0.3946, + "MMLU-PRO": 0.386 + } + }, + { + "model_id": "khoantap/llama-linear-0.5-0.5-1-merge", + "name": "llama-linear-0.5-0.5-1-merge", + "developer": "khoantap", + "scores": { + "IFEval": 0.4812, + "BBH": 0.5643, + "MATH Level 5": 0.2054, + "GPQA": 0.307, + "MUSR": 0.4143, + "MMLU-PRO": 0.3833 + } + }, + { + "model_id": "khoantap/llama-linear-0.5-1-0.5-merge", + "name": "llama-linear-0.5-1-0.5-merge", + "developer": "khoantap", + "scores": { + "IFEval": 0.5032, + "BBH": 0.5951, + "MATH Level 5": 0.148, + "GPQA": 0.2936, + "MUSR": 0.4172, + "MMLU-PRO": 0.369 + } + }, + { + "model_id": "khoantap/llama-linear-1-0.5-0.5-merge", + "name": "llama-linear-1-0.5-0.5-merge", + "developer": "khoantap", + "scores": { + "IFEval": 0.4515, + "BBH": 0.5526, + "MATH Level 5": 0.2477, + "GPQA": 0.2928, + "MUSR": 0.4118, + "MMLU-PRO": 0.3635 + } + }, + { + "model_id": "khoantap/llama-slerp-merge", + "name": "llama-slerp-merge", + "developer": "khoantap", + "scores": { + "IFEval": 0.498, + "BBH": 0.5783, + "MATH Level 5": 0.0831, + "GPQA": 0.3029, + "MUSR": 0.4053, + "MMLU-PRO": 0.3678 + } + }, + { + "model_id": "khoantap/moe-out-merge", + "name": "moe-out-merge", + "developer": "khoantap", + "scores": { + "IFEval": 0.4505, + "BBH": 0.5151, + "MATH Level 5": 0.0929, + "GPQA": 0.2886, + "MUSR": 0.4063, + "MMLU-PRO": 0.3348 + } + }, + { + "model_id": "khulaifi95/Llama-3.1-8B-Reason-Blend-888k", + "name": "Llama-3.1-8B-Reason-Blend-888k", + "developer": "khulaifi95", + "scores": { + "IFEval": 0.5832, + "BBH": 0.479, + "MATH Level 5": 0.1156, + "GPQA": 0.2794, + "MUSR": 0.3379, + "MMLU-PRO": 0.31 + } + }, + { + "model_id": "kms7530/chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1", + "name": "chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1", + "developer": "kms7530", + "scores": { + "IFEval": 0.5455, + "BBH": 0.4289, + "MATH Level 5": 0.0619, + "GPQA": 0.2701, + "MUSR": 0.3821, + "MMLU-PRO": 0.2798 + } + }, + { + "model_id": "kms7530/chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath", + "name": "chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath", + "developer": "kms7530", + "scores": { + "IFEval": 0.4863, + "BBH": 0.4987, + "MATH Level 5": 0.108, + "GPQA": 0.3104, + "MUSR": 0.3983, + "MMLU-PRO": 0.3481 + } + }, + { + "model_id": "kms7530/chemeng_qwen-math-7b_24_1_100_1", + "name": "chemeng_qwen-math-7b_24_1_100_1", + "developer": "kms7530", + "scores": { + "IFEval": 0.2111, + "BBH": 0.3578, + "MATH Level 5": 0.2243, + "GPQA": 0.2441, + "MUSR": 0.3687, + "MMLU-PRO": 0.2158 + } + }, + { + "model_id": "kms7530/chemeng_qwen-math-7b_24_1_100_1_nonmath", + "name": "chemeng_qwen-math-7b_24_1_100_1_nonmath", + "developer": "kms7530", + "scores": { + "IFEval": 0.2584, + "BBH": 0.3893, + "MATH Level 5": 0.3097, + "GPQA": 0.2903, + "MUSR": 0.4087, + "MMLU-PRO": 0.2452 + } + }, + { + "model_id": "kno10/ende-chat-0.0.5", + "name": "ende-chat-0.0.5", + "developer": "kno10", + "scores": { + "IFEval": 0.3404, + "BBH": 0.3604, + "MATH Level 5": 0.0204, + "GPQA": 0.2651, + "MUSR": 0.3938, + "MMLU-PRO": 0.179 + } + }, + { + "model_id": "kno10/ende-chat-0.0.7", + "name": "ende-chat-0.0.7", + "developer": "kno10", + "scores": { + "IFEval": 0.4401, + "BBH": 0.3792, + "MATH Level 5": 0.0174, + "GPQA": 0.281, + "MUSR": 0.3861, + "MMLU-PRO": 0.1966 + } + }, + { + "model_id": "kyutai/helium-1-preview-2b", + "name": "helium-1-preview-2b", + "developer": "kyutai", + "scores": { + "IFEval": 0.2614, + "BBH": 0.3638, + "MATH Level 5": 0.0136, + "GPQA": 0.2785, + "MUSR": 0.355, + "MMLU-PRO": 0.1873 + } + }, + { + "model_id": "kz919/QwQ-0.5B-Distilled-SFT", + "name": "QwQ-0.5B-Distilled-SFT", + "developer": "kz919", + "scores": { + "IFEval": 0.3077, + "BBH": 0.3256, + "MATH Level 5": 0.074, + "GPQA": 0.2609, + "MUSR": 0.3409, + "MMLU-PRO": 0.1587 + } + }, + { + "model_id": "ladydaina/ECE-FDF", + "name": "ECE-FDF", + "developer": "ladydaina", + "scores": { + "IFEval": 0.3728, + "BBH": 0.515, + "MATH Level 5": 0.0816, + "GPQA": 0.2827, + "MUSR": 0.4504, + "MMLU-PRO": 0.3007 + } + }, + { + "model_id": "laislemke/LLaMA-2-vicuna-7b-slerp", + "name": "LLaMA-2-vicuna-7b-slerp", + "developer": "laislemke", + "scores": { + "IFEval": 0.2932, + "BBH": 0.2986, + "MATH Level 5": 0.0113, + "GPQA": 0.2735, + "MUSR": 0.3833, + "MMLU-PRO": 0.1342 + } + }, + { + "model_id": "lalainy/ECE-PRYMMAL-0.5B-FT-V5-MUSR", + "name": "ECE-PRYMMAL-0.5B-FT-V5-MUSR", + "developer": "lalainy", + "scores": { + "IFEval": 0.2138, + "BBH": 0.3269, + "MATH Level 5": 0.0453, + "GPQA": 0.2743, + "MUSR": 0.3262, + "MMLU-PRO": 0.1533 + } + }, + { + "model_id": "lalainy/ECE-PRYMMAL-0.5B-SLERP-V4", + "name": "ECE-PRYMMAL-0.5B-SLERP-V4", + "developer": "lalainy", + "scores": { + "IFEval": 0.1564, + "BBH": 0.2894, + "MATH Level 5": 0.0, + "GPQA": 0.2626, + "MUSR": 0.3789, + "MMLU-PRO": 0.1169 + } + }, + { + "model_id": "lalainy/ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1", + "name": "ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1", + "developer": "lalainy", + "scores": { + "IFEval": 0.1437, + "BBH": 0.3032, + "MATH Level 5": 0.0008, + "GPQA": 0.2349, + "MUSR": 0.3646, + "MMLU-PRO": 0.1121 + } + }, + { + "model_id": "lalainy/ECE-PRYMMAL-YL-1B-SLERP-V3", + "name": "ECE-PRYMMAL-YL-1B-SLERP-V3", + "developer": "lalainy", + "scores": { + "IFEval": 0.325, + "BBH": 0.4225, + "MATH Level 5": 0.0974, + "GPQA": 0.2945, + "MUSR": 0.4213, + "MMLU-PRO": 0.2931 + } + }, + { + "model_id": "lalainy/ECE-PRYMMAL-YL-1B-SLERP-V4", + "name": "ECE-PRYMMAL-YL-1B-SLERP-V4", + "developer": "lalainy", + "scores": { + "IFEval": 0.3324, + "BBH": 0.4171, + "MATH Level 5": 0.1005, + "GPQA": 0.2861, + "MUSR": 0.4306, + "MMLU-PRO": 0.2893 + } + }, + { + "model_id": "lalainy/ECE-PRYMMAL-YL-6B-SLERP-V1", + "name": "ECE-PRYMMAL-YL-6B-SLERP-V1", + "developer": "lalainy", + "scores": { + "IFEval": 0.3264, + "BBH": 0.4629, + "MATH Level 5": 0.1269, + "GPQA": 0.2886, + "MUSR": 0.4864, + "MMLU-PRO": 0.3214 + } + }, + { + "model_id": "lalainy/ECE-PRYMMAL-YL-6B-SLERP-V2", + "name": "ECE-PRYMMAL-YL-6B-SLERP-V2", + "developer": "lalainy", + "scores": { + "IFEval": 0.3249, + "BBH": 0.4629, + "MATH Level 5": 0.1269, + "GPQA": 0.2886, + "MUSR": 0.4864, + "MMLU-PRO": 0.3214 + } + }, + { + "model_id": "langgptai/Qwen-las-v0.1", + "name": "Qwen-las-v0.1", + "developer": "langgptai", + "scores": { + "IFEval": 0.3301, + "BBH": 0.3893, + "MATH Level 5": 0.037, + "GPQA": 0.2466, + "MUSR": 0.3701, + "MMLU-PRO": 0.2325 + } + }, + { + "model_id": "langgptai/qwen1.5-7b-chat-sa-v0.1", + "name": "qwen1.5-7b-chat-sa-v0.1", + "developer": "langgptai", + "scores": { + "IFEval": 0.4268, + "BBH": 0.4325, + "MATH Level 5": 0.0302, + "GPQA": 0.3121, + "MUSR": 0.3551, + "MMLU-PRO": 0.2993 + } + }, + { + "model_id": "lars1234/Mistral-Small-24B-Instruct-2501-writer", + "name": "Mistral-Small-24B-Instruct-2501-writer", + "developer": "lars1234", + "scores": { + "IFEval": 0.6565, + "BBH": 0.6733, + "MATH Level 5": 0.3557, + "GPQA": 0.3893, + "MUSR": 0.4645, + "MMLU-PRO": 0.5448 + } + }, + { + "model_id": "leafspark/Llama-3.1-8B-MultiReflection-Instruct", + "name": "Llama-3.1-8B-MultiReflection-Instruct", + "developer": "leafspark", + "scores": { + "IFEval": 0.7125, + "BBH": 0.5009, + "MATH Level 5": 0.1707, + "GPQA": 0.2928, + "MUSR": 0.3682, + "MMLU-PRO": 0.3724 + } + }, + { + "model_id": "lemon07r/Gemma-2-Ataraxy-9B", + "name": "Gemma-2-Ataraxy-9B", + "developer": "lemon07r", + "scores": { + "IFEval": 0.3009, + "BBH": 0.5931, + "MATH Level 5": 0.0853, + "GPQA": 0.3347, + "MUSR": 0.4424, + "MMLU-PRO": 0.4226 + } + }, + { + "model_id": "lemon07r/Gemma-2-Ataraxy-Advanced-9B", + "name": "Gemma-2-Ataraxy-Advanced-9B", + "developer": "lemon07r", + "scores": { + "IFEval": 0.5516, + "BBH": 0.5889, + "MATH Level 5": 0.1979, + "GPQA": 0.3356, + "MUSR": 0.3761, + "MMLU-PRO": 0.4244 + } + }, + { + "model_id": "lemon07r/Gemma-2-Ataraxy-Remix-9B", + "name": "Gemma-2-Ataraxy-Remix-9B", + "developer": "lemon07r", + "scores": { + "IFEval": 0.7083, + "BBH": 0.5892, + "MATH Level 5": 0.2017, + "GPQA": 0.3389, + "MUSR": 0.4372, + "MMLU-PRO": 0.4239 + } + }, + { + "model_id": "lemon07r/Gemma-2-Ataraxy-v2-9B", + "name": "Gemma-2-Ataraxy-v2-9B", + "developer": "lemon07r", + "scores": { + "IFEval": 0.2136, + "BBH": 0.5766, + "MATH Level 5": 0.0846, + "GPQA": 0.3423, + "MUSR": 0.3484, + "MMLU-PRO": 0.4221 + } + }, + { + "model_id": "lemon07r/Gemma-2-Ataraxy-v2a-9B", + "name": "Gemma-2-Ataraxy-v2a-9B", + "developer": "lemon07r", + "scores": { + "IFEval": 0.1595, + "BBH": 0.5182, + "MATH Level 5": 0.0612, + "GPQA": 0.3398, + "MUSR": 0.3165, + "MMLU-PRO": 0.3515 + } + }, + { + "model_id": "lemon07r/Gemma-2-Ataraxy-v2f-9B", + "name": "Gemma-2-Ataraxy-v2f-9B", + "developer": "lemon07r", + "scores": { + "IFEval": 0.3791, + "BBH": 0.5193, + "MATH Level 5": 0.1163, + "GPQA": 0.3389, + "MUSR": 0.3231, + "MMLU-PRO": 0.3503 + } + }, + { + "model_id": "lemon07r/Gemma-2-Ataraxy-v3-Advanced-9B", + "name": "Gemma-2-Ataraxy-v3-Advanced-9B", + "developer": "lemon07r", + "scores": { + "IFEval": 0.6602, + "BBH": 0.5935, + "MATH Level 5": 0.1873, + "GPQA": 0.3364, + "MUSR": 0.445, + "MMLU-PRO": 0.4196 + } + }, + { + "model_id": "lemon07r/Gemma-2-Ataraxy-v3b-9B", + "name": "Gemma-2-Ataraxy-v3b-9B", + "developer": "lemon07r", + "scores": { + "IFEval": 0.6809, + "BBH": 0.5908, + "MATH Level 5": 0.2153, + "GPQA": 0.3331, + "MUSR": 0.4489, + "MMLU-PRO": 0.4205 + } + }, + { + "model_id": "lemon07r/Gemma-2-Ataraxy-v3i-9B", + "name": "Gemma-2-Ataraxy-v3i-9B", + "developer": "lemon07r", + "scores": { + "IFEval": 0.4203, + "BBH": 0.5626, + "MATH Level 5": 0.1533, + "GPQA": 0.328, + "MUSR": 0.3181, + "MMLU-PRO": 0.4166 + } + }, + { + "model_id": "lemon07r/Gemma-2-Ataraxy-v3j-9B", + "name": "Gemma-2-Ataraxy-v3j-9B", + "developer": "lemon07r", + "scores": { + "IFEval": 0.4169, + "BBH": 0.5632, + "MATH Level 5": 0.1692, + "GPQA": 0.328, + "MUSR": 0.318, + "MMLU-PRO": 0.4134 + } + }, + { + "model_id": "lemon07r/Gemma-2-Ataraxy-v4-Advanced-9B", + "name": "Gemma-2-Ataraxy-v4-Advanced-9B", + "developer": "lemon07r", + "scores": { + "IFEval": 0.7015, + "BBH": 0.6024, + "MATH Level 5": 0.2153, + "GPQA": 0.3389, + "MUSR": 0.4581, + "MMLU-PRO": 0.4367 + } + }, + { + "model_id": "lemon07r/Gemma-2-Ataraxy-v4a-Advanced-9B", + "name": "Gemma-2-Ataraxy-v4a-Advanced-9B", + "developer": "lemon07r", + "scores": { + "IFEval": 0.7135, + "BBH": 0.5988, + "MATH Level 5": 0.2115, + "GPQA": 0.344, + "MUSR": 0.4489, + "MMLU-PRO": 0.4309 + } + }, + { + "model_id": "lemon07r/Gemma-2-Ataraxy-v4b-9B", + "name": "Gemma-2-Ataraxy-v4b-9B", + "developer": "lemon07r", + "scores": { + "IFEval": 0.6878, + "BBH": 0.6039, + "MATH Level 5": 0.2334, + "GPQA": 0.3406, + "MUSR": 0.4555, + "MMLU-PRO": 0.4357 + } + }, + { + "model_id": "lemon07r/Gemma-2-Ataraxy-v4c-9B", + "name": "Gemma-2-Ataraxy-v4c-9B", + "developer": "lemon07r", + "scores": { + "IFEval": 0.6945, + "BBH": 0.6084, + "MATH Level 5": 0.2266, + "GPQA": 0.3339, + "MUSR": 0.4528, + "MMLU-PRO": 0.4395 + } + }, + { + "model_id": "lemon07r/Gemma-2-Ataraxy-v4d-9B", + "name": "Gemma-2-Ataraxy-v4d-9B", + "developer": "lemon07r", + "scores": { + "IFEval": 0.725, + "BBH": 0.6054, + "MATH Level 5": 0.2334, + "GPQA": 0.3473, + "MUSR": 0.4541, + "MMLU-PRO": 0.4346 + } + }, + { + "model_id": "lemon07r/Llama-3-RedMagic4-8B", + "name": "Llama-3-RedMagic4-8B", + "developer": "lemon07r", + "scores": { + "IFEval": 0.4864, + "BBH": 0.4256, + "MATH Level 5": 0.0899, + "GPQA": 0.2903, + "MUSR": 0.3766, + "MMLU-PRO": 0.3676 + } + }, + { + "model_id": "lemon07r/llama-3-NeuralMahou-8b", + "name": "llama-3-NeuralMahou-8b", + "developer": "lemon07r", + "scores": { + "IFEval": 0.4901, + "BBH": 0.4184, + "MATH Level 5": 0.102, + "GPQA": 0.2886, + "MUSR": 0.3873, + "MMLU-PRO": 0.369 + } + }, + { + "model_id": "lesubra/ECE-EIFFEL-3B", + "name": "ECE-EIFFEL-3B", + "developer": "lesubra", + "scores": { + "IFEval": 0.3469, + "BBH": 0.5102, + "MATH Level 5": 0.1216, + "GPQA": 0.3314, + "MUSR": 0.4362, + "MMLU-PRO": 0.3821 + } + }, + { + "model_id": "lesubra/ECE-EIFFEL-3Bv2", + "name": "ECE-EIFFEL-3Bv2", + "developer": "lesubra", + "scores": { + "IFEval": 0.3013, + "BBH": 0.5424, + "MATH Level 5": 0.1186, + "GPQA": 0.3356, + "MUSR": 0.4443, + "MMLU-PRO": 0.3999 + } + }, + { + "model_id": "lesubra/ECE-EIFFEL-3Bv3", + "name": "ECE-EIFFEL-3Bv3", + "developer": "lesubra", + "scores": { + "IFEval": 0.3786, + "BBH": 0.5469, + "MATH Level 5": 0.1669, + "GPQA": 0.3297, + "MUSR": 0.4675, + "MMLU-PRO": 0.3975 + } + }, + { + "model_id": "lesubra/ECE-PRYMMAL-3B-SLERP-V1", + "name": "ECE-PRYMMAL-3B-SLERP-V1", + "developer": "lesubra", + "scores": { + "IFEval": 0.2933, + "BBH": 0.5341, + "MATH Level 5": 0.1662, + "GPQA": 0.3171, + "MUSR": 0.4595, + "MMLU-PRO": 0.39 + } + }, + { + "model_id": "lesubra/ECE-PRYMMAL-3B-SLERP-V2", + "name": "ECE-PRYMMAL-3B-SLERP-V2", + "developer": "lesubra", + "scores": { + "IFEval": 0.2933, + "BBH": 0.5341, + "MATH Level 5": 0.1662, + "GPQA": 0.3171, + "MUSR": 0.4595, + "MMLU-PRO": 0.39 + } + }, + { + "model_id": "lesubra/ECE-PRYMMAL-3B-SLERP_2-V1", + "name": "ECE-PRYMMAL-3B-SLERP_2-V1", + "developer": "lesubra", + "scores": { + "IFEval": 0.3649, + "BBH": 0.5411, + "MATH Level 5": 0.1677, + "GPQA": 0.3213, + "MUSR": 0.4661, + "MMLU-PRO": 0.399 + } + }, + { + "model_id": "lesubra/ECE-PRYMMAL-3B-SLERP_2-V2", + "name": "ECE-PRYMMAL-3B-SLERP_2-V2", + "developer": "lesubra", + "scores": { + "IFEval": 0.3664, + "BBH": 0.5411, + "MATH Level 5": 0.1677, + "GPQA": 0.3213, + "MUSR": 0.4661, + "MMLU-PRO": 0.399 + } + }, + { + "model_id": "lesubra/merge-test", + "name": "merge-test", + "developer": "lesubra", + "scores": { + "IFEval": 0.5383, + "BBH": 0.524, + "MATH Level 5": 0.1208, + "GPQA": 0.3221, + "MUSR": 0.4419, + "MMLU-PRO": 0.3874 + } + }, + { + "model_id": "lightblue/suzume-llama-3-8B-multilingual", + "name": "suzume-llama-3-8B-multilingual", + "developer": "lightblue", + "scores": { + "IFEval": 0.6678, + "BBH": 0.495, + "MATH Level 5": 0.0944, + "GPQA": 0.2836, + "MUSR": 0.3977, + "MMLU-PRO": 0.3383 + } + }, + { + "model_id": "lightblue/suzume-llama-3-8B-multilingual-orpo-borda-full", + "name": "suzume-llama-3-8B-multilingual-orpo-borda-full", + "developer": "lightblue", + "scores": { + "IFEval": 0.5817, + "BBH": 0.4714, + "MATH Level 5": 0.0763, + "GPQA": 0.2592, + "MUSR": 0.3222, + "MMLU-PRO": 0.331 + } + }, + { + "model_id": "lightblue/suzume-llama-3-8B-multilingual-orpo-borda-half", + "name": "suzume-llama-3-8B-multilingual-orpo-borda-half", + "developer": "lightblue", + "scores": { + "IFEval": 0.6249, + "BBH": 0.4707, + "MATH Level 5": 0.0906, + "GPQA": 0.245, + "MUSR": 0.3516, + "MMLU-PRO": 0.3614 + } + }, + { + "model_id": "lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top25", + "name": "suzume-llama-3-8B-multilingual-orpo-borda-top25", + "developer": "lightblue", + "scores": { + "IFEval": 0.6637, + "BBH": 0.4865, + "MATH Level 5": 0.1042, + "GPQA": 0.2727, + "MUSR": 0.3566, + "MMLU-PRO": 0.3684 + } + }, + { + "model_id": "lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top75", + "name": "suzume-llama-3-8B-multilingual-orpo-borda-top75", + "developer": "lightblue", + "scores": { + "IFEval": 0.6687, + "BBH": 0.4833, + "MATH Level 5": 0.0785, + "GPQA": 0.2727, + "MUSR": 0.3817, + "MMLU-PRO": 0.3769 + } + }, + { + "model_id": "lkoenig/BBAI_145_", + "name": "BBAI_145_", + "developer": "lkoenig", + "scores": { + "IFEval": 0.445, + "BBH": 0.5567, + "MATH Level 5": 0.361, + "GPQA": 0.3163, + "MUSR": 0.4382, + "MMLU-PRO": 0.449 + } + }, + { + "model_id": "lkoenig/BBAI_200_Gemma", + "name": "BBAI_200_Gemma", + "developer": "lkoenig", + "scores": { + "IFEval": 0.0705, + "BBH": 0.3449, + "MATH Level 5": 0.0, + "GPQA": 0.2668, + "MUSR": 0.3631, + "MMLU-PRO": 0.1679 + } + }, + { + "model_id": "lkoenig/BBAI_212_QwenLawLo", + "name": "BBAI_212_QwenLawLo", + "developer": "lkoenig", + "scores": { + "IFEval": 0.4566, + "BBH": 0.5574, + "MATH Level 5": 0.3603, + "GPQA": 0.3163, + "MUSR": 0.437, + "MMLU-PRO": 0.4489 + } + }, + { + "model_id": "lkoenig/BBAI_212_Qwencore", + "name": "BBAI_212_Qwencore", + "developer": "lkoenig", + "scores": { + "IFEval": 0.4384, + "BBH": 0.5569, + "MATH Level 5": 0.3489, + "GPQA": 0.3163, + "MUSR": 0.4343, + "MMLU-PRO": 0.449 + } + }, + { + "model_id": "lkoenig/BBAI_230_Xiaqwen", + "name": "BBAI_230_Xiaqwen", + "developer": "lkoenig", + "scores": { + "IFEval": 0.4649, + "BBH": 0.5578, + "MATH Level 5": 0.3663, + "GPQA": 0.3138, + "MUSR": 0.4422, + "MMLU-PRO": 0.4481 + } + }, + { + "model_id": "lkoenig/BBAI_375_QwenDyancabs", + "name": "BBAI_375_QwenDyancabs", + "developer": "lkoenig", + "scores": { + "IFEval": 0.4566, + "BBH": 0.5571, + "MATH Level 5": 0.3776, + "GPQA": 0.3129, + "MUSR": 0.4462, + "MMLU-PRO": 0.4476 + } + }, + { + "model_id": "lkoenig/BBAI_456_QwenKoen", + "name": "BBAI_456_QwenKoen", + "developer": "lkoenig", + "scores": { + "IFEval": 0.4529, + "BBH": 0.5553, + "MATH Level 5": 0.3686, + "GPQA": 0.3129, + "MUSR": 0.4395, + "MMLU-PRO": 0.4469 + } + }, + { + "model_id": "lkoenig/BBAI_7B_KoenQwenDyan", + "name": "BBAI_7B_KoenQwenDyan", + "developer": "lkoenig", + "scores": { + "IFEval": 0.5807, + "BBH": 0.5537, + "MATH Level 5": 0.3739, + "GPQA": 0.318, + "MUSR": 0.4369, + "MMLU-PRO": 0.446 + } + }, + { + "model_id": "lkoenig/BBAI_7B_Qwen2.5koen", + "name": "BBAI_7B_Qwen2.5koen", + "developer": "lkoenig", + "scores": { + "IFEval": 0.46, + "BBH": 0.5544, + "MATH Level 5": 0.3656, + "GPQA": 0.3129, + "MUSR": 0.4369, + "MMLU-PRO": 0.4485 + } + }, + { + "model_id": "lkoenig/BBAI_7B_QwenDyanKoenLo", + "name": "BBAI_7B_QwenDyanKoenLo", + "developer": "lkoenig", + "scores": { + "IFEval": 0.4663, + "BBH": 0.5562, + "MATH Level 5": 0.364, + "GPQA": 0.3188, + "MUSR": 0.4343, + "MMLU-PRO": 0.4465 + } + }, + { + "model_id": "lkoenig/BBAI_7B_QwenDyancabsLAW", + "name": "BBAI_7B_QwenDyancabsLAW", + "developer": "lkoenig", + "scores": { + "IFEval": 0.555, + "BBH": 0.5579, + "MATH Level 5": 0.3678, + "GPQA": 0.3188, + "MUSR": 0.4461, + "MMLU-PRO": 0.4471 + } + }, + { + "model_id": "llmat/Mistral-v0.3-7B-ORPO", + "name": "Mistral-v0.3-7B-ORPO", + "developer": "llmat", + "scores": { + "IFEval": 0.364, + "BBH": 0.4005, + "MATH Level 5": 0.0015, + "GPQA": 0.2693, + "MUSR": 0.3529, + "MMLU-PRO": 0.2301 + } + }, + { + "model_id": "llnYou/ECE-PRYMMAL-YL-1B-SLERP-V5", + "name": "ECE-PRYMMAL-YL-1B-SLERP-V5", + "developer": "llnYou", + "scores": { + "IFEval": 0.3313, + "BBH": 0.4233, + "MATH Level 5": 0.111, + "GPQA": 0.2861, + "MUSR": 0.3868, + "MMLU-PRO": 0.2931 + } + }, + { + "model_id": "llnYou/ECE-PRYMMAL-YL-1B-SLERP-V6", + "name": "ECE-PRYMMAL-YL-1B-SLERP-V6", + "developer": "llnYou", + "scores": { + "IFEval": 0.1388, + "BBH": 0.3944, + "MATH Level 5": 0.0023, + "GPQA": 0.2903, + "MUSR": 0.3928, + "MMLU-PRO": 0.235 + } + }, + { + "model_id": "llnYou/ECE-PRYMMAL-YL-3B-SLERP-V1", + "name": "ECE-PRYMMAL-YL-3B-SLERP-V1", + "developer": "llnYou", + "scores": { + "IFEval": 0.2346, + "BBH": 0.4018, + "MATH Level 5": 0.0091, + "GPQA": 0.2936, + "MUSR": 0.3364, + "MMLU-PRO": 0.285 + } + }, + { + "model_id": "llnYou/ECE-PRYMMAL-YL-3B-SLERP-V2", + "name": "ECE-PRYMMAL-YL-3B-SLERP-V2", + "developer": "llnYou", + "scores": { + "IFEval": 0.2309, + "BBH": 0.399, + "MATH Level 5": 0.0128, + "GPQA": 0.2768, + "MUSR": 0.3588, + "MMLU-PRO": 0.29 + } + }, + { + "model_id": "llnYou/ECE-PRYMMAL-YL-3B-SLERP-V3", + "name": "ECE-PRYMMAL-YL-3B-SLERP-V3", + "developer": "llnYou", + "scores": { + "IFEval": 0.3581, + "BBH": 0.5473, + "MATH Level 5": 0.1299, + "GPQA": 0.3045, + "MUSR": 0.4361, + "MMLU-PRO": 0.4043 + } + }, + { + "model_id": "lmsys/vicuna-13b-v1.3", + "name": "vicuna-13b-v1.3", + "developer": "lmsys", + "scores": { + "IFEval": 0.3344, + "BBH": 0.3384, + "MATH Level 5": 0.0144, + "GPQA": 0.2676, + "MUSR": 0.3727, + "MMLU-PRO": 0.2243 + } + }, + { + "model_id": "lmsys/vicuna-7b-v1.3", + "name": "vicuna-7b-v1.3", + "developer": "lmsys", + "scores": { + "IFEval": 0.2909, + "BBH": 0.3298, + "MATH Level 5": 0.0128, + "GPQA": 0.2424, + "MUSR": 0.3793, + "MMLU-PRO": 0.1838 + } + }, + { + "model_id": "lmsys/vicuna-7b-v1.5", + "name": "vicuna-7b-v1.5", + "developer": "lmsys", + "scores": { + "IFEval": 0.2352, + "BBH": 0.3947, + "MATH Level 5": 0.0136, + "GPQA": 0.2584, + "MUSR": 0.4231, + "MMLU-PRO": 0.2147 + } + }, + { + "model_id": "lodrick-the-lafted/llama-3.1-8b-instruct-ortho-v7", + "name": "llama-3.1-8b-instruct-ortho-v7", + "developer": "lodrick-the-lafted", + "scores": { + "IFEval": 0.3515, + "BBH": 0.3907, + "MATH Level 5": 0.0272, + "GPQA": 0.2727, + "MUSR": 0.3616, + "MMLU-PRO": 0.1974 + } + }, + { + "model_id": "lordjia/Llama-3-Cantonese-8B-Instruct", + "name": "Llama-3-Cantonese-8B-Instruct", + "developer": "lordjia", + "scores": { + "IFEval": 0.6669, + "BBH": 0.4814, + "MATH Level 5": 0.0891, + "GPQA": 0.2936, + "MUSR": 0.4046, + "MMLU-PRO": 0.3515 + } + }, + { + "model_id": "lordjia/Qwen2-Cantonese-7B-Instruct", + "name": "Qwen2-Cantonese-7B-Instruct", + "developer": "lordjia", + "scores": { + "IFEval": 0.5435, + "BBH": 0.5215, + "MATH Level 5": 0.256, + "GPQA": 0.2953, + "MUSR": 0.4004, + "MMLU-PRO": 0.3843 + } + }, + { + "model_id": "lt-asset/nova-1.3b", + "name": "nova-1.3b", + "developer": "lt-asset", + "scores": { + "IFEval": 0.1214, + "BBH": 0.317, + "MATH Level 5": 0.0121, + "GPQA": 0.2492, + "MUSR": 0.3698, + "MMLU-PRO": 0.1142 + } + }, + { + "model_id": "lunahr/thea-3b-50r-u1", + "name": "thea-3b-50r-u1", + "developer": "lunahr", + "scores": { + "IFEval": 0.603, + "BBH": 0.4105, + "MATH Level 5": 0.1042, + "GPQA": 0.2836, + "MUSR": 0.3182, + "MMLU-PRO": 0.2808 + } + }, + { + "model_id": "lunahr/thea-v2-3b-50r", + "name": "thea-v2-3b-50r", + "developer": "lunahr", + "scores": { + "IFEval": 0.3704, + "BBH": 0.4194, + "MATH Level 5": 0.0242, + "GPQA": 0.2609, + "MUSR": 0.3222, + "MMLU-PRO": 0.2409 + } + }, + { + "model_id": "m42-health/Llama3-Med42-70B", + "name": "Llama3-Med42-70B", + "developer": "m42-health", + "scores": { + "IFEval": 0.6291, + "BBH": 0.6688, + "MATH Level 5": 0.2258, + "GPQA": 0.3473, + "MUSR": 0.4629, + "MMLU-PRO": 0.4963 + } + }, + { + "model_id": "macadeliccc/Samantha-Qwen-2-7B", + "name": "Samantha-Qwen-2-7B", + "developer": "macadeliccc", + "scores": { + "IFEval": 0.4377, + "BBH": 0.5082, + "MATH Level 5": 0.2115, + "GPQA": 0.2727, + "MUSR": 0.4799, + "MMLU-PRO": 0.3779 + } + }, + { + "model_id": "macadeliccc/magistrate-3.2-3b-base", + "name": "magistrate-3.2-3b-base", + "developer": "macadeliccc", + "scores": { + "IFEval": 0.1159, + "BBH": 0.3343, + "MATH Level 5": 0.0113, + "GPQA": 0.2609, + "MUSR": 0.3976, + "MMLU-PRO": 0.1689 + } + }, + { + "model_id": "macadeliccc/magistrate-3.2-3b-it", + "name": "magistrate-3.2-3b-it", + "developer": "macadeliccc", + "scores": { + "IFEval": 0.2292, + "BBH": 0.3257, + "MATH Level 5": 0.0196, + "GPQA": 0.2475, + "MUSR": 0.3763, + "MMLU-PRO": 0.1592 + } + }, + { + "model_id": "magnifi/Phi3_intent_v56_3_w_unknown_5_lr_0.002", + "name": "Phi3_intent_v56_3_w_unknown_5_lr_0.002", + "developer": "magnifi", + "scores": { + "IFEval": 0.2018, + "BBH": 0.3282, + "MATH Level 5": 0.0, + "GPQA": 0.2643, + "MUSR": 0.4123, + "MMLU-PRO": 0.1472 + } + }, + { + "model_id": "maldv/Awqward2.5-32B-Instruct", + "name": "Awqward2.5-32B-Instruct", + "developer": "maldv", + "scores": { + "IFEval": 0.8255, + "BBH": 0.6974, + "MATH Level 5": 0.6231, + "GPQA": 0.3406, + "MUSR": 0.4275, + "MMLU-PRO": 0.5723 + } + }, + { + "model_id": "maldv/Lytta2.5-32B-Instruct", + "name": "Lytta2.5-32B-Instruct", + "developer": "maldv", + "scores": { + "IFEval": 0.2508, + "BBH": 0.56, + "MATH Level 5": 0.3444, + "GPQA": 0.2668, + "MUSR": 0.3769, + "MMLU-PRO": 0.5048 + } + }, + { + "model_id": "maldv/Qwentile2.5-32B-Instruct", + "name": "Qwentile2.5-32B-Instruct", + "developer": "maldv", + "scores": { + "IFEval": 0.7393, + "BBH": 0.6963, + "MATH Level 5": 0.5219, + "GPQA": 0.3842, + "MUSR": 0.4682, + "MMLU-PRO": 0.5879 + } + }, + { + "model_id": "maldv/badger-kappa-llama-3-8b", + "name": "badger-kappa-llama-3-8b", + "developer": "maldv", + "scores": { + "IFEval": 0.4695, + "BBH": 0.5085, + "MATH Level 5": 0.0861, + "GPQA": 0.3029, + "MUSR": 0.3765, + "MMLU-PRO": 0.3695 + } + }, + { + "model_id": "maldv/badger-lambda-llama-3-8b", + "name": "badger-lambda-llama-3-8b", + "developer": "maldv", + "scores": { + "IFEval": 0.4861, + "BBH": 0.4963, + "MATH Level 5": 0.0944, + "GPQA": 0.2819, + "MUSR": 0.3754, + "MMLU-PRO": 0.3767 + } + }, + { + "model_id": "maldv/badger-mu-llama-3-8b", + "name": "badger-mu-llama-3-8b", + "developer": "maldv", + "scores": { + "IFEval": 0.4919, + "BBH": 0.5143, + "MATH Level 5": 0.0559, + "GPQA": 0.2592, + "MUSR": 0.3555, + "MMLU-PRO": 0.3674 + } + }, + { + "model_id": "maldv/badger-writer-llama-3-8b", + "name": "badger-writer-llama-3-8b", + "developer": "maldv", + "scores": { + "IFEval": 0.5303, + "BBH": 0.4864, + "MATH Level 5": 0.0755, + "GPQA": 0.2894, + "MUSR": 0.3581, + "MMLU-PRO": 0.376 + } + }, + { + "model_id": "marcuscedricridia/Cheng-1", + "name": "Cheng-1", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7789, + "BBH": 0.5525, + "MATH Level 5": 0.4894, + "GPQA": 0.2961, + "MUSR": 0.4073, + "MMLU-PRO": 0.4349 + } + }, + { + "model_id": "marcuscedricridia/Cheng-2", + "name": "Cheng-2", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.8337, + "BBH": 0.6499, + "MATH Level 5": 0.5438, + "GPQA": 0.3456, + "MUSR": 0.4193, + "MMLU-PRO": 0.5013 + } + }, + { + "model_id": "marcuscedricridia/Cheng-2-v1.1", + "name": "Cheng-2-v1.1", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.827, + "BBH": 0.651, + "MATH Level 5": 0.5393, + "GPQA": 0.3431, + "MUSR": 0.4167, + "MMLU-PRO": 0.5076 + } + }, + { + "model_id": "marcuscedricridia/Hush-Qwen2.5-7B-MST", + "name": "Hush-Qwen2.5-7B-MST", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7488, + "BBH": 0.5458, + "MATH Level 5": 0.4245, + "GPQA": 0.3037, + "MUSR": 0.3914, + "MMLU-PRO": 0.4163 + } + }, + { + "model_id": "marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.1", + "name": "Hush-Qwen2.5-7B-MST-v1.1", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7445, + "BBH": 0.5559, + "MATH Level 5": 0.4653, + "GPQA": 0.3062, + "MUSR": 0.4073, + "MMLU-PRO": 0.4299 + } + }, + { + "model_id": "marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.3", + "name": "Hush-Qwen2.5-7B-MST-v1.3", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7043, + "BBH": 0.5516, + "MATH Level 5": 0.4758, + "GPQA": 0.3146, + "MUSR": 0.4311, + "MMLU-PRO": 0.444 + } + }, + { + "model_id": "marcuscedricridia/Hush-Qwen2.5-7B-Preview", + "name": "Hush-Qwen2.5-7B-Preview", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7962, + "BBH": 0.5431, + "MATH Level 5": 0.3754, + "GPQA": 0.3112, + "MUSR": 0.4298, + "MMLU-PRO": 0.4364 + } + }, + { + "model_id": "marcuscedricridia/Hush-Qwen2.5-7B-RP-v1.4-1M", + "name": "Hush-Qwen2.5-7B-RP-v1.4-1M", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7728, + "BBH": 0.5295, + "MATH Level 5": 0.3369, + "GPQA": 0.2987, + "MUSR": 0.4433, + "MMLU-PRO": 0.4135 + } + }, + { + "model_id": "marcuscedricridia/Hush-Qwen2.5-7B-v1.1", + "name": "Hush-Qwen2.5-7B-v1.1", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7889, + "BBH": 0.5384, + "MATH Level 5": 0.4381, + "GPQA": 0.3163, + "MUSR": 0.4179, + "MMLU-PRO": 0.4227 + } + }, + { + "model_id": "marcuscedricridia/Hush-Qwen2.5-7B-v1.2", + "name": "Hush-Qwen2.5-7B-v1.2", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7865, + "BBH": 0.5403, + "MATH Level 5": 0.4403, + "GPQA": 0.3146, + "MUSR": 0.4219, + "MMLU-PRO": 0.4197 + } + }, + { + "model_id": "marcuscedricridia/Hush-Qwen2.5-7B-v1.3", + "name": "Hush-Qwen2.5-7B-v1.3", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7856, + "BBH": 0.5327, + "MATH Level 5": 0.3323, + "GPQA": 0.3121, + "MUSR": 0.4246, + "MMLU-PRO": 0.4345 + } + }, + { + "model_id": "marcuscedricridia/Hush-Qwen2.5-7B-v1.4", + "name": "Hush-Qwen2.5-7B-v1.4", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7835, + "BBH": 0.5423, + "MATH Level 5": 0.426, + "GPQA": 0.3112, + "MUSR": 0.4232, + "MMLU-PRO": 0.4195 + } + }, + { + "model_id": "marcuscedricridia/Qwen2.5-7B-Preview", + "name": "Qwen2.5-7B-Preview", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7679, + "BBH": 0.536, + "MATH Level 5": 0.3444, + "GPQA": 0.3238, + "MUSR": 0.414, + "MMLU-PRO": 0.4258 + } + }, + { + "model_id": "marcuscedricridia/Yell-Qwen2.5-7B-Preview", + "name": "Yell-Qwen2.5-7B-Preview", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.5839, + "BBH": 0.5371, + "MATH Level 5": 0.1926, + "GPQA": 0.281, + "MUSR": 0.4046, + "MMLU-PRO": 0.3798 + } + }, + { + "model_id": "marcuscedricridia/Yell-Qwen2.5-7B-Preview-v1.1", + "name": "Yell-Qwen2.5-7B-Preview-v1.1", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.5757, + "BBH": 0.5348, + "MATH Level 5": 0.1896, + "GPQA": 0.2861, + "MUSR": 0.4059, + "MMLU-PRO": 0.3831 + } + }, + { + "model_id": "marcuscedricridia/absolute-o1-7b", + "name": "absolute-o1-7b", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7516, + "BBH": 0.5469, + "MATH Level 5": 0.5083, + "GPQA": 0.3196, + "MUSR": 0.4114, + "MMLU-PRO": 0.4413 + } + }, + { + "model_id": "marcuscedricridia/cursa-o1-7b", + "name": "cursa-o1-7b", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7628, + "BBH": 0.5466, + "MATH Level 5": 0.4955, + "GPQA": 0.307, + "MUSR": 0.4301, + "MMLU-PRO": 0.4392 + } + }, + { + "model_id": "marcuscedricridia/cursa-o1-7b-2-28-2025", + "name": "cursa-o1-7b-2-28-2025", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7467, + "BBH": 0.5384, + "MATH Level 5": 0.4811, + "GPQA": 0.307, + "MUSR": 0.4273, + "MMLU-PRO": 0.4365 + } + }, + { + "model_id": "marcuscedricridia/cursa-o1-7b-v1.1", + "name": "cursa-o1-7b-v1.1", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7528, + "BBH": 0.5493, + "MATH Level 5": 0.4985, + "GPQA": 0.307, + "MUSR": 0.4259, + "MMLU-PRO": 0.4392 + } + }, + { + "model_id": "marcuscedricridia/cursa-o1-7b-v1.2-normalize-false", + "name": "cursa-o1-7b-v1.2-normalize-false", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7616, + "BBH": 0.5492, + "MATH Level 5": 0.4992, + "GPQA": 0.307, + "MUSR": 0.4273, + "MMLU-PRO": 0.4436 + } + }, + { + "model_id": "marcuscedricridia/cursor-o1-7b", + "name": "cursor-o1-7b", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.4107, + "BBH": 0.5007, + "MATH Level 5": 0.1412, + "GPQA": 0.281, + "MUSR": 0.4101, + "MMLU-PRO": 0.3251 + } + }, + { + "model_id": "marcuscedricridia/cursorr-o1.2-7b", + "name": "cursorr-o1.2-7b", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.166, + "BBH": 0.3068, + "MATH Level 5": 0.0, + "GPQA": 0.2542, + "MUSR": 0.3538, + "MMLU-PRO": 0.108 + } + }, + { + "model_id": "marcuscedricridia/etr1o-explicit-v1.1", + "name": "etr1o-explicit-v1.1", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.288, + "BBH": 0.3132, + "MATH Level 5": 0.0045, + "GPQA": 0.2777, + "MUSR": 0.4111, + "MMLU-PRO": 0.1195 + } + }, + { + "model_id": "marcuscedricridia/etr1o-explicit-v1.2", + "name": "etr1o-explicit-v1.2", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.1504, + "BBH": 0.295, + "MATH Level 5": 0.0, + "GPQA": 0.2609, + "MUSR": 0.4031, + "MMLU-PRO": 0.1126 + } + }, + { + "model_id": "marcuscedricridia/etr1o-v1.1", + "name": "etr1o-v1.1", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.1597, + "BBH": 0.31, + "MATH Level 5": 0.0, + "GPQA": 0.2567, + "MUSR": 0.4017, + "MMLU-PRO": 0.1157 + } + }, + { + "model_id": "marcuscedricridia/etr1o-v1.2", + "name": "etr1o-v1.2", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7287, + "BBH": 0.6349, + "MATH Level 5": 0.3588, + "GPQA": 0.3758, + "MUSR": 0.4714, + "MMLU-PRO": 0.5316 + } + }, + { + "model_id": "marcuscedricridia/fan-o1-7b", + "name": "fan-o1-7b", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.4456, + "BBH": 0.4849, + "MATH Level 5": 0.1616, + "GPQA": 0.2844, + "MUSR": 0.3834, + "MMLU-PRO": 0.3274 + } + }, + { + "model_id": "marcuscedricridia/olmner-7b", + "name": "olmner-7b", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7254, + "BBH": 0.5472, + "MATH Level 5": 0.463, + "GPQA": 0.3079, + "MUSR": 0.438, + "MMLU-PRO": 0.4309 + } + }, + { + "model_id": "marcuscedricridia/olmner-della-7b", + "name": "olmner-della-7b", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7637, + "BBH": 0.5491, + "MATH Level 5": 0.4962, + "GPQA": 0.3012, + "MUSR": 0.4208, + "MMLU-PRO": 0.4386 + } + }, + { + "model_id": "marcuscedricridia/olmner-o1-7b", + "name": "olmner-o1-7b", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7528, + "BBH": 0.5481, + "MATH Level 5": 0.4924, + "GPQA": 0.3012, + "MUSR": 0.4299, + "MMLU-PRO": 0.4386 + } + }, + { + "model_id": "marcuscedricridia/olmner-sbr-7b", + "name": "olmner-sbr-7b", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.76, + "BBH": 0.5462, + "MATH Level 5": 0.4947, + "GPQA": 0.3087, + "MUSR": 0.4154, + "MMLU-PRO": 0.4412 + } + }, + { + "model_id": "marcuscedricridia/post-cursa-o1", + "name": "post-cursa-o1", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7628, + "BBH": 0.548, + "MATH Level 5": 0.4872, + "GPQA": 0.3096, + "MUSR": 0.4351, + "MMLU-PRO": 0.4361 + } + }, + { + "model_id": "marcuscedricridia/pre-cursa-o1", + "name": "pre-cursa-o1", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7409, + "BBH": 0.5462, + "MATH Level 5": 0.5038, + "GPQA": 0.3096, + "MUSR": 0.426, + "MMLU-PRO": 0.4424 + } + }, + { + "model_id": "marcuscedricridia/pre-cursa-o1-v1.2", + "name": "pre-cursa-o1-v1.2", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7549, + "BBH": 0.5487, + "MATH Level 5": 0.5068, + "GPQA": 0.3129, + "MUSR": 0.4272, + "MMLU-PRO": 0.4402 + } + }, + { + "model_id": "marcuscedricridia/pre-cursa-o1-v1.3", + "name": "pre-cursa-o1-v1.3", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7507, + "BBH": 0.5455, + "MATH Level 5": 0.5076, + "GPQA": 0.3129, + "MUSR": 0.4271, + "MMLU-PRO": 0.442 + } + }, + { + "model_id": "marcuscedricridia/pre-cursa-o1-v1.4", + "name": "pre-cursa-o1-v1.4", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7488, + "BBH": 0.5493, + "MATH Level 5": 0.4834, + "GPQA": 0.3054, + "MUSR": 0.4285, + "MMLU-PRO": 0.4436 + } + }, + { + "model_id": "marcuscedricridia/pre-cursa-o1-v1.6", + "name": "pre-cursa-o1-v1.6", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7528, + "BBH": 0.5473, + "MATH Level 5": 0.5, + "GPQA": 0.3205, + "MUSR": 0.4234, + "MMLU-PRO": 0.4413 + } + }, + { + "model_id": "marcuscedricridia/r1o-et", + "name": "r1o-et", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.3597, + "BBH": 0.4209, + "MATH Level 5": 0.0793, + "GPQA": 0.2727, + "MUSR": 0.3579, + "MMLU-PRO": 0.258 + } + }, + { + "model_id": "marcuscedricridia/sbr-o1-7b", + "name": "sbr-o1-7b", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.7455, + "BBH": 0.5479, + "MATH Level 5": 0.4985, + "GPQA": 0.3104, + "MUSR": 0.4404, + "MMLU-PRO": 0.4355 + } + }, + { + "model_id": "marcuscedricridia/stray-r1o-et", + "name": "stray-r1o-et", + "developer": "marcuscedricridia", + "scores": { + "IFEval": 0.1562, + "BBH": 0.2967, + "MATH Level 5": 0.0045, + "GPQA": 0.2617, + "MUSR": 0.4086, + "MMLU-PRO": 0.1094 + } + }, + { + "model_id": "matouLeLoup/ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3", + "name": "ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3", + "developer": "matouLeLoup", + "scores": { + "IFEval": 0.1873, + "BBH": 0.3239, + "MATH Level 5": 0.0264, + "GPQA": 0.2609, + "MUSR": 0.3752, + "MMLU-PRO": 0.172 + } + }, + { + "model_id": "matouLeLoup/ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis", + "name": "ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis", + "developer": "matouLeLoup", + "scores": { + "IFEval": 0.1873, + "BBH": 0.3239, + "MATH Level 5": 0.0264, + "GPQA": 0.2609, + "MUSR": 0.3752, + "MMLU-PRO": 0.172 + } + }, + { + "model_id": "matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis", + "name": "ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis", + "developer": "matouLeLoup", + "scores": { + "IFEval": 0.1873, + "BBH": 0.3239, + "MATH Level 5": 0.0264, + "GPQA": 0.2609, + "MUSR": 0.3752, + "MMLU-PRO": 0.172 + } + }, + { + "model_id": "matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis", + "name": "ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis", + "developer": "matouLeLoup", + "scores": { + "IFEval": 0.1882, + "BBH": 0.3233, + "MATH Level 5": 0.0272, + "GPQA": 0.2634, + "MUSR": 0.3685, + "MMLU-PRO": 0.172 + } + }, + { + "model_id": "matouLeLoup/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis", + "name": "ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis", + "developer": "matouLeLoup", + "scores": { + "IFEval": 0.1652, + "BBH": 0.3024, + "MATH Level 5": 0.0189, + "GPQA": 0.2567, + "MUSR": 0.4273, + "MMLU-PRO": 0.1116 + } + }, + { + "model_id": "mattshumer/Reflection-Llama-3.1-70B", + "name": "Reflection-Llama-3.1-70B", + "developer": "mattshumer", + "scores": { + "IFEval": 0.0045, + "BBH": 0.645, + "MATH Level 5": 0.2145, + "GPQA": 0.3633, + "MUSR": 0.4577, + "MMLU-PRO": 0.4955 + } + }, + { + "model_id": "mattshumer/ref_70_e3", + "name": "ref_70_e3", + "developer": "mattshumer", + "scores": { + "IFEval": 0.6294, + "BBH": 0.6501, + "MATH Level 5": 0.2795, + "GPQA": 0.3356, + "MUSR": 0.4328, + "MMLU-PRO": 0.5303 + } + }, + { + "model_id": "maywell/Qwen2-7B-Multilingual-RP", + "name": "Qwen2-7B-Multilingual-RP", + "developer": "maywell", + "scores": { + "IFEval": 0.4347, + "BBH": 0.5062, + "MATH Level 5": 0.2243, + "GPQA": 0.297, + "MUSR": 0.3696, + "MMLU-PRO": 0.3859 + } + }, + { + "model_id": "meditsolutions/Llama-3.1-MedIT-SUN-8B", + "name": "Llama-3.1-MedIT-SUN-8B", + "developer": "meditsolutions", + "scores": { + "IFEval": 0.7837, + "BBH": 0.5187, + "MATH Level 5": 0.2092, + "GPQA": 0.3087, + "MUSR": 0.4056, + "MMLU-PRO": 0.3916 + } + }, + { + "model_id": "meditsolutions/Llama-3.2-SUN-1B-Instruct", + "name": "Llama-3.2-SUN-1B-Instruct", + "developer": "meditsolutions", + "scores": { + "IFEval": 0.6413, + "BBH": 0.3474, + "MATH Level 5": 0.071, + "GPQA": 0.2424, + "MUSR": 0.3514, + "MMLU-PRO": 0.1781 + } + }, + { + "model_id": "meditsolutions/Llama-3.2-SUN-1B-chat", + "name": "Llama-3.2-SUN-1B-chat", + "developer": "meditsolutions", + "scores": { + "IFEval": 0.5482, + "BBH": 0.3514, + "MATH Level 5": 0.0642, + "GPQA": 0.2617, + "MUSR": 0.3249, + "MMLU-PRO": 0.1838 + } + }, + { + "model_id": "meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-26000", + "name": "Llama-3.2-SUN-2.4B-checkpoint-26000", + "developer": "meditsolutions", + "scores": { + "IFEval": 0.2814, + "BBH": 0.3018, + "MATH Level 5": 0.0181, + "GPQA": 0.2777, + "MUSR": 0.4103, + "MMLU-PRO": 0.1345 + } + }, + { + "model_id": "meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-34800", + "name": "Llama-3.2-SUN-2.4B-checkpoint-34800", + "developer": "meditsolutions", + "scores": { + "IFEval": 0.2501, + "BBH": 0.3161, + "MATH Level 5": 0.0106, + "GPQA": 0.2861, + "MUSR": 0.4022, + "MMLU-PRO": 0.1357 + } + }, + { + "model_id": "meditsolutions/Llama-3.2-SUN-2.4B-v1.0.0", + "name": "Llama-3.2-SUN-2.4B-v1.0.0", + "developer": "meditsolutions", + "scores": { + "IFEval": 0.5637, + "BBH": 0.3391, + "MATH Level 5": 0.0627, + "GPQA": 0.2576, + "MUSR": 0.3209, + "MMLU-PRO": 0.1543 + } + }, + { + "model_id": "meditsolutions/Llama-3.2-SUN-2.5B-chat", + "name": "Llama-3.2-SUN-2.5B-chat", + "developer": "meditsolutions", + "scores": { + "IFEval": 0.5604, + "BBH": 0.3575, + "MATH Level 5": 0.071, + "GPQA": 0.2592, + "MUSR": 0.3155, + "MMLU-PRO": 0.1813 + } + }, + { + "model_id": "meditsolutions/Llama-3.2-SUN-HDIC-1B-Instruct", + "name": "Llama-3.2-SUN-HDIC-1B-Instruct", + "developer": "meditsolutions", + "scores": { + "IFEval": 0.6827, + "BBH": 0.3508, + "MATH Level 5": 0.0619, + "GPQA": 0.2366, + "MUSR": 0.3594, + "MMLU-PRO": 0.1687 + } + }, + { + "model_id": "meditsolutions/MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune", + "name": "MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune", + "developer": "meditsolutions", + "scores": { + "IFEval": 0.3655, + "BBH": 0.4035, + "MATH Level 5": 0.0264, + "GPQA": 0.3029, + "MUSR": 0.4253, + "MMLU-PRO": 0.219 + } + }, + { + "model_id": "meditsolutions/MSH-v1-Bielik-v2.3-Instruct-MedIT-merge", + "name": "MSH-v1-Bielik-v2.3-Instruct-MedIT-merge", + "developer": "meditsolutions", + "scores": { + "IFEval": 0.5814, + "BBH": 0.5672, + "MATH Level 5": 0.2077, + "GPQA": 0.3456, + "MUSR": 0.4385, + "MMLU-PRO": 0.35 + } + }, + { + "model_id": "meditsolutions/MedIT-Mesh-3B-Instruct", + "name": "MedIT-Mesh-3B-Instruct", + "developer": "meditsolutions", + "scores": { + "IFEval": 0.5814, + "BBH": 0.5576, + "MATH Level 5": 0.2032, + "GPQA": 0.3238, + "MUSR": 0.4048, + "MMLU-PRO": 0.4012 + } + }, + { + "model_id": "meditsolutions/SmolLM2-MedIT-Upscale-2B", + "name": "SmolLM2-MedIT-Upscale-2B", + "developer": "meditsolutions", + "scores": { + "IFEval": 0.6429, + "BBH": 0.3551, + "MATH Level 5": 0.0559, + "GPQA": 0.2643, + "MUSR": 0.3314, + "MMLU-PRO": 0.1971 + } + }, + { + "model_id": "meetkai/functionary-small-v3.1", + "name": "functionary-small-v3.1", + "developer": "meetkai", + "scores": { + "IFEval": 0.6275, + "BBH": 0.4982, + "MATH Level 5": 0.1571, + "GPQA": 0.2886, + "MUSR": 0.3834, + "MMLU-PRO": 0.3349 + } + }, + { + "model_id": "meraGPT/mera-mix-4x7B", + "name": "mera-mix-4x7B", + "developer": "meraGPT", + "scores": { + "IFEval": 0.4832, + "BBH": 0.4019, + "MATH Level 5": 0.0536, + "GPQA": 0.3045, + "MUSR": 0.4057, + "MMLU-PRO": 0.2748 + } + }, + { + "model_id": "mergekit-community/JAJUKA-WEWILLNEVERFORGETYOU-3B", + "name": "JAJUKA-WEWILLNEVERFORGETYOU-3B", + "developer": "mergekit-community", + "scores": { + "IFEval": 0.4941, + "BBH": 0.437, + "MATH Level 5": 0.1246, + "GPQA": 0.2928, + "MUSR": 0.3656, + "MMLU-PRO": 0.3033 + } + }, + { + "model_id": "mergekit-community/SuperQwen-2.5-1.5B", + "name": "SuperQwen-2.5-1.5B", + "developer": "mergekit-community", + "scores": { + "IFEval": 0.1336, + "BBH": 0.2907, + "MATH Level 5": 0.0196, + "GPQA": 0.2542, + "MUSR": 0.3355, + "MMLU-PRO": 0.1075 + } + }, + { + "model_id": "mergekit-community/VirtuosoSmall-InstructModelStock", + "name": "VirtuosoSmall-InstructModelStock", + "developer": "mergekit-community", + "scores": { + "IFEval": 0.5238, + "BBH": 0.6518, + "MATH Level 5": 0.4094, + "GPQA": 0.3826, + "MUSR": 0.4756, + "MMLU-PRO": 0.5421 + } + }, + { + "model_id": "mergekit-community/diabolic6045_ELN-AOC-CAIN", + "name": "diabolic6045_ELN-AOC-CAIN", + "developer": "mergekit-community", + "scores": { + "IFEval": 0.0862, + "BBH": 0.3126, + "MATH Level 5": 0.0121, + "GPQA": 0.2634, + "MUSR": 0.3658, + "MMLU-PRO": 0.1191 + } + }, + { + "model_id": "mergekit-community/mergekit-dare_ties-ajgjgea", + "name": "mergekit-dare_ties-ajgjgea", + "developer": "mergekit-community", + "scores": { + "IFEval": 0.5263, + "BBH": 0.3495, + "MATH Level 5": 0.0642, + "GPQA": 0.2643, + "MUSR": 0.3289, + "MMLU-PRO": 0.1744 + } + }, + { + "model_id": "mergekit-community/mergekit-della-zgowfmf", + "name": "mergekit-della-zgowfmf", + "developer": "mergekit-community", + "scores": { + "IFEval": 0.4828, + "BBH": 0.6591, + "MATH Level 5": 0.3618, + "GPQA": 0.3901, + "MUSR": 0.4834, + "MMLU-PRO": 0.5415 + } + }, + { + "model_id": "mergekit-community/mergekit-model_stock-azgztvm", + "name": "mergekit-model_stock-azgztvm", + "developer": "mergekit-community", + "scores": { + "IFEval": 0.5062, + "BBH": 0.6543, + "MATH Level 5": 0.4373, + "GPQA": 0.3817, + "MUSR": 0.473, + "MMLU-PRO": 0.5406 + } + }, + { + "model_id": "mergekit-community/mergekit-slerp-fmrazcr", + "name": "mergekit-slerp-fmrazcr", + "developer": "mergekit-community", + "scores": { + "IFEval": 0.4174, + "BBH": 0.5342, + "MATH Level 5": 0.1193, + "GPQA": 0.3112, + "MUSR": 0.4105, + "MMLU-PRO": 0.3777 + } + }, + { + "model_id": "mergekit-community/mergekit-ties-rraxdhv", + "name": "mergekit-ties-rraxdhv", + "developer": "mergekit-community", + "scores": { + "IFEval": 0.1123, + "BBH": 0.5184, + "MATH Level 5": 0.04, + "GPQA": 0.3079, + "MUSR": 0.4202, + "MMLU-PRO": 0.391 + } + }, + { + "model_id": "mergekit-community/mergekit-ties-ykqemwr", + "name": "mergekit-ties-ykqemwr", + "developer": "mergekit-community", + "scores": { + "IFEval": 0.36, + "BBH": 0.5455, + "MATH Level 5": 0.1224, + "GPQA": 0.3221, + "MUSR": 0.4198, + "MMLU-PRO": 0.3734 + } + }, + { + "model_id": "mergekit-community/sexeh_time_testing", + "name": "sexeh_time_testing", + "developer": "mergekit-community", + "scores": { + "IFEval": 0.7329, + "BBH": 0.5241, + "MATH Level 5": 0.0899, + "GPQA": 0.2911, + "MUSR": 0.3619, + "MMLU-PRO": 0.3667 + } + }, + { + "model_id": "meta-llama/Llama-2-13b-chat-hf", + "name": "Llama-2-13b-chat-hf", + "developer": "meta-llama", + "scores": { + "IFEval": 0.3985, + "BBH": 0.3343, + "MATH Level 5": 0.0136, + "GPQA": 0.2315, + "MUSR": 0.4007, + "MMLU-PRO": 0.1923 + } + }, + { + "model_id": "meta-llama/Llama-2-13b-hf", + "name": "Llama-2-13b-hf", + "developer": "meta-llama", + "scores": { + "IFEval": 0.2482, + "BBH": 0.4126, + "MATH Level 5": 0.0151, + "GPQA": 0.281, + "MUSR": 0.3538, + "MMLU-PRO": 0.2378 + } + }, + { + "model_id": "meta-llama/Llama-2-70b-chat-hf", + "name": "Llama-2-70b-chat-hf", + "developer": "meta-llama", + "scores": { + "IFEval": 0.4958, + "BBH": 0.3042, + "MATH Level 5": 0.0295, + "GPQA": 0.2643, + "MUSR": 0.3687, + "MMLU-PRO": 0.2433 + } + }, + { + "model_id": "meta-llama/Llama-2-70b-hf", + "name": "Llama-2-70b-hf", + "developer": "meta-llama", + "scores": { + "IFEval": 0.2407, + "BBH": 0.5473, + "MATH Level 5": 0.0325, + "GPQA": 0.3029, + "MUSR": 0.4124, + "MMLU-PRO": 0.3718 + } + }, + { + "model_id": "meta-llama/Llama-2-7b-chat-hf", + "name": "Llama-2-7b-chat-hf", + "developer": "meta-llama", + "scores": { + "IFEval": 0.3986, + "BBH": 0.3114, + "MATH Level 5": 0.0196, + "GPQA": 0.2534, + "MUSR": 0.3676, + "MMLU-PRO": 0.1688 + } + }, + { + "model_id": "meta-llama/Llama-2-7b-hf", + "name": "Llama-2-7b-hf", + "developer": "meta-llama", + "scores": { + "IFEval": 0.2519, + "BBH": 0.3496, + "MATH Level 5": 0.0174, + "GPQA": 0.2668, + "MUSR": 0.3701, + "MMLU-PRO": 0.1861 + } + }, + { + "model_id": "meta-llama/Llama-3.1-70B", + "name": "Llama-3.1-70B", + "developer": "meta-llama", + "scores": { + "IFEval": 0.1684, + "BBH": 0.626, + "MATH Level 5": 0.1843, + "GPQA": 0.3876, + "MUSR": 0.4572, + "MMLU-PRO": 0.4654 + } + }, + { + "model_id": "meta-llama/Llama-3.1-70B-Instruct", + "name": "Llama-3.1-70B-Instruct", + "developer": "meta-llama", + "scores": { + "IFEval": 0.8669, + "BBH": 0.6917, + "MATH Level 5": 0.3807, + "GPQA": 0.3565, + "MUSR": 0.4581, + "MMLU-PRO": 0.5309 + } + }, + { + "model_id": "meta-llama/Llama-3.1-8B", + "name": "Llama-3.1-8B", + "developer": "meta-llama", + "scores": { + "IFEval": 0.1246, + "BBH": 0.466, + "MATH Level 5": 0.0657, + "GPQA": 0.3104, + "MUSR": 0.3812, + "MMLU-PRO": 0.3288 + } + }, + { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "name": "Llama-3.1-8B-Instruct", + "developer": "meta-llama", + "scores": { + "IFEval": 0.4922, + "BBH": 0.5087, + "MATH Level 5": 0.1556, + "GPQA": 0.3154, + "MUSR": 0.3972, + "MMLU-PRO": 0.3798 + } + }, + { + "model_id": "meta-llama/Llama-3.2-1B", + "name": "Llama-3.2-1B", + "developer": "meta-llama", + "scores": { + "IFEval": 0.1478, + "BBH": 0.3115, + "MATH Level 5": 0.0121, + "GPQA": 0.2282, + "MUSR": 0.3447, + "MMLU-PRO": 0.1203 + } + }, + { + "model_id": "meta-llama/Llama-3.2-1B-Instruct", + "name": "Llama-3.2-1B-Instruct", + "developer": "meta-llama", + "scores": { + "IFEval": 0.5698, + "BBH": 0.3497, + "MATH Level 5": 0.0702, + "GPQA": 0.2752, + "MUSR": 0.3329, + "MMLU-PRO": 0.1682 + } + }, + { + "model_id": "meta-llama/Llama-3.2-3B", + "name": "Llama-3.2-3B", + "developer": "meta-llama", + "scores": { + "IFEval": 0.1337, + "BBH": 0.3905, + "MATH Level 5": 0.0189, + "GPQA": 0.2676, + "MUSR": 0.3577, + "MMLU-PRO": 0.2488 + } + }, + { + "model_id": "meta-llama/Llama-3.2-3B-Instruct", + "name": "Llama-3.2-3B-Instruct", + "developer": "meta-llama", + "scores": { + "IFEval": 0.7393, + "BBH": 0.461, + "MATH Level 5": 0.1767, + "GPQA": 0.2785, + "MUSR": 0.3529, + "MMLU-PRO": 0.3195 + } + }, + { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "name": "Llama-3.3-70B-Instruct", + "developer": "meta-llama", + "scores": { + "IFEval": 0.8998, + "BBH": 0.6919, + "MATH Level 5": 0.4834, + "GPQA": 0.3289, + "MUSR": 0.4461, + "MMLU-PRO": 0.5332 + } + }, + { + "model_id": "meta-llama/Meta-Llama-3-70B", + "name": "Meta-Llama-3-70B", + "developer": "meta-llama", + "scores": { + "IFEval": 0.1603, + "BBH": 0.6461, + "MATH Level 5": 0.1858, + "GPQA": 0.3977, + "MUSR": 0.4518, + "MMLU-PRO": 0.4709 + } + }, + { + "model_id": "meta-llama/Meta-Llama-3-70B-Instruct", + "name": "Meta-Llama-3-70B-Instruct", + "developer": "meta-llama", + "scores": { + "IFEval": 0.8099, + "BBH": 0.6547, + "MATH Level 5": 0.2447, + "GPQA": 0.2869, + "MUSR": 0.4154, + "MMLU-PRO": 0.5207 + } + }, + { + "model_id": "meta-llama/Meta-Llama-3-8B", + "name": "Meta-Llama-3-8B", + "developer": "meta-llama", + "scores": { + "IFEval": 0.1455, + "BBH": 0.4598, + "MATH Level 5": 0.0453, + "GPQA": 0.3054, + "MUSR": 0.3614, + "MMLU-PRO": 0.321 + } + }, + { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "name": "Meta-Llama-3-8B-Instruct", + "developer": "meta-llama", + "scores": { + "IFEval": 0.7408, + "BBH": 0.4989, + "MATH Level 5": 0.0869, + "GPQA": 0.2592, + "MUSR": 0.3568, + "MMLU-PRO": 0.3664 + } + }, + { + "model_id": "mhl1/Qwen2.5-0.5B-cinstruct-stage1", + "name": "Qwen2.5-0.5B-cinstruct-stage1", + "developer": "mhl1", + "scores": { + "IFEval": 0.1482, + "BBH": 0.3256, + "MATH Level 5": 0.0128, + "GPQA": 0.2651, + "MUSR": 0.35, + "MMLU-PRO": 0.1139 + } + }, + { + "model_id": "microsoft/DialoGPT-medium", + "name": "DialoGPT-medium", + "developer": "microsoft", + "scores": { + "IFEval": 0.1479, + "BBH": 0.3014, + "MATH Level 5": 0.0, + "GPQA": 0.2542, + "MUSR": 0.4287, + "MMLU-PRO": 0.1119 + } + }, + { + "model_id": "microsoft/Orca-2-13b", + "name": "Orca-2-13b", + "developer": "microsoft", + "scores": { + "IFEval": 0.3128, + "BBH": 0.4884, + "MATH Level 5": 0.0317, + "GPQA": 0.2802, + "MUSR": 0.513, + "MMLU-PRO": 0.2749 + } + }, + { + "model_id": "microsoft/Orca-2-7b", + "name": "Orca-2-7b", + "developer": "microsoft", + "scores": { + "IFEval": 0.2183, + "BBH": 0.4452, + "MATH Level 5": 0.0196, + "GPQA": 0.2609, + "MUSR": 0.5026, + "MMLU-PRO": 0.2319 + } + }, + { + "model_id": "microsoft/Phi-3-medium-128k-instruct", + "name": "Phi-3-medium-128k-instruct", + "developer": "microsoft", + "scores": { + "IFEval": 0.604, + "BBH": 0.6382, + "MATH Level 5": 0.1918, + "GPQA": 0.3364, + "MUSR": 0.4129, + "MMLU-PRO": 0.4712 + } + }, + { + "model_id": "microsoft/Phi-3-medium-4k-instruct", + "name": "Phi-3-medium-4k-instruct", + "developer": "microsoft", + "scores": { + "IFEval": 0.6423, + "BBH": 0.6412, + "MATH Level 5": 0.1956, + "GPQA": 0.3364, + "MUSR": 0.4258, + "MMLU-PRO": 0.4676 + } + }, + { + "model_id": "microsoft/Phi-3-mini-128k-instruct", + "name": "Phi-3-mini-128k-instruct", + "developer": "microsoft", + "scores": { + "IFEval": 0.5976, + "BBH": 0.5575, + "MATH Level 5": 0.1405, + "GPQA": 0.318, + "MUSR": 0.3937, + "MMLU-PRO": 0.3734 + } + }, + { + "model_id": "microsoft/Phi-3-mini-4k-instruct", + "name": "Phi-3-mini-4k-instruct", + "developer": "microsoft", + "scores": { + "IFEval": 0.5477, + "BBH": 0.5491, + "MATH Level 5": 0.1639, + "GPQA": 0.3322, + "MUSR": 0.4284, + "MMLU-PRO": 0.4022 + } + }, + { + "model_id": "microsoft/Phi-3-small-128k-instruct", + "name": "Phi-3-small-128k-instruct", + "developer": "microsoft", + "scores": { + "IFEval": 0.6368, + "BBH": 0.6202, + "MATH Level 5": 0.2026, + "GPQA": 0.3171, + "MUSR": 0.4378, + "MMLU-PRO": 0.4491 + } + }, + { + "model_id": "microsoft/Phi-3-small-8k-instruct", + "name": "Phi-3-small-8k-instruct", + "developer": "microsoft", + "scores": { + "IFEval": 0.6497, + "BBH": 0.6208, + "MATH Level 5": 0.1887, + "GPQA": 0.3121, + "MUSR": 0.4558, + "MMLU-PRO": 0.4506 + } + }, + { + "model_id": "microsoft/Phi-3.5-MoE-instruct", + "name": "Phi-3.5-MoE-instruct", + "developer": "microsoft", + "scores": { + "IFEval": 0.6925, + "BBH": 0.6408, + "MATH Level 5": 0.3119, + "GPQA": 0.3557, + "MUSR": 0.4565, + "MMLU-PRO": 0.4658 + } + }, + { + "model_id": "microsoft/Phi-3.5-mini-instruct", + "name": "Phi-3.5-mini-instruct", + "developer": "microsoft", + "scores": { + "IFEval": 0.5775, + "BBH": 0.5518, + "MATH Level 5": 0.1964, + "GPQA": 0.3398, + "MUSR": 0.4021, + "MMLU-PRO": 0.3962 + } + }, + { + "model_id": "microsoft/Phi-4-mini-instruct", + "name": "Phi-4-mini-instruct", + "developer": "microsoft", + "scores": { + "IFEval": 0.7378, + "BBH": 0.5689, + "MATH Level 5": 0.1699, + "GPQA": 0.3096, + "MUSR": 0.3873, + "MMLU-PRO": 0.3932 + } + }, + { + "model_id": "microsoft/phi-1", + "name": "phi-1", + "developer": "microsoft", + "scores": { + "IFEval": 0.2068, + "BBH": 0.3139, + "MATH Level 5": 0.0098, + "GPQA": 0.2651, + "MUSR": 0.3525, + "MMLU-PRO": 0.1162 + } + }, + { + "model_id": "microsoft/phi-1_5", + "name": "phi-1_5", + "developer": "microsoft", + "scores": { + "IFEval": 0.2033, + "BBH": 0.336, + "MATH Level 5": 0.0181, + "GPQA": 0.2676, + "MUSR": 0.3404, + "MMLU-PRO": 0.1691 + } + }, + { + "model_id": "microsoft/phi-2", + "name": "Phi-2", + "developer": "microsoft", + "scores": { + "IFEval": 0.2739, + "BBH": 0.4881, + "MATH Level 5": 0.0295, + "GPQA": 0.2718, + "MUSR": 0.4099, + "MMLU-PRO": 0.2628 + } + }, + { + "model_id": "microsoft/phi-4", + "name": "phi-4", + "developer": "microsoft", + "scores": { + "IFEval": 0.0488, + "BBH": 0.6703, + "MATH Level 5": 0.2787, + "GPQA": 0.401, + "MUSR": 0.5034, + "MMLU-PRO": 0.5295 + } + }, + { + "model_id": "migtissera/Llama-3-70B-Synthia-v3.5", + "name": "Llama-3-70B-Synthia-v3.5", + "developer": "migtissera", + "scores": { + "IFEval": 0.6076, + "BBH": 0.6489, + "MATH Level 5": 0.2115, + "GPQA": 0.3876, + "MUSR": 0.4922, + "MMLU-PRO": 0.4658 + } + }, + { + "model_id": "migtissera/Llama-3-8B-Synthia-v3.5", + "name": "Llama-3-8B-Synthia-v3.5", + "developer": "migtissera", + "scores": { + "IFEval": 0.507, + "BBH": 0.4888, + "MATH Level 5": 0.0657, + "GPQA": 0.2718, + "MUSR": 0.4044, + "MMLU-PRO": 0.303 + } + }, + { + "model_id": "migtissera/Tess-3-7B-SFT", + "name": "Tess-3-7B-SFT", + "developer": "migtissera", + "scores": { + "IFEval": 0.3946, + "BBH": 0.4607, + "MATH Level 5": 0.04, + "GPQA": 0.271, + "MUSR": 0.4113, + "MMLU-PRO": 0.3034 + } + }, + { + "model_id": "migtissera/Tess-3-Mistral-Nemo-12B", + "name": "Tess-3-Mistral-Nemo-12B", + "developer": "migtissera", + "scores": { + "IFEval": 0.3355, + "BBH": 0.4899, + "MATH Level 5": 0.0574, + "GPQA": 0.2508, + "MUSR": 0.4458, + "MMLU-PRO": 0.2565 + } + }, + { + "model_id": "migtissera/Tess-v2.5-Phi-3-medium-128k-14B", + "name": "Tess-v2.5-Phi-3-medium-128k-14B", + "developer": "migtissera", + "scores": { + "IFEval": 0.4539, + "BBH": 0.6207, + "MATH Level 5": 0.0506, + "GPQA": 0.3079, + "MUSR": 0.4113, + "MMLU-PRO": 0.3732 + } + }, + { + "model_id": "migtissera/Tess-v2.5.2-Qwen2-72B", + "name": "Tess-v2.5.2-Qwen2-72B", + "developer": "migtissera", + "scores": { + "IFEval": 0.4494, + "BBH": 0.6647, + "MATH Level 5": 0.2938, + "GPQA": 0.3507, + "MUSR": 0.4188, + "MMLU-PRO": 0.5561 + } + }, + { + "model_id": "migtissera/Trinity-2-Codestral-22B", + "name": "Trinity-2-Codestral-22B", + "developer": "migtissera", + "scores": { + "IFEval": 0.4202, + "BBH": 0.5593, + "MATH Level 5": 0.0967, + "GPQA": 0.3146, + "MUSR": 0.4111, + "MMLU-PRO": 0.3308 + } + }, + { + "model_id": "migtissera/Trinity-2-Codestral-22B-v0.2", + "name": "Trinity-2-Codestral-22B-v0.2", + "developer": "migtissera", + "scores": { + "IFEval": 0.4345, + "BBH": 0.5686, + "MATH Level 5": 0.0838, + "GPQA": 0.3003, + "MUSR": 0.4045, + "MMLU-PRO": 0.334 + } + }, + { + "model_id": "mindw96/DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3", + "name": "DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3", + "developer": "mindw96", + "scores": { + "IFEval": 0.1388, + "BBH": 0.3068, + "MATH Level 5": 0.0083, + "GPQA": 0.2508, + "MUSR": 0.3792, + "MMLU-PRO": 0.1106 + } + }, + { + "model_id": "minghaowu/Qwen1.5-1.8B-OpenHermes-2.5", + "name": "Qwen1.5-1.8B-OpenHermes-2.5", + "developer": "minghaowu", + "scores": { + "IFEval": 0.2778, + "BBH": 0.3375, + "MATH Level 5": 0.0242, + "GPQA": 0.2836, + "MUSR": 0.3529, + "MMLU-PRO": 0.1792 + } + }, + { + "model_id": "ministral/Ministral-3b-instruct", + "name": "Ministral-3b-instruct", + "developer": "ministral", + "scores": { + "IFEval": 0.1358, + "BBH": 0.3192, + "MATH Level 5": 0.0083, + "GPQA": 0.2517, + "MUSR": 0.3382, + "MMLU-PRO": 0.1093 + } + }, + { + "model_id": "mistral-community/Mistral-7B-v0.2", + "name": "Mistral-7B-v0.2", + "developer": "mistral-community", + "scores": { + "IFEval": 0.2266, + "BBH": 0.451, + "MATH Level 5": 0.0302, + "GPQA": 0.2919, + "MUSR": 0.4032, + "MMLU-PRO": 0.2953 + } + }, + { + "model_id": "mistral-community/Mixtral-8x22B-v0.1", + "name": "Mixtral-8x22B-v0.1", + "developer": "mistral-community", + "scores": { + "IFEval": 0.3167, + "BBH": 0.38, + "MATH Level 5": 0.1543, + "GPQA": 0.33, + "MUSR": 0.3533, + "MMLU-PRO": 0.36 + } + }, + { + "model_id": "mistral-community/mixtral-8x22B-v0.3", + "name": "mixtral-8x22B-v0.3", + "developer": "mistral-community", + "scores": { + "IFEval": 0.2583, + "BBH": 0.625, + "MATH Level 5": 0.1835, + "GPQA": 0.3775, + "MUSR": 0.4037, + "MMLU-PRO": 0.4639 + } + }, + { + "model_id": "mistralai/Codestral-22B-v0.1", + "name": "Codestral-22B-v0.1", + "developer": "mistralai", + "scores": { + "IFEval": 0.5772, + "BBH": 0.5139, + "MATH Level 5": 0.1005, + "GPQA": 0.2987, + "MUSR": 0.4187, + "MMLU-PRO": 0.3156 + } + }, + { + "model_id": "mistralai/Ministral-8B-Instruct-2410", + "name": "Ministral-8B-Instruct-2410", + "developer": "mistralai", + "scores": { + "IFEval": 0.5896, + "BBH": 0.4762, + "MATH Level 5": 0.1956, + "GPQA": 0.2844, + "MUSR": 0.4138, + "MMLU-PRO": 0.3291 + } + }, + { + "model_id": "mistralai/Mistral-7B-Instruct-v0.1", + "name": "Mistral-7B-Instruct-v0.1", + "developer": "mistralai", + "scores": { + "IFEval": 0.4487, + "BBH": 0.3355, + "MATH Level 5": 0.0227, + "GPQA": 0.25, + "MUSR": 0.3848, + "MMLU-PRO": 0.2414 + } + }, + { + "model_id": "mistralai/Mistral-7B-Instruct-v0.2", + "name": "Mistral-7B-Instruct-v0.2", + "developer": "mistralai", + "scores": { + "IFEval": 0.5496, + "BBH": 0.446, + "MATH Level 5": 0.0302, + "GPQA": 0.276, + "MUSR": 0.3966, + "MMLU-PRO": 0.2717 + } + }, + { + "model_id": "mistralai/Mistral-7B-Instruct-v0.3", + "name": "Mistral-7B-Instruct-v0.3", + "developer": "mistralai", + "scores": { + "IFEval": 0.5465, + "BBH": 0.4722, + "MATH Level 5": 0.0385, + "GPQA": 0.2794, + "MUSR": 0.3739, + "MMLU-PRO": 0.3075 + } + }, + { + "model_id": "mistralai/Mistral-7B-v0.1", + "name": "Mistral-7B-v0.1", + "developer": "mistralai", + "scores": { + "IFEval": 0.2386, + "BBH": 0.4419, + "MATH Level 5": 0.0295, + "GPQA": 0.2919, + "MUSR": 0.4139, + "MMLU-PRO": 0.3013 + } + }, + { + "model_id": "mistralai/Mistral-7B-v0.3", + "name": "Mistral-7B-v0.3", + "developer": "mistralai", + "scores": { + "IFEval": 0.2266, + "BBH": 0.4517, + "MATH Level 5": 0.0302, + "GPQA": 0.2919, + "MUSR": 0.4032, + "MMLU-PRO": 0.2953 + } + }, + { + "model_id": "mistralai/Mistral-Large-Instruct-2411", + "name": "Mistral-Large-Instruct-2411", + "developer": "mistralai", + "scores": { + "IFEval": 0.8401, + "BBH": 0.6747, + "MATH Level 5": 0.4955, + "GPQA": 0.4371, + "MUSR": 0.454, + "MMLU-PRO": 0.5562 + } + }, + { + "model_id": "mistralai/Mistral-Nemo-Base-2407", + "name": "Mistral-Nemo-Base-2407", + "developer": "mistralai", + "scores": { + "IFEval": 0.163, + "BBH": 0.5035, + "MATH Level 5": 0.0597, + "GPQA": 0.2936, + "MUSR": 0.3921, + "MMLU-PRO": 0.3472 + } + }, + { + "model_id": "mistralai/Mistral-Nemo-Instruct-2407", + "name": "Mistral-Nemo-Instruct-2407", + "developer": "mistralai", + "scores": { + "IFEval": 0.638, + "BBH": 0.5037, + "MATH Level 5": 0.1269, + "GPQA": 0.2903, + "MUSR": 0.39, + "MMLU-PRO": 0.3517 + } + }, + { + "model_id": "mistralai/Mistral-Small-24B-Base-2501", + "name": "Mistral-Small-24B-Base-2501", + "developer": "mistralai", + "scores": { + "IFEval": 0.1672, + "BBH": 0.6442, + "MATH Level 5": 0.1971, + "GPQA": 0.3876, + "MUSR": 0.4237, + "MMLU-PRO": 0.5406 + } + }, + { + "model_id": "mistralai/Mistral-Small-Instruct-2409", + "name": "Mistral-Small-Instruct-2409", + "developer": "mistralai", + "scores": { + "IFEval": 0.667, + "BBH": 0.5213, + "MATH Level 5": 0.1435, + "GPQA": 0.3238, + "MUSR": 0.3632, + "MMLU-PRO": 0.396 + } + }, + { + "model_id": "mistralai/Mixtral-8x22B-Instruct-v0.1", + "name": "Mixtral-8x22B-Instruct-v0.1", + "developer": "mistralai", + "scores": { + "IFEval": 0.7184, + "BBH": 0.6125, + "MATH Level 5": 0.1873, + "GPQA": 0.3733, + "MUSR": 0.4311, + "MMLU-PRO": 0.4483 + } + }, + { + "model_id": "mistralai/Mixtral-8x22B-v0.1", + "name": "Mixtral-8x22B-v0.1", + "developer": "mistralai", + "scores": { + "IFEval": 0.2583, + "BBH": 0.624, + "MATH Level 5": 0.1835, + "GPQA": 0.3758, + "MUSR": 0.4037, + "MMLU-PRO": 0.4639 + } + }, + { + "model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "name": "Mixtral-8x7B-Instruct-v0.1", + "developer": "mistralai", + "scores": { + "IFEval": 0.5599, + "BBH": 0.4962, + "MATH Level 5": 0.0914, + "GPQA": 0.3029, + "MUSR": 0.4203, + "MMLU-PRO": 0.3692 + } + }, + { + "model_id": "mistralai/Mixtral-8x7B-v0.1", + "name": "Mixtral-8x7B-v0.1", + "developer": "mistralai", + "scores": { + "IFEval": 0.2415, + "BBH": 0.5087, + "MATH Level 5": 0.102, + "GPQA": 0.3138, + "MUSR": 0.4321, + "MMLU-PRO": 0.385 + } + }, + { + "model_id": "mixtao/MixTAO-7Bx2-MoE-v8.1", + "name": "MixTAO-7Bx2-MoE-v8.1", + "developer": "mixtao", + "scores": { + "IFEval": 0.4162, + "BBH": 0.5189, + "MATH Level 5": 0.0906, + "GPQA": 0.2844, + "MUSR": 0.4463, + "MMLU-PRO": 0.3123 + } + }, + { + "model_id": "mkurman/llama-3.2-MEDIT-3B-o1", + "name": "llama-3.2-MEDIT-3B-o1", + "developer": "mkurman", + "scores": { + "IFEval": 0.4382, + "BBH": 0.44, + "MATH Level 5": 0.1307, + "GPQA": 0.2659, + "MUSR": 0.3565, + "MMLU-PRO": 0.2741 + } + }, + { + "model_id": "mkurman/phi-4-MedIT-11B-exp-1", + "name": "phi-4-MedIT-11B-exp-1", + "developer": "mkurman", + "scores": { + "IFEval": 0.5948, + "BBH": 0.5414, + "MATH Level 5": 0.0899, + "GPQA": 0.3012, + "MUSR": 0.3848, + "MMLU-PRO": 0.3825 + } + }, + { + "model_id": "mkurman/phi4-MedIT-10B-o1", + "name": "phi4-MedIT-10B-o1", + "developer": "mkurman", + "scores": { + "IFEval": 0.3463, + "BBH": 0.5198, + "MATH Level 5": 0.1148, + "GPQA": 0.2458, + "MUSR": 0.3968, + "MMLU-PRO": 0.3507 + } + }, + { + "model_id": "mkxu/llama-3-8b-instruct-fpo", + "name": "llama-3-8b-instruct-fpo", + "developer": "mkxu", + "scores": { + "IFEval": 0.679, + "BBH": 0.4959, + "MATH Level 5": 0.0733, + "GPQA": 0.2777, + "MUSR": 0.3658, + "MMLU-PRO": 0.3605 + } + }, + { + "model_id": "mkxu/llama-3-8b-po1", + "name": "llama-3-8b-po1", + "developer": "mkxu", + "scores": { + "IFEval": 0.4081, + "BBH": 0.4976, + "MATH Level 5": 0.0702, + "GPQA": 0.297, + "MUSR": 0.3804, + "MMLU-PRO": 0.3562 + } + }, + { + "model_id": "mlabonne/AlphaMonarch-7B", + "name": "AlphaMonarch-7B", + "developer": "mlabonne", + "scores": { + "IFEval": 0.4939, + "BBH": 0.4626, + "MATH Level 5": 0.0408, + "GPQA": 0.2701, + "MUSR": 0.4121, + "MMLU-PRO": 0.2473 + } + }, + { + "model_id": "mlabonne/Beyonder-4x7B-v3", + "name": "Beyonder-4x7B-v3", + "developer": "mlabonne", + "scores": { + "IFEval": 0.5608, + "BBH": 0.4671, + "MATH Level 5": 0.0536, + "GPQA": 0.2852, + "MUSR": 0.4045, + "MMLU-PRO": 0.2512 + } + }, + { + "model_id": "mlabonne/BigQwen2.5-52B-Instruct", + "name": "BigQwen2.5-52B-Instruct", + "developer": "mlabonne", + "scores": { + "IFEval": 0.7913, + "BBH": 0.7121, + "MATH Level 5": 0.5476, + "GPQA": 0.302, + "MUSR": 0.4113, + "MMLU-PRO": 0.5519 + } + }, + { + "model_id": "mlabonne/BigQwen2.5-Echo-47B-Instruct", + "name": "BigQwen2.5-Echo-47B-Instruct", + "developer": "mlabonne", + "scores": { + "IFEval": 0.7357, + "BBH": 0.6125, + "MATH Level 5": 0.4381, + "GPQA": 0.3146, + "MUSR": 0.4125, + "MMLU-PRO": 0.4734 + } + }, + { + "model_id": "mlabonne/ChimeraLlama-3-8B-v2", + "name": "ChimeraLlama-3-8B-v2", + "developer": "mlabonne", + "scores": { + "IFEval": 0.4469, + "BBH": 0.5046, + "MATH Level 5": 0.0906, + "GPQA": 0.2852, + "MUSR": 0.3791, + "MMLU-PRO": 0.3569 + } + }, + { + "model_id": "mlabonne/ChimeraLlama-3-8B-v3", + "name": "ChimeraLlama-3-8B-v3", + "developer": "mlabonne", + "scores": { + "IFEval": 0.4408, + "BBH": 0.4978, + "MATH Level 5": 0.0884, + "GPQA": 0.2919, + "MUSR": 0.4004, + "MMLU-PRO": 0.3669 + } + }, + { + "model_id": "mlabonne/Daredevil-8B", + "name": "Daredevil-8B", + "developer": "mlabonne", + "scores": { + "IFEval": 0.4548, + "BBH": 0.5194, + "MATH Level 5": 0.1065, + "GPQA": 0.3079, + "MUSR": 0.3939, + "MMLU-PRO": 0.3831 + } + }, + { + "model_id": "mlabonne/Daredevil-8B-abliterated", + "name": "Daredevil-8B-abliterated", + "developer": "mlabonne", + "scores": { + "IFEval": 0.4426, + "BBH": 0.4254, + "MATH Level 5": 0.0944, + "GPQA": 0.2903, + "MUSR": 0.407, + "MMLU-PRO": 0.3701 + } + }, + { + "model_id": "mlabonne/Hermes-3-Llama-3.1-70B-lorablated", + "name": "Hermes-3-Llama-3.1-70B-lorablated", + "developer": "mlabonne", + "scores": { + "IFEval": 0.3424, + "BBH": 0.6693, + "MATH Level 5": 0.2243, + "GPQA": 0.3658, + "MUSR": 0.5029, + "MMLU-PRO": 0.4679 + } + }, + { + "model_id": "mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated", + "name": "Meta-Llama-3.1-8B-Instruct-abliterated", + "developer": "mlabonne", + "scores": { + "IFEval": 0.7329, + "BBH": 0.4874, + "MATH Level 5": 0.0687, + "GPQA": 0.2567, + "MUSR": 0.3649, + "MMLU-PRO": 0.3503 + } + }, + { + "model_id": "mlabonne/NeuralBeagle14-7B", + "name": "NeuralBeagle14-7B", + "developer": "mlabonne", + "scores": { + "IFEval": 0.4935, + "BBH": 0.4628, + "MATH Level 5": 0.0521, + "GPQA": 0.2819, + "MUSR": 0.4319, + "MMLU-PRO": 0.2601 + } + }, + { + "model_id": "mlabonne/NeuralDaredevil-8B-abliterated", + "name": "NeuralDaredevil-8B-abliterated", + "developer": "mlabonne", + "scores": { + "IFEval": 0.7561, + "BBH": 0.5111, + "MATH Level 5": 0.0906, + "GPQA": 0.3062, + "MUSR": 0.4019, + "MMLU-PRO": 0.3841 + } + }, + { + "model_id": "mlabonne/OrpoLlama-3-8B", + "name": "OrpoLlama-3-8B", + "developer": "mlabonne", + "scores": { + "IFEval": 0.3653, + "BBH": 0.4424, + "MATH Level 5": 0.0559, + "GPQA": 0.2794, + "MUSR": 0.3579, + "MMLU-PRO": 0.2705 + } + }, + { + "model_id": "mlabonne/phixtral-2x2_8", + "name": "phixtral-2x2_8", + "developer": "mlabonne", + "scores": { + "IFEval": 0.3431, + "BBH": 0.4889, + "MATH Level 5": 0.0355, + "GPQA": 0.2651, + "MUSR": 0.3644, + "MMLU-PRO": 0.2551 + } + }, + { + "model_id": "mlx-community/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32", + "name": "Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32", + "developer": "mlx-community", + "scores": { + "IFEval": 0.3369, + "BBH": 0.3292, + "MATH Level 5": 0.0846, + "GPQA": 0.2576, + "MUSR": 0.3249, + "MMLU-PRO": 0.1638 + } + }, + { + "model_id": "mlx-community/Mistral-Small-24B-Instruct-2501-bf16", + "name": "Mistral-Small-24B-Instruct-2501-bf16", + "developer": "mlx-community", + "scores": { + "IFEval": 0.6283, + "BBH": 0.6713, + "MATH Level 5": 0.3225, + "GPQA": 0.3951, + "MUSR": 0.4618, + "MMLU-PRO": 0.5395 + } + }, + { + "model_id": "mmnga/Llama-3-70B-japanese-suzume-vector-v0.1", + "name": "Llama-3-70B-japanese-suzume-vector-v0.1", + "developer": "mmnga", + "scores": { + "IFEval": 0.4649, + "BBH": 0.6542, + "MATH Level 5": 0.2326, + "GPQA": 0.2861, + "MUSR": 0.4141, + "MMLU-PRO": 0.5224 + } + }, + { + "model_id": "mobiuslabsgmbh/DeepSeek-R1-ReDistill-Llama3-8B-v1.1", + "name": "DeepSeek-R1-ReDistill-Llama3-8B-v1.1", + "developer": "mobiuslabsgmbh", + "scores": { + "IFEval": 0.3704, + "BBH": 0.3473, + "MATH Level 5": 0.3285, + "GPQA": 0.271, + "MUSR": 0.3396, + "MMLU-PRO": 0.2198 + } + }, + { + "model_id": "mobiuslabsgmbh/DeepSeek-R1-ReDistill-Qwen-7B-v1.1", + "name": "DeepSeek-R1-ReDistill-Qwen-7B-v1.1", + "developer": "mobiuslabsgmbh", + "scores": { + "IFEval": 0.3473, + "BBH": 0.3698, + "MATH Level 5": 0.3497, + "GPQA": 0.2651, + "MUSR": 0.4009, + "MMLU-PRO": 0.2326 + } + }, + { + "model_id": "moeru-ai/L3.1-Moe-2x8B-v0.2", + "name": "L3.1-Moe-2x8B-v0.2", + "developer": "moeru-ai", + "scores": { + "IFEval": 0.7348, + "BBH": 0.5256, + "MATH Level 5": 0.1699, + "GPQA": 0.3003, + "MUSR": 0.4199, + "MMLU-PRO": 0.3858 + } + }, + { + "model_id": "moeru-ai/L3.1-Moe-4x8B-v0.1", + "name": "L3.1-Moe-4x8B-v0.1", + "developer": "moeru-ai", + "scores": { + "IFEval": 0.4332, + "BBH": 0.4939, + "MATH Level 5": 0.1299, + "GPQA": 0.2592, + "MUSR": 0.3609, + "MMLU-PRO": 0.3454 + } + }, + { + "model_id": "moeru-ai/L3.1-Moe-4x8B-v0.2", + "name": "L3.1-Moe-4x8B-v0.2", + "developer": "moeru-ai", + "scores": { + "IFEval": 0.5407, + "BBH": 0.4466, + "MATH Level 5": 0.1035, + "GPQA": 0.2668, + "MUSR": 0.3234, + "MMLU-PRO": 0.2763 + } + }, + { + "model_id": "monsterapi/Llama-3_1-8B-Instruct-orca-ORPO", + "name": "Llama-3_1-8B-Instruct-orca-ORPO", + "developer": "monsterapi", + "scores": { + "IFEval": 0.2273, + "BBH": 0.2865, + "MATH Level 5": 0.0, + "GPQA": 0.2492, + "MUSR": 0.3445, + "MMLU-PRO": 0.1168 + } + }, + { + "model_id": "monsterapi/gemma-2-2b-LoRA-MonsterInstruct", + "name": "gemma-2-2b-LoRA-MonsterInstruct", + "developer": "monsterapi", + "scores": { + "IFEval": 0.3903, + "BBH": 0.365, + "MATH Level 5": 0.0506, + "GPQA": 0.2701, + "MUSR": 0.3644, + "MMLU-PRO": 0.1987 + } + }, + { + "model_id": "mosaicml/mpt-7b", + "name": "mpt-7b", + "developer": "mosaicml", + "scores": { + "IFEval": 0.2152, + "BBH": 0.33, + "MATH Level 5": 0.0159, + "GPQA": 0.2601, + "MUSR": 0.3672, + "MMLU-PRO": 0.1206 + } + }, + { + "model_id": "mosama/Qwen2.5-1.5B-Instruct-CoT-Reflection", + "name": "Qwen2.5-1.5B-Instruct-CoT-Reflection", + "developer": "mosama", + "scores": { + "IFEval": 0.287, + "BBH": 0.4109, + "MATH Level 5": 0.0272, + "GPQA": 0.2617, + "MUSR": 0.3212, + "MMLU-PRO": 0.2651 + } + }, + { + "model_id": "mrdayl/OpenCogito", + "name": "OpenCogito", + "developer": "mrdayl", + "scores": { + "IFEval": 0.3934, + "BBH": 0.472, + "MATH Level 5": 0.2183, + "GPQA": 0.3003, + "MUSR": 0.424, + "MMLU-PRO": 0.3452 + } + }, + { + "model_id": "mrdayl/OpenCognito", + "name": "OpenCognito", + "developer": "mrdayl", + "scores": { + "IFEval": 0.4062, + "BBH": 0.4706, + "MATH Level 5": 0.2115, + "GPQA": 0.2978, + "MUSR": 0.4293, + "MMLU-PRO": 0.3443 + } + }, + { + "model_id": "mrdayl/OpenCognito-r1", + "name": "OpenCognito-r1", + "developer": "mrdayl", + "scores": { + "IFEval": 0.4241, + "BBH": 0.4673, + "MATH Level 5": 0.1903, + "GPQA": 0.2995, + "MUSR": 0.4241, + "MMLU-PRO": 0.3475 + } + }, + { + "model_id": "mrdayl/OpenCognito-r2", + "name": "OpenCognito-r2", + "developer": "mrdayl", + "scores": { + "IFEval": 0.3959, + "BBH": 0.4688, + "MATH Level 5": 0.2024, + "GPQA": 0.3062, + "MUSR": 0.4202, + "MMLU-PRO": 0.3462 + } + }, + { + "model_id": "mrdayl/OpenThink", + "name": "OpenThink", + "developer": "mrdayl", + "scores": { + "IFEval": 0.2054, + "BBH": 0.346, + "MATH Level 5": 0.2885, + "GPQA": 0.2827, + "MUSR": 0.3289, + "MMLU-PRO": 0.185 + } + }, + { + "model_id": "mrm8488/phi-4-14B-grpo-gsm8k-3e", + "name": "phi-4-14B-grpo-gsm8k-3e", + "developer": "mrm8488", + "scores": { + "IFEval": 0.6885, + "BBH": 0.6805, + "MATH Level 5": 0.4524, + "GPQA": 0.3356, + "MUSR": 0.3994, + "MMLU-PRO": 0.5268 + } + }, + { + "model_id": "mrm8488/phi-4-14B-grpo-limo", + "name": "phi-4-14B-grpo-limo", + "developer": "mrm8488", + "scores": { + "IFEval": 0.6812, + "BBH": 0.6785, + "MATH Level 5": 0.4569, + "GPQA": 0.3364, + "MUSR": 0.3981, + "MMLU-PRO": 0.5261 + } + }, + { + "model_id": "mukaj/Llama-3.1-Hawkish-8B", + "name": "Llama-3.1-Hawkish-8B", + "developer": "mukaj", + "scores": { + "IFEval": 0.672, + "BBH": 0.4884, + "MATH Level 5": 0.2432, + "GPQA": 0.2903, + "MUSR": 0.3967, + "MMLU-PRO": 0.3331 + } + }, + { + "model_id": "natong19/Mistral-Nemo-Instruct-2407-abliterated", + "name": "Mistral-Nemo-Instruct-2407-abliterated", + "developer": "natong19", + "scores": { + "IFEval": 0.6392, + "BBH": 0.5048, + "MATH Level 5": 0.1322, + "GPQA": 0.2869, + "MUSR": 0.4033, + "MMLU-PRO": 0.3518 + } + }, + { + "model_id": "natong19/Qwen2-7B-Instruct-abliterated", + "name": "Qwen2-7B-Instruct-abliterated", + "developer": "natong19", + "scores": { + "IFEval": 0.5837, + "BBH": 0.5553, + "MATH Level 5": 0.2764, + "GPQA": 0.3012, + "MUSR": 0.4034, + "MMLU-PRO": 0.3842 + } + }, + { + "model_id": "nazimali/Mistral-Nemo-Kurdish", + "name": "Mistral-Nemo-Kurdish", + "developer": "nazimali", + "scores": { + "IFEval": 0.3401, + "BBH": 0.5133, + "MATH Level 5": 0.0959, + "GPQA": 0.3012, + "MUSR": 0.4116, + "MMLU-PRO": 0.3235 + } + }, + { + "model_id": "nazimali/Mistral-Nemo-Kurdish-Instruct", + "name": "Mistral-Nemo-Kurdish-Instruct", + "developer": "nazimali", + "scores": { + "IFEval": 0.4964, + "BBH": 0.4699, + "MATH Level 5": 0.0045, + "GPQA": 0.2827, + "MUSR": 0.3979, + "MMLU-PRO": 0.3063 + } + }, + { + "model_id": "nbeerbower/BigKartoffel-mistral-nemo-20B", + "name": "BigKartoffel-mistral-nemo-20B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.5857, + "BBH": 0.5515, + "MATH Level 5": 0.0264, + "GPQA": 0.2869, + "MUSR": 0.428, + "MMLU-PRO": 0.353 + } + }, + { + "model_id": "nbeerbower/DoppelKartoffel-Mistral-Nemo-23B", + "name": "DoppelKartoffel-Mistral-Nemo-23B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.5191, + "BBH": 0.5218, + "MATH Level 5": 0.031, + "GPQA": 0.2752, + "MUSR": 0.3795, + "MMLU-PRO": 0.308 + } + }, + { + "model_id": "nbeerbower/DoublePotato-Mistral-Nemo-13B", + "name": "DoublePotato-Mistral-Nemo-13B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.6796, + "BBH": 0.5438, + "MATH Level 5": 0.04, + "GPQA": 0.3012, + "MUSR": 0.46, + "MMLU-PRO": 0.3596 + } + }, + { + "model_id": "nbeerbower/Dumpling-Qwen2.5-1.5B", + "name": "Dumpling-Qwen2.5-1.5B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.3699, + "BBH": 0.416, + "MATH Level 5": 0.1171, + "GPQA": 0.2685, + "MUSR": 0.3728, + "MMLU-PRO": 0.2772 + } + }, + { + "model_id": "nbeerbower/Dumpling-Qwen2.5-14B", + "name": "Dumpling-Qwen2.5-14B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.6064, + "BBH": 0.6451, + "MATH Level 5": 0.3097, + "GPQA": 0.3012, + "MUSR": 0.4354, + "MMLU-PRO": 0.517 + } + }, + { + "model_id": "nbeerbower/Dumpling-Qwen2.5-7B-1k-r16", + "name": "Dumpling-Qwen2.5-7B-1k-r16", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.486, + "BBH": 0.5214, + "MATH Level 5": 0.2364, + "GPQA": 0.2701, + "MUSR": 0.423, + "MMLU-PRO": 0.3959 + } + }, + { + "model_id": "nbeerbower/Dumpling-Qwen2.5-7B-1k-r64-2e-5", + "name": "Dumpling-Qwen2.5-7B-1k-r64-2e-5", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.4179, + "BBH": 0.5301, + "MATH Level 5": 0.2115, + "GPQA": 0.2701, + "MUSR": 0.4486, + "MMLU-PRO": 0.4122 + } + }, + { + "model_id": "nbeerbower/EVA-abliterated-TIES-Qwen2.5-1.5B", + "name": "EVA-abliterated-TIES-Qwen2.5-1.5B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.4115, + "BBH": 0.3997, + "MATH Level 5": 0.1375, + "GPQA": 0.2651, + "MUSR": 0.3502, + "MMLU-PRO": 0.2712 + } + }, + { + "model_id": "nbeerbower/EVA-abliterated-TIES-Qwen2.5-14B", + "name": "EVA-abliterated-TIES-Qwen2.5-14B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.7836, + "BBH": 0.6372, + "MATH Level 5": 0.5045, + "GPQA": 0.3549, + "MUSR": 0.4407, + "MMLU-PRO": 0.5211 + } + }, + { + "model_id": "nbeerbower/Flammades-Mistral-Nemo-12B", + "name": "Flammades-Mistral-Nemo-12B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.3842, + "BBH": 0.53, + "MATH Level 5": 0.0755, + "GPQA": 0.3037, + "MUSR": 0.4806, + "MMLU-PRO": 0.3661 + } + }, + { + "model_id": "nbeerbower/Gemma2-Gutenberg-Doppel-9B", + "name": "Gemma2-Gutenberg-Doppel-9B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.7171, + "BBH": 0.587, + "MATH Level 5": 0.1979, + "GPQA": 0.3297, + "MUSR": 0.4608, + "MMLU-PRO": 0.4127 + } + }, + { + "model_id": "nbeerbower/Gutensuppe-mistral-nemo-12B", + "name": "Gutensuppe-mistral-nemo-12B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.2916, + "BBH": 0.5487, + "MATH Level 5": 0.1329, + "GPQA": 0.3372, + "MUSR": 0.429, + "MMLU-PRO": 0.368 + } + }, + { + "model_id": "nbeerbower/Hermes2-Gutenberg2-Mistral-7B", + "name": "Hermes2-Gutenberg2-Mistral-7B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.3721, + "BBH": 0.4981, + "MATH Level 5": 0.0574, + "GPQA": 0.2894, + "MUSR": 0.4623, + "MMLU-PRO": 0.2993 + } + }, + { + "model_id": "nbeerbower/Kartoffel-Deepfry-12B", + "name": "Kartoffel-Deepfry-12B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.5022, + "BBH": 0.5365, + "MATH Level 5": 0.0604, + "GPQA": 0.2961, + "MUSR": 0.4792, + "MMLU-PRO": 0.3582 + } + }, + { + "model_id": "nbeerbower/Llama-3.1-Nemotron-lorablated-70B", + "name": "Llama-3.1-Nemotron-lorablated-70B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.7229, + "BBH": 0.6825, + "MATH Level 5": 0.3338, + "GPQA": 0.3909, + "MUSR": 0.4682, + "MMLU-PRO": 0.5343 + } + }, + { + "model_id": "nbeerbower/Llama3.1-Gutenberg-Doppel-70B", + "name": "Llama3.1-Gutenberg-Doppel-70B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.7092, + "BBH": 0.6661, + "MATH Level 5": 0.2122, + "GPQA": 0.3448, + "MUSR": 0.4897, + "MMLU-PRO": 0.4737 + } + }, + { + "model_id": "nbeerbower/Lyra-Gutenberg-mistral-nemo-12B", + "name": "Lyra-Gutenberg-mistral-nemo-12B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.3495, + "BBH": 0.5586, + "MATH Level 5": 0.1012, + "GPQA": 0.3339, + "MUSR": 0.4357, + "MMLU-PRO": 0.3628 + } + }, + { + "model_id": "nbeerbower/Lyra4-Gutenberg-12B", + "name": "Lyra4-Gutenberg-12B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.2212, + "BBH": 0.5387, + "MATH Level 5": 0.1299, + "GPQA": 0.3188, + "MUSR": 0.4038, + "MMLU-PRO": 0.3571 + } + }, + { + "model_id": "nbeerbower/Lyra4-Gutenberg2-12B", + "name": "Lyra4-Gutenberg2-12B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.2585, + "BBH": 0.5345, + "MATH Level 5": 0.1171, + "GPQA": 0.3129, + "MUSR": 0.3972, + "MMLU-PRO": 0.3565 + } + }, + { + "model_id": "nbeerbower/Mahou-1.5-mistral-nemo-12B-lorablated", + "name": "Mahou-1.5-mistral-nemo-12B-lorablated", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.6825, + "BBH": 0.5496, + "MATH Level 5": 0.0891, + "GPQA": 0.2794, + "MUSR": 0.4522, + "MMLU-PRO": 0.3574 + } + }, + { + "model_id": "nbeerbower/Mistral-Gutenberg-Doppel-7B-FFT", + "name": "Mistral-Gutenberg-Doppel-7B-FFT", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.5717, + "BBH": 0.4076, + "MATH Level 5": 0.0249, + "GPQA": 0.2836, + "MUSR": 0.4059, + "MMLU-PRO": 0.2729 + } + }, + { + "model_id": "nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B", + "name": "Mistral-Nemo-Gutenberg-Doppel-12B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.3567, + "BBH": 0.5275, + "MATH Level 5": 0.1216, + "GPQA": 0.3163, + "MUSR": 0.4132, + "MMLU-PRO": 0.3579 + } + }, + { + "model_id": "nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B-v2", + "name": "Mistral-Nemo-Gutenberg-Doppel-12B-v2", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.6536, + "BBH": 0.5374, + "MATH Level 5": 0.1156, + "GPQA": 0.271, + "MUSR": 0.4233, + "MMLU-PRO": 0.3546 + } + }, + { + "model_id": "nbeerbower/Mistral-Nemo-Moderne-12B-FFT-experimental", + "name": "Mistral-Nemo-Moderne-12B-FFT-experimental", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.3352, + "BBH": 0.5234, + "MATH Level 5": 0.077, + "GPQA": 0.281, + "MUSR": 0.3715, + "MMLU-PRO": 0.3455 + } + }, + { + "model_id": "nbeerbower/Mistral-Nemo-Prism-12B", + "name": "Mistral-Nemo-Prism-12B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.6858, + "BBH": 0.5475, + "MATH Level 5": 0.0869, + "GPQA": 0.3079, + "MUSR": 0.4626, + "MMLU-PRO": 0.3581 + } + }, + { + "model_id": "nbeerbower/Mistral-Nemo-Prism-12B-v2", + "name": "Mistral-Nemo-Prism-12B-v2", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.6974, + "BBH": 0.5492, + "MATH Level 5": 0.0891, + "GPQA": 0.3054, + "MUSR": 0.46, + "MMLU-PRO": 0.3567 + } + }, + { + "model_id": "nbeerbower/Mistral-Nemo-Prism-12B-v7", + "name": "Mistral-Nemo-Prism-12B-v7", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.6962, + "BBH": 0.5521, + "MATH Level 5": 0.0869, + "GPQA": 0.2995, + "MUSR": 0.4639, + "MMLU-PRO": 0.359 + } + }, + { + "model_id": "nbeerbower/Mistral-Small-Drummer-22B", + "name": "Mistral-Small-Drummer-22B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.6331, + "BBH": 0.5793, + "MATH Level 5": 0.1888, + "GPQA": 0.3431, + "MUSR": 0.4064, + "MMLU-PRO": 0.4095 + } + }, + { + "model_id": "nbeerbower/Mistral-Small-Gutenberg-Doppel-22B", + "name": "Mistral-Small-Gutenberg-Doppel-22B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.4893, + "BBH": 0.5859, + "MATH Level 5": 0.2183, + "GPQA": 0.3465, + "MUSR": 0.3971, + "MMLU-PRO": 0.4124 + } + }, + { + "model_id": "nbeerbower/Nemo-Loony-12B-experimental", + "name": "Nemo-Loony-12B-experimental", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.3734, + "BBH": 0.3822, + "MATH Level 5": 0.0151, + "GPQA": 0.2701, + "MUSR": 0.3341, + "MMLU-PRO": 0.1589 + } + }, + { + "model_id": "nbeerbower/Nemoties-ChatML-12B", + "name": "Nemoties-ChatML-12B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.6382, + "BBH": 0.547, + "MATH Level 5": 0.0785, + "GPQA": 0.297, + "MUSR": 0.4509, + "MMLU-PRO": 0.3551 + } + }, + { + "model_id": "nbeerbower/Qwen2.5-Gutenberg-Doppel-14B", + "name": "Qwen2.5-Gutenberg-Doppel-14B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.8091, + "BBH": 0.6382, + "MATH Level 5": 0.5415, + "GPQA": 0.3331, + "MUSR": 0.4101, + "MMLU-PRO": 0.4921 + } + }, + { + "model_id": "nbeerbower/SmolNemo-12B-FFT-experimental", + "name": "SmolNemo-12B-FFT-experimental", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.3348, + "BBH": 0.3336, + "MATH Level 5": 0.0128, + "GPQA": 0.2601, + "MUSR": 0.3847, + "MMLU-PRO": 0.1217 + } + }, + { + "model_id": "nbeerbower/Stella-mistral-nemo-12B-v2", + "name": "Stella-mistral-nemo-12B-v2", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.3274, + "BBH": 0.5484, + "MATH Level 5": 0.1163, + "GPQA": 0.3322, + "MUSR": 0.4304, + "MMLU-PRO": 0.3684 + } + }, + { + "model_id": "nbeerbower/gemma2-gutenberg-27B", + "name": "gemma2-gutenberg-27B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.2947, + "BBH": 0.3797, + "MATH Level 5": 0.0189, + "GPQA": 0.2727, + "MUSR": 0.3727, + "MMLU-PRO": 0.1982 + } + }, + { + "model_id": "nbeerbower/gemma2-gutenberg-9B", + "name": "gemma2-gutenberg-9B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.2796, + "BBH": 0.5951, + "MATH Level 5": 0.0808, + "GPQA": 0.3381, + "MUSR": 0.4595, + "MMLU-PRO": 0.4192 + } + }, + { + "model_id": "nbeerbower/llama-3-gutenberg-8B", + "name": "llama-3-gutenberg-8B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.4372, + "BBH": 0.4994, + "MATH Level 5": 0.0785, + "GPQA": 0.3012, + "MUSR": 0.4073, + "MMLU-PRO": 0.3831 + } + }, + { + "model_id": "nbeerbower/llama3.1-cc-8B", + "name": "llama3.1-cc-8B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.5068, + "BBH": 0.4871, + "MATH Level 5": 0.071, + "GPQA": 0.2852, + "MUSR": 0.3885, + "MMLU-PRO": 0.3347 + } + }, + { + "model_id": "nbeerbower/llama3.1-kartoffeldes-70B", + "name": "llama3.1-kartoffeldes-70B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.823, + "BBH": 0.6894, + "MATH Level 5": 0.3218, + "GPQA": 0.3515, + "MUSR": 0.4646, + "MMLU-PRO": 0.4988 + } + }, + { + "model_id": "nbeerbower/mistral-nemo-bophades-12B", + "name": "mistral-nemo-bophades-12B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.6794, + "BBH": 0.4988, + "MATH Level 5": 0.1231, + "GPQA": 0.2852, + "MUSR": 0.4178, + "MMLU-PRO": 0.3501 + } + }, + { + "model_id": "nbeerbower/mistral-nemo-bophades3-12B", + "name": "mistral-nemo-bophades3-12B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.6578, + "BBH": 0.5449, + "MATH Level 5": 0.0846, + "GPQA": 0.3121, + "MUSR": 0.4604, + "MMLU-PRO": 0.3371 + } + }, + { + "model_id": "nbeerbower/mistral-nemo-cc-12B", + "name": "mistral-nemo-cc-12B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.1435, + "BBH": 0.5399, + "MATH Level 5": 0.0257, + "GPQA": 0.3154, + "MUSR": 0.4424, + "MMLU-PRO": 0.3598 + } + }, + { + "model_id": "nbeerbower/mistral-nemo-gutades-12B", + "name": "mistral-nemo-gutades-12B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.3425, + "BBH": 0.5407, + "MATH Level 5": 0.1178, + "GPQA": 0.3154, + "MUSR": 0.404, + "MMLU-PRO": 0.3561 + } + }, + { + "model_id": "nbeerbower/mistral-nemo-gutenberg-12B", + "name": "mistral-nemo-gutenberg-12B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.3504, + "BBH": 0.5281, + "MATH Level 5": 0.1163, + "GPQA": 0.307, + "MUSR": 0.4171, + "MMLU-PRO": 0.3562 + } + }, + { + "model_id": "nbeerbower/mistral-nemo-gutenberg-12B-v2", + "name": "mistral-nemo-gutenberg-12B-v2", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.6203, + "BBH": 0.5397, + "MATH Level 5": 0.1088, + "GPQA": 0.2777, + "MUSR": 0.4287, + "MMLU-PRO": 0.3499 + } + }, + { + "model_id": "nbeerbower/mistral-nemo-gutenberg-12B-v3", + "name": "mistral-nemo-gutenberg-12B-v3", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.2183, + "BBH": 0.5441, + "MATH Level 5": 0.0597, + "GPQA": 0.3146, + "MUSR": 0.445, + "MMLU-PRO": 0.3644 + } + }, + { + "model_id": "nbeerbower/mistral-nemo-gutenberg-12B-v4", + "name": "mistral-nemo-gutenberg-12B-v4", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.2379, + "BBH": 0.5269, + "MATH Level 5": 0.1261, + "GPQA": 0.3163, + "MUSR": 0.4104, + "MMLU-PRO": 0.3575 + } + }, + { + "model_id": "nbeerbower/mistral-nemo-gutenberg2-12B-test", + "name": "mistral-nemo-gutenberg2-12B-test", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.3385, + "BBH": 0.5255, + "MATH Level 5": 0.1163, + "GPQA": 0.3171, + "MUSR": 0.4157, + "MMLU-PRO": 0.3555 + } + }, + { + "model_id": "nbeerbower/mistral-nemo-kartoffel-12B", + "name": "mistral-nemo-kartoffel-12B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.7032, + "BBH": 0.5484, + "MATH Level 5": 0.0853, + "GPQA": 0.3045, + "MUSR": 0.4653, + "MMLU-PRO": 0.3585 + } + }, + { + "model_id": "nbeerbower/mistral-nemo-narwhal-12B", + "name": "mistral-nemo-narwhal-12B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.5549, + "BBH": 0.5057, + "MATH Level 5": 0.0582, + "GPQA": 0.271, + "MUSR": 0.3847, + "MMLU-PRO": 0.3483 + } + }, + { + "model_id": "nbeerbower/mistral-nemo-wissenschaft-12B", + "name": "mistral-nemo-wissenschaft-12B", + "developer": "nbeerbower", + "scores": { + "IFEval": 0.652, + "BBH": 0.504, + "MATH Level 5": 0.1216, + "GPQA": 0.2928, + "MUSR": 0.4178, + "MMLU-PRO": 0.3532 + } + }, + { + "model_id": "nbrahme/IndusQ", + "name": "IndusQ", + "developer": "nbrahme", + "scores": { + "IFEval": 0.244, + "BBH": 0.3062, + "MATH Level 5": 0.0008, + "GPQA": 0.2651, + "MUSR": 0.3366, + "MMLU-PRO": 0.112 + } + }, + { + "model_id": "necva/IE-cont-Llama3.1-8B", + "name": "IE-cont-Llama3.1-8B", + "developer": "necva", + "scores": { + "IFEval": 0.2049, + "BBH": 0.2912, + "MATH Level 5": 0.0, + "GPQA": 0.2601, + "MUSR": 0.3575, + "MMLU-PRO": 0.1167 + } + }, + { + "model_id": "necva/replica-IEPile", + "name": "replica-IEPile", + "developer": "necva", + "scores": { + "IFEval": 0.4678, + "BBH": 0.4779, + "MATH Level 5": 0.1239, + "GPQA": 0.3062, + "MUSR": 0.3998, + "MMLU-PRO": 0.3561 + } + }, + { + "model_id": "neopolita/jessi-v0.1-bf16-falcon3-7b-instruct", + "name": "jessi-v0.1-bf16-falcon3-7b-instruct", + "developer": "neopolita", + "scores": { + "IFEval": 0.7527, + "BBH": 0.5516, + "MATH Level 5": 0.3807, + "GPQA": 0.3029, + "MUSR": 0.4825, + "MMLU-PRO": 0.3924 + } + }, + { + "model_id": "neopolita/jessi-v0.1-falcon3-10b-instruct", + "name": "jessi-v0.1-falcon3-10b-instruct", + "developer": "neopolita", + "scores": { + "IFEval": 0.7552, + "BBH": 0.5953, + "MATH Level 5": 0.2002, + "GPQA": 0.3188, + "MUSR": 0.4279, + "MMLU-PRO": 0.4188 + } + }, + { + "model_id": "neopolita/jessi-v0.1-qwen2.5-7b-instruct", + "name": "jessi-v0.1-qwen2.5-7b-instruct", + "developer": "neopolita", + "scores": { + "IFEval": 0.7327, + "BBH": 0.5292, + "MATH Level 5": 0.4086, + "GPQA": 0.297, + "MUSR": 0.3914, + "MMLU-PRO": 0.4228 + } + }, + { + "model_id": "neopolita/jessi-v0.1-virtuoso-small", + "name": "jessi-v0.1-virtuoso-small", + "developer": "neopolita", + "scores": { + "IFEval": 0.7959, + "BBH": 0.6443, + "MATH Level 5": 0.3399, + "GPQA": 0.3305, + "MUSR": 0.4362, + "MMLU-PRO": 0.513 + } + }, + { + "model_id": "neopolita/jessi-v0.2-falcon3-10b-instruct", + "name": "jessi-v0.2-falcon3-10b-instruct", + "developer": "neopolita", + "scores": { + "IFEval": 0.7768, + "BBH": 0.6205, + "MATH Level 5": 0.2122, + "GPQA": 0.3289, + "MUSR": 0.4281, + "MMLU-PRO": 0.4354 + } + }, + { + "model_id": "neopolita/jessi-v0.2-falcon3-7b-instruct", + "name": "jessi-v0.2-falcon3-7b-instruct", + "developer": "neopolita", + "scores": { + "IFEval": 0.5771, + "BBH": 0.5363, + "MATH Level 5": 0.2538, + "GPQA": 0.3171, + "MUSR": 0.4479, + "MMLU-PRO": 0.3905 + } + }, + { + "model_id": "neopolita/jessi-v0.3-falcon3-7b-instruct", + "name": "jessi-v0.3-falcon3-7b-instruct", + "developer": "neopolita", + "scores": { + "IFEval": 0.7509, + "BBH": 0.5388, + "MATH Level 5": 0.1888, + "GPQA": 0.3196, + "MUSR": 0.4692, + "MMLU-PRO": 0.397 + } + }, + { + "model_id": "neopolita/jessi-v0.4-falcon3-7b-instruct", + "name": "jessi-v0.4-falcon3-7b-instruct", + "developer": "neopolita", + "scores": { + "IFEval": 0.7604, + "BBH": 0.5522, + "MATH Level 5": 0.3769, + "GPQA": 0.3029, + "MUSR": 0.4971, + "MMLU-PRO": 0.4004 + } + }, + { + "model_id": "neopolita/jessi-v0.5-falcon3-7b-instruct", + "name": "jessi-v0.5-falcon3-7b-instruct", + "developer": "neopolita", + "scores": { + "IFEval": 0.7412, + "BBH": 0.559, + "MATH Level 5": 0.3739, + "GPQA": 0.3112, + "MUSR": 0.4865, + "MMLU-PRO": 0.3966 + } + }, + { + "model_id": "neopolita/jessi-v0.6-falcon3-7b-instruct", + "name": "jessi-v0.6-falcon3-7b-instruct", + "developer": "neopolita", + "scores": { + "IFEval": 0.7402, + "BBH": 0.5509, + "MATH Level 5": 0.3565, + "GPQA": 0.3003, + "MUSR": 0.4904, + "MMLU-PRO": 0.3957 + } + }, + { + "model_id": "neopolita/loki-v0.1-virtuoso", + "name": "loki-v0.1-virtuoso", + "developer": "neopolita", + "scores": { + "IFEval": 0.7819, + "BBH": 0.6467, + "MATH Level 5": 0.3391, + "GPQA": 0.3507, + "MUSR": 0.4375, + "MMLU-PRO": 0.5129 + } + }, + { + "model_id": "netcat420/DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b", + "name": "DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b", + "developer": "netcat420", + "scores": { + "IFEval": 0.115, + "BBH": 0.2877, + "MATH Level 5": 0.0015, + "GPQA": 0.2643, + "MUSR": 0.3724, + "MMLU-PRO": 0.109 + } + }, + { + "model_id": "netcat420/DeepSeek-R1-MFANN-TIES-unretrained-7b", + "name": "DeepSeek-R1-MFANN-TIES-unretrained-7b", + "developer": "netcat420", + "scores": { + "IFEval": 0.2587, + "BBH": 0.3086, + "MATH Level 5": 0.0121, + "GPQA": 0.255, + "MUSR": 0.3527, + "MMLU-PRO": 0.1145 + } + }, + { + "model_id": "netcat420/Llama3.1-MFANN-8b", + "name": "Llama3.1-MFANN-8b", + "developer": "netcat420", + "scores": { + "IFEval": 0.297, + "BBH": 0.4281, + "MATH Level 5": 0.0295, + "GPQA": 0.2878, + "MUSR": 0.3379, + "MMLU-PRO": 0.2725 + } + }, + { + "model_id": "netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V2", + "name": "MFANN-Llama3.1-Abliterated-SLERP-TIES-V2", + "developer": "netcat420", + "scores": { + "IFEval": 0.421, + "BBH": 0.4924, + "MATH Level 5": 0.0763, + "GPQA": 0.297, + "MUSR": 0.3728, + "MMLU-PRO": 0.3522 + } + }, + { + "model_id": "netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V3", + "name": "MFANN-Llama3.1-Abliterated-SLERP-TIES-V3", + "developer": "netcat420", + "scores": { + "IFEval": 0.4238, + "BBH": 0.4914, + "MATH Level 5": 0.0755, + "GPQA": 0.297, + "MUSR": 0.3741, + "MMLU-PRO": 0.349 + } + }, + { + "model_id": "netcat420/MFANN-Llama3.1-Abliterated-SLERP-V4", + "name": "MFANN-Llama3.1-Abliterated-SLERP-V4", + "developer": "netcat420", + "scores": { + "IFEval": 0.4169, + "BBH": 0.4909, + "MATH Level 5": 0.068, + "GPQA": 0.3054, + "MUSR": 0.3821, + "MMLU-PRO": 0.3516 + } + }, + { + "model_id": "netcat420/MFANN-Llama3.1-Abliterated-SLERP-V5", + "name": "MFANN-Llama3.1-Abliterated-SLERP-V5", + "developer": "netcat420", + "scores": { + "IFEval": 0.4329, + "BBH": 0.4952, + "MATH Level 5": 0.0816, + "GPQA": 0.2936, + "MUSR": 0.3781, + "MMLU-PRO": 0.3445 + } + }, + { + "model_id": "netcat420/MFANN-Llama3.1-Abliterated-Slerp-TIES", + "name": "MFANN-Llama3.1-Abliterated-Slerp-TIES", + "developer": "netcat420", + "scores": { + "IFEval": 0.4293, + "BBH": 0.4968, + "MATH Level 5": 0.0665, + "GPQA": 0.2919, + "MUSR": 0.3687, + "MMLU-PRO": 0.3531 + } + }, + { + "model_id": "netcat420/MFANN-Llama3.1-Abliterated-Slerp-V3.2", + "name": "MFANN-Llama3.1-Abliterated-Slerp-V3.2", + "developer": "netcat420", + "scores": { + "IFEval": 0.4128, + "BBH": 0.4978, + "MATH Level 5": 0.0702, + "GPQA": 0.2878, + "MUSR": 0.3754, + "MMLU-PRO": 0.3527 + } + }, + { + "model_id": "netcat420/MFANN-SFT", + "name": "MFANN-SFT", + "developer": "netcat420", + "scores": { + "IFEval": 0.3682, + "BBH": 0.4852, + "MATH Level 5": 0.0597, + "GPQA": 0.3163, + "MUSR": 0.3725, + "MMLU-PRO": 0.3336 + } + }, + { + "model_id": "netcat420/MFANN-abliterated-phi2-merge-unretrained", + "name": "MFANN-abliterated-phi2-merge-unretrained", + "developer": "netcat420", + "scores": { + "IFEval": 0.3005, + "BBH": 0.4104, + "MATH Level 5": 0.0287, + "GPQA": 0.2609, + "MUSR": 0.3183, + "MMLU-PRO": 0.1478 + } + }, + { + "model_id": "netcat420/MFANN-llama3.1-Abliterated-SLERP", + "name": "MFANN-llama3.1-Abliterated-SLERP", + "developer": "netcat420", + "scores": { + "IFEval": 0.2591, + "BBH": 0.4574, + "MATH Level 5": 0.0483, + "GPQA": 0.2735, + "MUSR": 0.3809, + "MMLU-PRO": 0.2928 + } + }, + { + "model_id": "netcat420/MFANN-llama3.1-abliterated-SLERP-v3", + "name": "MFANN-llama3.1-abliterated-SLERP-v3", + "developer": "netcat420", + "scores": { + "IFEval": 0.3799, + "BBH": 0.4931, + "MATH Level 5": 0.0642, + "GPQA": 0.2911, + "MUSR": 0.366, + "MMLU-PRO": 0.3531 + } + }, + { + "model_id": "netcat420/MFANN-llama3.1-abliterated-SLERP-v3.1", + "name": "MFANN-llama3.1-abliterated-SLERP-v3.1", + "developer": "netcat420", + "scores": { + "IFEval": 0.4202, + "BBH": 0.4921, + "MATH Level 5": 0.0695, + "GPQA": 0.2928, + "MUSR": 0.3686, + "MMLU-PRO": 0.3543 + } + }, + { + "model_id": "netcat420/MFANN-llama3.1-abliterated-v2", + "name": "MFANN-llama3.1-abliterated-v2", + "developer": "netcat420", + "scores": { + "IFEval": 0.4429, + "BBH": 0.4941, + "MATH Level 5": 0.074, + "GPQA": 0.2928, + "MUSR": 0.3845, + "MMLU-PRO": 0.3491 + } + }, + { + "model_id": "netcat420/MFANN-phigments-slerp-V2", + "name": "MFANN-phigments-slerp-V2", + "developer": "netcat420", + "scores": { + "IFEval": 0.3232, + "BBH": 0.4827, + "MATH Level 5": 0.0317, + "GPQA": 0.2727, + "MUSR": 0.4037, + "MMLU-PRO": 0.2717 + } + }, + { + "model_id": "netcat420/MFANN-phigments-slerp-V3.2", + "name": "MFANN-phigments-slerp-V3.2", + "developer": "netcat420", + "scores": { + "IFEval": 0.3524, + "BBH": 0.4809, + "MATH Level 5": 0.0332, + "GPQA": 0.2836, + "MUSR": 0.3708, + "MMLU-PRO": 0.2705 + } + }, + { + "model_id": "netcat420/MFANN-phigments-slerp-V3.3", + "name": "MFANN-phigments-slerp-V3.3", + "developer": "netcat420", + "scores": { + "IFEval": 0.3691, + "BBH": 0.4895, + "MATH Level 5": 0.0332, + "GPQA": 0.2752, + "MUSR": 0.3892, + "MMLU-PRO": 0.2803 + } + }, + { + "model_id": "netcat420/MFANN3b", + "name": "MFANN3b", + "developer": "netcat420", + "scores": { + "IFEval": 0.2524, + "BBH": 0.4433, + "MATH Level 5": 0.0219, + "GPQA": 0.2919, + "MUSR": 0.3606, + "MMLU-PRO": 0.2306 + } + }, + { + "model_id": "netcat420/MFANN3bv0.15", + "name": "MFANN3bv0.15", + "developer": "netcat420", + "scores": { + "IFEval": 0.2012, + "BBH": 0.4539, + "MATH Level 5": 0.0264, + "GPQA": 0.2517, + "MUSR": 0.3958, + "MMLU-PRO": 0.2468 + } + }, + { + "model_id": "netcat420/MFANN3bv0.18", + "name": "MFANN3bv0.18", + "developer": "netcat420", + "scores": { + "IFEval": 0.2206, + "BBH": 0.4514, + "MATH Level 5": 0.0249, + "GPQA": 0.2576, + "MUSR": 0.4024, + "MMLU-PRO": 0.25 + } + }, + { + "model_id": "netcat420/MFANN3bv0.19", + "name": "MFANN3bv0.19", + "developer": "netcat420", + "scores": { + "IFEval": 0.2258, + "BBH": 0.4516, + "MATH Level 5": 0.0227, + "GPQA": 0.2576, + "MUSR": 0.4024, + "MMLU-PRO": 0.252 + } + }, + { + "model_id": "netcat420/MFANN3bv0.20", + "name": "MFANN3bv0.20", + "developer": "netcat420", + "scores": { + "IFEval": 0.2193, + "BBH": 0.4493, + "MATH Level 5": 0.0264, + "GPQA": 0.2592, + "MUSR": 0.4077, + "MMLU-PRO": 0.25 + } + }, + { + "model_id": "netcat420/MFANN3bv0.21", + "name": "MFANN3bv0.21", + "developer": "netcat420", + "scores": { + "IFEval": 0.1909, + "BBH": 0.447, + "MATH Level 5": 0.0317, + "GPQA": 0.2643, + "MUSR": 0.3759, + "MMLU-PRO": 0.2393 + } + }, + { + "model_id": "netcat420/MFANN3bv0.22", + "name": "MFANN3bv0.22", + "developer": "netcat420", + "scores": { + "IFEval": 0.1979, + "BBH": 0.4485, + "MATH Level 5": 0.0264, + "GPQA": 0.2617, + "MUSR": 0.3521, + "MMLU-PRO": 0.2517 + } + }, + { + "model_id": "netcat420/MFANN3bv0.23", + "name": "MFANN3bv0.23", + "developer": "netcat420", + "scores": { + "IFEval": 0.2048, + "BBH": 0.4495, + "MATH Level 5": 0.0249, + "GPQA": 0.2517, + "MUSR": 0.3427, + "MMLU-PRO": 0.2418 + } + }, + { + "model_id": "netcat420/MFANN3bv0.24", + "name": "MFANN3bv0.24", + "developer": "netcat420", + "scores": { + "IFEval": 0.22, + "BBH": 0.4407, + "MATH Level 5": 0.0279, + "GPQA": 0.2584, + "MUSR": 0.3521, + "MMLU-PRO": 0.2352 + } + }, + { + "model_id": "netcat420/MFANN3bv1.1", + "name": "MFANN3bv1.1", + "developer": "netcat420", + "scores": { + "IFEval": 0.2507, + "BBH": 0.3397, + "MATH Level 5": 0.0204, + "GPQA": 0.2668, + "MUSR": 0.3223, + "MMLU-PRO": 0.1159 + } + }, + { + "model_id": "netcat420/MFANN3bv1.2", + "name": "MFANN3bv1.2", + "developer": "netcat420", + "scores": { + "IFEval": 0.2686, + "BBH": 0.366, + "MATH Level 5": 0.0264, + "GPQA": 0.2634, + "MUSR": 0.3156, + "MMLU-PRO": 0.145 + } + }, + { + "model_id": "netcat420/MFANN3bv1.3", + "name": "MFANN3bv1.3", + "developer": "netcat420", + "scores": { + "IFEval": 0.2547, + "BBH": 0.4456, + "MATH Level 5": 0.0211, + "GPQA": 0.2576, + "MUSR": 0.3299, + "MMLU-PRO": 0.2276 + } + }, + { + "model_id": "netcat420/MFANN3bv1.4", + "name": "MFANN3bv1.4", + "developer": "netcat420", + "scores": { + "IFEval": 0.3524, + "BBH": 0.4809, + "MATH Level 5": 0.037, + "GPQA": 0.2827, + "MUSR": 0.3708, + "MMLU-PRO": 0.2705 + } + }, + { + "model_id": "netcat420/MFANNv0.19", + "name": "MFANNv0.19", + "developer": "netcat420", + "scores": { + "IFEval": 0.3057, + "BBH": 0.4731, + "MATH Level 5": 0.0415, + "GPQA": 0.307, + "MUSR": 0.3527, + "MMLU-PRO": 0.2473 + } + }, + { + "model_id": "netcat420/MFANNv0.20", + "name": "MFANNv0.20", + "developer": "netcat420", + "scores": { + "IFEval": 0.3479, + "BBH": 0.4574, + "MATH Level 5": 0.0498, + "GPQA": 0.2903, + "MUSR": 0.3874, + "MMLU-PRO": 0.3202 + } + }, + { + "model_id": "netcat420/MFANNv0.21", + "name": "MFANNv0.21", + "developer": "netcat420", + "scores": { + "IFEval": 0.3233, + "BBH": 0.4576, + "MATH Level 5": 0.0574, + "GPQA": 0.2785, + "MUSR": 0.3993, + "MMLU-PRO": 0.3031 + } + }, + { + "model_id": "netcat420/MFANNv0.22.1", + "name": "MFANNv0.22.1", + "developer": "netcat420", + "scores": { + "IFEval": 0.3089, + "BBH": 0.4661, + "MATH Level 5": 0.0536, + "GPQA": 0.276, + "MUSR": 0.3753, + "MMLU-PRO": 0.3343 + } + }, + { + "model_id": "netcat420/MFANNv0.23", + "name": "MFANNv0.23", + "developer": "netcat420", + "scores": { + "IFEval": 0.3127, + "BBH": 0.4898, + "MATH Level 5": 0.0498, + "GPQA": 0.2844, + "MUSR": 0.3768, + "MMLU-PRO": 0.3388 + } + }, + { + "model_id": "netcat420/MFANNv0.24", + "name": "MFANNv0.24", + "developer": "netcat420", + "scores": { + "IFEval": 0.3162, + "BBH": 0.479, + "MATH Level 5": 0.0612, + "GPQA": 0.2844, + "MUSR": 0.3754, + "MMLU-PRO": 0.3348 + } + }, + { + "model_id": "netcat420/MFANNv0.25", + "name": "MFANNv0.25", + "developer": "netcat420", + "scores": { + "IFEval": 0.3467, + "BBH": 0.4794, + "MATH Level 5": 0.0582, + "GPQA": 0.2802, + "MUSR": 0.3688, + "MMLU-PRO": 0.3343 + } + }, + { + "model_id": "netcat420/Qwen2.5-7B-nerd-uncensored-v0.9-MFANN", + "name": "Qwen2.5-7B-nerd-uncensored-v0.9-MFANN", + "developer": "netcat420", + "scores": { + "IFEval": 0.5878, + "BBH": 0.5237, + "MATH Level 5": 0.3376, + "GPQA": 0.281, + "MUSR": 0.3926, + "MMLU-PRO": 0.3904 + } + }, + { + "model_id": "netcat420/Qwen2.5-7b-MFANN-slerp", + "name": "Qwen2.5-7b-MFANN-slerp", + "developer": "netcat420", + "scores": { + "IFEval": 0.6532, + "BBH": 0.5089, + "MATH Level 5": 0.287, + "GPQA": 0.2953, + "MUSR": 0.4073, + "MMLU-PRO": 0.3417 + } + }, + { + "model_id": "netcat420/Qwen2.5-7b-nerd-uncensored-MFANN-slerp", + "name": "Qwen2.5-7b-nerd-uncensored-MFANN-slerp", + "developer": "netcat420", + "scores": { + "IFEval": 0.1564, + "BBH": 0.292, + "MATH Level 5": 0.0, + "GPQA": 0.2601, + "MUSR": 0.3792, + "MMLU-PRO": 0.11 + } + }, + { + "model_id": "netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN", + "name": "Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN", + "developer": "netcat420", + "scores": { + "IFEval": 0.5742, + "BBH": 0.5071, + "MATH Level 5": 0.2568, + "GPQA": 0.2928, + "MUSR": 0.4058, + "MMLU-PRO": 0.3157 + } + }, + { + "model_id": "netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained", + "name": "Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained", + "developer": "netcat420", + "scores": { + "IFEval": 0.6486, + "BBH": 0.5066, + "MATH Level 5": 0.2991, + "GPQA": 0.2987, + "MUSR": 0.4152, + "MMLU-PRO": 0.3432 + } + }, + { + "model_id": "netcat420/Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b", + "name": "Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b", + "developer": "netcat420", + "scores": { + "IFEval": 0.2676, + "BBH": 0.3789, + "MATH Level 5": 0.0181, + "GPQA": 0.2324, + "MUSR": 0.3528, + "MMLU-PRO": 0.1677 + } + }, + { + "model_id": "netcat420/Qwen2.5-MFANN-7b", + "name": "Qwen2.5-MFANN-7b", + "developer": "netcat420", + "scores": { + "IFEval": 0.6097, + "BBH": 0.5054, + "MATH Level 5": 0.2787, + "GPQA": 0.2861, + "MUSR": 0.4021, + "MMLU-PRO": 0.3233 + } + }, + { + "model_id": "netcat420/qwen2.5-MFANN-7b-SLERP-V1.2", + "name": "qwen2.5-MFANN-7b-SLERP-V1.2", + "developer": "netcat420", + "scores": { + "IFEval": 0.6606, + "BBH": 0.5111, + "MATH Level 5": 0.287, + "GPQA": 0.297, + "MUSR": 0.4259, + "MMLU-PRO": 0.3438 + } + }, + { + "model_id": "netcat420/qwen2.5-MFANN-7b-SLERPv1.1", + "name": "qwen2.5-MFANN-7b-SLERPv1.1", + "developer": "netcat420", + "scores": { + "IFEval": 0.6555, + "BBH": 0.5075, + "MATH Level 5": 0.2968, + "GPQA": 0.2903, + "MUSR": 0.4126, + "MMLU-PRO": 0.3448 + } + }, + { + "model_id": "netcat420/qwen2.5-MFANN-7b-v1.1", + "name": "qwen2.5-MFANN-7b-v1.1", + "developer": "netcat420", + "scores": { + "IFEval": 0.6088, + "BBH": 0.4967, + "MATH Level 5": 0.2825, + "GPQA": 0.276, + "MUSR": 0.4114, + "MMLU-PRO": 0.3248 + } + }, + { + "model_id": "netease-youdao/Confucius-o1-14B", + "name": "Confucius-o1-14B", + "developer": "netease-youdao", + "scores": { + "IFEval": 0.6378, + "BBH": 0.63, + "MATH Level 5": 0.4313, + "GPQA": 0.3649, + "MUSR": 0.4338, + "MMLU-PRO": 0.5265 + } + }, + { + "model_id": "newsbang/Homer-7B-v0.1", + "name": "Homer-7B-v0.1", + "developer": "newsbang", + "scores": { + "IFEval": 0.6109, + "BBH": 0.5601, + "MATH Level 5": 0.386, + "GPQA": 0.3247, + "MUSR": 0.4357, + "MMLU-PRO": 0.4475 + } + }, + { + "model_id": "newsbang/Homer-7B-v0.2", + "name": "Homer-7B-v0.2", + "developer": "newsbang", + "scores": { + "IFEval": 0.7494, + "BBH": 0.5517, + "MATH Level 5": 0.2477, + "GPQA": 0.3322, + "MUSR": 0.4298, + "MMLU-PRO": 0.441 + } + }, + { + "model_id": "newsbang/Homer-v0.3-Qwen2.5-7B", + "name": "Homer-v0.3-Qwen2.5-7B", + "developer": "newsbang", + "scores": { + "IFEval": 0.5154, + "BBH": 0.5481, + "MATH Level 5": 0.3089, + "GPQA": 0.3339, + "MUSR": 0.4744, + "MMLU-PRO": 0.4456 + } + }, + { + "model_id": "newsbang/Homer-v0.4-Qwen2.5-7B", + "name": "Homer-v0.4-Qwen2.5-7B", + "developer": "newsbang", + "scores": { + "IFEval": 0.7999, + "BBH": 0.5533, + "MATH Level 5": 0.2779, + "GPQA": 0.3154, + "MUSR": 0.4311, + "MMLU-PRO": 0.4363 + } + }, + { + "model_id": "newsbang/Homer-v0.5-Qwen2.5-7B", + "name": "Homer-v0.5-Qwen2.5-7B", + "developer": "newsbang", + "scores": { + "IFEval": 0.7881, + "BBH": 0.554, + "MATH Level 5": 0.3724, + "GPQA": 0.3029, + "MUSR": 0.4193, + "MMLU-PRO": 0.4369 + } + }, + { + "model_id": "newsbang/Homer-v1.0-Qwen2.5-72B", + "name": "Homer-v1.0-Qwen2.5-72B", + "developer": "newsbang", + "scores": { + "IFEval": 0.7628, + "BBH": 0.731, + "MATH Level 5": 0.4902, + "GPQA": 0.4161, + "MUSR": 0.4677, + "MMLU-PRO": 0.6145 + } + }, + { + "model_id": "newsbang/Homer-v1.0-Qwen2.5-7B", + "name": "Homer-v1.0-Qwen2.5-7B", + "developer": "newsbang", + "scores": { + "IFEval": 0.6393, + "BBH": 0.5655, + "MATH Level 5": 0.3323, + "GPQA": 0.3221, + "MUSR": 0.4278, + "MMLU-PRO": 0.4535 + } + }, + { + "model_id": "nguyentd/FinancialAdvice-Qwen2.5-7B", + "name": "FinancialAdvice-Qwen2.5-7B", + "developer": "nguyentd", + "scores": { + "IFEval": 0.4496, + "BBH": 0.4731, + "MATH Level 5": 0.1148, + "GPQA": 0.2945, + "MUSR": 0.4025, + "MMLU-PRO": 0.3752 + } + }, + { + "model_id": "ngxson/MiniThinky-1B-Llama-3.2", + "name": "MiniThinky-1B-Llama-3.2", + "developer": "ngxson", + "scores": { + "IFEval": 0.2771, + "BBH": 0.3142, + "MATH Level 5": 0.0574, + "GPQA": 0.2391, + "MUSR": 0.3434, + "MMLU-PRO": 0.1147 + } + }, + { + "model_id": "ngxson/MiniThinky-v2-1B-Llama-3.2", + "name": "MiniThinky-v2-1B-Llama-3.2", + "developer": "ngxson", + "scores": { + "IFEval": 0.2963, + "BBH": 0.3205, + "MATH Level 5": 0.0287, + "GPQA": 0.2399, + "MUSR": 0.3356, + "MMLU-PRO": 0.1116 + } + }, + { + "model_id": "nhyha/N3N_Delirium-v1_1030_0227", + "name": "N3N_Delirium-v1_1030_0227", + "developer": "nhyha", + "scores": { + "IFEval": 0.8023, + "BBH": 0.5891, + "MATH Level 5": 0.2107, + "GPQA": 0.3372, + "MUSR": 0.4098, + "MMLU-PRO": 0.415 + } + }, + { + "model_id": "nhyha/N3N_Llama-3.1-8B-Instruct_1028_0216", + "name": "N3N_Llama-3.1-8B-Instruct_1028_0216", + "developer": "nhyha", + "scores": { + "IFEval": 0.4796, + "BBH": 0.5054, + "MATH Level 5": 0.1707, + "GPQA": 0.3062, + "MUSR": 0.405, + "MMLU-PRO": 0.3638 + } + }, + { + "model_id": "nhyha/N3N_gemma-2-9b-it_20241029_1532", + "name": "N3N_gemma-2-9b-it_20241029_1532", + "developer": "nhyha", + "scores": { + "IFEval": 0.6752, + "BBH": 0.5863, + "MATH Level 5": 0.2122, + "GPQA": 0.3406, + "MUSR": 0.4594, + "MMLU-PRO": 0.4122 + } + }, + { + "model_id": "nhyha/N3N_gemma-2-9b-it_20241110_2026", + "name": "N3N_gemma-2-9b-it_20241110_2026", + "developer": "nhyha", + "scores": { + "IFEval": 0.6283, + "BBH": 0.5867, + "MATH Level 5": 0.1609, + "GPQA": 0.3364, + "MUSR": 0.4073, + "MMLU-PRO": 0.402 + } + }, + { + "model_id": "nhyha/merge_Qwen2.5-7B-Instruct_20241023_0314", + "name": "merge_Qwen2.5-7B-Instruct_20241023_0314", + "developer": "nhyha", + "scores": { + "IFEval": 0.5695, + "BBH": 0.5559, + "MATH Level 5": 0.3542, + "GPQA": 0.3213, + "MUSR": 0.4251, + "MMLU-PRO": 0.4542 + } + }, + { + "model_id": "nidum/Nidum-Limitless-Gemma-2B", + "name": "Nidum-Limitless-Gemma-2B", + "developer": "nidum", + "scores": { + "IFEval": 0.2424, + "BBH": 0.3079, + "MATH Level 5": 0.0136, + "GPQA": 0.2643, + "MUSR": 0.374, + "MMLU-PRO": 0.1174 + } + }, + { + "model_id": "nisten/franqwenstein-35b", + "name": "franqwenstein-35b", + "developer": "nisten", + "scores": { + "IFEval": 0.3799, + "BBH": 0.6647, + "MATH Level 5": 0.3406, + "GPQA": 0.4035, + "MUSR": 0.494, + "MMLU-PRO": 0.5731 + } + }, + { + "model_id": "nisten/tqwendo-36b", + "name": "tqwendo-36b", + "developer": "nisten", + "scores": { + "IFEval": 0.6778, + "BBH": 0.6432, + "MATH Level 5": 0.4154, + "GPQA": 0.3314, + "MUSR": 0.443, + "MMLU-PRO": 0.4381 + } + }, + { + "model_id": "nlpguy/Lion-Lamarck-v.1.0.8", + "name": "Lion-Lamarck-v.1.0.8", + "developer": "nlpguy", + "scores": { + "IFEval": 0.4509, + "BBH": 0.5869, + "MATH Level 5": 0.5544, + "GPQA": 0.3582, + "MUSR": 0.4673, + "MMLU-PRO": 0.4643 + } + }, + { + "model_id": "nlpguy/Lion-Lamarck-v.1.0.9", + "name": "Lion-Lamarck-v.1.0.9", + "developer": "nlpguy", + "scores": { + "IFEval": 0.3409, + "BBH": 0.5918, + "MATH Level 5": 0.5642, + "GPQA": 0.3901, + "MUSR": 0.53, + "MMLU-PRO": 0.4704 + } + }, + { + "model_id": "nlpguy/Lion-Lamarck-v.1.1.0", + "name": "Lion-Lamarck-v.1.1.0", + "developer": "nlpguy", + "scores": { + "IFEval": 0.3658, + "BBH": 0.5962, + "MATH Level 5": 0.5755, + "GPQA": 0.3926, + "MUSR": 0.5325, + "MMLU-PRO": 0.4631 + } + }, + { + "model_id": "nlpguy/Miisce-one", + "name": "Miisce-one", + "developer": "nlpguy", + "scores": { + "IFEval": 0.6066, + "BBH": 0.6505, + "MATH Level 5": 0.4169, + "GPQA": 0.3859, + "MUSR": 0.482, + "MMLU-PRO": 0.5412 + } + }, + { + "model_id": "nlpguy/Mistral-NeMo-Minitron-Upscale-v1", + "name": "Mistral-NeMo-Minitron-Upscale-v1", + "developer": "nlpguy", + "scores": { + "IFEval": 0.1648, + "BBH": 0.4468, + "MATH Level 5": 0.0144, + "GPQA": 0.2802, + "MUSR": 0.3804, + "MMLU-PRO": 0.2537 + } + }, + { + "model_id": "nlpguy/Mistral-NeMo-Minitron-Upscale-v2", + "name": "Mistral-NeMo-Minitron-Upscale-v2", + "developer": "nlpguy", + "scores": { + "IFEval": 0.1573, + "BBH": 0.395, + "MATH Level 5": 0.0128, + "GPQA": 0.2735, + "MUSR": 0.3791, + "MMLU-PRO": 0.1927 + } + }, + { + "model_id": "nlpguy/Mistral-NeMo-Minitron-Upscale-v3", + "name": "Mistral-NeMo-Minitron-Upscale-v3", + "developer": "nlpguy", + "scores": { + "IFEval": 0.1412, + "BBH": 0.3052, + "MATH Level 5": 0.0113, + "GPQA": 0.2592, + "MUSR": 0.4098, + "MMLU-PRO": 0.1171 + } + }, + { + "model_id": "nlpguy/StableProse", + "name": "StableProse", + "developer": "nlpguy", + "scores": { + "IFEval": 0.1972, + "BBH": 0.5117, + "MATH Level 5": 0.065, + "GPQA": 0.3029, + "MUSR": 0.4067, + "MMLU-PRO": 0.3468 + } + }, + { + "model_id": "nlpguy/StarFusion-alpha1", + "name": "StarFusion-alpha1", + "developer": "nlpguy", + "scores": { + "IFEval": 0.566, + "BBH": 0.4429, + "MATH Level 5": 0.0718, + "GPQA": 0.2953, + "MUSR": 0.4081, + "MMLU-PRO": 0.3191 + } + }, + { + "model_id": "noname0202/Llama-3.2-4x3B-Instruct", + "name": "Llama-3.2-4x3B-Instruct", + "developer": "noname0202", + "scores": { + "IFEval": 0.7067, + "BBH": 0.4647, + "MATH Level 5": 0.1586, + "GPQA": 0.2727, + "MUSR": 0.3674, + "MMLU-PRO": 0.3285 + } + }, + { + "model_id": "noname0202/gemma-2-2b-it-ties", + "name": "gemma-2-2b-it-ties", + "developer": "noname0202", + "scores": { + "IFEval": 0.1266, + "BBH": 0.4206, + "MATH Level 5": 0.0242, + "GPQA": 0.2701, + "MUSR": 0.3929, + "MMLU-PRO": 0.2561 + } + }, + { + "model_id": "noname0202/gemma-2-9b-sft-jp-en-zh-v1", + "name": "gemma-2-9b-sft-jp-en-zh-v1", + "developer": "noname0202", + "scores": { + "IFEval": 0.2988, + "BBH": 0.4519, + "MATH Level 5": 0.0891, + "GPQA": 0.307, + "MUSR": 0.408, + "MMLU-PRO": 0.3125 + } + }, + { + "model_id": "noname0202/gemma-2-9b-sft-jp-en-zh-v2", + "name": "gemma-2-9b-sft-jp-en-zh-v2", + "developer": "noname0202", + "scores": { + "IFEval": 0.3993, + "BBH": 0.4515, + "MATH Level 5": 0.1042, + "GPQA": 0.2878, + "MUSR": 0.3612, + "MMLU-PRO": 0.3675 + } + }, + { + "model_id": "noname0202/llama-math-1b-r16-0to512tokens-test", + "name": "llama-math-1b-r16-0to512tokens-test", + "developer": "noname0202", + "scores": { + "IFEval": 0.547, + "BBH": 0.3488, + "MATH Level 5": 0.0816, + "GPQA": 0.2668, + "MUSR": 0.3143, + "MMLU-PRO": 0.1728 + } + }, + { + "model_id": "noname0202/llama-math-1b-r32-0to512tokens-test", + "name": "llama-math-1b-r32-0to512tokens-test", + "developer": "noname0202", + "scores": { + "IFEval": 0.5683, + "BBH": 0.3495, + "MATH Level 5": 0.0906, + "GPQA": 0.2651, + "MUSR": 0.3209, + "MMLU-PRO": 0.176 + } + }, + { + "model_id": "noname0202/llama-math-1b-r32-test", + "name": "llama-math-1b-r32-test", + "developer": "noname0202", + "scores": { + "IFEval": 0.5819, + "BBH": 0.3486, + "MATH Level 5": 0.0725, + "GPQA": 0.2617, + "MUSR": 0.3156, + "MMLU-PRO": 0.1781 + } + }, + { + "model_id": "noname0202/llama-math-1b-r8-512tokens-test", + "name": "llama-math-1b-r8-512tokens-test", + "developer": "noname0202", + "scores": { + "IFEval": 0.5792, + "BBH": 0.3496, + "MATH Level 5": 0.0816, + "GPQA": 0.2685, + "MUSR": 0.3169, + "MMLU-PRO": 0.1753 + } + }, + { + "model_id": "notbdq/Qwen2.5-14B-Instruct-1M-GRPO-Reasoning", + "name": "Qwen2.5-14B-Instruct-1M-GRPO-Reasoning", + "developer": "notbdq", + "scores": { + "IFEval": 0.8414, + "BBH": 0.6198, + "MATH Level 5": 0.5302, + "GPQA": 0.3431, + "MUSR": 0.418, + "MMLU-PRO": 0.485 + } + }, + { + "model_id": "nothingiisreal/L3.1-8B-Celeste-V1.5", + "name": "L3.1-8B-Celeste-V1.5", + "developer": "nothingiisreal", + "scores": { + "IFEval": 0.7327, + "BBH": 0.5012, + "MATH Level 5": 0.1465, + "GPQA": 0.2844, + "MUSR": 0.3749, + "MMLU-PRO": 0.3704 + } + }, + { + "model_id": "nothingiisreal/MN-12B-Starcannon-v2", + "name": "MN-12B-Starcannon-v2", + "developer": "nothingiisreal", + "scores": { + "IFEval": 0.3925, + "BBH": 0.5004, + "MATH Level 5": 0.0597, + "GPQA": 0.2785, + "MUSR": 0.3978, + "MMLU-PRO": 0.3128 + } + }, + { + "model_id": "nothingiisreal/MN-12B-Starcannon-v3", + "name": "MN-12B-Starcannon-v3", + "developer": "nothingiisreal", + "scores": { + "IFEval": 0.3807, + "BBH": 0.5171, + "MATH Level 5": 0.0778, + "GPQA": 0.2735, + "MUSR": 0.4046, + "MMLU-PRO": 0.3265 + } + }, + { + "model_id": "nvidia/AceInstruct-1.5B", + "name": "AceInstruct-1.5B", + "developer": "nvidia", + "scores": { + "IFEval": 0.3948, + "BBH": 0.3932, + "MATH Level 5": 0.3127, + "GPQA": 0.2718, + "MUSR": 0.346, + "MMLU-PRO": 0.2574 + } + }, + { + "model_id": "nvidia/AceInstruct-72B", + "name": "AceInstruct-72B", + "developer": "nvidia", + "scores": { + "IFEval": 0.7119, + "BBH": 0.6139, + "MATH Level 5": 0.6261, + "GPQA": 0.3213, + "MUSR": 0.4206, + "MMLU-PRO": 0.4874 + } + }, + { + "model_id": "nvidia/AceInstruct-7B", + "name": "AceInstruct-7B", + "developer": "nvidia", + "scores": { + "IFEval": 0.5422, + "BBH": 0.5501, + "MATH Level 5": 0.5295, + "GPQA": 0.307, + "MUSR": 0.4255, + "MMLU-PRO": 0.4177 + } + }, + { + "model_id": "nvidia/AceMath-1.5B-Instruct", + "name": "AceMath-1.5B-Instruct", + "developer": "nvidia", + "scores": { + "IFEval": 0.3212, + "BBH": 0.4024, + "MATH Level 5": 0.5287, + "GPQA": 0.2743, + "MUSR": 0.3607, + "MMLU-PRO": 0.2064 + } + }, + { + "model_id": "nvidia/AceMath-72B-Instruct", + "name": "AceMath-72B-Instruct", + "developer": "nvidia", + "scores": { + "IFEval": 0.495, + "BBH": 0.6402, + "MATH Level 5": 0.7145, + "GPQA": 0.271, + "MUSR": 0.4062, + "MMLU-PRO": 0.4411 + } + }, + { + "model_id": "nvidia/AceMath-72B-RM", + "name": "AceMath-72B-RM", + "developer": "nvidia", + "scores": { + "IFEval": 0.1413, + "BBH": 0.2717, + "MATH Level 5": 0.0, + "GPQA": 0.2341, + "MUSR": 0.3351, + "MMLU-PRO": 0.1179 + } + }, + { + "model_id": "nvidia/AceMath-7B-Instruct", + "name": "AceMath-7B-Instruct", + "developer": "nvidia", + "scores": { + "IFEval": 0.4532, + "BBH": 0.4994, + "MATH Level 5": 0.6337, + "GPQA": 0.2919, + "MUSR": 0.4193, + "MMLU-PRO": 0.3383 + } + }, + { + "model_id": "nvidia/AceMath-7B-RM", + "name": "AceMath-7B-RM", + "developer": "nvidia", + "scores": { + "IFEval": 0.1494, + "BBH": 0.2423, + "MATH Level 5": 0.0, + "GPQA": 0.2458, + "MUSR": 0.358, + "MMLU-PRO": 0.1139 + } + }, + { + "model_id": "nvidia/Hymba-1.5B-Base", + "name": "Hymba-1.5B-Base", + "developer": "nvidia", + "scores": { + "IFEval": 0.2295, + "BBH": 0.3256, + "MATH Level 5": 0.0136, + "GPQA": 0.2559, + "MUSR": 0.3566, + "MMLU-PRO": 0.1922 + } + }, + { + "model_id": "nvidia/Hymba-1.5B-Instruct", + "name": "Hymba-1.5B-Instruct", + "developer": "nvidia", + "scores": { + "IFEval": 0.6009, + "BBH": 0.3067, + "MATH Level 5": 0.0272, + "GPQA": 0.2886, + "MUSR": 0.3316, + "MMLU-PRO": 0.204 + } + }, + { + "model_id": "nvidia/Llama-3.1-Minitron-4B-Depth-Base", + "name": "Llama-3.1-Minitron-4B-Depth-Base", + "developer": "nvidia", + "scores": { + "IFEval": 0.1607, + "BBH": 0.4171, + "MATH Level 5": 0.0196, + "GPQA": 0.2634, + "MUSR": 0.4011, + "MMLU-PRO": 0.2798 + } + }, + { + "model_id": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", + "name": "Llama-3.1-Nemotron-70B-Instruct-HF", + "developer": "nvidia", + "scores": { + "IFEval": 0.7381, + "BBH": 0.6316, + "MATH Level 5": 0.4267, + "GPQA": 0.2584, + "MUSR": 0.4328, + "MMLU-PRO": 0.4919 + } + }, + { + "model_id": "nvidia/Minitron-4B-Base", + "name": "Minitron-4B-Base", + "developer": "nvidia", + "scores": { + "IFEval": 0.2218, + "BBH": 0.4084, + "MATH Level 5": 0.0196, + "GPQA": 0.2693, + "MUSR": 0.4134, + "MMLU-PRO": 0.262 + } + }, + { + "model_id": "nvidia/Minitron-8B-Base", + "name": "Minitron-8B-Base", + "developer": "nvidia", + "scores": { + "IFEval": 0.2424, + "BBH": 0.4395, + "MATH Level 5": 0.0257, + "GPQA": 0.2735, + "MUSR": 0.4026, + "MMLU-PRO": 0.3181 + } + }, + { + "model_id": "nvidia/Mistral-NeMo-Minitron-8B-Base", + "name": "Mistral-NeMo-Minitron-8B-Base", + "developer": "nvidia", + "scores": { + "IFEval": 0.1946, + "BBH": 0.5219, + "MATH Level 5": 0.0483, + "GPQA": 0.3255, + "MUSR": 0.4092, + "MMLU-PRO": 0.3796 + } + }, + { + "model_id": "nvidia/Mistral-NeMo-Minitron-8B-Instruct", + "name": "Mistral-NeMo-Minitron-8B-Instruct", + "developer": "nvidia", + "scores": { + "IFEval": 0.5004, + "BBH": 0.5321, + "MATH Level 5": 0.1163, + "GPQA": 0.2878, + "MUSR": 0.3886, + "MMLU-PRO": 0.3991 + } + }, + { + "model_id": "nvidia/Nemotron-Mini-4B-Instruct", + "name": "Nemotron-Mini-4B-Instruct", + "developer": "nvidia", + "scores": { + "IFEval": 0.6669, + "BBH": 0.3865, + "MATH Level 5": 0.0257, + "GPQA": 0.2802, + "MUSR": 0.3767, + "MMLU-PRO": 0.2626 + } + }, + { + "model_id": "nvidia/OpenMath2-Llama3.1-8B", + "name": "OpenMath2-Llama3.1-8B", + "developer": "nvidia", + "scores": { + "IFEval": 0.2331, + "BBH": 0.4096, + "MATH Level 5": 0.2674, + "GPQA": 0.2651, + "MUSR": 0.3436, + "MMLU-PRO": 0.1553 + } + }, + { + "model_id": "nxmwxm/Beast-Soul-new", + "name": "Beast-Soul-new", + "developer": "nxmwxm", + "scores": { + "IFEval": 0.4869, + "BBH": 0.5227, + "MATH Level 5": 0.074, + "GPQA": 0.2819, + "MUSR": 0.4459, + "MMLU-PRO": 0.3102 + } + }, + { + "model_id": "occiglot/occiglot-7b-es-en-instruct", + "name": "occiglot-7b-es-en-instruct", + "developer": "occiglot", + "scores": { + "IFEval": 0.3485, + "BBH": 0.4111, + "MATH Level 5": 0.0242, + "GPQA": 0.2592, + "MUSR": 0.3738, + "MMLU-PRO": 0.2311 + } + }, + { + "model_id": "odyssey-labs/Astral-1-10B", + "name": "Astral-1-10B", + "developer": "odyssey-labs", + "scores": { + "IFEval": 0.3878, + "BBH": 0.4873, + "MATH Level 5": 0.0347, + "GPQA": 0.3054, + "MUSR": 0.428, + "MMLU-PRO": 0.2985 + } + }, + { + "model_id": "olabs-ai/reflection_model", + "name": "reflection_model", + "developer": "olabs-ai", + "scores": { + "IFEval": 0.1599, + "BBH": 0.4713, + "MATH Level 5": 0.0514, + "GPQA": 0.3003, + "MUSR": 0.3508, + "MMLU-PRO": 0.3311 + } + }, + { + "model_id": "ontocord/Llama_3.2_1b-autoredteam_helpfulness-train", + "name": "Llama_3.2_1b-autoredteam_helpfulness-train", + "developer": "ontocord", + "scores": { + "IFEval": 0.2765, + "BBH": 0.3115, + "MATH Level 5": 0.0166, + "GPQA": 0.2592, + "MUSR": 0.3459, + "MMLU-PRO": 0.1132 + } + }, + { + "model_id": "ontocord/RedPajama-3B-v1-AutoRedteam", + "name": "RedPajama-3B-v1-AutoRedteam", + "developer": "ontocord", + "scores": { + "IFEval": 0.1343, + "BBH": 0.3026, + "MATH Level 5": 0.0091, + "GPQA": 0.2424, + "MUSR": 0.3661, + "MMLU-PRO": 0.1108 + } + }, + { + "model_id": "ontocord/RedPajama-3B-v1-AutoRedteam-Harmless-only", + "name": "RedPajama-3B-v1-AutoRedteam-Harmless-only", + "developer": "ontocord", + "scores": { + "IFEval": 0.1525, + "BBH": 0.3124, + "MATH Level 5": 0.006, + "GPQA": 0.2315, + "MUSR": 0.3661, + "MMLU-PRO": 0.11 + } + }, + { + "model_id": "ontocord/RedPajama3b_v1-autoredteam_helpfulness-train", + "name": "RedPajama3b_v1-autoredteam_helpfulness-train", + "developer": "ontocord", + "scores": { + "IFEval": 0.2848, + "BBH": 0.3093, + "MATH Level 5": 0.0068, + "GPQA": 0.2458, + "MUSR": 0.358, + "MMLU-PRO": 0.1107 + } + }, + { + "model_id": "ontocord/merged_0.2_expert_0.8", + "name": "merged_0.2_expert_0.8", + "developer": "ontocord", + "scores": { + "IFEval": 0.1743, + "BBH": 0.3046, + "MATH Level 5": 0.0264, + "GPQA": 0.2617, + "MUSR": 0.3621, + "MMLU-PRO": 0.1111 + } + }, + { + "model_id": "ontocord/merged_0.2_expert_0.8-stack_2x", + "name": "merged_0.2_expert_0.8-stack_2x", + "developer": "ontocord", + "scores": { + "IFEval": 0.1796, + "BBH": 0.3006, + "MATH Level 5": 0.0249, + "GPQA": 0.2626, + "MUSR": 0.3541, + "MMLU-PRO": 0.1103 + } + }, + { + "model_id": "ontocord/merged_0.5_expert_0.5", + "name": "merged_0.5_expert_0.5", + "developer": "ontocord", + "scores": { + "IFEval": 0.1787, + "BBH": 0.3017, + "MATH Level 5": 0.0196, + "GPQA": 0.2643, + "MUSR": 0.3542, + "MMLU-PRO": 0.1108 + } + }, + { + "model_id": "ontocord/ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful", + "name": "ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful", + "developer": "ontocord", + "scores": { + "IFEval": 0.1318, + "BBH": 0.3004, + "MATH Level 5": 0.0106, + "GPQA": 0.2676, + "MUSR": 0.3631, + "MMLU-PRO": 0.1142 + } + }, + { + "model_id": "ontocord/ontocord_wide_7b-stacked-stage1", + "name": "ontocord_wide_7b-stacked-stage1", + "developer": "ontocord", + "scores": { + "IFEval": 0.1485, + "BBH": 0.2897, + "MATH Level 5": 0.0091, + "GPQA": 0.2534, + "MUSR": 0.3604, + "MMLU-PRO": 0.1105 + } + }, + { + "model_id": "ontocord/ontocord_wide_7b-stacked-stage1-instruct", + "name": "ontocord_wide_7b-stacked-stage1-instruct", + "developer": "ontocord", + "scores": { + "IFEval": 0.153, + "BBH": 0.2854, + "MATH Level 5": 0.0068, + "GPQA": 0.2466, + "MUSR": 0.3538, + "MMLU-PRO": 0.1117 + } + }, + { + "model_id": "ontocord/starcoder2-29b-ls", + "name": "starcoder2-29b-ls", + "developer": "ontocord", + "scores": { + "IFEval": 0.2149, + "BBH": 0.3735, + "MATH Level 5": 0.0189, + "GPQA": 0.2735, + "MUSR": 0.37, + "MMLU-PRO": 0.1869 + } + }, + { + "model_id": "ontocord/starcoder2_3b-AutoRedteam", + "name": "starcoder2_3b-AutoRedteam", + "developer": "ontocord", + "scores": { + "IFEval": 0.1574, + "BBH": 0.3498, + "MATH Level 5": 0.0106, + "GPQA": 0.2517, + "MUSR": 0.3646, + "MMLU-PRO": 0.1336 + } + }, + { + "model_id": "ontocord/wide_3b-merge_test", + "name": "wide_3b-merge_test", + "developer": "ontocord", + "scores": { + "IFEval": 0.1763, + "BBH": 0.3011, + "MATH Level 5": 0.0, + "GPQA": 0.2399, + "MUSR": 0.342, + "MMLU-PRO": 0.1066 + } + }, + { + "model_id": "ontocord/wide_3b-stage1_shuf_sample1_jsonl-pretrained", + "name": "wide_3b-stage1_shuf_sample1_jsonl-pretrained", + "developer": "ontocord", + "scores": { + "IFEval": 0.1395, + "BBH": 0.3004, + "MATH Level 5": 0.0166, + "GPQA": 0.2659, + "MUSR": 0.3632, + "MMLU-PRO": 0.114 + } + }, + { + "model_id": "ontocord/wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge", + "name": "wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge", + "developer": "ontocord", + "scores": { + "IFEval": 0.1664, + "BBH": 0.3031, + "MATH Level 5": 0.0113, + "GPQA": 0.2601, + "MUSR": 0.3845, + "MMLU-PRO": 0.1111 + } + }, + { + "model_id": "ontocord/wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge", + "name": "wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge", + "developer": "ontocord", + "scores": { + "IFEval": 0.1697, + "BBH": 0.2975, + "MATH Level 5": 0.0136, + "GPQA": 0.2601, + "MUSR": 0.3778, + "MMLU-PRO": 0.1125 + } + }, + { + "model_id": "ontocord/wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue", + "name": "wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue", + "developer": "ontocord", + "scores": { + "IFEval": 0.148, + "BBH": 0.3095, + "MATH Level 5": 0.0204, + "GPQA": 0.2701, + "MUSR": 0.3579, + "MMLU-PRO": 0.1108 + } + }, + { + "model_id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue", + "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue", + "developer": "ontocord", + "scores": { + "IFEval": 0.1237, + "BBH": 0.306, + "MATH Level 5": 0.0106, + "GPQA": 0.2743, + "MUSR": 0.3673, + "MMLU-PRO": 0.1111 + } + }, + { + "model_id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue", + "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue", + "developer": "ontocord", + "scores": { + "IFEval": 0.1192, + "BBH": 0.2956, + "MATH Level 5": 0.0068, + "GPQA": 0.2643, + "MUSR": 0.3553, + "MMLU-PRO": 0.1183 + } + }, + { + "model_id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue", + "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue", + "developer": "ontocord", + "scores": { + "IFEval": 0.1128, + "BBH": 0.3171, + "MATH Level 5": 0.0113, + "GPQA": 0.2685, + "MUSR": 0.346, + "MMLU-PRO": 0.1129 + } + }, + { + "model_id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue", + "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue", + "developer": "ontocord", + "scores": { + "IFEval": 0.1317, + "BBH": 0.3064, + "MATH Level 5": 0.0091, + "GPQA": 0.2651, + "MUSR": 0.3446, + "MMLU-PRO": 0.1144 + } + }, + { + "model_id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue", + "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue", + "developer": "ontocord", + "scores": { + "IFEval": 0.1182, + "BBH": 0.3037, + "MATH Level 5": 0.0083, + "GPQA": 0.2659, + "MUSR": 0.3567, + "MMLU-PRO": 0.1162 + } + }, + { + "model_id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue", + "name": "wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue", + "developer": "ontocord", + "scores": { + "IFEval": 0.124, + "BBH": 0.3032, + "MATH Level 5": 0.0076, + "GPQA": 0.2584, + "MUSR": 0.3487, + "MMLU-PRO": 0.1128 + } + }, + { + "model_id": "ontocord/wide_3b_sft_stage1.1-ss1-with_math.no_issue", + "name": "wide_3b_sft_stage1.1-ss1-with_math.no_issue", + "developer": "ontocord", + "scores": { + "IFEval": 0.1298, + "BBH": 0.3052, + "MATH Level 5": 0.0159, + "GPQA": 0.2601, + "MUSR": 0.3928, + "MMLU-PRO": 0.1147 + } + }, + { + "model_id": "ontocord/wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue", + "name": "wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue", + "developer": "ontocord", + "scores": { + "IFEval": 0.2049, + "BBH": 0.2912, + "MATH Level 5": 0.0, + "GPQA": 0.2601, + "MUSR": 0.3575, + "MMLU-PRO": 0.1167 + } + }, + { + "model_id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical", + "name": "wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical", + "developer": "ontocord", + "scores": { + "IFEval": 0.1461, + "BBH": 0.2998, + "MATH Level 5": 0.0136, + "GPQA": 0.2643, + "MUSR": 0.3926, + "MMLU-PRO": 0.1141 + } + }, + { + "model_id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_formatted_text", + "name": "wide_3b_sft_stage1.2-ss1-expert_formatted_text", + "developer": "ontocord", + "scores": { + "IFEval": 0.1487, + "BBH": 0.3069, + "MATH Level 5": 0.0121, + "GPQA": 0.2617, + "MUSR": 0.3474, + "MMLU-PRO": 0.1146 + } + }, + { + "model_id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_how-to", + "name": "wide_3b_sft_stage1.2-ss1-expert_how-to", + "developer": "ontocord", + "scores": { + "IFEval": 0.1245, + "BBH": 0.3047, + "MATH Level 5": 0.0144, + "GPQA": 0.2592, + "MUSR": 0.3658, + "MMLU-PRO": 0.1153 + } + }, + { + "model_id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_math", + "name": "wide_3b_sft_stage1.2-ss1-expert_math", + "developer": "ontocord", + "scores": { + "IFEval": 0.1915, + "BBH": 0.306, + "MATH Level 5": 0.0279, + "GPQA": 0.2592, + "MUSR": 0.37, + "MMLU-PRO": 0.1092 + } + }, + { + "model_id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_news", + "name": "wide_3b_sft_stage1.2-ss1-expert_news", + "developer": "ontocord", + "scores": { + "IFEval": 0.1658, + "BBH": 0.2926, + "MATH Level 5": 0.0166, + "GPQA": 0.2676, + "MUSR": 0.3621, + "MMLU-PRO": 0.1111 + } + }, + { + "model_id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_software", + "name": "wide_3b_sft_stage1.2-ss1-expert_software", + "developer": "ontocord", + "scores": { + "IFEval": 0.1734, + "BBH": 0.298, + "MATH Level 5": 0.0159, + "GPQA": 0.2584, + "MUSR": 0.3569, + "MMLU-PRO": 0.114 + } + }, + { + "model_id": "ontocord/wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked", + "name": "wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked", + "developer": "ontocord", + "scores": { + "IFEval": 0.1244, + "BBH": 0.3026, + "MATH Level 5": 0.0144, + "GPQA": 0.2659, + "MUSR": 0.3686, + "MMLU-PRO": 0.1115 + } + }, + { + "model_id": "oobabooga/CodeBooga-34B-v0.1", + "name": "CodeBooga-34B-v0.1", + "developer": "oobabooga", + "scores": { + "IFEval": 0.525, + "BBH": 0.3427, + "MATH Level 5": 0.0393, + "GPQA": 0.2567, + "MUSR": 0.431, + "MMLU-PRO": 0.236 + } + }, + { + "model_id": "oopere/Llama-FinSent-S", + "name": "Llama-FinSent-S", + "developer": "oopere", + "scores": { + "IFEval": 0.2119, + "BBH": 0.3156, + "MATH Level 5": 0.0181, + "GPQA": 0.2567, + "MUSR": 0.3832, + "MMLU-PRO": 0.113 + } + }, + { + "model_id": "oopere/pruned10-llama-3.2-3B", + "name": "pruned10-llama-3.2-3B", + "developer": "oopere", + "scores": { + "IFEval": 0.1776, + "BBH": 0.334, + "MATH Level 5": 0.0196, + "GPQA": 0.2668, + "MUSR": 0.3722, + "MMLU-PRO": 0.164 + } + }, + { + "model_id": "oopere/pruned20-llama-1b", + "name": "pruned20-llama-1b", + "developer": "oopere", + "scores": { + "IFEval": 0.1994, + "BBH": 0.3031, + "MATH Level 5": 0.0106, + "GPQA": 0.25, + "MUSR": 0.3631, + "MMLU-PRO": 0.1123 + } + }, + { + "model_id": "oopere/pruned20-llama-3.2-3b", + "name": "pruned20-llama-3.2-3b", + "developer": "oopere", + "scores": { + "IFEval": 0.1789, + "BBH": 0.3248, + "MATH Level 5": 0.0159, + "GPQA": 0.2659, + "MUSR": 0.3418, + "MMLU-PRO": 0.128 + } + }, + { + "model_id": "oopere/pruned40-llama-1b", + "name": "pruned40-llama-1b", + "developer": "oopere", + "scores": { + "IFEval": 0.2284, + "BBH": 0.2969, + "MATH Level 5": 0.0076, + "GPQA": 0.2433, + "MUSR": 0.4287, + "MMLU-PRO": 0.1082 + } + }, + { + "model_id": "oopere/pruned40-llama-3.2-1B", + "name": "pruned40-llama-3.2-1B", + "developer": "oopere", + "scores": { + "IFEval": 0.2266, + "BBH": 0.2982, + "MATH Level 5": 0.0083, + "GPQA": 0.2542, + "MUSR": 0.4352, + "MMLU-PRO": 0.1115 + } + }, + { + "model_id": "oopere/pruned40-llama-3.2-3b", + "name": "pruned40-llama-3.2-3b", + "developer": "oopere", + "scores": { + "IFEval": 0.2183, + "BBH": 0.3167, + "MATH Level 5": 0.0128, + "GPQA": 0.2299, + "MUSR": 0.3539, + "MMLU-PRO": 0.1177 + } + }, + { + "model_id": "oopere/pruned60-llama-1b", + "name": "pruned60-llama-1b", + "developer": "oopere", + "scores": { + "IFEval": 0.1829, + "BBH": 0.3016, + "MATH Level 5": 0.0023, + "GPQA": 0.2492, + "MUSR": 0.4088, + "MMLU-PRO": 0.1173 + } + }, + { + "model_id": "oopere/pruned60-llama-3.2-3b", + "name": "pruned60-llama-3.2-3b", + "developer": "oopere", + "scores": { + "IFEval": 0.1825, + "BBH": 0.3166, + "MATH Level 5": 0.0038, + "GPQA": 0.2701, + "MUSR": 0.3633, + "MMLU-PRO": 0.1131 + } + }, + { + "model_id": "open-atlas/Atlas-Flash-1.5B-Preview", + "name": "Atlas-Flash-1.5B-Preview", + "developer": "open-atlas", + "scores": { + "IFEval": 0.327, + "BBH": 0.3215, + "MATH Level 5": 0.2213, + "GPQA": 0.2525, + "MUSR": 0.3488, + "MMLU-PRO": 0.1374 + } + }, + { + "model_id": "open-atlas/Atlas-Flash-7B-Preview", + "name": "Atlas-Flash-7B-Preview", + "developer": "open-atlas", + "scores": { + "IFEval": 0.3908, + "BBH": 0.3542, + "MATH Level 5": 0.2576, + "GPQA": 0.2886, + "MUSR": 0.3836, + "MMLU-PRO": 0.2784 + } + }, + { + "model_id": "open-neo/Kyro-n1-3B", + "name": "Kyro-n1-3B", + "developer": "open-neo", + "scores": { + "IFEval": 0.4595, + "BBH": 0.4685, + "MATH Level 5": 0.2855, + "GPQA": 0.2819, + "MUSR": 0.4088, + "MMLU-PRO": 0.3423 + } + }, + { + "model_id": "open-neo/Kyro-n1-7B", + "name": "Kyro-n1-7B", + "developer": "open-neo", + "scores": { + "IFEval": 0.5573, + "BBH": 0.5387, + "MATH Level 5": 0.3897, + "GPQA": 0.2609, + "MUSR": 0.3884, + "MMLU-PRO": 0.4333 + } + }, + { + "model_id": "open-thoughts/OpenThinker-7B", + "name": "OpenThinker-7B", + "developer": "open-thoughts", + "scores": { + "IFEval": 0.4089, + "BBH": 0.5343, + "MATH Level 5": 0.426, + "GPQA": 0.2567, + "MUSR": 0.382, + "MMLU-PRO": 0.4165 + } + }, + { + "model_id": "openai-community/gpt2", + "name": "gpt2", + "developer": "openai-community", + "scores": { + "IFEval": 0.1793, + "BBH": 0.3036, + "MATH Level 5": 0.0023, + "GPQA": 0.2584, + "MUSR": 0.4471, + "MMLU-PRO": 0.1159 + } + }, + { + "model_id": "openai-community/gpt2-large", + "name": "gpt2-large", + "developer": "openai-community", + "scores": { + "IFEval": 0.2048, + "BBH": 0.3069, + "MATH Level 5": 0.0121, + "GPQA": 0.2592, + "MUSR": 0.3789, + "MMLU-PRO": 0.1142 + } + }, + { + "model_id": "openai-community/gpt2-medium", + "name": "gpt2-medium", + "developer": "openai-community", + "scores": { + "IFEval": 0.2208, + "BBH": 0.305, + "MATH Level 5": 0.0076, + "GPQA": 0.2626, + "MUSR": 0.3884, + "MMLU-PRO": 0.1182 + } + }, + { + "model_id": "openai-community/gpt2-xl", + "name": "gpt2-xl", + "developer": "openai-community", + "scores": { + "IFEval": 0.2039, + "BBH": 0.3009, + "MATH Level 5": 0.0098, + "GPQA": 0.2584, + "MUSR": 0.371, + "MMLU-PRO": 0.1131 + } + }, + { + "model_id": "openbmb/MiniCPM-S-1B-sft-llama-format", + "name": "MiniCPM-S-1B-sft-llama-format", + "developer": "openbmb", + "scores": { + "IFEval": 0.3329, + "BBH": 0.3049, + "MATH Level 5": 0.031, + "GPQA": 0.271, + "MUSR": 0.3317, + "MMLU-PRO": 0.1858 + } + }, + { + "model_id": "openchat/openchat-3.5-0106", + "name": "openchat-3.5-0106", + "developer": "openchat", + "scores": { + "IFEval": 0.5967, + "BBH": 0.4617, + "MATH Level 5": 0.0763, + "GPQA": 0.3079, + "MUSR": 0.4254, + "MMLU-PRO": 0.3291 + } + }, + { + "model_id": "openchat/openchat-3.5-1210", + "name": "openchat-3.5-1210", + "developer": "openchat", + "scores": { + "IFEval": 0.6037, + "BBH": 0.4535, + "MATH Level 5": 0.0785, + "GPQA": 0.3012, + "MUSR": 0.4414, + "MMLU-PRO": 0.3142 + } + }, + { + "model_id": "openchat/openchat-3.6-8b-20240522", + "name": "openchat-3.6-8b-20240522", + "developer": "openchat", + "scores": { + "IFEval": 0.5343, + "BBH": 0.5338, + "MATH Level 5": 0.0997, + "GPQA": 0.318, + "MUSR": 0.3999, + "MMLU-PRO": 0.3229 + } + }, + { + "model_id": "openchat/openchat_3.5", + "name": "openchat_3.5", + "developer": "openchat", + "scores": { + "IFEval": 0.5931, + "BBH": 0.4426, + "MATH Level 5": 0.0725, + "GPQA": 0.2987, + "MUSR": 0.4229, + "MMLU-PRO": 0.3153 + } + }, + { + "model_id": "openchat/openchat_v3.2", + "name": "openchat_v3.2", + "developer": "openchat", + "scores": { + "IFEval": 0.2981, + "BBH": 0.4331, + "MATH Level 5": 0.0128, + "GPQA": 0.2701, + "MUSR": 0.4336, + "MMLU-PRO": 0.2422 + } + }, + { + "model_id": "openchat/openchat_v3.2_super", + "name": "openchat_v3.2_super", + "developer": "openchat", + "scores": { + "IFEval": 0.2862, + "BBH": 0.4221, + "MATH Level 5": 0.0211, + "GPQA": 0.2643, + "MUSR": 0.4161, + "MMLU-PRO": 0.2425 + } + }, + { + "model_id": "orai-nlp/Llama-eus-8B", + "name": "Llama-eus-8B", + "developer": "orai-nlp", + "scores": { + "IFEval": 0.2161, + "BBH": 0.4418, + "MATH Level 5": 0.0468, + "GPQA": 0.2894, + "MUSR": 0.3919, + "MMLU-PRO": 0.3058 + } + }, + { + "model_id": "oxyapi/oxy-1-small", + "name": "oxy-1-small", + "developer": "oxyapi", + "scores": { + "IFEval": 0.6245, + "BBH": 0.5885, + "MATH Level 5": 0.3603, + "GPQA": 0.3716, + "MUSR": 0.4487, + "MMLU-PRO": 0.5001 + } + }, + { + "model_id": "ozone-ai/0x-lite", + "name": "0x-lite", + "developer": "ozone-ai", + "scores": { + "IFEval": 0.774, + "BBH": 0.6341, + "MATH Level 5": 0.5045, + "GPQA": 0.3196, + "MUSR": 0.4221, + "MMLU-PRO": 0.5184 + } + }, + { + "model_id": "ozone-research/Chirp-01", + "name": "Chirp-01", + "developer": "ozone-research", + "scores": { + "IFEval": 0.6348, + "BBH": 0.465, + "MATH Level 5": 0.3467, + "GPQA": 0.2718, + "MUSR": 0.4487, + "MMLU-PRO": 0.3508 + } + }, + { + "model_id": "paloalma/ECE-TW3-JRGL-V1", + "name": "ECE-TW3-JRGL-V1", + "developer": "paloalma", + "scores": { + "IFEval": 0.5535, + "BBH": 0.6284, + "MATH Level 5": 0.1314, + "GPQA": 0.3473, + "MUSR": 0.4621, + "MMLU-PRO": 0.4221 + } + }, + { + "model_id": "paloalma/ECE-TW3-JRGL-V2", + "name": "ECE-TW3-JRGL-V2", + "developer": "paloalma", + "scores": { + "IFEval": 0.2255, + "BBH": 0.6031, + "MATH Level 5": 0.185, + "GPQA": 0.3314, + "MUSR": 0.4793, + "MMLU-PRO": 0.4588 + } + }, + { + "model_id": "paloalma/ECE-TW3-JRGL-V5", + "name": "ECE-TW3-JRGL-V5", + "developer": "paloalma", + "scores": { + "IFEval": 0.4553, + "BBH": 0.6025, + "MATH Level 5": 0.1835, + "GPQA": 0.3414, + "MUSR": 0.4621, + "MMLU-PRO": 0.4648 + } + }, + { + "model_id": "paloalma/Le_Triomphant-ECE-TW3", + "name": "Le_Triomphant-ECE-TW3", + "developer": "paloalma", + "scores": { + "IFEval": 0.5402, + "BBH": 0.6112, + "MATH Level 5": 0.1949, + "GPQA": 0.349, + "MUSR": 0.4725, + "MMLU-PRO": 0.4763 + } + }, + { + "model_id": "paloalma/TW3-JRGL-v2", + "name": "TW3-JRGL-v2", + "developer": "paloalma", + "scores": { + "IFEval": 0.5316, + "BBH": 0.6138, + "MATH Level 5": 0.179, + "GPQA": 0.3591, + "MUSR": 0.4858, + "MMLU-PRO": 0.4858 + } + }, + { + "model_id": "pankajmathur/Al_Dente_v1_8b", + "name": "Al_Dente_v1_8b", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.3694, + "BBH": 0.4835, + "MATH Level 5": 0.0408, + "GPQA": 0.2995, + "MUSR": 0.3987, + "MMLU-PRO": 0.286 + } + }, + { + "model_id": "pankajmathur/model_007_13b_v2", + "name": "model_007_13b_v2", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.3056, + "BBH": 0.4702, + "MATH Level 5": 0.0211, + "GPQA": 0.2836, + "MUSR": 0.4611, + "MMLU-PRO": 0.2461 + } + }, + { + "model_id": "pankajmathur/orca_mini_3b", + "name": "orca_mini_3b", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.0742, + "BBH": 0.3196, + "MATH Level 5": 0.0083, + "GPQA": 0.2458, + "MUSR": 0.3349, + "MMLU-PRO": 0.1145 + } + }, + { + "model_id": "pankajmathur/orca_mini_7b", + "name": "orca_mini_7b", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.0412, + "BBH": 0.3332, + "MATH Level 5": 0.0128, + "GPQA": 0.2542, + "MUSR": 0.3698, + "MMLU-PRO": 0.1246 + } + }, + { + "model_id": "pankajmathur/orca_mini_phi-4", + "name": "orca_mini_phi-4", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.7781, + "BBH": 0.6856, + "MATH Level 5": 0.2953, + "GPQA": 0.3742, + "MUSR": 0.4703, + "MMLU-PRO": 0.5255 + } + }, + { + "model_id": "pankajmathur/orca_mini_v2_7b", + "name": "orca_mini_v2_7b", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.1358, + "BBH": 0.3536, + "MATH Level 5": 0.0113, + "GPQA": 0.2492, + "MUSR": 0.3593, + "MMLU-PRO": 0.1542 + } + }, + { + "model_id": "pankajmathur/orca_mini_v3_13b", + "name": "orca_mini_v3_13b", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.2897, + "BBH": 0.4711, + "MATH Level 5": 0.0211, + "GPQA": 0.2651, + "MUSR": 0.4598, + "MMLU-PRO": 0.2305 + } + }, + { + "model_id": "pankajmathur/orca_mini_v3_70b", + "name": "orca_mini_v3_70b", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.4015, + "BBH": 0.5949, + "MATH Level 5": 0.0385, + "GPQA": 0.318, + "MUSR": 0.5079, + "MMLU-PRO": 0.3757 + } + }, + { + "model_id": "pankajmathur/orca_mini_v3_7b", + "name": "orca_mini_v3_7b", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.2821, + "BBH": 0.4095, + "MATH Level 5": 0.0106, + "GPQA": 0.2466, + "MUSR": 0.4982, + "MMLU-PRO": 0.2084 + } + }, + { + "model_id": "pankajmathur/orca_mini_v5_8b", + "name": "orca_mini_v5_8b", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.4806, + "BBH": 0.5064, + "MATH Level 5": 0.0989, + "GPQA": 0.2869, + "MUSR": 0.4, + "MMLU-PRO": 0.3076 + } + }, + { + "model_id": "pankajmathur/orca_mini_v5_8b_dpo", + "name": "orca_mini_v5_8b_dpo", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.4896, + "BBH": 0.5075, + "MATH Level 5": 0.0974, + "GPQA": 0.2743, + "MUSR": 0.3894, + "MMLU-PRO": 0.3116 + } + }, + { + "model_id": "pankajmathur/orca_mini_v5_8b_orpo", + "name": "orca_mini_v5_8b_orpo", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.0824, + "BBH": 0.4964, + "MATH Level 5": 0.0665, + "GPQA": 0.2844, + "MUSR": 0.4131, + "MMLU-PRO": 0.2947 + } + }, + { + "model_id": "pankajmathur/orca_mini_v6_8b", + "name": "orca_mini_v6_8b", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.0111, + "BBH": 0.3029, + "MATH Level 5": 0.0038, + "GPQA": 0.2383, + "MUSR": 0.3555, + "MMLU-PRO": 0.1125 + } + }, + { + "model_id": "pankajmathur/orca_mini_v6_8b_dpo", + "name": "orca_mini_v6_8b_dpo", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.3883, + "BBH": 0.5203, + "MATH Level 5": 0.0612, + "GPQA": 0.3012, + "MUSR": 0.409, + "MMLU-PRO": 0.3596 + } + }, + { + "model_id": "pankajmathur/orca_mini_v7_72b", + "name": "orca_mini_v7_72b", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.593, + "BBH": 0.6842, + "MATH Level 5": 0.0937, + "GPQA": 0.3851, + "MUSR": 0.507, + "MMLU-PRO": 0.5622 + } + }, + { + "model_id": "pankajmathur/orca_mini_v7_7b", + "name": "orca_mini_v7_7b", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.4388, + "BBH": 0.5275, + "MATH Level 5": 0.1208, + "GPQA": 0.2961, + "MUSR": 0.436, + "MMLU-PRO": 0.4167 + } + }, + { + "model_id": "pankajmathur/orca_mini_v8_1_70b", + "name": "orca_mini_v8_1_70b", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.8571, + "BBH": 0.6781, + "MATH Level 5": 0.3527, + "GPQA": 0.4329, + "MUSR": 0.4437, + "MMLU-PRO": 0.4983 + } + }, + { + "model_id": "pankajmathur/orca_mini_v9_0_3B-Instruct", + "name": "orca_mini_v9_0_3B-Instruct", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.5754, + "BBH": 0.4413, + "MATH Level 5": 0.1465, + "GPQA": 0.3012, + "MUSR": 0.3659, + "MMLU-PRO": 0.2603 + } + }, + { + "model_id": "pankajmathur/orca_mini_v9_1_1B-Instruct", + "name": "orca_mini_v9_1_1B-Instruct", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.3629, + "BBH": 0.3205, + "MATH Level 5": 0.0461, + "GPQA": 0.2567, + "MUSR": 0.3381, + "MMLU-PRO": 0.1374 + } + }, + { + "model_id": "pankajmathur/orca_mini_v9_2_14B", + "name": "orca_mini_v9_2_14B", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.7781, + "BBH": 0.6856, + "MATH Level 5": 0.2953, + "GPQA": 0.3742, + "MUSR": 0.4703, + "MMLU-PRO": 0.5255 + } + }, + { + "model_id": "pankajmathur/orca_mini_v9_2_70b", + "name": "orca_mini_v9_2_70b", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.8383, + "BBH": 0.6745, + "MATH Level 5": 0.2938, + "GPQA": 0.3733, + "MUSR": 0.471, + "MMLU-PRO": 0.4821 + } + }, + { + "model_id": "pankajmathur/orca_mini_v9_4_70B", + "name": "orca_mini_v9_4_70B", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.8015, + "BBH": 0.6419, + "MATH Level 5": 0.3263, + "GPQA": 0.3658, + "MUSR": 0.4647, + "MMLU-PRO": 0.4536 + } + }, + { + "model_id": "pankajmathur/orca_mini_v9_5_1B-Instruct", + "name": "orca_mini_v9_5_1B-Instruct", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.4638, + "BBH": 0.3337, + "MATH Level 5": 0.0302, + "GPQA": 0.2701, + "MUSR": 0.3182, + "MMLU-PRO": 0.137 + } + }, + { + "model_id": "pankajmathur/orca_mini_v9_5_1B-Instruct_preview", + "name": "orca_mini_v9_5_1B-Instruct_preview", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.3936, + "BBH": 0.3277, + "MATH Level 5": 0.0385, + "GPQA": 0.2634, + "MUSR": 0.3395, + "MMLU-PRO": 0.1327 + } + }, + { + "model_id": "pankajmathur/orca_mini_v9_5_3B-Instruct", + "name": "orca_mini_v9_5_3B-Instruct", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.7207, + "BBH": 0.4496, + "MATH Level 5": 0.1322, + "GPQA": 0.2869, + "MUSR": 0.427, + "MMLU-PRO": 0.2882 + } + }, + { + "model_id": "pankajmathur/orca_mini_v9_6_1B-Instruct", + "name": "orca_mini_v9_6_1B-Instruct", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.6086, + "BBH": 0.3561, + "MATH Level 5": 0.077, + "GPQA": 0.2685, + "MUSR": 0.3396, + "MMLU-PRO": 0.1809 + } + }, + { + "model_id": "pankajmathur/orca_mini_v9_6_3B-Instruct", + "name": "orca_mini_v9_6_3B-Instruct", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.7316, + "BBH": 0.4568, + "MATH Level 5": 0.1329, + "GPQA": 0.2936, + "MUSR": 0.4068, + "MMLU-PRO": 0.2851 + } + }, + { + "model_id": "pankajmathur/orca_mini_v9_7_1B-Instruct", + "name": "orca_mini_v9_7_1B-Instruct", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.561, + "BBH": 0.3182, + "MATH Level 5": 0.0446, + "GPQA": 0.2727, + "MUSR": 0.3527, + "MMLU-PRO": 0.1345 + } + }, + { + "model_id": "pankajmathur/orca_mini_v9_7_3B-Instruct", + "name": "orca_mini_v9_7_3B-Instruct", + "developer": "pankajmathur", + "scores": { + "IFEval": 0.5618, + "BBH": 0.3297, + "MATH Level 5": 0.0619, + "GPQA": 0.2617, + "MUSR": 0.3619, + "MMLU-PRO": 0.1375 + } + }, + { + "model_id": "paulml/ECE-ILAB-Q1", + "name": "ECE-ILAB-Q1", + "developer": "paulml", + "scores": { + "IFEval": 0.7865, + "BBH": 0.6718, + "MATH Level 5": 0.3557, + "GPQA": 0.3867, + "MUSR": 0.4614, + "MMLU-PRO": 0.5505 + } + }, + { + "model_id": "pints-ai/1.5-Pints-16K-v0.1", + "name": "1.5-Pints-16K-v0.1", + "developer": "pints-ai", + "scores": { + "IFEval": 0.1636, + "BBH": 0.3133, + "MATH Level 5": 0.0144, + "GPQA": 0.2357, + "MUSR": 0.3579, + "MMLU-PRO": 0.1119 + } + }, + { + "model_id": "pints-ai/1.5-Pints-2K-v0.1", + "name": "1.5-Pints-2K-v0.1", + "developer": "pints-ai", + "scores": { + "IFEval": 0.1762, + "BBH": 0.298, + "MATH Level 5": 0.0128, + "GPQA": 0.2483, + "MUSR": 0.3502, + "MMLU-PRO": 0.1104 + } + }, + { + "model_id": "piotr25691/thea-3b-25r", + "name": "thea-3b-25r", + "developer": "piotr25691", + "scores": { + "IFEval": 0.7344, + "BBH": 0.4484, + "MATH Level 5": 0.1782, + "GPQA": 0.2676, + "MUSR": 0.3315, + "MMLU-PRO": 0.3182 + } + }, + { + "model_id": "piotr25691/thea-c-3b-25r", + "name": "thea-c-3b-25r", + "developer": "piotr25691", + "scores": { + "IFEval": 0.7402, + "BBH": 0.4532, + "MATH Level 5": 0.1526, + "GPQA": 0.2651, + "MUSR": 0.3315, + "MMLU-PRO": 0.3178 + } + }, + { + "model_id": "piotr25691/thea-rp-3b-25r", + "name": "thea-rp-3b-25r", + "developer": "piotr25691", + "scores": { + "IFEval": 0.6578, + "BBH": 0.439, + "MATH Level 5": 0.1322, + "GPQA": 0.2743, + "MUSR": 0.3819, + "MMLU-PRO": 0.306 + } + }, + { + "model_id": "postbot/gpt2-medium-emailgen", + "name": "gpt2-medium-emailgen", + "developer": "postbot", + "scores": { + "IFEval": 0.1492, + "BBH": 0.313, + "MATH Level 5": 0.0, + "GPQA": 0.2601, + "MUSR": 0.3911, + "MMLU-PRO": 0.1147 + } + }, + { + "model_id": "prince-canuma/Ministral-8B-Instruct-2410-HF", + "name": "Ministral-8B-Instruct-2410-HF", + "developer": "prince-canuma", + "scores": { + "IFEval": 0.5912, + "BBH": 0.4586, + "MATH Level 5": 0.1918, + "GPQA": 0.281, + "MUSR": 0.4138, + "MMLU-PRO": 0.3298 + } + }, + { + "model_id": "princeton-nlp/Llama-3-8B-ProLong-512k-Base", + "name": "Llama-3-8B-ProLong-512k-Base", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.5322, + "BBH": 0.5033, + "MATH Level 5": 0.0687, + "GPQA": 0.2617, + "MUSR": 0.4223, + "MMLU-PRO": 0.3329 + } + }, + { + "model_id": "princeton-nlp/Llama-3-8B-ProLong-512k-Instruct", + "name": "Llama-3-8B-ProLong-512k-Instruct", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.3978, + "BBH": 0.4983, + "MATH Level 5": 0.0582, + "GPQA": 0.281, + "MUSR": 0.425, + "MMLU-PRO": 0.3246 + } + }, + { + "model_id": "princeton-nlp/Llama-3-8B-ProLong-64k-Base", + "name": "Llama-3-8B-ProLong-64k-Base", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.5201, + "BBH": 0.4927, + "MATH Level 5": 0.065, + "GPQA": 0.2651, + "MUSR": 0.4341, + "MMLU-PRO": 0.3348 + } + }, + { + "model_id": "princeton-nlp/Llama-3-8B-ProLong-64k-Instruct", + "name": "Llama-3-8B-ProLong-64k-Instruct", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.5563, + "BBH": 0.5083, + "MATH Level 5": 0.065, + "GPQA": 0.2953, + "MUSR": 0.4397, + "MMLU-PRO": 0.3275 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Base-8B-SFT", + "name": "Llama-3-Base-8B-SFT", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.2796, + "BBH": 0.4643, + "MATH Level 5": 0.04, + "GPQA": 0.2978, + "MUSR": 0.4118, + "MMLU-PRO": 0.3093 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Base-8B-SFT-CPO", + "name": "Llama-3-Base-8B-SFT-CPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.3703, + "BBH": 0.4595, + "MATH Level 5": 0.0544, + "GPQA": 0.2743, + "MUSR": 0.3609, + "MMLU-PRO": 0.2976 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Base-8B-SFT-DPO", + "name": "Llama-3-Base-8B-SFT-DPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.4111, + "BBH": 0.4666, + "MATH Level 5": 0.0415, + "GPQA": 0.3104, + "MUSR": 0.3867, + "MMLU-PRO": 0.3078 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Base-8B-SFT-IPO", + "name": "Llama-3-Base-8B-SFT-IPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.4487, + "BBH": 0.469, + "MATH Level 5": 0.0393, + "GPQA": 0.2978, + "MUSR": 0.3919, + "MMLU-PRO": 0.3115 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Base-8B-SFT-KTO", + "name": "Llama-3-Base-8B-SFT-KTO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.4523, + "BBH": 0.4693, + "MATH Level 5": 0.0529, + "GPQA": 0.3054, + "MUSR": 0.3842, + "MMLU-PRO": 0.3054 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Base-8B-SFT-ORPO", + "name": "Llama-3-Base-8B-SFT-ORPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.4517, + "BBH": 0.4734, + "MATH Level 5": 0.0468, + "GPQA": 0.3138, + "MUSR": 0.3707, + "MMLU-PRO": 0.3083 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Base-8B-SFT-RDPO", + "name": "Llama-3-Base-8B-SFT-RDPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.448, + "BBH": 0.4662, + "MATH Level 5": 0.0574, + "GPQA": 0.3062, + "MUSR": 0.4027, + "MMLU-PRO": 0.3014 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Base-8B-SFT-RRHF", + "name": "Llama-3-Base-8B-SFT-RRHF", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.3357, + "BBH": 0.452, + "MATH Level 5": 0.0453, + "GPQA": 0.3054, + "MUSR": 0.3722, + "MMLU-PRO": 0.2889 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Base-8B-SFT-SLiC-HF", + "name": "Llama-3-Base-8B-SFT-SLiC-HF", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.489, + "BBH": 0.4704, + "MATH Level 5": 0.0506, + "GPQA": 0.2869, + "MUSR": 0.4091, + "MMLU-PRO": 0.3063 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Base-8B-SFT-SimPO", + "name": "Llama-3-Base-8B-SFT-SimPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.4685, + "BBH": 0.4741, + "MATH Level 5": 0.0551, + "GPQA": 0.2886, + "MUSR": 0.4127, + "MMLU-PRO": 0.3105 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Instruct-8B-CPO", + "name": "Llama-3-Instruct-8B-CPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.7293, + "BBH": 0.4999, + "MATH Level 5": 0.0989, + "GPQA": 0.2601, + "MUSR": 0.3514, + "MMLU-PRO": 0.3652 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Instruct-8B-CPO-v0.2", + "name": "Llama-3-Instruct-8B-CPO-v0.2", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.7506, + "BBH": 0.5027, + "MATH Level 5": 0.108, + "GPQA": 0.2609, + "MUSR": 0.3619, + "MMLU-PRO": 0.3706 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Instruct-8B-DPO", + "name": "Llama-3-Instruct-8B-DPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.6757, + "BBH": 0.4991, + "MATH Level 5": 0.0846, + "GPQA": 0.2718, + "MUSR": 0.3738, + "MMLU-PRO": 0.3665 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Instruct-8B-DPO-v0.2", + "name": "Llama-3-Instruct-8B-DPO-v0.2", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.7208, + "BBH": 0.5056, + "MATH Level 5": 0.0899, + "GPQA": 0.2869, + "MUSR": 0.3844, + "MMLU-PRO": 0.3769 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Instruct-8B-KTO", + "name": "Llama-3-Instruct-8B-KTO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.6864, + "BBH": 0.4982, + "MATH Level 5": 0.0725, + "GPQA": 0.276, + "MUSR": 0.3698, + "MMLU-PRO": 0.3599 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Instruct-8B-KTO-v0.2", + "name": "Llama-3-Instruct-8B-KTO-v0.2", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.729, + "BBH": 0.508, + "MATH Level 5": 0.0997, + "GPQA": 0.2601, + "MUSR": 0.3777, + "MMLU-PRO": 0.3668 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Instruct-8B-ORPO", + "name": "Llama-3-Instruct-8B-ORPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.7128, + "BBH": 0.5001, + "MATH Level 5": 0.0785, + "GPQA": 0.2584, + "MUSR": 0.3502, + "MMLU-PRO": 0.3646 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Instruct-8B-ORPO-v0.2", + "name": "Llama-3-Instruct-8B-ORPO-v0.2", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.7633, + "BBH": 0.5078, + "MATH Level 5": 0.102, + "GPQA": 0.2836, + "MUSR": 0.378, + "MMLU-PRO": 0.3731 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Instruct-8B-RDPO", + "name": "Llama-3-Instruct-8B-RDPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.666, + "BBH": 0.5034, + "MATH Level 5": 0.0846, + "GPQA": 0.2827, + "MUSR": 0.3752, + "MMLU-PRO": 0.3607 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Instruct-8B-RDPO-v0.2", + "name": "Llama-3-Instruct-8B-RDPO-v0.2", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.7077, + "BBH": 0.5049, + "MATH Level 5": 0.0869, + "GPQA": 0.2928, + "MUSR": 0.3804, + "MMLU-PRO": 0.3774 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Instruct-8B-RRHF", + "name": "Llama-3-Instruct-8B-RRHF", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.7275, + "BBH": 0.4911, + "MATH Level 5": 0.0967, + "GPQA": 0.2802, + "MUSR": 0.3476, + "MMLU-PRO": 0.3644 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Instruct-8B-RRHF-v0.2", + "name": "Llama-3-Instruct-8B-RRHF-v0.2", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.7125, + "BBH": 0.4984, + "MATH Level 5": 0.0876, + "GPQA": 0.2601, + "MUSR": 0.3738, + "MMLU-PRO": 0.3482 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Instruct-8B-SLiC-HF", + "name": "Llama-3-Instruct-8B-SLiC-HF", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.74, + "BBH": 0.5029, + "MATH Level 5": 0.0974, + "GPQA": 0.2861, + "MUSR": 0.3723, + "MMLU-PRO": 0.3585 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Instruct-8B-SLiC-HF-v0.2", + "name": "Llama-3-Instruct-8B-SLiC-HF-v0.2", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.711, + "BBH": 0.4984, + "MATH Level 5": 0.0876, + "GPQA": 0.2601, + "MUSR": 0.3738, + "MMLU-PRO": 0.3482 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Instruct-8B-SimPO", + "name": "Llama-3-Instruct-8B-SimPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.6504, + "BBH": 0.4845, + "MATH Level 5": 0.0861, + "GPQA": 0.2936, + "MUSR": 0.3948, + "MMLU-PRO": 0.3489 + } + }, + { + "model_id": "princeton-nlp/Llama-3-Instruct-8B-SimPO-v0.2", + "name": "Llama-3-Instruct-8B-SimPO-v0.2", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.6809, + "BBH": 0.5038, + "MATH Level 5": 0.074, + "GPQA": 0.3012, + "MUSR": 0.3988, + "MMLU-PRO": 0.3622 + } + }, + { + "model_id": "princeton-nlp/Mistral-7B-Base-SFT-CPO", + "name": "Mistral-7B-Base-SFT-CPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.4655, + "BBH": 0.4382, + "MATH Level 5": 0.0279, + "GPQA": 0.2919, + "MUSR": 0.4071, + "MMLU-PRO": 0.2651 + } + }, + { + "model_id": "princeton-nlp/Mistral-7B-Base-SFT-DPO", + "name": "Mistral-7B-Base-SFT-DPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.4403, + "BBH": 0.435, + "MATH Level 5": 0.0211, + "GPQA": 0.2727, + "MUSR": 0.4122, + "MMLU-PRO": 0.2645 + } + }, + { + "model_id": "princeton-nlp/Mistral-7B-Base-SFT-IPO", + "name": "Mistral-7B-Base-SFT-IPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.483, + "BBH": 0.4458, + "MATH Level 5": 0.0287, + "GPQA": 0.2802, + "MUSR": 0.3776, + "MMLU-PRO": 0.2792 + } + }, + { + "model_id": "princeton-nlp/Mistral-7B-Base-SFT-KTO", + "name": "Mistral-7B-Base-SFT-KTO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.4785, + "BBH": 0.4476, + "MATH Level 5": 0.0393, + "GPQA": 0.2903, + "MUSR": 0.4368, + "MMLU-PRO": 0.2872 + } + }, + { + "model_id": "princeton-nlp/Mistral-7B-Base-SFT-RDPO", + "name": "Mistral-7B-Base-SFT-RDPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.4606, + "BBH": 0.444, + "MATH Level 5": 0.0219, + "GPQA": 0.2777, + "MUSR": 0.3579, + "MMLU-PRO": 0.2777 + } + }, + { + "model_id": "princeton-nlp/Mistral-7B-Base-SFT-RRHF", + "name": "Mistral-7B-Base-SFT-RRHF", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.4407, + "BBH": 0.4281, + "MATH Level 5": 0.0249, + "GPQA": 0.2903, + "MUSR": 0.4187, + "MMLU-PRO": 0.2398 + } + }, + { + "model_id": "princeton-nlp/Mistral-7B-Base-SFT-SLiC-HF", + "name": "Mistral-7B-Base-SFT-SLiC-HF", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.5127, + "BBH": 0.4422, + "MATH Level 5": 0.0355, + "GPQA": 0.2919, + "MUSR": 0.4261, + "MMLU-PRO": 0.2781 + } + }, + { + "model_id": "princeton-nlp/Mistral-7B-Base-SFT-SimPO", + "name": "Mistral-7B-Base-SFT-SimPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.4701, + "BBH": 0.4398, + "MATH Level 5": 0.0144, + "GPQA": 0.2836, + "MUSR": 0.3971, + "MMLU-PRO": 0.2702 + } + }, + { + "model_id": "princeton-nlp/Mistral-7B-Instruct-CPO", + "name": "Mistral-7B-Instruct-CPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.4203, + "BBH": 0.4069, + "MATH Level 5": 0.0204, + "GPQA": 0.2659, + "MUSR": 0.4178, + "MMLU-PRO": 0.2701 + } + }, + { + "model_id": "princeton-nlp/Mistral-7B-Instruct-DPO", + "name": "Mistral-7B-Instruct-DPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.5176, + "BBH": 0.406, + "MATH Level 5": 0.031, + "GPQA": 0.2685, + "MUSR": 0.3833, + "MMLU-PRO": 0.2749 + } + }, + { + "model_id": "princeton-nlp/Mistral-7B-Instruct-IPO", + "name": "Mistral-7B-Instruct-IPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.4929, + "BBH": 0.4322, + "MATH Level 5": 0.0204, + "GPQA": 0.2735, + "MUSR": 0.4324, + "MMLU-PRO": 0.2708 + } + }, + { + "model_id": "princeton-nlp/Mistral-7B-Instruct-KTO", + "name": "Mistral-7B-Instruct-KTO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.4908, + "BBH": 0.414, + "MATH Level 5": 0.0264, + "GPQA": 0.2735, + "MUSR": 0.3953, + "MMLU-PRO": 0.2812 + } + }, + { + "model_id": "princeton-nlp/Mistral-7B-Instruct-ORPO", + "name": "Mistral-7B-Instruct-ORPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.472, + "BBH": 0.4104, + "MATH Level 5": 0.0295, + "GPQA": 0.2743, + "MUSR": 0.3912, + "MMLU-PRO": 0.2662 + } + }, + { + "model_id": "princeton-nlp/Mistral-7B-Instruct-RDPO", + "name": "Mistral-7B-Instruct-RDPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.4887, + "BBH": 0.405, + "MATH Level 5": 0.0249, + "GPQA": 0.2802, + "MUSR": 0.3873, + "MMLU-PRO": 0.2777 + } + }, + { + "model_id": "princeton-nlp/Mistral-7B-Instruct-RRHF", + "name": "Mistral-7B-Instruct-RRHF", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.496, + "BBH": 0.419, + "MATH Level 5": 0.0279, + "GPQA": 0.276, + "MUSR": 0.3979, + "MMLU-PRO": 0.2651 + } + }, + { + "model_id": "princeton-nlp/Mistral-7B-Instruct-SLiC-HF", + "name": "Mistral-7B-Instruct-SLiC-HF", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.5115, + "BBH": 0.404, + "MATH Level 5": 0.0174, + "GPQA": 0.2727, + "MUSR": 0.3913, + "MMLU-PRO": 0.2715 + } + }, + { + "model_id": "princeton-nlp/Mistral-7B-Instruct-SimPO", + "name": "Mistral-7B-Instruct-SimPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.4687, + "BBH": 0.4507, + "MATH Level 5": 0.0287, + "GPQA": 0.2785, + "MUSR": 0.4098, + "MMLU-PRO": 0.2797 + } + }, + { + "model_id": "princeton-nlp/Sheared-LLaMA-1.3B", + "name": "Sheared-LLaMA-1.3B", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.2198, + "BBH": 0.3197, + "MATH Level 5": 0.0128, + "GPQA": 0.2399, + "MUSR": 0.3713, + "MMLU-PRO": 0.1171 + } + }, + { + "model_id": "princeton-nlp/Sheared-LLaMA-2.7B", + "name": "Sheared-LLaMA-2.7B", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.2417, + "BBH": 0.3259, + "MATH Level 5": 0.0128, + "GPQA": 0.2752, + "MUSR": 0.3567, + "MMLU-PRO": 0.1187 + } + }, + { + "model_id": "princeton-nlp/gemma-2-9b-it-DPO", + "name": "gemma-2-9b-it-DPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.2769, + "BBH": 0.5941, + "MATH Level 5": 0.0831, + "GPQA": 0.3356, + "MUSR": 0.382, + "MMLU-PRO": 0.3723 + } + }, + { + "model_id": "princeton-nlp/gemma-2-9b-it-SimPO", + "name": "gemma-2-9b-it-SimPO", + "developer": "princeton-nlp", + "scores": { + "IFEval": 0.3207, + "BBH": 0.5839, + "MATH Level 5": 0.071, + "GPQA": 0.3356, + "MUSR": 0.4123, + "MMLU-PRO": 0.3975 + } + }, + { + "model_id": "prithivMLmods/Bellatrix-1.5B-xElite", + "name": "Bellatrix-1.5B-xElite", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.1964, + "BBH": 0.3501, + "MATH Level 5": 0.287, + "GPQA": 0.2785, + "MUSR": 0.3619, + "MMLU-PRO": 0.1657 + } + }, + { + "model_id": "prithivMLmods/Bellatrix-Tiny-1.5B-R1", + "name": "Bellatrix-Tiny-1.5B-R1", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.3352, + "BBH": 0.4022, + "MATH Level 5": 0.0604, + "GPQA": 0.2987, + "MUSR": 0.3683, + "MMLU-PRO": 0.2751 + } + }, + { + "model_id": "prithivMLmods/Bellatrix-Tiny-1B-v2", + "name": "Bellatrix-Tiny-1B-v2", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.151, + "BBH": 0.3268, + "MATH Level 5": 0.0287, + "GPQA": 0.2727, + "MUSR": 0.343, + "MMLU-PRO": 0.1493 + } + }, + { + "model_id": "prithivMLmods/Blaze-14B-xElite", + "name": "Blaze-14B-xElite", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.0363, + "BBH": 0.6628, + "MATH Level 5": 0.3693, + "GPQA": 0.3943, + "MUSR": 0.4625, + "MMLU-PRO": 0.5111 + } + }, + { + "model_id": "prithivMLmods/COCO-7B-Instruct-1M", + "name": "COCO-7B-Instruct-1M", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4743, + "BBH": 0.541, + "MATH Level 5": 0.3497, + "GPQA": 0.3079, + "MUSR": 0.4382, + "MMLU-PRO": 0.4186 + } + }, + { + "model_id": "prithivMLmods/Calcium-Opus-14B-Elite", + "name": "Calcium-Opus-14B-Elite", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.6052, + "BBH": 0.6317, + "MATH Level 5": 0.4789, + "GPQA": 0.3742, + "MUSR": 0.486, + "MMLU-PRO": 0.5302 + } + }, + { + "model_id": "prithivMLmods/Calcium-Opus-14B-Elite-1M", + "name": "Calcium-Opus-14B-Elite-1M", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.5613, + "BBH": 0.6329, + "MATH Level 5": 0.4456, + "GPQA": 0.3523, + "MUSR": 0.4676, + "MMLU-PRO": 0.5152 + } + }, + { + "model_id": "prithivMLmods/Calcium-Opus-14B-Elite-Stock", + "name": "Calcium-Opus-14B-Elite-Stock", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.6143, + "BBH": 0.6329, + "MATH Level 5": 0.4668, + "GPQA": 0.3683, + "MUSR": 0.4808, + "MMLU-PRO": 0.5284 + } + }, + { + "model_id": "prithivMLmods/Calcium-Opus-14B-Elite2", + "name": "Calcium-Opus-14B-Elite2", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.6176, + "BBH": 0.6318, + "MATH Level 5": 0.469, + "GPQA": 0.37, + "MUSR": 0.494, + "MMLU-PRO": 0.5301 + } + }, + { + "model_id": "prithivMLmods/Calcium-Opus-14B-Elite2-R1", + "name": "Calcium-Opus-14B-Elite2-R1", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.6326, + "BBH": 0.6362, + "MATH Level 5": 0.3338, + "GPQA": 0.3909, + "MUSR": 0.49, + "MMLU-PRO": 0.5248 + } + }, + { + "model_id": "prithivMLmods/Calcium-Opus-14B-Elite3", + "name": "Calcium-Opus-14B-Elite3", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.5428, + "BBH": 0.635, + "MATH Level 5": 0.4705, + "GPQA": 0.3708, + "MUSR": 0.4795, + "MMLU-PRO": 0.5335 + } + }, + { + "model_id": "prithivMLmods/Calcium-Opus-14B-Elite4", + "name": "Calcium-Opus-14B-Elite4", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.6112, + "BBH": 0.6195, + "MATH Level 5": 0.3625, + "GPQA": 0.3557, + "MUSR": 0.4687, + "MMLU-PRO": 0.5149 + } + }, + { + "model_id": "prithivMLmods/Calcium-Opus-14B-Merge", + "name": "Calcium-Opus-14B-Merge", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4949, + "BBH": 0.6319, + "MATH Level 5": 0.4637, + "GPQA": 0.3708, + "MUSR": 0.4861, + "MMLU-PRO": 0.5356 + } + }, + { + "model_id": "prithivMLmods/Calcium-Opus-20B-v1", + "name": "Calcium-Opus-20B-v1", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.3093, + "BBH": 0.599, + "MATH Level 5": 0.3618, + "GPQA": 0.3532, + "MUSR": 0.4943, + "MMLU-PRO": 0.4734 + } + }, + { + "model_id": "prithivMLmods/Codepy-Deepthink-3B", + "name": "Codepy-Deepthink-3B", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4327, + "BBH": 0.4259, + "MATH Level 5": 0.1156, + "GPQA": 0.2794, + "MUSR": 0.331, + "MMLU-PRO": 0.309 + } + }, + { + "model_id": "prithivMLmods/Coma-II-14B", + "name": "Coma-II-14B", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4168, + "BBH": 0.6321, + "MATH Level 5": 0.5514, + "GPQA": 0.4002, + "MUSR": 0.5351, + "MMLU-PRO": 0.504 + } + }, + { + "model_id": "prithivMLmods/Condor-Opus-14B-Exp", + "name": "Condor-Opus-14B-Exp", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4043, + "BBH": 0.6154, + "MATH Level 5": 0.5227, + "GPQA": 0.3918, + "MUSR": 0.5194, + "MMLU-PRO": 0.5014 + } + }, + { + "model_id": "prithivMLmods/Cygnus-II-14B", + "name": "Cygnus-II-14B", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.6184, + "BBH": 0.6661, + "MATH Level 5": 0.4396, + "GPQA": 0.3876, + "MUSR": 0.4688, + "MMLU-PRO": 0.5391 + } + }, + { + "model_id": "prithivMLmods/Deepthink-Llama-3-8B-Preview", + "name": "Deepthink-Llama-3-8B-Preview", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.2955, + "BBH": 0.4665, + "MATH Level 5": 0.355, + "GPQA": 0.3163, + "MUSR": 0.3707, + "MMLU-PRO": 0.2739 + } + }, + { + "model_id": "prithivMLmods/Deepthink-Reasoning-14B", + "name": "Deepthink-Reasoning-14B", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.5424, + "BBH": 0.6334, + "MATH Level 5": 0.423, + "GPQA": 0.3666, + "MUSR": 0.4732, + "MMLU-PRO": 0.5296 + } + }, + { + "model_id": "prithivMLmods/Deepthink-Reasoning-7B", + "name": "Deepthink-Reasoning-7B", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.484, + "BBH": 0.5505, + "MATH Level 5": 0.3346, + "GPQA": 0.2995, + "MUSR": 0.4432, + "MMLU-PRO": 0.4349 + } + }, + { + "model_id": "prithivMLmods/Dinobot-Opus-14B-Exp", + "name": "Dinobot-Opus-14B-Exp", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.824, + "BBH": 0.637, + "MATH Level 5": 0.5317, + "GPQA": 0.3247, + "MUSR": 0.426, + "MMLU-PRO": 0.4979 + } + }, + { + "model_id": "prithivMLmods/Elita-0.1-Distilled-R1-abliterated", + "name": "Elita-0.1-Distilled-R1-abliterated", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.3542, + "BBH": 0.3828, + "MATH Level 5": 0.3066, + "GPQA": 0.2659, + "MUSR": 0.366, + "MMLU-PRO": 0.2758 + } + }, + { + "model_id": "prithivMLmods/Elita-1", + "name": "Elita-1", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4906, + "BBH": 0.652, + "MATH Level 5": 0.3429, + "GPQA": 0.3758, + "MUSR": 0.4834, + "MMLU-PRO": 0.5381 + } + }, + { + "model_id": "prithivMLmods/Epimetheus-14B-Axo", + "name": "Epimetheus-14B-Axo", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.5546, + "BBH": 0.6613, + "MATH Level 5": 0.4101, + "GPQA": 0.3926, + "MUSR": 0.482, + "MMLU-PRO": 0.5304 + } + }, + { + "model_id": "prithivMLmods/Equuleus-Opus-14B-Exp", + "name": "Equuleus-Opus-14B-Exp", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.7001, + "BBH": 0.6434, + "MATH Level 5": 0.4585, + "GPQA": 0.3867, + "MUSR": 0.4952, + "MMLU-PRO": 0.5374 + } + }, + { + "model_id": "prithivMLmods/Eridanus-Opus-14B-r999", + "name": "Eridanus-Opus-14B-r999", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.6386, + "BBH": 0.6584, + "MATH Level 5": 0.386, + "GPQA": 0.3943, + "MUSR": 0.4769, + "MMLU-PRO": 0.5362 + } + }, + { + "model_id": "prithivMLmods/Evac-Opus-14B-Exp", + "name": "Evac-Opus-14B-Exp", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.5916, + "BBH": 0.6475, + "MATH Level 5": 0.4215, + "GPQA": 0.3884, + "MUSR": 0.4728, + "MMLU-PRO": 0.5317 + } + }, + { + "model_id": "prithivMLmods/FastThink-0.5B-Tiny", + "name": "FastThink-0.5B-Tiny", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.258, + "BBH": 0.3206, + "MATH Level 5": 0.0204, + "GPQA": 0.2609, + "MUSR": 0.3566, + "MMLU-PRO": 0.1649 + } + }, + { + "model_id": "prithivMLmods/GWQ-9B-Preview", + "name": "GWQ-9B-Preview", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.5066, + "BBH": 0.5806, + "MATH Level 5": 0.2266, + "GPQA": 0.3398, + "MUSR": 0.4951, + "MMLU-PRO": 0.3984 + } + }, + { + "model_id": "prithivMLmods/GWQ-9B-Preview2", + "name": "GWQ-9B-Preview2", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.5209, + "BBH": 0.5797, + "MATH Level 5": 0.2372, + "GPQA": 0.3263, + "MUSR": 0.486, + "MMLU-PRO": 0.3997 + } + }, + { + "model_id": "prithivMLmods/GWQ2b", + "name": "GWQ2b", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4115, + "BBH": 0.4143, + "MATH Level 5": 0.0627, + "GPQA": 0.2827, + "MUSR": 0.4311, + "MMLU-PRO": 0.2473 + } + }, + { + "model_id": "prithivMLmods/Gaea-Opus-14B-Exp", + "name": "Gaea-Opus-14B-Exp", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.5956, + "BBH": 0.656, + "MATH Level 5": 0.4275, + "GPQA": 0.3909, + "MUSR": 0.4859, + "MMLU-PRO": 0.5401 + } + }, + { + "model_id": "prithivMLmods/Galactic-Qwen-14B-Exp1", + "name": "Galactic-Qwen-14B-Exp1", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.5832, + "BBH": 0.6582, + "MATH Level 5": 0.4018, + "GPQA": 0.3935, + "MUSR": 0.4781, + "MMLU-PRO": 0.5396 + } + }, + { + "model_id": "prithivMLmods/Galactic-Qwen-14B-Exp2", + "name": "Galactic-Qwen-14B-Exp2", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.662, + "BBH": 0.7203, + "MATH Level 5": 0.3474, + "GPQA": 0.3993, + "MUSR": 0.5354, + "MMLU-PRO": 0.5691 + } + }, + { + "model_id": "prithivMLmods/Gauss-Opus-14B-R999", + "name": "Gauss-Opus-14B-R999", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.3907, + "BBH": 0.6228, + "MATH Level 5": 0.5755, + "GPQA": 0.3918, + "MUSR": 0.5338, + "MMLU-PRO": 0.5007 + } + }, + { + "model_id": "prithivMLmods/Jolt-v0.1", + "name": "Jolt-v0.1", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.5092, + "BBH": 0.6521, + "MATH Level 5": 0.3565, + "GPQA": 0.38, + "MUSR": 0.4847, + "MMLU-PRO": 0.5386 + } + }, + { + "model_id": "prithivMLmods/Lacerta-Opus-14B-Elite8", + "name": "Lacerta-Opus-14B-Elite8", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.6141, + "BBH": 0.6401, + "MATH Level 5": 0.3648, + "GPQA": 0.3784, + "MUSR": 0.4635, + "MMLU-PRO": 0.5322 + } + }, + { + "model_id": "prithivMLmods/Llama-3.1-5B-Instruct", + "name": "Llama-3.1-5B-Instruct", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.1407, + "BBH": 0.3051, + "MATH Level 5": 0.0151, + "GPQA": 0.2643, + "MUSR": 0.354, + "MMLU-PRO": 0.1184 + } + }, + { + "model_id": "prithivMLmods/Llama-3.1-8B-Open-SFT", + "name": "Llama-3.1-8B-Open-SFT", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4123, + "BBH": 0.4968, + "MATH Level 5": 0.1216, + "GPQA": 0.3096, + "MUSR": 0.3904, + "MMLU-PRO": 0.3522 + } + }, + { + "model_id": "prithivMLmods/Llama-3.2-3B-Math-Oct", + "name": "Llama-3.2-3B-Math-Oct", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4585, + "BBH": 0.4372, + "MATH Level 5": 0.1156, + "GPQA": 0.2584, + "MUSR": 0.347, + "MMLU-PRO": 0.2911 + } + }, + { + "model_id": "prithivMLmods/Llama-3.2-6B-AlgoCode", + "name": "Llama-3.2-6B-AlgoCode", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.2136, + "BBH": 0.3748, + "MATH Level 5": 0.0136, + "GPQA": 0.2869, + "MUSR": 0.4013, + "MMLU-PRO": 0.1798 + } + }, + { + "model_id": "prithivMLmods/Llama-8B-Distill-CoT", + "name": "Llama-8B-Distill-CoT", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.3342, + "BBH": 0.4298, + "MATH Level 5": 0.4003, + "GPQA": 0.2894, + "MUSR": 0.372, + "MMLU-PRO": 0.2732 + } + }, + { + "model_id": "prithivMLmods/Llama-Deepsync-1B", + "name": "Llama-Deepsync-1B", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.357, + "BBH": 0.3386, + "MATH Level 5": 0.0438, + "GPQA": 0.2601, + "MUSR": 0.3565, + "MMLU-PRO": 0.1738 + } + }, + { + "model_id": "prithivMLmods/Llama-Deepsync-3B", + "name": "Llama-Deepsync-3B", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4302, + "BBH": 0.4292, + "MATH Level 5": 0.1178, + "GPQA": 0.2718, + "MUSR": 0.3324, + "MMLU-PRO": 0.3031 + } + }, + { + "model_id": "prithivMLmods/Llama-Express.1-Math", + "name": "Llama-Express.1-Math", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.5084, + "BBH": 0.3364, + "MATH Level 5": 0.0559, + "GPQA": 0.2634, + "MUSR": 0.3143, + "MMLU-PRO": 0.161 + } + }, + { + "model_id": "prithivMLmods/LwQ-10B-Instruct", + "name": "LwQ-10B-Instruct", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.3935, + "BBH": 0.5122, + "MATH Level 5": 0.04, + "GPQA": 0.3121, + "MUSR": 0.4544, + "MMLU-PRO": 0.3318 + } + }, + { + "model_id": "prithivMLmods/LwQ-Reasoner-10B", + "name": "LwQ-Reasoner-10B", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.2941, + "BBH": 0.5866, + "MATH Level 5": 0.358, + "GPQA": 0.3465, + "MUSR": 0.4079, + "MMLU-PRO": 0.4147 + } + }, + { + "model_id": "prithivMLmods/Magellanic-Opus-14B-Exp", + "name": "Magellanic-Opus-14B-Exp", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.6866, + "BBH": 0.6383, + "MATH Level 5": 0.3799, + "GPQA": 0.3742, + "MUSR": 0.4926, + "MMLU-PRO": 0.5273 + } + }, + { + "model_id": "prithivMLmods/Magellanic-Qwen-25B-R999", + "name": "Magellanic-Qwen-25B-R999", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.1873, + "BBH": 0.2608, + "MATH Level 5": 0.0053, + "GPQA": 0.2508, + "MUSR": 0.3831, + "MMLU-PRO": 0.13 + } + }, + { + "model_id": "prithivMLmods/Megatron-Corpus-14B-Exp", + "name": "Megatron-Corpus-14B-Exp", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4983, + "BBH": 0.6355, + "MATH Level 5": 0.3429, + "GPQA": 0.3633, + "MUSR": 0.4767, + "MMLU-PRO": 0.526 + } + }, + { + "model_id": "prithivMLmods/Megatron-Corpus-14B-Exp.v2", + "name": "Megatron-Corpus-14B-Exp.v2", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.487, + "BBH": 0.6321, + "MATH Level 5": 0.2591, + "GPQA": 0.3423, + "MUSR": 0.449, + "MMLU-PRO": 0.481 + } + }, + { + "model_id": "prithivMLmods/Megatron-Opus-14B-2.0", + "name": "Megatron-Opus-14B-2.0", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.6694, + "BBH": 0.6871, + "MATH Level 5": 0.2779, + "GPQA": 0.3591, + "MUSR": 0.414, + "MMLU-PRO": 0.517 + } + }, + { + "model_id": "prithivMLmods/Megatron-Opus-14B-2.1", + "name": "Megatron-Opus-14B-2.1", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.0246, + "BBH": 0.6727, + "MATH Level 5": 0.2998, + "GPQA": 0.3834, + "MUSR": 0.4928, + "MMLU-PRO": 0.5174 + } + }, + { + "model_id": "prithivMLmods/Megatron-Opus-14B-Exp", + "name": "Megatron-Opus-14B-Exp", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4979, + "BBH": 0.6516, + "MATH Level 5": 0.3535, + "GPQA": 0.375, + "MUSR": 0.4887, + "MMLU-PRO": 0.5401 + } + }, + { + "model_id": "prithivMLmods/Megatron-Opus-14B-Stock", + "name": "Megatron-Opus-14B-Stock", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.5174, + "BBH": 0.6412, + "MATH Level 5": 0.3346, + "GPQA": 0.375, + "MUSR": 0.482, + "MMLU-PRO": 0.5293 + } + }, + { + "model_id": "prithivMLmods/Megatron-Opus-7B-Exp", + "name": "Megatron-Opus-7B-Exp", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.6017, + "BBH": 0.5367, + "MATH Level 5": 0.1971, + "GPQA": 0.3112, + "MUSR": 0.4186, + "MMLU-PRO": 0.39 + } + }, + { + "model_id": "prithivMLmods/Messier-Opus-14B-Elite7", + "name": "Messier-Opus-14B-Elite7", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.7113, + "BBH": 0.6499, + "MATH Level 5": 0.4071, + "GPQA": 0.3909, + "MUSR": 0.4886, + "MMLU-PRO": 0.5404 + } + }, + { + "model_id": "prithivMLmods/Omni-Reasoner-Merged", + "name": "Omni-Reasoner-Merged", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4599, + "BBH": 0.5508, + "MATH Level 5": 0.3331, + "GPQA": 0.3037, + "MUSR": 0.4616, + "MMLU-PRO": 0.4364 + } + }, + { + "model_id": "prithivMLmods/Omni-Reasoner3-Merged", + "name": "Omni-Reasoner3-Merged", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4935, + "BBH": 0.4388, + "MATH Level 5": 0.1088, + "GPQA": 0.2643, + "MUSR": 0.3522, + "MMLU-PRO": 0.295 + } + }, + { + "model_id": "prithivMLmods/Pegasus-Opus-14B-Exp", + "name": "Pegasus-Opus-14B-Exp", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.6982, + "BBH": 0.6548, + "MATH Level 5": 0.4086, + "GPQA": 0.3951, + "MUSR": 0.486, + "MMLU-PRO": 0.5412 + } + }, + { + "model_id": "prithivMLmods/Phi-4-Empathetic", + "name": "Phi-4-Empathetic", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.0497, + "BBH": 0.6727, + "MATH Level 5": 0.2621, + "GPQA": 0.38, + "MUSR": 0.4991, + "MMLU-PRO": 0.5066 + } + }, + { + "model_id": "prithivMLmods/Phi-4-Math-IO", + "name": "Phi-4-Math-IO", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.059, + "BBH": 0.6668, + "MATH Level 5": 0.4577, + "GPQA": 0.3985, + "MUSR": 0.4873, + "MMLU-PRO": 0.5205 + } + }, + { + "model_id": "prithivMLmods/Phi-4-QwQ", + "name": "Phi-4-QwQ", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.0559, + "BBH": 0.6696, + "MATH Level 5": 0.4577, + "GPQA": 0.3909, + "MUSR": 0.4651, + "MMLU-PRO": 0.5275 + } + }, + { + "model_id": "prithivMLmods/Phi-4-Super", + "name": "Phi-4-Super", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.0481, + "BBH": 0.672, + "MATH Level 5": 0.3489, + "GPQA": 0.3943, + "MUSR": 0.5044, + "MMLU-PRO": 0.5266 + } + }, + { + "model_id": "prithivMLmods/Phi-4-Super-1", + "name": "Phi-4-Super-1", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.0418, + "BBH": 0.6729, + "MATH Level 5": 0.352, + "GPQA": 0.3935, + "MUSR": 0.5017, + "MMLU-PRO": 0.5235 + } + }, + { + "model_id": "prithivMLmods/Phi-4-Super-o1", + "name": "Phi-4-Super-o1", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.0418, + "BBH": 0.6729, + "MATH Level 5": 0.352, + "GPQA": 0.3935, + "MUSR": 0.5017, + "MMLU-PRO": 0.5235 + } + }, + { + "model_id": "prithivMLmods/Phi-4-o1", + "name": "Phi-4-o1", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.029, + "BBH": 0.6689, + "MATH Level 5": 0.3995, + "GPQA": 0.3826, + "MUSR": 0.4978, + "MMLU-PRO": 0.5174 + } + }, + { + "model_id": "prithivMLmods/Phi4-Super", + "name": "Phi4-Super", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.0481, + "BBH": 0.672, + "MATH Level 5": 0.3489, + "GPQA": 0.3943, + "MUSR": 0.5044, + "MMLU-PRO": 0.5266 + } + }, + { + "model_id": "prithivMLmods/Porpoise-Opus-14B-Exp", + "name": "Porpoise-Opus-14B-Exp", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.7098, + "BBH": 0.6519, + "MATH Level 5": 0.4041, + "GPQA": 0.3935, + "MUSR": 0.4926, + "MMLU-PRO": 0.5396 + } + }, + { + "model_id": "prithivMLmods/Primal-Opus-14B-Optimus-v1", + "name": "Primal-Opus-14B-Optimus-v1", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.5013, + "BBH": 0.6419, + "MATH Level 5": 0.3384, + "GPQA": 0.3725, + "MUSR": 0.4847, + "MMLU-PRO": 0.5259 + } + }, + { + "model_id": "prithivMLmods/Primal-Opus-14B-Optimus-v2", + "name": "Primal-Opus-14B-Optimus-v2", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.6404, + "BBH": 0.6544, + "MATH Level 5": 0.4207, + "GPQA": 0.3918, + "MUSR": 0.49, + "MMLU-PRO": 0.5422 + } + }, + { + "model_id": "prithivMLmods/QwQ-LCoT-14B-Conversational", + "name": "QwQ-LCoT-14B-Conversational", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4047, + "BBH": 0.624, + "MATH Level 5": 0.4653, + "GPQA": 0.3498, + "MUSR": 0.4847, + "MMLU-PRO": 0.5278 + } + }, + { + "model_id": "prithivMLmods/QwQ-LCoT-3B-Instruct", + "name": "QwQ-LCoT-3B-Instruct", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4354, + "BBH": 0.4763, + "MATH Level 5": 0.2825, + "GPQA": 0.2819, + "MUSR": 0.4358, + "MMLU-PRO": 0.3582 + } + }, + { + "model_id": "prithivMLmods/QwQ-LCoT-7B-Instruct", + "name": "QwQ-LCoT-7B-Instruct", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4987, + "BBH": 0.5466, + "MATH Level 5": 0.3716, + "GPQA": 0.302, + "MUSR": 0.4802, + "MMLU-PRO": 0.4334 + } + }, + { + "model_id": "prithivMLmods/QwQ-LCoT1-Merged", + "name": "QwQ-LCoT1-Merged", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4751, + "BBH": 0.5481, + "MATH Level 5": 0.3731, + "GPQA": 0.307, + "MUSR": 0.4696, + "MMLU-PRO": 0.4358 + } + }, + { + "model_id": "prithivMLmods/QwQ-LCoT2-7B-Instruct", + "name": "QwQ-LCoT2-7B-Instruct", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.5561, + "BBH": 0.5425, + "MATH Level 5": 0.327, + "GPQA": 0.2978, + "MUSR": 0.4564, + "MMLU-PRO": 0.4342 + } + }, + { + "model_id": "prithivMLmods/QwQ-MathOct-7B", + "name": "QwQ-MathOct-7B", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4684, + "BBH": 0.5486, + "MATH Level 5": 0.2953, + "GPQA": 0.3029, + "MUSR": 0.4601, + "MMLU-PRO": 0.433 + } + }, + { + "model_id": "prithivMLmods/QwQ-R1-Distill-1.5B-CoT", + "name": "QwQ-R1-Distill-1.5B-CoT", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.2194, + "BBH": 0.3666, + "MATH Level 5": 0.3346, + "GPQA": 0.2861, + "MUSR": 0.3434, + "MMLU-PRO": 0.1913 + } + }, + { + "model_id": "prithivMLmods/QwQ-R1-Distill-7B-CoT", + "name": "QwQ-R1-Distill-7B-CoT", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.35, + "BBH": 0.4388, + "MATH Level 5": 0.4683, + "GPQA": 0.2936, + "MUSR": 0.3779, + "MMLU-PRO": 0.2804 + } + }, + { + "model_id": "prithivMLmods/Qwen-7B-Distill-Reasoner", + "name": "Qwen-7B-Distill-Reasoner", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.3396, + "BBH": 0.4409, + "MATH Level 5": 0.395, + "GPQA": 0.3272, + "MUSR": 0.366, + "MMLU-PRO": 0.2818 + } + }, + { + "model_id": "prithivMLmods/Qwen2.5-1.5B-DeepSeek-R1-Instruct", + "name": "Qwen2.5-1.5B-DeepSeek-R1-Instruct", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.1397, + "BBH": 0.2824, + "MATH Level 5": 0.0, + "GPQA": 0.276, + "MUSR": 0.3724, + "MMLU-PRO": 0.1123 + } + }, + { + "model_id": "prithivMLmods/Qwen2.5-14B-DeepSeek-R1-1M", + "name": "Qwen2.5-14B-DeepSeek-R1-1M", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4193, + "BBH": 0.5935, + "MATH Level 5": 0.5128, + "GPQA": 0.3322, + "MUSR": 0.4606, + "MMLU-PRO": 0.4899 + } + }, + { + "model_id": "prithivMLmods/Qwen2.5-7B-DeepSeek-R1-1M", + "name": "Qwen2.5-7B-DeepSeek-R1-1M", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.1861, + "BBH": 0.3126, + "MATH Level 5": 0.0151, + "GPQA": 0.2617, + "MUSR": 0.3417, + "MMLU-PRO": 0.1201 + } + }, + { + "model_id": "prithivMLmods/SmolLM2-CoT-360M", + "name": "SmolLM2-CoT-360M", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.2216, + "BBH": 0.3135, + "MATH Level 5": 0.0204, + "GPQA": 0.2366, + "MUSR": 0.3794, + "MMLU-PRO": 0.1085 + } + }, + { + "model_id": "prithivMLmods/Sombrero-Opus-14B-Elite5", + "name": "Sombrero-Opus-14B-Elite5", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.7881, + "BBH": 0.6502, + "MATH Level 5": 0.5355, + "GPQA": 0.3364, + "MUSR": 0.4287, + "MMLU-PRO": 0.52 + } + }, + { + "model_id": "prithivMLmods/Sombrero-Opus-14B-Elite6", + "name": "Sombrero-Opus-14B-Elite6", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.7226, + "BBH": 0.6488, + "MATH Level 5": 0.4079, + "GPQA": 0.3935, + "MUSR": 0.4886, + "MMLU-PRO": 0.539 + } + }, + { + "model_id": "prithivMLmods/Sombrero-Opus-14B-Sm1", + "name": "Sombrero-Opus-14B-Sm1", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.3813, + "BBH": 0.6355, + "MATH Level 5": 0.5665, + "GPQA": 0.4035, + "MUSR": 0.5299, + "MMLU-PRO": 0.5125 + } + }, + { + "model_id": "prithivMLmods/Sombrero-Opus-14B-Sm2", + "name": "Sombrero-Opus-14B-Sm2", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4272, + "BBH": 0.6609, + "MATH Level 5": 0.4864, + "GPQA": 0.3884, + "MUSR": 0.5088, + "MMLU-PRO": 0.5345 + } + }, + { + "model_id": "prithivMLmods/Sombrero-Opus-14B-Sm4", + "name": "Sombrero-Opus-14B-Sm4", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4347, + "BBH": 0.6613, + "MATH Level 5": 0.4879, + "GPQA": 0.3951, + "MUSR": 0.5192, + "MMLU-PRO": 0.53 + } + }, + { + "model_id": "prithivMLmods/Sombrero-Opus-14B-Sm5", + "name": "Sombrero-Opus-14B-Sm5", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.6852, + "BBH": 0.6564, + "MATH Level 5": 0.4094, + "GPQA": 0.3867, + "MUSR": 0.4806, + "MMLU-PRO": 0.54 + } + }, + { + "model_id": "prithivMLmods/Sqweeks-7B-Instruct", + "name": "Sqweeks-7B-Instruct", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.2158, + "BBH": 0.4667, + "MATH Level 5": 0.5144, + "GPQA": 0.307, + "MUSR": 0.4476, + "MMLU-PRO": 0.3133 + } + }, + { + "model_id": "prithivMLmods/Tadpole-Opus-14B-Exp", + "name": "Tadpole-Opus-14B-Exp", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.575, + "BBH": 0.6369, + "MATH Level 5": 0.3134, + "GPQA": 0.3859, + "MUSR": 0.4728, + "MMLU-PRO": 0.5322 + } + }, + { + "model_id": "prithivMLmods/Taurus-Opus-7B", + "name": "Taurus-Opus-7B", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4223, + "BBH": 0.5367, + "MATH Level 5": 0.2168, + "GPQA": 0.3263, + "MUSR": 0.4399, + "MMLU-PRO": 0.3951 + } + }, + { + "model_id": "prithivMLmods/Triangulum-10B", + "name": "Triangulum-10B", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.3229, + "BBH": 0.5968, + "MATH Level 5": 0.355, + "GPQA": 0.354, + "MUSR": 0.4172, + "MMLU-PRO": 0.4178 + } + }, + { + "model_id": "prithivMLmods/Triangulum-5B", + "name": "Triangulum-5B", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.1283, + "BBH": 0.3124, + "MATH Level 5": 0.0106, + "GPQA": 0.255, + "MUSR": 0.3445, + "MMLU-PRO": 0.1223 + } + }, + { + "model_id": "prithivMLmods/Triangulum-v2-10B", + "name": "Triangulum-v2-10B", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.6705, + "BBH": 0.6065, + "MATH Level 5": 0.2447, + "GPQA": 0.3372, + "MUSR": 0.4281, + "MMLU-PRO": 0.4466 + } + }, + { + "model_id": "prithivMLmods/Tucana-Opus-14B-r999", + "name": "Tucana-Opus-14B-r999", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.6067, + "BBH": 0.6557, + "MATH Level 5": 0.4063, + "GPQA": 0.3918, + "MUSR": 0.473, + "MMLU-PRO": 0.5384 + } + }, + { + "model_id": "prithivMLmods/Tulu-MathLingo-8B", + "name": "Tulu-MathLingo-8B", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.5589, + "BBH": 0.4659, + "MATH Level 5": 0.145, + "GPQA": 0.2903, + "MUSR": 0.3864, + "MMLU-PRO": 0.3044 + } + }, + { + "model_id": "prithivMLmods/Viper-Coder-7B-Elite14", + "name": "Viper-Coder-7B-Elite14", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.1488, + "BBH": 0.2829, + "MATH Level 5": 0.0106, + "GPQA": 0.255, + "MUSR": 0.3422, + "MMLU-PRO": 0.1089 + } + }, + { + "model_id": "prithivMLmods/Viper-Coder-Hybrid-v1.2", + "name": "Viper-Coder-Hybrid-v1.2", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.6736, + "BBH": 0.6391, + "MATH Level 5": 0.3331, + "GPQA": 0.3742, + "MUSR": 0.4822, + "MMLU-PRO": 0.5243 + } + }, + { + "model_id": "prithivMLmods/Viper-Coder-Hybrid-v1.3", + "name": "Viper-Coder-Hybrid-v1.3", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.7555, + "BBH": 0.6471, + "MATH Level 5": 0.4517, + "GPQA": 0.3381, + "MUSR": 0.4403, + "MMLU-PRO": 0.5097 + } + }, + { + "model_id": "prithivMLmods/Viper-Coder-HybridMini-v1.3", + "name": "Viper-Coder-HybridMini-v1.3", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.6104, + "BBH": 0.5365, + "MATH Level 5": 0.463, + "GPQA": 0.3171, + "MUSR": 0.4505, + "MMLU-PRO": 0.4352 + } + }, + { + "model_id": "prithivMLmods/Viper-Coder-v0.1", + "name": "Viper-Coder-v0.1", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.5521, + "BBH": 0.6143, + "MATH Level 5": 0.327, + "GPQA": 0.354, + "MUSR": 0.4394, + "MMLU-PRO": 0.3928 + } + }, + { + "model_id": "prithivMLmods/Viper-Coder-v1.1", + "name": "Viper-Coder-v1.1", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4432, + "BBH": 0.6492, + "MATH Level 5": 0.5461, + "GPQA": 0.401, + "MUSR": 0.5219, + "MMLU-PRO": 0.5232 + } + }, + { + "model_id": "prithivMLmods/Viper-Coder-v1.6-r999", + "name": "Viper-Coder-v1.6-r999", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4433, + "BBH": 0.6492, + "MATH Level 5": 0.5657, + "GPQA": 0.401, + "MUSR": 0.5219, + "MMLU-PRO": 0.5232 + } + }, + { + "model_id": "prithivMLmods/Viper-Coder-v1.7-Vsm6", + "name": "Viper-Coder-v1.7-Vsm6", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.5004, + "BBH": 0.6502, + "MATH Level 5": 0.4645, + "GPQA": 0.3968, + "MUSR": 0.4768, + "MMLU-PRO": 0.5288 + } + }, + { + "model_id": "prithivMLmods/Viper-OneCoder-UIGEN", + "name": "Viper-OneCoder-UIGEN", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.4692, + "BBH": 0.6047, + "MATH Level 5": 0.3867, + "GPQA": 0.3423, + "MUSR": 0.4514, + "MMLU-PRO": 0.3904 + } + }, + { + "model_id": "prithivMLmods/Volans-Opus-14B-Exp", + "name": "Volans-Opus-14B-Exp", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.5868, + "BBH": 0.6521, + "MATH Level 5": 0.4252, + "GPQA": 0.3851, + "MUSR": 0.4872, + "MMLU-PRO": 0.5385 + } + }, + { + "model_id": "prithivMLmods/WebMind-7B-v0.1", + "name": "WebMind-7B-v0.1", + "developer": "prithivMLmods", + "scores": { + "IFEval": 0.5278, + "BBH": 0.5434, + "MATH Level 5": 0.3648, + "GPQA": 0.3171, + "MUSR": 0.4537, + "MMLU-PRO": 0.4279 + } + }, + { + "model_id": "pszemraj/Llama-3-6.3b-v0.1", + "name": "Llama-3-6.3b-v0.1", + "developer": "pszemraj", + "scores": { + "IFEval": 0.1044, + "BBH": 0.4197, + "MATH Level 5": 0.0211, + "GPQA": 0.2836, + "MUSR": 0.3908, + "MMLU-PRO": 0.284 + } + }, + { + "model_id": "pszemraj/Mistral-v0.3-6B", + "name": "Mistral-v0.3-6B", + "developer": "pszemraj", + "scores": { + "IFEval": 0.2454, + "BBH": 0.3774, + "MATH Level 5": 0.0136, + "GPQA": 0.2651, + "MUSR": 0.3908, + "MMLU-PRO": 0.2143 + } + }, + { + "model_id": "qingy2019/LLaMa_3.2_3B_Catalysts", + "name": "LLaMa_3.2_3B_Catalysts", + "developer": "qingy2019", + "scores": { + "IFEval": 0.4992, + "BBH": 0.4468, + "MATH Level 5": 0.1292, + "GPQA": 0.2886, + "MUSR": 0.3788, + "MMLU-PRO": 0.3008 + } + }, + { + "model_id": "qingy2019/OpenMath2-Llama3.1-8B", + "name": "OpenMath2-Llama3.1-8B", + "developer": "qingy2019", + "scores": { + "IFEval": 0.2331, + "BBH": 0.4096, + "MATH Level 5": 0.2674, + "GPQA": 0.2651, + "MUSR": 0.3436, + "MMLU-PRO": 0.1553 + } + }, + { + "model_id": "qingy2019/Oracle-14B", + "name": "Oracle-14B", + "developer": "qingy2019", + "scores": { + "IFEval": 0.2358, + "BBH": 0.4612, + "MATH Level 5": 0.0642, + "GPQA": 0.2576, + "MUSR": 0.3717, + "MMLU-PRO": 0.2382 + } + }, + { + "model_id": "qingy2019/Qwen2.5-Math-14B-Instruct", + "name": "Qwen2.5-Math-14B-Instruct", + "developer": "qingy2019", + "scores": { + "IFEval": 0.6066, + "BBH": 0.635, + "MATH Level 5": 0.3716, + "GPQA": 0.3725, + "MUSR": 0.4757, + "MMLU-PRO": 0.5331 + } + }, + { + "model_id": "qingy2019/Qwen2.5-Math-14B-Instruct-Alpha", + "name": "Qwen2.5-Math-14B-Instruct-Alpha", + "developer": "qingy2019", + "scores": { + "IFEval": 0.5981, + "BBH": 0.6375, + "MATH Level 5": 0.3142, + "GPQA": 0.37, + "MUSR": 0.4649, + "MMLU-PRO": 0.5331 + } + }, + { + "model_id": "qingy2019/Qwen2.5-Math-14B-Instruct-Pro", + "name": "Qwen2.5-Math-14B-Instruct-Pro", + "developer": "qingy2019", + "scores": { + "IFEval": 0.1922, + "BBH": 0.5319, + "MATH Level 5": 0.284, + "GPQA": 0.3112, + "MUSR": 0.374, + "MMLU-PRO": 0.3558 + } + }, + { + "model_id": "qingy2019/Qwen2.5-Ultimate-14B-Instruct", + "name": "Qwen2.5-Ultimate-14B-Instruct", + "developer": "qingy2019", + "scores": { + "IFEval": 0.3938, + "BBH": 0.5842, + "MATH Level 5": 0.2893, + "GPQA": 0.3565, + "MUSR": 0.4135, + "MMLU-PRO": 0.4929 + } + }, + { + "model_id": "qingy2024/Benchmaxx-Llama-3.2-1B-Instruct", + "name": "Benchmaxx-Llama-3.2-1B-Instruct", + "developer": "qingy2024", + "scores": { + "IFEval": 0.2014, + "BBH": 0.8269, + "MATH Level 5": 0.4804, + "GPQA": 0.2836, + "MUSR": 0.3446, + "MMLU-PRO": 0.1113 + } + }, + { + "model_id": "qingy2024/Eyas-17B-Instruct", + "name": "Eyas-17B-Instruct", + "developer": "qingy2024", + "scores": { + "IFEval": 0.6575, + "BBH": 0.6085, + "MATH Level 5": 0.247, + "GPQA": 0.3146, + "MUSR": 0.4522, + "MMLU-PRO": 0.4343 + } + }, + { + "model_id": "qingy2024/Falcon3-2x10B-MoE-Instruct", + "name": "Falcon3-2x10B-MoE-Instruct", + "developer": "qingy2024", + "scores": { + "IFEval": 0.785, + "BBH": 0.6185, + "MATH Level 5": 0.2795, + "GPQA": 0.3305, + "MUSR": 0.4284, + "MMLU-PRO": 0.4423 + } + }, + { + "model_id": "qingy2024/Fusion-14B-Instruct", + "name": "Fusion-14B-Instruct", + "developer": "qingy2024", + "scores": { + "IFEval": 0.726, + "BBH": 0.6396, + "MATH Level 5": 0.3369, + "GPQA": 0.3549, + "MUSR": 0.44, + "MMLU-PRO": 0.5044 + } + }, + { + "model_id": "qingy2024/Fusion2-14B-Instruct", + "name": "Fusion2-14B-Instruct", + "developer": "qingy2024", + "scores": { + "IFEval": 0.6064, + "BBH": 0.6119, + "MATH Level 5": 0.3127, + "GPQA": 0.3448, + "MUSR": 0.4634, + "MMLU-PRO": 0.5051 + } + }, + { + "model_id": "qingy2024/Fusion4-14B-Instruct", + "name": "Fusion4-14B-Instruct", + "developer": "qingy2024", + "scores": { + "IFEval": 0.7649, + "BBH": 0.6543, + "MATH Level 5": 0.3882, + "GPQA": 0.3305, + "MUSR": 0.4326, + "MMLU-PRO": 0.5194 + } + }, + { + "model_id": "qingy2024/OwO-14B-Instruct", + "name": "OwO-14B-Instruct", + "developer": "qingy2024", + "scores": { + "IFEval": 0.1383, + "BBH": 0.6165, + "MATH Level 5": 0.4162, + "GPQA": 0.3641, + "MUSR": 0.4407, + "MMLU-PRO": 0.5181 + } + }, + { + "model_id": "qingy2024/QwEnlarge-16B-Instruct", + "name": "QwEnlarge-16B-Instruct", + "developer": "qingy2024", + "scores": { + "IFEval": 0.7802, + "BBH": 0.5949, + "MATH Level 5": 0.46, + "GPQA": 0.3331, + "MUSR": 0.4101, + "MMLU-PRO": 0.4476 + } + }, + { + "model_id": "qingy2024/QwQ-14B-Math-v0.2", + "name": "QwQ-14B-Math-v0.2", + "developer": "qingy2024", + "scores": { + "IFEval": 0.3391, + "BBH": 0.5731, + "MATH Level 5": 0.4811, + "GPQA": 0.2626, + "MUSR": 0.4021, + "MMLU-PRO": 0.48 + } + }, + { + "model_id": "qingy2024/Qwarkstar-4B", + "name": "Qwarkstar-4B", + "developer": "qingy2024", + "scores": { + "IFEval": 0.1994, + "BBH": 0.4015, + "MATH Level 5": 0.0861, + "GPQA": 0.3247, + "MUSR": 0.4428, + "MMLU-PRO": 0.2425 + } + }, + { + "model_id": "qingy2024/Qwarkstar-4B-Instruct-Preview", + "name": "Qwarkstar-4B-Instruct-Preview", + "developer": "qingy2024", + "scores": { + "IFEval": 0.5324, + "BBH": 0.4358, + "MATH Level 5": 0.1284, + "GPQA": 0.2802, + "MUSR": 0.3896, + "MMLU-PRO": 0.2502 + } + }, + { + "model_id": "qingy2024/Qwen2.5-4B", + "name": "Qwen2.5-4B", + "developer": "qingy2024", + "scores": { + "IFEval": 0.2158, + "BBH": 0.4269, + "MATH Level 5": 0.0514, + "GPQA": 0.2911, + "MUSR": 0.461, + "MMLU-PRO": 0.2525 + } + }, + { + "model_id": "qingy2024/Qwen2.5-Coder-Draft-1.5B-Instruct", + "name": "Qwen2.5-Coder-Draft-1.5B-Instruct", + "developer": "qingy2024", + "scores": { + "IFEval": 0.4125, + "BBH": 0.3837, + "MATH Level 5": 0.1579, + "GPQA": 0.2601, + "MUSR": 0.358, + "MMLU-PRO": 0.2244 + } + }, + { + "model_id": "qingy2024/Qwen2.5-Math-14B-Instruct-Alpha", + "name": "Qwen2.5-Math-14B-Instruct-Alpha", + "developer": "qingy2024", + "scores": { + "IFEval": 0.7704, + "BBH": 0.6465, + "MATH Level 5": 0.429, + "GPQA": 0.349, + "MUSR": 0.4021, + "MMLU-PRO": 0.4966 + } + }, + { + "model_id": "qingy2024/Qwen2.5-Math-14B-Instruct-Preview", + "name": "Qwen2.5-Math-14B-Instruct-Preview", + "developer": "qingy2024", + "scores": { + "IFEval": 0.7826, + "BBH": 0.6294, + "MATH Level 5": 0.4758, + "GPQA": 0.3406, + "MUSR": 0.4115, + "MMLU-PRO": 0.4993 + } + }, + { + "model_id": "qingy2024/Qwen2.6-14B-Instruct", + "name": "Qwen2.6-14B-Instruct", + "developer": "qingy2024", + "scores": { + "IFEval": 0.5811, + "BBH": 0.6394, + "MATH Level 5": 0.3051, + "GPQA": 0.3792, + "MUSR": 0.4569, + "MMLU-PRO": 0.5285 + } + }, + { + "model_id": "qingy2024/Qwen2.6-Math-14B-Instruct", + "name": "Qwen2.6-Math-14B-Instruct", + "developer": "qingy2024", + "scores": { + "IFEval": 0.3862, + "BBH": 0.6324, + "MATH Level 5": 0.429, + "GPQA": 0.37, + "MUSR": 0.4759, + "MMLU-PRO": 0.5241 + } + }, + { + "model_id": "qq8933/OpenLongCoT-Base-Gemma2-2B", + "name": "OpenLongCoT-Base-Gemma2-2B", + "developer": "qq8933", + "scores": { + "IFEval": 0.1965, + "BBH": 0.3106, + "MATH Level 5": 0.0234, + "GPQA": 0.2626, + "MUSR": 0.3222, + "MMLU-PRO": 0.1316 + } + }, + { + "model_id": "raphgg/test-2.5-72B", + "name": "test-2.5-72B", + "developer": "raphgg", + "scores": { + "IFEval": 0.8437, + "BBH": 0.7266, + "MATH Level 5": 0.4109, + "GPQA": 0.3893, + "MUSR": 0.4812, + "MMLU-PRO": 0.5837 + } + }, + { + "model_id": "rasyosef/Mistral-NeMo-Minitron-8B-Chat", + "name": "Mistral-NeMo-Minitron-8B-Chat", + "developer": "rasyosef", + "scores": { + "IFEval": 0.4452, + "BBH": 0.4759, + "MATH Level 5": 0.0272, + "GPQA": 0.276, + "MUSR": 0.4304, + "MMLU-PRO": 0.2404 + } + }, + { + "model_id": "rasyosef/Phi-1_5-Instruct-v0.1", + "name": "Phi-1_5-Instruct-v0.1", + "developer": "rasyosef", + "scores": { + "IFEval": 0.2402, + "BBH": 0.3118, + "MATH Level 5": 0.0136, + "GPQA": 0.2601, + "MUSR": 0.3422, + "MMLU-PRO": 0.1562 + } + }, + { + "model_id": "rasyosef/phi-2-instruct-apo", + "name": "phi-2-instruct-apo", + "developer": "rasyosef", + "scores": { + "IFEval": 0.3146, + "BBH": 0.4445, + "MATH Level 5": 0.0302, + "GPQA": 0.2701, + "MUSR": 0.3342, + "MMLU-PRO": 0.2155 + } + }, + { + "model_id": "rasyosef/phi-2-instruct-v0.1", + "name": "phi-2-instruct-v0.1", + "developer": "rasyosef", + "scores": { + "IFEval": 0.3681, + "BBH": 0.4726, + "MATH Level 5": 0.0, + "GPQA": 0.2743, + "MUSR": 0.3524, + "MMLU-PRO": 0.2247 + } + }, + { + "model_id": "realtreetune/rho-1b-sft-MATH", + "name": "rho-1b-sft-MATH", + "developer": "realtreetune", + "scores": { + "IFEval": 0.2121, + "BBH": 0.3144, + "MATH Level 5": 0.0347, + "GPQA": 0.2525, + "MUSR": 0.3458, + "MMLU-PRO": 0.1117 + } + }, + { + "model_id": "recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp", + "name": "Gemma-2-Ataraxy-Gemmasutra-9B-slerp", + "developer": "recoilme", + "scores": { + "IFEval": 0.7649, + "BBH": 0.5974, + "MATH Level 5": 0.0174, + "GPQA": 0.3305, + "MUSR": 0.4245, + "MMLU-PRO": 0.4207 + } + }, + { + "model_id": "recoilme/recoilme-gemma-2-9B-v0.1", + "name": "recoilme-gemma-2-9B-v0.1", + "developer": "recoilme", + "scores": { + "IFEval": 0.7515, + "BBH": 0.5995, + "MATH Level 5": 0.2039, + "GPQA": 0.3389, + "MUSR": 0.4191, + "MMLU-PRO": 0.4159 + } + }, + { + "model_id": "recoilme/recoilme-gemma-2-9B-v0.2", + "name": "recoilme-gemma-2-9B-v0.2", + "developer": "recoilme", + "scores": { + "IFEval": 0.2747, + "BBH": 0.6031, + "MATH Level 5": 0.0831, + "GPQA": 0.3305, + "MUSR": 0.4686, + "MMLU-PRO": 0.4122 + } + }, + { + "model_id": "recoilme/recoilme-gemma-2-9B-v0.3", + "name": "recoilme-gemma-2-9B-v0.3", + "developer": "recoilme", + "scores": { + "IFEval": 0.7439, + "BBH": 0.5993, + "MATH Level 5": 0.0876, + "GPQA": 0.3238, + "MUSR": 0.4204, + "MMLU-PRO": 0.4072 + } + }, + { + "model_id": "recoilme/recoilme-gemma-2-9B-v0.4", + "name": "recoilme-gemma-2-9B-v0.4", + "developer": "recoilme", + "scores": { + "IFEval": 0.2562, + "BBH": 0.5967, + "MATH Level 5": 0.0846, + "GPQA": 0.3406, + "MUSR": 0.4727, + "MMLU-PRO": 0.4406 + } + }, + { + "model_id": "recoilme/recoilme-gemma-2-9B-v0.5", + "name": "recoilme-gemma-2-9B-v0.5", + "developer": "recoilme", + "scores": { + "IFEval": 0.7664, + "BBH": 0.5981, + "MATH Level 5": 0.2115, + "GPQA": 0.3364, + "MUSR": 0.4232, + "MMLU-PRO": 0.42 + } + }, + { + "model_id": "redrix/AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS", + "name": "AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS", + "developer": "redrix", + "scores": { + "IFEval": 0.536, + "BBH": 0.5129, + "MATH Level 5": 0.1133, + "GPQA": 0.3154, + "MUSR": 0.3818, + "MMLU-PRO": 0.318 + } + }, + { + "model_id": "redrix/patricide-12B-Unslop-Mell", + "name": "patricide-12B-Unslop-Mell", + "developer": "redrix", + "scores": { + "IFEval": 0.4074, + "BBH": 0.5399, + "MATH Level 5": 0.1314, + "GPQA": 0.3238, + "MUSR": 0.4026, + "MMLU-PRO": 0.357 + } + }, + { + "model_id": "refuelai/Llama-3-Refueled", + "name": "Llama-3-Refueled", + "developer": "refuelai", + "scores": { + "IFEval": 0.462, + "BBH": 0.5871, + "MATH Level 5": 0.0665, + "GPQA": 0.2995, + "MUSR": 0.4454, + "MMLU-PRO": 0.3095 + } + }, + { + "model_id": "rhplus0831/maid-yuzu-v7", + "name": "maid-yuzu-v7", + "developer": "rhplus0831", + "scores": { + "IFEval": 0.6462, + "BBH": 0.4805, + "MATH Level 5": 0.102, + "GPQA": 0.3096, + "MUSR": 0.4136, + "MMLU-PRO": 0.354 + } + }, + { + "model_id": "rhymes-ai/Aria", + "name": "Aria", + "developer": "rhymes-ai", + "scores": { + "IFEval": 0.4773, + "BBH": 0.5695, + "MATH Level 5": 0.1934, + "GPQA": 0.3624, + "MUSR": 0.4338, + "MMLU-PRO": 0.4405 + } + }, + { + "model_id": "rhysjones/phi-2-orange-v2", + "name": "phi-2-orange-v2", + "developer": "rhysjones", + "scores": { + "IFEval": 0.367, + "BBH": 0.477, + "MATH Level 5": 0.0408, + "GPQA": 0.2617, + "MUSR": 0.363, + "MMLU-PRO": 0.2532 + } + }, + { + "model_id": "riaz/FineLlama-3.1-8B", + "name": "FineLlama-3.1-8B", + "developer": "riaz", + "scores": { + "IFEval": 0.4373, + "BBH": 0.4586, + "MATH Level 5": 0.0514, + "GPQA": 0.2752, + "MUSR": 0.3763, + "MMLU-PRO": 0.2964 + } + }, + { + "model_id": "rmdhirr/Gluon-8B", + "name": "Gluon-8B", + "developer": "rmdhirr", + "scores": { + "IFEval": 0.5053, + "BBH": 0.5153, + "MATH Level 5": 0.1443, + "GPQA": 0.3121, + "MUSR": 0.4039, + "MMLU-PRO": 0.3808 + } + }, + { + "model_id": "rombodawg/Rombos-Coder-V2.5-Qwen-14b", + "name": "Rombos-Coder-V2.5-Qwen-14b", + "developer": "rombodawg", + "scores": { + "IFEval": 0.7047, + "BBH": 0.6165, + "MATH Level 5": 0.3301, + "GPQA": 0.3029, + "MUSR": 0.3915, + "MMLU-PRO": 0.3939 + } + }, + { + "model_id": "rombodawg/Rombos-Coder-V2.5-Qwen-7b", + "name": "Rombos-Coder-V2.5-Qwen-7b", + "developer": "rombodawg", + "scores": { + "IFEval": 0.621, + "BBH": 0.5077, + "MATH Level 5": 0.3338, + "GPQA": 0.2836, + "MUSR": 0.3979, + "MMLU-PRO": 0.3398 + } + }, + { + "model_id": "rombodawg/Rombos-LLM-V2.5-Qwen-0.5b", + "name": "Rombos-LLM-V2.5-Qwen-0.5b", + "developer": "rombodawg", + "scores": { + "IFEval": 0.2847, + "BBH": 0.3294, + "MATH Level 5": 0.068, + "GPQA": 0.2668, + "MUSR": 0.3236, + "MMLU-PRO": 0.1866 + } + }, + { + "model_id": "rombodawg/Rombos-LLM-V2.5-Qwen-1.5b", + "name": "Rombos-LLM-V2.5-Qwen-1.5b", + "developer": "rombodawg", + "scores": { + "IFEval": 0.3402, + "BBH": 0.4257, + "MATH Level 5": 0.0853, + "GPQA": 0.2886, + "MUSR": 0.4186, + "MMLU-PRO": 0.2922 + } + }, + { + "model_id": "rombodawg/Rombos-LLM-V2.5-Qwen-14b", + "name": "Rombos-LLM-V2.5-Qwen-14b", + "developer": "rombodawg", + "scores": { + "IFEval": 0.584, + "BBH": 0.6481, + "MATH Level 5": 0.4554, + "GPQA": 0.3716, + "MUSR": 0.4717, + "MMLU-PRO": 0.5376 + } + }, + { + "model_id": "rombodawg/Rombos-LLM-V2.5-Qwen-32b", + "name": "Rombos-LLM-V2.5-Qwen-32b", + "developer": "rombodawg", + "scores": { + "IFEval": 0.6827, + "BBH": 0.7046, + "MATH Level 5": 0.4955, + "GPQA": 0.3968, + "MUSR": 0.5034, + "MMLU-PRO": 0.5916 + } + }, + { + "model_id": "rombodawg/Rombos-LLM-V2.5-Qwen-3b", + "name": "Rombos-LLM-V2.5-Qwen-3b", + "developer": "rombodawg", + "scores": { + "IFEval": 0.5342, + "BBH": 0.4809, + "MATH Level 5": 0.2795, + "GPQA": 0.3079, + "MUSR": 0.4042, + "MMLU-PRO": 0.3761 + } + }, + { + "model_id": "rombodawg/Rombos-LLM-V2.5-Qwen-72b", + "name": "Rombos-LLM-V2.5-Qwen-72b", + "developer": "rombodawg", + "scores": { + "IFEval": 0.7155, + "BBH": 0.723, + "MATH Level 5": 0.5423, + "GPQA": 0.3985, + "MUSR": 0.4599, + "MMLU-PRO": 0.5935 + } + }, + { + "model_id": "rombodawg/Rombos-LLM-V2.5-Qwen-7b", + "name": "Rombos-LLM-V2.5-Qwen-7b", + "developer": "rombodawg", + "scores": { + "IFEval": 0.6237, + "BBH": 0.5544, + "MATH Level 5": 0.3814, + "GPQA": 0.318, + "MUSR": 0.4291, + "MMLU-PRO": 0.4469 + } + }, + { + "model_id": "rombodawg/Rombos-LLM-V2.5.1-Qwen-3b", + "name": "Rombos-LLM-V2.5.1-Qwen-3b", + "developer": "rombodawg", + "scores": { + "IFEval": 0.2595, + "BBH": 0.3884, + "MATH Level 5": 0.0914, + "GPQA": 0.2743, + "MUSR": 0.3991, + "MMLU-PRO": 0.2719 + } + }, + { + "model_id": "rombodawg/Rombos-LLM-V2.6-Nemotron-70b", + "name": "Rombos-LLM-V2.6-Nemotron-70b", + "developer": "rombodawg", + "scores": { + "IFEval": 0.7527, + "BBH": 0.6938, + "MATH Level 5": 0.3331, + "GPQA": 0.406, + "MUSR": 0.4669, + "MMLU-PRO": 0.5329 + } + }, + { + "model_id": "rombodawg/Rombos-LLM-V2.6-Qwen-14b", + "name": "Rombos-LLM-V2.6-Qwen-14b", + "developer": "rombodawg", + "scores": { + "IFEval": 0.8432, + "BBH": 0.6442, + "MATH Level 5": 0.5211, + "GPQA": 0.3339, + "MUSR": 0.4221, + "MMLU-PRO": 0.4961 + } + }, + { + "model_id": "rombodawg/rombos_Replete-Coder-Instruct-8b-Merged", + "name": "rombos_Replete-Coder-Instruct-8b-Merged", + "developer": "rombodawg", + "scores": { + "IFEval": 0.5388, + "BBH": 0.4462, + "MATH Level 5": 0.0778, + "GPQA": 0.2693, + "MUSR": 0.366, + "MMLU-PRO": 0.1809 + } + }, + { + "model_id": "rombodawg/rombos_Replete-Coder-Llama3-8B", + "name": "rombos_Replete-Coder-Llama3-8B", + "developer": "rombodawg", + "scores": { + "IFEval": 0.4714, + "BBH": 0.3276, + "MATH Level 5": 0.0393, + "GPQA": 0.2668, + "MUSR": 0.3966, + "MMLU-PRO": 0.1335 + } + }, + { + "model_id": "rootxhacker/Apollo-70B", + "name": "Apollo-70B", + "developer": "rootxhacker", + "scores": { + "IFEval": 0.5099, + "BBH": 0.6804, + "MATH Level 5": 0.5612, + "GPQA": 0.4572, + "MUSR": 0.4948, + "MMLU-PRO": 0.5279 + } + }, + { + "model_id": "rootxhacker/Apollo_v2-32B", + "name": "Apollo_v2-32B", + "developer": "rootxhacker", + "scores": { + "IFEval": 0.428, + "BBH": 0.7072, + "MATH Level 5": 0.4275, + "GPQA": 0.3784, + "MUSR": 0.4994, + "MMLU-PRO": 0.5869 + } + }, + { + "model_id": "rootxhacker/apollo-7B", + "name": "apollo-7B", + "developer": "rootxhacker", + "scores": { + "IFEval": 0.2953, + "BBH": 0.3636, + "MATH Level 5": 0.0257, + "GPQA": 0.2785, + "MUSR": 0.4131, + "MMLU-PRO": 0.1748 + } + }, + { + "model_id": "rsh345/mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B", + "name": "mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B", + "developer": "rsh345", + "scores": { + "IFEval": 0.3892, + "BBH": 0.5188, + "MATH Level 5": 0.0733, + "GPQA": 0.3029, + "MUSR": 0.4672, + "MMLU-PRO": 0.3054 + } + }, + { + "model_id": "rubenroy/Geneva-12B-GCv2-5m", + "name": "Geneva-12B-GCv2-5m", + "developer": "rubenroy", + "scores": { + "IFEval": 0.2586, + "BBH": 0.5278, + "MATH Level 5": 0.0801, + "GPQA": 0.2878, + "MUSR": 0.3525, + "MMLU-PRO": 0.325 + } + }, + { + "model_id": "rubenroy/Gilgamesh-72B", + "name": "Gilgamesh-72B", + "developer": "rubenroy", + "scores": { + "IFEval": 0.8486, + "BBH": 0.7253, + "MATH Level 5": 0.4381, + "GPQA": 0.3943, + "MUSR": 0.4626, + "MMLU-PRO": 0.5802 + } + }, + { + "model_id": "rubenroy/Zurich-14B-GCv2-5m", + "name": "Zurich-14B-GCv2-5m", + "developer": "rubenroy", + "scores": { + "IFEval": 0.6164, + "BBH": 0.6308, + "MATH Level 5": 0.3074, + "GPQA": 0.3616, + "MUSR": 0.4874, + "MMLU-PRO": 0.5233 + } + }, + { + "model_id": "ruizhe1217/sft-s1-qwen-0.5b", + "name": "sft-s1-qwen-0.5b", + "developer": "ruizhe1217", + "scores": { + "IFEval": 0.2749, + "BBH": 0.3301, + "MATH Level 5": 0.0619, + "GPQA": 0.271, + "MUSR": 0.3196, + "MMLU-PRO": 0.1892 + } + }, + { + "model_id": "rwitz/go-bruins-v2", + "name": "go-bruins-v2", + "developer": "rwitz", + "scores": { + "IFEval": 0.4096, + "BBH": 0.3799, + "MATH Level 5": 0.0672, + "GPQA": 0.2626, + "MUSR": 0.4138, + "MMLU-PRO": 0.2761 + } + }, + { + "model_id": "sabersaleh/Llama2-7B-CPO", + "name": "Llama2-7B-CPO", + "developer": "sabersaleh", + "scores": { + "IFEval": 0.1545, + "BBH": 0.3458, + "MATH Level 5": 0.0136, + "GPQA": 0.2676, + "MUSR": 0.4048, + "MMLU-PRO": 0.1606 + } + }, + { + "model_id": "sabersaleh/Llama2-7B-DPO", + "name": "Llama2-7B-DPO", + "developer": "sabersaleh", + "scores": { + "IFEval": 0.1453, + "BBH": 0.3512, + "MATH Level 5": 0.0159, + "GPQA": 0.2685, + "MUSR": 0.4114, + "MMLU-PRO": 0.1626 + } + }, + { + "model_id": "sabersaleh/Llama2-7B-IPO", + "name": "Llama2-7B-IPO", + "developer": "sabersaleh", + "scores": { + "IFEval": 0.1769, + "BBH": 0.3475, + "MATH Level 5": 0.0159, + "GPQA": 0.2676, + "MUSR": 0.4048, + "MMLU-PRO": 0.1617 + } + }, + { + "model_id": "sabersaleh/Llama2-7B-KTO", + "name": "Llama2-7B-KTO", + "developer": "sabersaleh", + "scores": { + "IFEval": 0.1528, + "BBH": 0.3501, + "MATH Level 5": 0.0189, + "GPQA": 0.2676, + "MUSR": 0.4167, + "MMLU-PRO": 0.1636 + } + }, + { + "model_id": "sabersaleh/Llama2-7B-SPO", + "name": "Llama2-7B-SPO", + "developer": "sabersaleh", + "scores": { + "IFEval": 0.1567, + "BBH": 0.3383, + "MATH Level 5": 0.0196, + "GPQA": 0.2768, + "MUSR": 0.3874, + "MMLU-PRO": 0.1757 + } + }, + { + "model_id": "sabersaleh/Llama2-7B-SimPO", + "name": "Llama2-7B-SimPO", + "developer": "sabersaleh", + "scores": { + "IFEval": 0.1659, + "BBH": 0.3489, + "MATH Level 5": 0.0159, + "GPQA": 0.271, + "MUSR": 0.4007, + "MMLU-PRO": 0.1641 + } + }, + { + "model_id": "sabersaleh/Llama3", + "name": "Llama3", + "developer": "sabersaleh", + "scores": { + "IFEval": 0.3321, + "BBH": 0.4782, + "MATH Level 5": 0.0566, + "GPQA": 0.3104, + "MUSR": 0.3933, + "MMLU-PRO": 0.3162 + } + }, + { + "model_id": "sabersalehk/Llama3-001-300", + "name": "Llama3-001-300", + "developer": "sabersalehk", + "scores": { + "IFEval": 0.3179, + "BBH": 0.4745, + "MATH Level 5": 0.0529, + "GPQA": 0.2995, + "MUSR": 0.4064, + "MMLU-PRO": 0.3158 + } + }, + { + "model_id": "sabersalehk/Llama3-SimPO", + "name": "Llama3-SimPO", + "developer": "sabersalehk", + "scores": { + "IFEval": 0.3642, + "BBH": 0.4874, + "MATH Level 5": 0.0574, + "GPQA": 0.3079, + "MUSR": 0.4046, + "MMLU-PRO": 0.3157 + } + }, + { + "model_id": "sabersalehk/Llama3_001_200", + "name": "Llama3_001_200", + "developer": "sabersalehk", + "scores": { + "IFEval": 0.3218, + "BBH": 0.4728, + "MATH Level 5": 0.0514, + "GPQA": 0.3037, + "MUSR": 0.4037, + "MMLU-PRO": 0.3183 + } + }, + { + "model_id": "sabersalehk/Llama3_01_300", + "name": "Llama3_01_300", + "developer": "sabersalehk", + "scores": { + "IFEval": 0.2959, + "BBH": 0.4691, + "MATH Level 5": 0.0498, + "GPQA": 0.3079, + "MUSR": 0.4065, + "MMLU-PRO": 0.3124 + } + }, + { + "model_id": "saishf/Fimbulvetr-Kuro-Lotus-10.7B", + "name": "Fimbulvetr-Kuro-Lotus-10.7B", + "developer": "saishf", + "scores": { + "IFEval": 0.4939, + "BBH": 0.4342, + "MATH Level 5": 0.0536, + "GPQA": 0.3012, + "MUSR": 0.4445, + "MMLU-PRO": 0.3389 + } + }, + { + "model_id": "saishf/Neural-SOVLish-Devil-8B-L3", + "name": "Neural-SOVLish-Devil-8B-L3", + "developer": "saishf", + "scores": { + "IFEval": 0.4199, + "BBH": 0.5142, + "MATH Level 5": 0.0891, + "GPQA": 0.3079, + "MUSR": 0.411, + "MMLU-PRO": 0.3807 + } + }, + { + "model_id": "saishshinde15/TethysAI_Base_Reasoning", + "name": "TethysAI_Base_Reasoning", + "developer": "saishshinde15", + "scores": { + "IFEval": 0.6369, + "BBH": 0.4519, + "MATH Level 5": 0.3142, + "GPQA": 0.2861, + "MUSR": 0.4075, + "MMLU-PRO": 0.3236 + } + }, + { + "model_id": "saishshinde15/TethysAI_Vortex", + "name": "TethysAI_Vortex", + "developer": "saishshinde15", + "scores": { + "IFEval": 0.4298, + "BBH": 0.4749, + "MATH Level 5": 0.315, + "GPQA": 0.3054, + "MUSR": 0.4458, + "MMLU-PRO": 0.3241 + } + }, + { + "model_id": "saishshinde15/TethysAI_Vortex_Reasoning", + "name": "TethysAI_Vortex_Reasoning", + "developer": "saishshinde15", + "scores": { + "IFEval": 0.4021, + "BBH": 0.4694, + "MATH Level 5": 0.2145, + "GPQA": 0.3045, + "MUSR": 0.4084, + "MMLU-PRO": 0.3381 + } + }, + { + "model_id": "sakaltcommunity/novablast-preview", + "name": "novablast-preview", + "developer": "sakaltcommunity", + "scores": { + "IFEval": 0.453, + "BBH": 0.7043, + "MATH Level 5": 0.4894, + "GPQA": 0.3817, + "MUSR": 0.5021, + "MMLU-PRO": 0.5915 + } + }, + { + "model_id": "sakaltcommunity/sakaltum-7b", + "name": "sakaltum-7b", + "developer": "sakaltcommunity", + "scores": { + "IFEval": 0.2604, + "BBH": 0.4575, + "MATH Level 5": 0.0295, + "GPQA": 0.2727, + "MUSR": 0.3775, + "MMLU-PRO": 0.2769 + } + }, + { + "model_id": "sakhan10/quantized_open_llama_3b_v2", + "name": "quantized_open_llama_3b_v2", + "developer": "sakhan10", + "scores": { + "IFEval": 0.1872, + "BBH": 0.302, + "MATH Level 5": 0.0, + "GPQA": 0.2768, + "MUSR": 0.3682, + "MMLU-PRO": 0.1095 + } + }, + { + "model_id": "saltlux/luxia-21.4b-alignment-v1.0", + "name": "luxia-21.4b-alignment-v1.0", + "developer": "saltlux", + "scores": { + "IFEval": 0.3693, + "BBH": 0.6373, + "MATH Level 5": 0.0974, + "GPQA": 0.3012, + "MUSR": 0.4328, + "MMLU-PRO": 0.3403 + } + }, + { + "model_id": "saltlux/luxia-21.4b-alignment-v1.2", + "name": "luxia-21.4b-alignment-v1.2", + "developer": "saltlux", + "scores": { + "IFEval": 0.4115, + "BBH": 0.6371, + "MATH Level 5": 0.0846, + "GPQA": 0.3079, + "MUSR": 0.4459, + "MMLU-PRO": 0.3473 + } + }, + { + "model_id": "sam-paech/Darkest-muse-v1", + "name": "Darkest-muse-v1", + "developer": "sam-paech", + "scores": { + "IFEval": 0.7344, + "BBH": 0.5968, + "MATH Level 5": 0.2145, + "GPQA": 0.344, + "MUSR": 0.4502, + "MMLU-PRO": 0.4184 + } + }, + { + "model_id": "sam-paech/Delirium-v1", + "name": "Delirium-v1", + "developer": "sam-paech", + "scores": { + "IFEval": 0.7208, + "BBH": 0.5962, + "MATH Level 5": 0.2107, + "GPQA": 0.3431, + "MUSR": 0.4514, + "MMLU-PRO": 0.419 + } + }, + { + "model_id": "sam-paech/Quill-v1", + "name": "Quill-v1", + "developer": "sam-paech", + "scores": { + "IFEval": 0.7122, + "BBH": 0.5969, + "MATH Level 5": 0.2122, + "GPQA": 0.3398, + "MUSR": 0.4555, + "MMLU-PRO": 0.4171 + } + }, + { + "model_id": "sarvamai/OpenHathi-7B-Hi-v0.1-Base", + "name": "OpenHathi-7B-Hi-v0.1-Base", + "developer": "sarvamai", + "scores": { + "IFEval": 0.1804, + "BBH": 0.3354, + "MATH Level 5": 0.0083, + "GPQA": 0.2534, + "MUSR": 0.3658, + "MMLU-PRO": 0.1543 + } + }, + { + "model_id": "schnapss/testmerge-7b", + "name": "testmerge-7b", + "developer": "schnapss", + "scores": { + "IFEval": 0.3922, + "BBH": 0.5187, + "MATH Level 5": 0.0687, + "GPQA": 0.2961, + "MUSR": 0.4686, + "MMLU-PRO": 0.306 + } + }, + { + "model_id": "sci-m-wang/Mistral-7B-Instruct-sa-v0.1", + "name": "Mistral-7B-Instruct-sa-v0.1", + "developer": "sci-m-wang", + "scores": { + "IFEval": 0.4335, + "BBH": 0.3273, + "MATH Level 5": 0.0144, + "GPQA": 0.2592, + "MUSR": 0.39, + "MMLU-PRO": 0.2362 + } + }, + { + "model_id": "sci-m-wang/Phi-3-mini-4k-instruct-sa-v0.1", + "name": "Phi-3-mini-4k-instruct-sa-v0.1", + "developer": "sci-m-wang", + "scores": { + "IFEval": 0.5021, + "BBH": 0.5502, + "MATH Level 5": 0.148, + "GPQA": 0.3289, + "MUSR": 0.4073, + "MMLU-PRO": 0.3985 + } + }, + { + "model_id": "sci-m-wang/deepseek-llm-7b-chat-sa-v0.1", + "name": "deepseek-llm-7b-chat-sa-v0.1", + "developer": "sci-m-wang", + "scores": { + "IFEval": 0.4036, + "BBH": 0.3718, + "MATH Level 5": 0.0264, + "GPQA": 0.2567, + "MUSR": 0.4173, + "MMLU-PRO": 0.2209 + } + }, + { + "model_id": "securin/Securin-LLM-V2.5-Qwen-1.5B", + "name": "Securin-LLM-V2.5-Qwen-1.5B", + "developer": "securin", + "scores": { + "IFEval": 0.1492, + "BBH": 0.3158, + "MATH Level 5": 0.0249, + "GPQA": 0.25, + "MUSR": 0.3606, + "MMLU-PRO": 0.1615 + } + }, + { + "model_id": "senseable/WestLake-7B-v2", + "name": "WestLake-7B-v2", + "developer": "senseable", + "scores": { + "IFEval": 0.4419, + "BBH": 0.4073, + "MATH Level 5": 0.0483, + "GPQA": 0.2768, + "MUSR": 0.3937, + "MMLU-PRO": 0.2764 + } + }, + { + "model_id": "sequelbox/Llama3.1-70B-PlumChat", + "name": "Llama3.1-70B-PlumChat", + "developer": "sequelbox", + "scores": { + "IFEval": 0.5616, + "BBH": 0.6753, + "MATH Level 5": 0.3029, + "GPQA": 0.3909, + "MUSR": 0.4774, + "MMLU-PRO": 0.5164 + } + }, + { + "model_id": "sequelbox/Llama3.1-8B-MOTH", + "name": "Llama3.1-8B-MOTH", + "developer": "sequelbox", + "scores": { + "IFEval": 0.5245, + "BBH": 0.4902, + "MATH Level 5": 0.1216, + "GPQA": 0.2685, + "MUSR": 0.3689, + "MMLU-PRO": 0.3339 + } + }, + { + "model_id": "sequelbox/Llama3.1-8B-PlumChat", + "name": "Llama3.1-8B-PlumChat", + "developer": "sequelbox", + "scores": { + "IFEval": 0.4243, + "BBH": 0.3873, + "MATH Level 5": 0.0363, + "GPQA": 0.2651, + "MUSR": 0.3755, + "MMLU-PRO": 0.2127 + } + }, + { + "model_id": "sequelbox/Llama3.1-8B-PlumCode", + "name": "Llama3.1-8B-PlumCode", + "developer": "sequelbox", + "scores": { + "IFEval": 0.2045, + "BBH": 0.3368, + "MATH Level 5": 0.0272, + "GPQA": 0.276, + "MUSR": 0.3773, + "MMLU-PRO": 0.2335 + } + }, + { + "model_id": "sequelbox/Llama3.1-8B-PlumMath", + "name": "Llama3.1-8B-PlumMath", + "developer": "sequelbox", + "scores": { + "IFEval": 0.2242, + "BBH": 0.4032, + "MATH Level 5": 0.0476, + "GPQA": 0.318, + "MUSR": 0.3919, + "MMLU-PRO": 0.2975 + } + }, + { + "model_id": "sequelbox/gemma-2-9B-MOTH", + "name": "gemma-2-9B-MOTH", + "developer": "sequelbox", + "scores": { + "IFEval": 0.2059, + "BBH": 0.308, + "MATH Level 5": 0.0106, + "GPQA": 0.2601, + "MUSR": 0.3409, + "MMLU-PRO": 0.114 + } + }, + { + "model_id": "sethuiyer/Llama-3.1-8B-Experimental-1206-Instruct", + "name": "Llama-3.1-8B-Experimental-1206-Instruct", + "developer": "sethuiyer", + "scores": { + "IFEval": 0.6967, + "BBH": 0.5104, + "MATH Level 5": 0.1118, + "GPQA": 0.2995, + "MUSR": 0.3966, + "MMLU-PRO": 0.3529 + } + }, + { + "model_id": "sethuiyer/Llama-3.1-8B-Experimental-1208-Instruct", + "name": "Llama-3.1-8B-Experimental-1208-Instruct", + "developer": "sethuiyer", + "scores": { + "IFEval": 0.61, + "BBH": 0.4964, + "MATH Level 5": 0.0891, + "GPQA": 0.2961, + "MUSR": 0.379, + "MMLU-PRO": 0.3511 + } + }, + { + "model_id": "sethuiyer/LlamaZero-3.1-8B-Experimental-1208", + "name": "LlamaZero-3.1-8B-Experimental-1208", + "developer": "sethuiyer", + "scores": { + "IFEval": 0.6051, + "BBH": 0.4981, + "MATH Level 5": 0.108, + "GPQA": 0.2685, + "MUSR": 0.382, + "MMLU-PRO": 0.3 + } + }, + { + "model_id": "sethuiyer/Llamaverse-3.1-8B-Instruct", + "name": "Llamaverse-3.1-8B-Instruct", + "developer": "sethuiyer", + "scores": { + "IFEval": 0.6185, + "BBH": 0.5414, + "MATH Level 5": 0.1858, + "GPQA": 0.2911, + "MUSR": 0.3762, + "MMLU-PRO": 0.3523 + } + }, + { + "model_id": "sethuiyer/Llamazing-3.1-8B-Instruct", + "name": "Llamazing-3.1-8B-Instruct", + "developer": "sethuiyer", + "scores": { + "IFEval": 0.5711, + "BBH": 0.5291, + "MATH Level 5": 0.0544, + "GPQA": 0.3121, + "MUSR": 0.3976, + "MMLU-PRO": 0.3606 + } + }, + { + "model_id": "sethuiyer/Qwen2.5-7B-Anvita", + "name": "Qwen2.5-7B-Anvita", + "developer": "sethuiyer", + "scores": { + "IFEval": 0.648, + "BBH": 0.5466, + "MATH Level 5": 0.2017, + "GPQA": 0.3272, + "MUSR": 0.4337, + "MMLU-PRO": 0.4166 + } + }, + { + "model_id": "shadowml/BeagSake-7B", + "name": "BeagSake-7B", + "developer": "shadowml", + "scores": { + "IFEval": 0.5216, + "BBH": 0.4711, + "MATH Level 5": 0.0506, + "GPQA": 0.281, + "MUSR": 0.4124, + "MMLU-PRO": 0.2585 + } + }, + { + "model_id": "shadowml/Mixolar-4x7b", + "name": "Mixolar-4x7b", + "developer": "shadowml", + "scores": { + "IFEval": 0.3893, + "BBH": 0.5216, + "MATH Level 5": 0.0582, + "GPQA": 0.2928, + "MUSR": 0.4258, + "MMLU-PRO": 0.3305 + } + }, + { + "model_id": "shastraai/Shastra-LLAMA2-Math-Commonsense-SFT", + "name": "Shastra-LLAMA2-Math-Commonsense-SFT", + "developer": "shastraai", + "scores": { + "IFEval": 0.3042, + "BBH": 0.3843, + "MATH Level 5": 0.0174, + "GPQA": 0.2592, + "MUSR": 0.3604, + "MMLU-PRO": 0.1997 + } + }, + { + "model_id": "shivam9980/NEPALI-LLM", + "name": "NEPALI-LLM", + "developer": "shivam9980", + "scores": { + "IFEval": 0.0417, + "BBH": 0.3828, + "MATH Level 5": 0.0091, + "GPQA": 0.2617, + "MUSR": 0.4122, + "MMLU-PRO": 0.2064 + } + }, + { + "model_id": "shivam9980/mistral-7b-news-cnn-merged", + "name": "mistral-7b-news-cnn-merged", + "developer": "shivam9980", + "scores": { + "IFEval": 0.4634, + "BBH": 0.3635, + "MATH Level 5": 0.0189, + "GPQA": 0.3087, + "MUSR": 0.4523, + "MMLU-PRO": 0.2827 + } + }, + { + "model_id": "shivank21/mistral_dpo_self", + "name": "mistral_dpo_self", + "developer": "shivank21", + "scores": { + "IFEval": 0.3403, + "BBH": 0.3216, + "MATH Level 5": 0.0219, + "GPQA": 0.2408, + "MUSR": 0.3247, + "MMLU-PRO": 0.2214 + } + }, + { + "model_id": "shuttleai/shuttle-3", + "name": "shuttle-3", + "developer": "shuttleai", + "scores": { + "IFEval": 0.8154, + "BBH": 0.742, + "MATH Level 5": 0.46, + "GPQA": 0.4119, + "MUSR": 0.4377, + "MMLU-PRO": 0.5716 + } + }, + { + "model_id": "shyamieee/Padma-v7.0", + "name": "Padma-v7.0", + "developer": "shyamieee", + "scores": { + "IFEval": 0.3841, + "BBH": 0.5119, + "MATH Level 5": 0.0702, + "GPQA": 0.2861, + "MUSR": 0.4386, + "MMLU-PRO": 0.3029 + } + }, + { + "model_id": "silma-ai/SILMA-9B-Instruct-v1.0", + "name": "SILMA-9B-Instruct-v1.0", + "developer": "silma-ai", + "scores": { + "IFEval": 0.5842, + "BBH": 0.5219, + "MATH Level 5": 0.1163, + "GPQA": 0.3054, + "MUSR": 0.4637, + "MMLU-PRO": 0.392 + } + }, + { + "model_id": "silma-ai/SILMA-Kashif-2B-Instruct-v1.0", + "name": "SILMA-Kashif-2B-Instruct-v1.0", + "developer": "silma-ai", + "scores": { + "IFEval": 0.1181, + "BBH": 0.3793, + "MATH Level 5": 0.0113, + "GPQA": 0.2701, + "MUSR": 0.4043, + "MMLU-PRO": 0.2258 + } + }, + { + "model_id": "siqi00/Mistral-7B-DFT", + "name": "Mistral-7B-DFT", + "developer": "siqi00", + "scores": { + "IFEval": 0.5569, + "BBH": 0.4665, + "MATH Level 5": 0.0378, + "GPQA": 0.3045, + "MUSR": 0.4191, + "MMLU-PRO": 0.2963 + } + }, + { + "model_id": "siqi00/Mistral-7B-DFT2", + "name": "Mistral-7B-DFT2", + "developer": "siqi00", + "scores": { + "IFEval": 0.5804, + "BBH": 0.3968, + "MATH Level 5": 0.0453, + "GPQA": 0.2995, + "MUSR": 0.4401, + "MMLU-PRO": 0.2852 + } + }, + { + "model_id": "skumar9/Llama-medx_v2", + "name": "Llama-medx_v2", + "developer": "skumar9", + "scores": { + "IFEval": 0.4462, + "BBH": 0.4909, + "MATH Level 5": 0.0914, + "GPQA": 0.3054, + "MUSR": 0.3661, + "MMLU-PRO": 0.3463 + } + }, + { + "model_id": "skymizer/Llama2-7b-sft-chat-custom-template-dpo", + "name": "Llama2-7b-sft-chat-custom-template-dpo", + "developer": "skymizer", + "scores": { + "IFEval": 0.2353, + "BBH": 0.3688, + "MATH Level 5": 0.0144, + "GPQA": 0.2391, + "MUSR": 0.4429, + "MMLU-PRO": 0.1946 + } + }, + { + "model_id": "someon98/qwen-CoMa-0.5b", + "name": "qwen-CoMa-0.5b", + "developer": "someon98", + "scores": { + "IFEval": 0.2277, + "BBH": 0.2953, + "MATH Level 5": 0.0045, + "GPQA": 0.2399, + "MUSR": 0.4046, + "MMLU-PRO": 0.1099 + } + }, + { + "model_id": "sometimesanotion/ChocoTrio-14B-v1", + "name": "ChocoTrio-14B-v1", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.7089, + "BBH": 0.6506, + "MATH Level 5": 0.3973, + "GPQA": 0.3851, + "MUSR": 0.4821, + "MMLU-PRO": 0.537 + } + }, + { + "model_id": "sometimesanotion/IF-reasoning-experiment-40", + "name": "IF-reasoning-experiment-40", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.633, + "BBH": 0.6112, + "MATH Level 5": 0.3716, + "GPQA": 0.38, + "MUSR": 0.5194, + "MMLU-PRO": 0.5025 + } + }, + { + "model_id": "sometimesanotion/IF-reasoning-experiment-80", + "name": "IF-reasoning-experiment-80", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.5463, + "BBH": 0.421, + "MATH Level 5": 0.0989, + "GPQA": 0.2844, + "MUSR": 0.5025, + "MMLU-PRO": 0.3368 + } + }, + { + "model_id": "sometimesanotion/KytheraMix-7B-v0.2", + "name": "KytheraMix-7B-v0.2", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.6129, + "BBH": 0.5635, + "MATH Level 5": 0.2923, + "GPQA": 0.3356, + "MUSR": 0.4594, + "MMLU-PRO": 0.4505 + } + }, + { + "model_id": "sometimesanotion/Lamarck-14B-v0.1-experimental", + "name": "Lamarck-14B-v0.1-experimental", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.5354, + "BBH": 0.6583, + "MATH Level 5": 0.358, + "GPQA": 0.3817, + "MUSR": 0.4728, + "MMLU-PRO": 0.5408 + } + }, + { + "model_id": "sometimesanotion/Lamarck-14B-v0.3", + "name": "Lamarck-14B-v0.3", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.5032, + "BBH": 0.6611, + "MATH Level 5": 0.3406, + "GPQA": 0.3884, + "MUSR": 0.4688, + "MMLU-PRO": 0.5411 + } + }, + { + "model_id": "sometimesanotion/Lamarck-14B-v0.4-Qwenvergence", + "name": "Lamarck-14B-v0.4-Qwenvergence", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.4906, + "BBH": 0.6535, + "MATH Level 5": 0.3399, + "GPQA": 0.3784, + "MUSR": 0.4847, + "MMLU-PRO": 0.5406 + } + }, + { + "model_id": "sometimesanotion/Lamarck-14B-v0.6", + "name": "Lamarck-14B-v0.6", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.6973, + "BBH": 0.646, + "MATH Level 5": 0.4041, + "GPQA": 0.3893, + "MUSR": 0.4847, + "MMLU-PRO": 0.54 + } + }, + { + "model_id": "sometimesanotion/Lamarck-14B-v0.6-002-model_stock", + "name": "Lamarck-14B-v0.6-002-model_stock", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.6692, + "BBH": 0.6143, + "MATH Level 5": 0.3776, + "GPQA": 0.3742, + "MUSR": 0.518, + "MMLU-PRO": 0.5054 + } + }, + { + "model_id": "sometimesanotion/Lamarck-14B-v0.6-model_stock", + "name": "Lamarck-14B-v0.6-model_stock", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.679, + "BBH": 0.6269, + "MATH Level 5": 0.4245, + "GPQA": 0.3842, + "MUSR": 0.5007, + "MMLU-PRO": 0.5198 + } + }, + { + "model_id": "sometimesanotion/Lamarck-14B-v0.7-Fusion", + "name": "Lamarck-14B-v0.7-Fusion", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.6821, + "BBH": 0.6544, + "MATH Level 5": 0.4041, + "GPQA": 0.401, + "MUSR": 0.4991, + "MMLU-PRO": 0.5391 + } + }, + { + "model_id": "sometimesanotion/Lamarck-14B-v0.7-rc1", + "name": "Lamarck-14B-v0.7-rc1", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.7305, + "BBH": 0.6486, + "MATH Level 5": 0.3852, + "GPQA": 0.3893, + "MUSR": 0.4715, + "MMLU-PRO": 0.5416 + } + }, + { + "model_id": "sometimesanotion/Lamarck-14B-v0.7-rc4", + "name": "Lamarck-14B-v0.7-rc4", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.7211, + "BBH": 0.651, + "MATH Level 5": 0.4026, + "GPQA": 0.3893, + "MUSR": 0.4912, + "MMLU-PRO": 0.54 + } + }, + { + "model_id": "sometimesanotion/LamarckInfusion-14B-v1", + "name": "LamarckInfusion-14B-v1", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.7198, + "BBH": 0.6539, + "MATH Level 5": 0.4169, + "GPQA": 0.3909, + "MUSR": 0.4899, + "MMLU-PRO": 0.5376 + } + }, + { + "model_id": "sometimesanotion/LamarckInfusion-14B-v2", + "name": "LamarckInfusion-14B-v2", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.6812, + "BBH": 0.6564, + "MATH Level 5": 0.4388, + "GPQA": 0.3876, + "MUSR": 0.4993, + "MMLU-PRO": 0.5416 + } + }, + { + "model_id": "sometimesanotion/LamarckInfusion-14B-v2-hi", + "name": "LamarckInfusion-14B-v2-hi", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.6855, + "BBH": 0.6555, + "MATH Level 5": 0.423, + "GPQA": 0.3884, + "MUSR": 0.4847, + "MMLU-PRO": 0.5405 + } + }, + { + "model_id": "sometimesanotion/LamarckInfusion-14B-v2-lo", + "name": "LamarckInfusion-14B-v2-lo", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.6788, + "BBH": 0.6528, + "MATH Level 5": 0.4237, + "GPQA": 0.3859, + "MUSR": 0.4991, + "MMLU-PRO": 0.5397 + } + }, + { + "model_id": "sometimesanotion/LamarckInfusion-14B-v3", + "name": "LamarckInfusion-14B-v3", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.7131, + "BBH": 0.6518, + "MATH Level 5": 0.4124, + "GPQA": 0.3867, + "MUSR": 0.482, + "MMLU-PRO": 0.5407 + } + }, + { + "model_id": "sometimesanotion/Qwen-14B-ProseStock-v4", + "name": "Qwen-14B-ProseStock-v4", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.4942, + "BBH": 0.6498, + "MATH Level 5": 0.364, + "GPQA": 0.3884, + "MUSR": 0.4938, + "MMLU-PRO": 0.5386 + } + }, + { + "model_id": "sometimesanotion/Qwen-2.5-14B-Virmarckeoso", + "name": "Qwen-2.5-14B-Virmarckeoso", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.4813, + "BBH": 0.657, + "MATH Level 5": 0.3565, + "GPQA": 0.3792, + "MUSR": 0.4794, + "MMLU-PRO": 0.5377 + } + }, + { + "model_id": "sometimesanotion/Qwen2.5-14B-Vimarckoso", + "name": "Qwen2.5-14B-Vimarckoso", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.4574, + "BBH": 0.6446, + "MATH Level 5": 0.3384, + "GPQA": 0.3926, + "MUSR": 0.4859, + "MMLU-PRO": 0.5329 + } + }, + { + "model_id": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v2", + "name": "Qwen2.5-14B-Vimarckoso-v2", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.4505, + "BBH": 0.655, + "MATH Level 5": 0.358, + "GPQA": 0.3826, + "MUSR": 0.4819, + "MMLU-PRO": 0.538 + } + }, + { + "model_id": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v3", + "name": "Qwen2.5-14B-Vimarckoso-v3", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.7257, + "BBH": 0.6415, + "MATH Level 5": 0.4003, + "GPQA": 0.38, + "MUSR": 0.4807, + "MMLU-PRO": 0.5343 + } + }, + { + "model_id": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-IF-Variant", + "name": "Qwen2.5-14B-Vimarckoso-v3-IF-Variant", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.6413, + "BBH": 0.5521, + "MATH Level 5": 0.2545, + "GPQA": 0.3473, + "MUSR": 0.5319, + "MMLU-PRO": 0.4589 + } + }, + { + "model_id": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-Prose01", + "name": "Qwen2.5-14B-Vimarckoso-v3-Prose01", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.6872, + "BBH": 0.6359, + "MATH Level 5": 0.3995, + "GPQA": 0.3867, + "MUSR": 0.4807, + "MMLU-PRO": 0.5275 + } + }, + { + "model_id": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-model_stock", + "name": "Qwen2.5-14B-Vimarckoso-v3-model_stock", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.7162, + "BBH": 0.6421, + "MATH Level 5": 0.4245, + "GPQA": 0.38, + "MUSR": 0.4781, + "MMLU-PRO": 0.5316 + } + }, + { + "model_id": "sometimesanotion/Qwen2.5-7B-Gordion-v0.1", + "name": "Qwen2.5-7B-Gordion-v0.1", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.7482, + "BBH": 0.5524, + "MATH Level 5": 0.2915, + "GPQA": 0.3079, + "MUSR": 0.4016, + "MMLU-PRO": 0.43 + } + }, + { + "model_id": "sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Prose", + "name": "Qwen2.5-7B-Gordion-v0.1-Prose", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.5347, + "BBH": 0.5599, + "MATH Level 5": 0.2893, + "GPQA": 0.3205, + "MUSR": 0.4502, + "MMLU-PRO": 0.4525 + } + }, + { + "model_id": "sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Reason", + "name": "Qwen2.5-7B-Gordion-v0.1-Reason", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.4917, + "BBH": 0.5498, + "MATH Level 5": 0.2621, + "GPQA": 0.3406, + "MUSR": 0.4434, + "MMLU-PRO": 0.4307 + } + }, + { + "model_id": "sometimesanotion/Qwentessential-14B-v1", + "name": "Qwentessential-14B-v1", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.6279, + "BBH": 0.6545, + "MATH Level 5": 0.4071, + "GPQA": 0.3876, + "MUSR": 0.4873, + "MMLU-PRO": 0.5381 + } + }, + { + "model_id": "sometimesanotion/Qwentinuum-14B-v013", + "name": "Qwentinuum-14B-v013", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.6711, + "BBH": 0.6087, + "MATH Level 5": 0.3708, + "GPQA": 0.3574, + "MUSR": 0.5154, + "MMLU-PRO": 0.4991 + } + }, + { + "model_id": "sometimesanotion/Qwentinuum-14B-v1", + "name": "Qwentinuum-14B-v1", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.5032, + "BBH": 0.6573, + "MATH Level 5": 0.3603, + "GPQA": 0.3826, + "MUSR": 0.4781, + "MMLU-PRO": 0.541 + } + }, + { + "model_id": "sometimesanotion/Qwentinuum-14B-v2", + "name": "Qwentinuum-14B-v2", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.5378, + "BBH": 0.6555, + "MATH Level 5": 0.3754, + "GPQA": 0.3884, + "MUSR": 0.4714, + "MMLU-PRO": 0.5409 + } + }, + { + "model_id": "sometimesanotion/Qwentinuum-14B-v3", + "name": "Qwentinuum-14B-v3", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.6158, + "BBH": 0.6539, + "MATH Level 5": 0.3535, + "GPQA": 0.3876, + "MUSR": 0.486, + "MMLU-PRO": 0.5413 + } + }, + { + "model_id": "sometimesanotion/Qwentinuum-14B-v5", + "name": "Qwentinuum-14B-v5", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.6286, + "BBH": 0.655, + "MATH Level 5": 0.3444, + "GPQA": 0.3876, + "MUSR": 0.4874, + "MMLU-PRO": 0.5418 + } + }, + { + "model_id": "sometimesanotion/Qwentinuum-14B-v6", + "name": "Qwentinuum-14B-v6", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.6304, + "BBH": 0.6545, + "MATH Level 5": 0.3603, + "GPQA": 0.3867, + "MUSR": 0.49, + "MMLU-PRO": 0.54 + } + }, + { + "model_id": "sometimesanotion/Qwentinuum-14B-v6-Prose", + "name": "Qwentinuum-14B-v6-Prose", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.5643, + "BBH": 0.6545, + "MATH Level 5": 0.3701, + "GPQA": 0.3884, + "MUSR": 0.4913, + "MMLU-PRO": 0.5392 + } + }, + { + "model_id": "sometimesanotion/Qwentinuum-14B-v7", + "name": "Qwentinuum-14B-v7", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.6109, + "BBH": 0.6551, + "MATH Level 5": 0.3573, + "GPQA": 0.3909, + "MUSR": 0.482, + "MMLU-PRO": 0.541 + } + }, + { + "model_id": "sometimesanotion/Qwentinuum-14B-v8", + "name": "Qwentinuum-14B-v8", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.5412, + "BBH": 0.6534, + "MATH Level 5": 0.3912, + "GPQA": 0.3834, + "MUSR": 0.4873, + "MMLU-PRO": 0.5412 + } + }, + { + "model_id": "sometimesanotion/Qwentinuum-14B-v9", + "name": "Qwentinuum-14B-v9", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.5107, + "BBH": 0.658, + "MATH Level 5": 0.3482, + "GPQA": 0.3859, + "MUSR": 0.4781, + "MMLU-PRO": 0.5421 + } + }, + { + "model_id": "sometimesanotion/Qwenvergence-14B-qv256", + "name": "Qwenvergence-14B-qv256", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.7006, + "BBH": 0.6312, + "MATH Level 5": 0.3897, + "GPQA": 0.3784, + "MUSR": 0.4926, + "MMLU-PRO": 0.5178 + } + }, + { + "model_id": "sometimesanotion/Qwenvergence-14B-v0.6-004-model_stock", + "name": "Qwenvergence-14B-v0.6-004-model_stock", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.686, + "BBH": 0.6249, + "MATH Level 5": 0.4094, + "GPQA": 0.3834, + "MUSR": 0.5033, + "MMLU-PRO": 0.5193 + } + }, + { + "model_id": "sometimesanotion/Qwenvergence-14B-v10", + "name": "Qwenvergence-14B-v10", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.6757, + "BBH": 0.6316, + "MATH Level 5": 0.4789, + "GPQA": 0.3792, + "MUSR": 0.4991, + "MMLU-PRO": 0.5239 + } + }, + { + "model_id": "sometimesanotion/Qwenvergence-14B-v11", + "name": "Qwenvergence-14B-v11", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.7192, + "BBH": 0.6368, + "MATH Level 5": 0.4645, + "GPQA": 0.3725, + "MUSR": 0.4754, + "MMLU-PRO": 0.5327 + } + }, + { + "model_id": "sometimesanotion/Qwenvergence-14B-v12-Prose", + "name": "Qwenvergence-14B-v12-Prose", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.5412, + "BBH": 0.6504, + "MATH Level 5": 0.3535, + "GPQA": 0.3867, + "MUSR": 0.4991, + "MMLU-PRO": 0.5381 + } + }, + { + "model_id": "sometimesanotion/Qwenvergence-14B-v12-Prose-DS", + "name": "Qwenvergence-14B-v12-Prose-DS", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.6173, + "BBH": 0.6507, + "MATH Level 5": 0.4305, + "GPQA": 0.3943, + "MUSR": 0.5151, + "MMLU-PRO": 0.5369 + } + }, + { + "model_id": "sometimesanotion/Qwenvergence-14B-v13-Prose-DS", + "name": "Qwenvergence-14B-v13-Prose-DS", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.7178, + "BBH": 0.6405, + "MATH Level 5": 0.386, + "GPQA": 0.3834, + "MUSR": 0.4927, + "MMLU-PRO": 0.5349 + } + }, + { + "model_id": "sometimesanotion/Qwenvergence-14B-v15-Prose-MS", + "name": "Qwenvergence-14B-v15-Prose-MS", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.5032, + "BBH": 0.655, + "MATH Level 5": 0.3633, + "GPQA": 0.3951, + "MUSR": 0.4913, + "MMLU-PRO": 0.5393 + } + }, + { + "model_id": "sometimesanotion/Qwenvergence-14B-v2-Prose", + "name": "Qwenvergence-14B-v2-Prose", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.4705, + "BBH": 0.6519, + "MATH Level 5": 0.3557, + "GPQA": 0.3935, + "MUSR": 0.4926, + "MMLU-PRO": 0.5372 + } + }, + { + "model_id": "sometimesanotion/Qwenvergence-14B-v3", + "name": "Qwenvergence-14B-v3", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.5044, + "BBH": 0.6548, + "MATH Level 5": 0.3693, + "GPQA": 0.3842, + "MUSR": 0.4886, + "MMLU-PRO": 0.5386 + } + }, + { + "model_id": "sometimesanotion/Qwenvergence-14B-v3-Prose", + "name": "Qwenvergence-14B-v3-Prose", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.4918, + "BBH": 0.6513, + "MATH Level 5": 0.3648, + "GPQA": 0.3951, + "MUSR": 0.4939, + "MMLU-PRO": 0.537 + } + }, + { + "model_id": "sometimesanotion/Qwenvergence-14B-v3-Reason", + "name": "Qwenvergence-14B-v3-Reason", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.5367, + "BBH": 0.6561, + "MATH Level 5": 0.358, + "GPQA": 0.3867, + "MUSR": 0.474, + "MMLU-PRO": 0.5395 + } + }, + { + "model_id": "sometimesanotion/Qwenvergence-14B-v6-Prose", + "name": "Qwenvergence-14B-v6-Prose", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.599, + "BBH": 0.6544, + "MATH Level 5": 0.3565, + "GPQA": 0.3884, + "MUSR": 0.4887, + "MMLU-PRO": 0.5371 + } + }, + { + "model_id": "sometimesanotion/Qwenvergence-14B-v6-Prose-model_stock", + "name": "Qwenvergence-14B-v6-Prose-model_stock", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.4811, + "BBH": 0.653, + "MATH Level 5": 0.3603, + "GPQA": 0.3935, + "MUSR": 0.4899, + "MMLU-PRO": 0.5387 + } + }, + { + "model_id": "sometimesanotion/Qwenvergence-14B-v8", + "name": "Qwenvergence-14B-v8", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.5913, + "BBH": 0.6522, + "MATH Level 5": 0.4048, + "GPQA": 0.3809, + "MUSR": 0.4768, + "MMLU-PRO": 0.5435 + } + }, + { + "model_id": "sometimesanotion/Qwenvergence-14B-v9", + "name": "Qwenvergence-14B-v9", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.6598, + "BBH": 0.6166, + "MATH Level 5": 0.4139, + "GPQA": 0.3683, + "MUSR": 0.5141, + "MMLU-PRO": 0.5111 + } + }, + { + "model_id": "sometimesanotion/lamarck-14b-prose-model_stock", + "name": "lamarck-14b-prose-model_stock", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.4276, + "BBH": 0.6488, + "MATH Level 5": 0.3414, + "GPQA": 0.3935, + "MUSR": 0.4846, + "MMLU-PRO": 0.5354 + } + }, + { + "model_id": "sometimesanotion/lamarck-14b-reason-model_stock", + "name": "lamarck-14b-reason-model_stock", + "developer": "sometimesanotion", + "scores": { + "IFEval": 0.4965, + "BBH": 0.6569, + "MATH Level 5": 0.358, + "GPQA": 0.3842, + "MUSR": 0.4741, + "MMLU-PRO": 0.5402 + } + }, + { + "model_id": "sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415", + "name": "ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415", + "developer": "sonthenguyen", + "scores": { + "IFEval": 0.2893, + "BBH": 0.3804, + "MATH Level 5": 0.0113, + "GPQA": 0.2466, + "MUSR": 0.3861, + "MMLU-PRO": 0.1401 + } + }, + { + "model_id": "sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205", + "name": "ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205", + "developer": "sonthenguyen", + "scores": { + "IFEval": 0.3199, + "BBH": 0.3959, + "MATH Level 5": 0.0083, + "GPQA": 0.276, + "MUSR": 0.4272, + "MMLU-PRO": 0.2124 + } + }, + { + "model_id": "sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522", + "name": "ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522", + "developer": "sonthenguyen", + "scores": { + "IFEval": 0.3764, + "BBH": 0.3828, + "MATH Level 5": 0.0091, + "GPQA": 0.2651, + "MUSR": 0.4404, + "MMLU-PRO": 0.2055 + } + }, + { + "model_id": "sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbc-213steps", + "name": "zephyr-sft-bnb-4bit-DPO-mtbc-213steps", + "developer": "sonthenguyen", + "scores": { + "IFEval": 0.4275, + "BBH": 0.4197, + "MATH Level 5": 0.0257, + "GPQA": 0.2617, + "MUSR": 0.4086, + "MMLU-PRO": 0.2709 + } + }, + { + "model_id": "sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbo-180steps", + "name": "zephyr-sft-bnb-4bit-DPO-mtbo-180steps", + "developer": "sonthenguyen", + "scores": { + "IFEval": 0.4087, + "BBH": 0.4323, + "MATH Level 5": 0.0234, + "GPQA": 0.276, + "MUSR": 0.3885, + "MMLU-PRO": 0.2748 + } + }, + { + "model_id": "sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbr-180steps", + "name": "zephyr-sft-bnb-4bit-DPO-mtbr-180steps", + "developer": "sonthenguyen", + "scores": { + "IFEval": 0.4032, + "BBH": 0.4305, + "MATH Level 5": 0.0249, + "GPQA": 0.2802, + "MUSR": 0.4258, + "MMLU-PRO": 0.2711 + } + }, + { + "model_id": "sophosympatheia/Midnight-Miqu-70B-v1.5", + "name": "Midnight-Miqu-70B-v1.5", + "developer": "sophosympatheia", + "scores": { + "IFEval": 0.6118, + "BBH": 0.5606, + "MATH Level 5": 0.0702, + "GPQA": 0.2961, + "MUSR": 0.4244, + "MMLU-PRO": 0.3825 + } + }, + { + "model_id": "speakleash/Bielik-11B-v2", + "name": "Bielik-11B-v2", + "developer": "speakleash", + "scores": { + "IFEval": 0.2381, + "BBH": 0.4931, + "MATH Level 5": 0.0785, + "GPQA": 0.2886, + "MUSR": 0.3924, + "MMLU-PRO": 0.3137 + } + }, + { + "model_id": "speakleash/Bielik-11B-v2.0-Instruct", + "name": "Bielik-11B-v2.0-Instruct", + "developer": "speakleash", + "scores": { + "IFEval": 0.5252, + "BBH": 0.5362, + "MATH Level 5": 0.1186, + "GPQA": 0.3171, + "MUSR": 0.4467, + "MMLU-PRO": 0.3351 + } + }, + { + "model_id": "speakleash/Bielik-11B-v2.1-Instruct", + "name": "Bielik-11B-v2.1-Instruct", + "developer": "speakleash", + "scores": { + "IFEval": 0.509, + "BBH": 0.553, + "MATH Level 5": 0.2666, + "GPQA": 0.3372, + "MUSR": 0.4185, + "MMLU-PRO": 0.3447 + } + }, + { + "model_id": "speakleash/Bielik-11B-v2.2-Instruct", + "name": "Bielik-11B-v2.2-Instruct", + "developer": "speakleash", + "scores": { + "IFEval": 0.5552, + "BBH": 0.5597, + "MATH Level 5": 0.2681, + "GPQA": 0.3314, + "MUSR": 0.4171, + "MMLU-PRO": 0.3487 + } + }, + { + "model_id": "speakleash/Bielik-11B-v2.3-Instruct", + "name": "Bielik-11B-v2.3-Instruct", + "developer": "speakleash", + "scores": { + "IFEval": 0.5583, + "BBH": 0.5663, + "MATH Level 5": 0.2085, + "GPQA": 0.3406, + "MUSR": 0.4518, + "MMLU-PRO": 0.3444 + } + }, + { + "model_id": "spmurrayzzz/Mistral-Syndicate-7B", + "name": "Mistral-Syndicate-7B", + "developer": "spmurrayzzz", + "scores": { + "IFEval": 0.2496, + "BBH": 0.4245, + "MATH Level 5": 0.034, + "GPQA": 0.276, + "MUSR": 0.4386, + "MMLU-PRO": 0.2631 + } + }, + { + "model_id": "spow12/ChatWaifu_12B_v2.0", + "name": "ChatWaifu_12B_v2.0", + "developer": "spow12", + "scores": { + "IFEval": 0.4768, + "BBH": 0.5208, + "MATH Level 5": 0.071, + "GPQA": 0.2768, + "MUSR": 0.4432, + "MMLU-PRO": 0.3388 + } + }, + { + "model_id": "spow12/ChatWaifu_22B_v2.0_preview", + "name": "ChatWaifu_22B_v2.0_preview", + "developer": "spow12", + "scores": { + "IFEval": 0.6745, + "BBH": 0.617, + "MATH Level 5": 0.1888, + "GPQA": 0.3154, + "MUSR": 0.3685, + "MMLU-PRO": 0.3988 + } + }, + { + "model_id": "spow12/ChatWaifu_v1.4", + "name": "ChatWaifu_v1.4", + "developer": "spow12", + "scores": { + "IFEval": 0.5691, + "BBH": 0.5176, + "MATH Level 5": 0.1057, + "GPQA": 0.307, + "MUSR": 0.4743, + "MMLU-PRO": 0.3475 + } + }, + { + "model_id": "spow12/ChatWaifu_v2.0_22B", + "name": "ChatWaifu_v2.0_22B", + "developer": "spow12", + "scores": { + "IFEval": 0.6517, + "BBH": 0.5908, + "MATH Level 5": 0.2032, + "GPQA": 0.3238, + "MUSR": 0.3842, + "MMLU-PRO": 0.3812 + } + }, + { + "model_id": "ssmits/Qwen2.5-95B-Instruct", + "name": "Qwen2.5-95B-Instruct", + "developer": "ssmits", + "scores": { + "IFEval": 0.8431, + "BBH": 0.7038, + "MATH Level 5": 0.5302, + "GPQA": 0.3641, + "MUSR": 0.4284, + "MMLU-PRO": 0.5217 + } + }, + { + "model_id": "stabilityai/StableBeluga2", + "name": "StableBeluga2", + "developer": "stabilityai", + "scores": { + "IFEval": 0.3787, + "BBH": 0.5824, + "MATH Level 5": 0.0438, + "GPQA": 0.3163, + "MUSR": 0.473, + "MMLU-PRO": 0.3326 + } + }, + { + "model_id": "stabilityai/stablelm-2-12b", + "name": "stablelm-2-12b", + "developer": "stabilityai", + "scores": { + "IFEval": 0.1569, + "BBH": 0.4509, + "MATH Level 5": 0.0431, + "GPQA": 0.2785, + "MUSR": 0.4479, + "MMLU-PRO": 0.3072 + } + }, + { + "model_id": "stabilityai/stablelm-2-12b-chat", + "name": "stablelm-2-12b-chat", + "developer": "stabilityai", + "scores": { + "IFEval": 0.4082, + "BBH": 0.4672, + "MATH Level 5": 0.0536, + "GPQA": 0.2668, + "MUSR": 0.3914, + "MMLU-PRO": 0.2734 + } + }, + { + "model_id": "stabilityai/stablelm-2-1_6b", + "name": "stablelm-2-1_6b", + "developer": "stabilityai", + "scores": { + "IFEval": 0.1157, + "BBH": 0.3385, + "MATH Level 5": 0.0076, + "GPQA": 0.2483, + "MUSR": 0.3882, + "MMLU-PRO": 0.1464 + } + }, + { + "model_id": "stabilityai/stablelm-2-1_6b-chat", + "name": "stablelm-2-1_6b-chat", + "developer": "stabilityai", + "scores": { + "IFEval": 0.306, + "BBH": 0.339, + "MATH Level 5": 0.0249, + "GPQA": 0.2475, + "MUSR": 0.358, + "MMLU-PRO": 0.1622 + } + }, + { + "model_id": "stabilityai/stablelm-2-zephyr-1_6b", + "name": "stablelm-2-zephyr-1_6b", + "developer": "stabilityai", + "scores": { + "IFEval": 0.3279, + "BBH": 0.3352, + "MATH Level 5": 0.0332, + "GPQA": 0.2433, + "MUSR": 0.3511, + "MMLU-PRO": 0.1714 + } + }, + { + "model_id": "stabilityai/stablelm-3b-4e1t", + "name": "stablelm-3b-4e1t", + "developer": "stabilityai", + "scores": { + "IFEval": 0.2203, + "BBH": 0.3504, + "MATH Level 5": 0.0106, + "GPQA": 0.2374, + "MUSR": 0.3778, + "MMLU-PRO": 0.1669 + } + }, + { + "model_id": "stabilityai/stablelm-zephyr-3b", + "name": "stablelm-zephyr-3b", + "developer": "stabilityai", + "scores": { + "IFEval": 0.3683, + "BBH": 0.3866, + "MATH Level 5": 0.0431, + "GPQA": 0.2391, + "MUSR": 0.4183, + "MMLU-PRO": 0.1768 + } + }, + { + "model_id": "sthenno-com/miscii-14b-0130", + "name": "miscii-14b-0130", + "developer": "sthenno-com", + "scores": { + "IFEval": 0.6647, + "BBH": 0.6505, + "MATH Level 5": 0.432, + "GPQA": 0.3817, + "MUSR": 0.4912, + "MMLU-PRO": 0.5363 + } + }, + { + "model_id": "sthenno-com/miscii-14b-0218", + "name": "miscii-14b-0218", + "developer": "sthenno-com", + "scores": { + "IFEval": 0.7656, + "BBH": 0.6559, + "MATH Level 5": 0.5144, + "GPQA": 0.3834, + "MUSR": 0.4273, + "MMLU-PRO": 0.5298 + } + }, + { + "model_id": "sthenno-com/miscii-14b-1028", + "name": "miscii-14b-1028", + "developer": "sthenno-com", + "scores": { + "IFEval": 0.8237, + "BBH": 0.6448, + "MATH Level 5": 0.503, + "GPQA": 0.3565, + "MUSR": 0.4182, + "MMLU-PRO": 0.5153 + } + }, + { + "model_id": "sthenno-com/miscii-14b-1225", + "name": "miscii-14b-1225", + "developer": "sthenno-com", + "scores": { + "IFEval": 0.7878, + "BBH": 0.6572, + "MATH Level 5": 0.4517, + "GPQA": 0.3775, + "MUSR": 0.4366, + "MMLU-PRO": 0.5272 + } + }, + { + "model_id": "sthenno/tempesthenno-0120", + "name": "tempesthenno-0120", + "developer": "sthenno", + "scores": { + "IFEval": 0.539, + "BBH": 0.6373, + "MATH Level 5": 0.3353, + "GPQA": 0.3943, + "MUSR": 0.4633, + "MMLU-PRO": 0.529 + } + }, + { + "model_id": "sthenno/tempesthenno-fusion-0309", + "name": "tempesthenno-fusion-0309", + "developer": "sthenno", + "scores": { + "IFEval": 0.7692, + "BBH": 0.6581, + "MATH Level 5": 0.4766, + "GPQA": 0.37, + "MUSR": 0.4325, + "MMLU-PRO": 0.5258 + } + }, + { + "model_id": "sthenno/tempesthenno-kto-0205-ckpt80", + "name": "tempesthenno-kto-0205-ckpt80", + "developer": "sthenno", + "scores": { + "IFEval": 0.8054, + "BBH": 0.6543, + "MATH Level 5": 0.4592, + "GPQA": 0.3482, + "MUSR": 0.4248, + "MMLU-PRO": 0.5286 + } + }, + { + "model_id": "sthenno/tempesthenno-nuslerp-001", + "name": "tempesthenno-nuslerp-001", + "developer": "sthenno", + "scores": { + "IFEval": 0.7926, + "BBH": 0.6578, + "MATH Level 5": 0.4758, + "GPQA": 0.3733, + "MUSR": 0.43, + "MMLU-PRO": 0.5257 + } + }, + { + "model_id": "sthenno/tempesthenno-nuslerp-0124", + "name": "tempesthenno-nuslerp-0124", + "developer": "sthenno", + "scores": { + "IFEval": 0.7004, + "BBH": 0.6469, + "MATH Level 5": 0.4116, + "GPQA": 0.3901, + "MUSR": 0.4859, + "MMLU-PRO": 0.5352 + } + }, + { + "model_id": "sthenno/tempesthenno-ppo-ckpt40", + "name": "tempesthenno-ppo-ckpt40", + "developer": "sthenno", + "scores": { + "IFEval": 0.7923, + "BBH": 0.655, + "MATH Level 5": 0.4736, + "GPQA": 0.3775, + "MUSR": 0.4352, + "MMLU-PRO": 0.5292 + } + }, + { + "model_id": "sthenno/tempesthenno-sft-0309-ckpt10", + "name": "tempesthenno-sft-0309-ckpt10", + "developer": "sthenno", + "scores": { + "IFEval": 0.7744, + "BBH": 0.6552, + "MATH Level 5": 0.4721, + "GPQA": 0.3716, + "MUSR": 0.4364, + "MMLU-PRO": 0.5258 + } + }, + { + "model_id": "sthenno/tempesthenno-sft-0314-stage1-ckpt50", + "name": "tempesthenno-sft-0314-stage1-ckpt50", + "developer": "sthenno", + "scores": { + "IFEval": 0.7394, + "BBH": 0.6601, + "MATH Level 5": 0.4683, + "GPQA": 0.3733, + "MUSR": 0.4429, + "MMLU-PRO": 0.5302 + } + }, + { + "model_id": "sthenno/tempestissimo-14b-0309", + "name": "tempestissimo-14b-0309", + "developer": "sthenno", + "scores": { + "IFEval": 0.7549, + "BBH": 0.6587, + "MATH Level 5": 0.4796, + "GPQA": 0.3666, + "MUSR": 0.4312, + "MMLU-PRO": 0.5281 + } + }, + { + "model_id": "streamerbtw1002/Nexuim-R1-7B-Instruct", + "name": "Nexuim-R1-7B-Instruct", + "developer": "streamerbtw1002", + "scores": { + "IFEval": 0.6934, + "BBH": 0.5175, + "MATH Level 5": 0.4456, + "GPQA": 0.2592, + "MUSR": 0.3356, + "MMLU-PRO": 0.4138 + } + }, + { + "model_id": "stupidity-ai/Llama-3-8B-Instruct-MultiMoose", + "name": "Llama-3-8B-Instruct-MultiMoose", + "developer": "stupidity-ai", + "scores": { + "IFEval": 0.2318, + "BBH": 0.2823, + "MATH Level 5": 0.0, + "GPQA": 0.2534, + "MUSR": 0.3485, + "MMLU-PRO": 0.1094 + } + }, + { + "model_id": "suayptalha/Clarus-7B-v0.1", + "name": "Clarus-7B-v0.1", + "developer": "suayptalha", + "scores": { + "IFEval": 0.7454, + "BBH": 0.5497, + "MATH Level 5": 0.4924, + "GPQA": 0.307, + "MUSR": 0.443, + "MMLU-PRO": 0.4387 + } + }, + { + "model_id": "suayptalha/Clarus-7B-v0.2", + "name": "Clarus-7B-v0.2", + "developer": "suayptalha", + "scores": { + "IFEval": 0.7679, + "BBH": 0.549, + "MATH Level 5": 0.4856, + "GPQA": 0.302, + "MUSR": 0.4417, + "MMLU-PRO": 0.44 + } + }, + { + "model_id": "suayptalha/Clarus-7B-v0.3", + "name": "Clarus-7B-v0.3", + "developer": "suayptalha", + "scores": { + "IFEval": 0.7509, + "BBH": 0.5526, + "MATH Level 5": 0.4879, + "GPQA": 0.3121, + "MUSR": 0.4402, + "MMLU-PRO": 0.4385 + } + }, + { + "model_id": "suayptalha/DeepSeek-R1-Distill-Llama-3B", + "name": "DeepSeek-R1-Distill-Llama-3B", + "developer": "suayptalha", + "scores": { + "IFEval": 0.7093, + "BBH": 0.4452, + "MATH Level 5": 0.2092, + "GPQA": 0.2609, + "MUSR": 0.3396, + "MMLU-PRO": 0.2978 + } + }, + { + "model_id": "suayptalha/Falcon3-Jessi-v0.4-7B-Slerp", + "name": "Falcon3-Jessi-v0.4-7B-Slerp", + "developer": "suayptalha", + "scores": { + "IFEval": 0.7676, + "BBH": 0.5591, + "MATH Level 5": 0.3965, + "GPQA": 0.3121, + "MUSR": 0.4812, + "MMLU-PRO": 0.406 + } + }, + { + "model_id": "suayptalha/HomerCreativeAnvita-Mix-Qw7B", + "name": "HomerCreativeAnvita-Mix-Qw7B", + "developer": "suayptalha", + "scores": { + "IFEval": 0.7808, + "BBH": 0.5565, + "MATH Level 5": 0.361, + "GPQA": 0.3146, + "MUSR": 0.4416, + "MMLU-PRO": 0.4445 + } + }, + { + "model_id": "suayptalha/Komodo-Llama-3.2-3B-v2-fp16", + "name": "Komodo-Llama-3.2-3B-v2-fp16", + "developer": "suayptalha", + "scores": { + "IFEval": 0.6341, + "BBH": 0.4355, + "MATH Level 5": 0.1065, + "GPQA": 0.2777, + "MUSR": 0.3406, + "MMLU-PRO": 0.2852 + } + }, + { + "model_id": "suayptalha/Lamarckvergence-14B", + "name": "Lamarckvergence-14B", + "developer": "suayptalha", + "scores": { + "IFEval": 0.7656, + "BBH": 0.6517, + "MATH Level 5": 0.54, + "GPQA": 0.3633, + "MUSR": 0.4422, + "MMLU-PRO": 0.5283 + } + }, + { + "model_id": "suayptalha/Lix-14B-v0.1", + "name": "Lix-14B-v0.1", + "developer": "suayptalha", + "scores": { + "IFEval": 0.7813, + "BBH": 0.6608, + "MATH Level 5": 0.5295, + "GPQA": 0.37, + "MUSR": 0.4338, + "MMLU-PRO": 0.5314 + } + }, + { + "model_id": "suayptalha/Luminis-phi-4", + "name": "Luminis-phi-4", + "developer": "suayptalha", + "scores": { + "IFEval": 0.69, + "BBH": 0.692, + "MATH Level 5": 0.4637, + "GPQA": 0.3515, + "MUSR": 0.4572, + "MMLU-PRO": 0.5424 + } + }, + { + "model_id": "suayptalha/Maestro-10B", + "name": "Maestro-10B", + "developer": "suayptalha", + "scores": { + "IFEval": 0.7768, + "BBH": 0.5746, + "MATH Level 5": 0.1911, + "GPQA": 0.3331, + "MUSR": 0.4397, + "MMLU-PRO": 0.4218 + } + }, + { + "model_id": "suayptalha/Rombos-2.5-T.E-8.1", + "name": "Rombos-2.5-T.E-8.1", + "developer": "suayptalha", + "scores": { + "IFEval": 0.6925, + "BBH": 0.5515, + "MATH Level 5": 0.4924, + "GPQA": 0.3112, + "MUSR": 0.4166, + "MMLU-PRO": 0.4446 + } + }, + { + "model_id": "sumink/Qmerft", + "name": "Qmerft", + "developer": "sumink", + "scores": { + "IFEval": 0.1564, + "BBH": 0.2939, + "MATH Level 5": 0.0023, + "GPQA": 0.2525, + "MUSR": 0.3688, + "MMLU-PRO": 0.1157 + } + }, + { + "model_id": "sumink/Qwenftmodel", + "name": "Qwenftmodel", + "developer": "sumink", + "scores": { + "IFEval": 0.1729, + "BBH": 0.3823, + "MATH Level 5": 0.0891, + "GPQA": 0.2567, + "MUSR": 0.3617, + "MMLU-PRO": 0.2339 + } + }, + { + "model_id": "sumink/Qwenmplus", + "name": "Qwenmplus", + "developer": "sumink", + "scores": { + "IFEval": 0.204, + "BBH": 0.3676, + "MATH Level 5": 0.0249, + "GPQA": 0.2852, + "MUSR": 0.3828, + "MMLU-PRO": 0.1992 + } + }, + { + "model_id": "sumink/Qwensci", + "name": "Qwensci", + "developer": "sumink", + "scores": { + "IFEval": 0.174, + "BBH": 0.3282, + "MATH Level 5": 0.0204, + "GPQA": 0.2584, + "MUSR": 0.3609, + "MMLU-PRO": 0.126 + } + }, + { + "model_id": "sumink/bbhqwen", + "name": "bbhqwen", + "developer": "sumink", + "scores": { + "IFEval": 0.1809, + "BBH": 0.3388, + "MATH Level 5": 0.0106, + "GPQA": 0.2576, + "MUSR": 0.4352, + "MMLU-PRO": 0.1617 + } + }, + { + "model_id": "sumink/bbhqwen2", + "name": "bbhqwen2", + "developer": "sumink", + "scores": { + "IFEval": 0.1533, + "BBH": 0.3066, + "MATH Level 5": 0.006, + "GPQA": 0.2626, + "MUSR": 0.4431, + "MMLU-PRO": 0.1149 + } + }, + { + "model_id": "sumink/bbhqwen3", + "name": "bbhqwen3", + "developer": "sumink", + "scores": { + "IFEval": 0.1943, + "BBH": 0.2951, + "MATH Level 5": 0.0, + "GPQA": 0.2576, + "MUSR": 0.3796, + "MMLU-PRO": 0.1166 + } + }, + { + "model_id": "sumink/bbhqwen4", + "name": "bbhqwen4", + "developer": "sumink", + "scores": { + "IFEval": 0.1449, + "BBH": 0.3199, + "MATH Level 5": 0.006, + "GPQA": 0.2441, + "MUSR": 0.4029, + "MMLU-PRO": 0.1509 + } + }, + { + "model_id": "sumink/bbhqwen5", + "name": "bbhqwen5", + "developer": "sumink", + "scores": { + "IFEval": 0.1522, + "BBH": 0.2913, + "MATH Level 5": 0.0023, + "GPQA": 0.2601, + "MUSR": 0.4019, + "MMLU-PRO": 0.1131 + } + }, + { + "model_id": "sumink/bbhqwen6", + "name": "bbhqwen6", + "developer": "sumink", + "scores": { + "IFEval": 0.1893, + "BBH": 0.2782, + "MATH Level 5": 0.0008, + "GPQA": 0.2584, + "MUSR": 0.358, + "MMLU-PRO": 0.1153 + } + }, + { + "model_id": "sumink/flflmillama", + "name": "flflmillama", + "developer": "sumink", + "scores": { + "IFEval": 0.1676, + "BBH": 0.3851, + "MATH Level 5": 0.0196, + "GPQA": 0.2919, + "MUSR": 0.3591, + "MMLU-PRO": 0.2096 + } + }, + { + "model_id": "sumink/ftgpt", + "name": "ftgpt", + "developer": "sumink", + "scores": { + "IFEval": 0.0787, + "BBH": 0.2919, + "MATH Level 5": 0.0, + "GPQA": 0.2643, + "MUSR": 0.4138, + "MMLU-PRO": 0.1172 + } + }, + { + "model_id": "sumink/llamaft", + "name": "llamaft", + "developer": "sumink", + "scores": { + "IFEval": 0.1609, + "BBH": 0.3763, + "MATH Level 5": 0.0166, + "GPQA": 0.271, + "MUSR": 0.3498, + "MMLU-PRO": 0.2114 + } + }, + { + "model_id": "sumink/llamamerge", + "name": "llamamerge", + "developer": "sumink", + "scores": { + "IFEval": 0.2672, + "BBH": 0.4632, + "MATH Level 5": 0.0151, + "GPQA": 0.2987, + "MUSR": 0.424, + "MMLU-PRO": 0.259 + } + }, + { + "model_id": "sumink/llftfl7", + "name": "llftfl7", + "developer": "sumink", + "scores": { + "IFEval": 0.1714, + "BBH": 0.3786, + "MATH Level 5": 0.0106, + "GPQA": 0.281, + "MUSR": 0.3632, + "MMLU-PRO": 0.1743 + } + }, + { + "model_id": "sumink/llmer", + "name": "llmer", + "developer": "sumink", + "scores": { + "IFEval": 0.3191, + "BBH": 0.4885, + "MATH Level 5": 0.065, + "GPQA": 0.2978, + "MUSR": 0.4039, + "MMLU-PRO": 0.3529 + } + }, + { + "model_id": "sumink/qwft", + "name": "qwft", + "developer": "sumink", + "scores": { + "IFEval": 0.1197, + "BBH": 0.3002, + "MATH Level 5": 0.0, + "GPQA": 0.2525, + "MUSR": 0.3581, + "MMLU-PRO": 0.1129 + } + }, + { + "model_id": "sumink/qwmer", + "name": "qwmer", + "developer": "sumink", + "scores": { + "IFEval": 0.2212, + "BBH": 0.4299, + "MATH Level 5": 0.0008, + "GPQA": 0.2869, + "MUSR": 0.4032, + "MMLU-PRO": 0.2215 + } + }, + { + "model_id": "sumink/solarmer3", + "name": "solarmer3", + "developer": "sumink", + "scores": { + "IFEval": 0.3741, + "BBH": 0.5266, + "MATH Level 5": 0.0582, + "GPQA": 0.2911, + "MUSR": 0.4401, + "MMLU-PRO": 0.3323 + } + }, + { + "model_id": "sumink/somer", + "name": "somer", + "developer": "sumink", + "scores": { + "IFEval": 0.299, + "BBH": 0.5194, + "MATH Level 5": 0.0415, + "GPQA": 0.2987, + "MUSR": 0.465, + "MMLU-PRO": 0.3447 + } + }, + { + "model_id": "sumink/somer2", + "name": "somer2", + "developer": "sumink", + "scores": { + "IFEval": 0.3132, + "BBH": 0.5167, + "MATH Level 5": 0.0468, + "GPQA": 0.3037, + "MUSR": 0.4663, + "MMLU-PRO": 0.3433 + } + }, + { + "model_id": "sumink/somerft", + "name": "somerft", + "developer": "sumink", + "scores": { + "IFEval": 0.1431, + "BBH": 0.3093, + "MATH Level 5": 0.0144, + "GPQA": 0.2483, + "MUSR": 0.4045, + "MMLU-PRO": 0.1117 + } + }, + { + "model_id": "sunbaby/BrainCog-8B-0.1-Instruct", + "name": "BrainCog-8B-0.1-Instruct", + "developer": "sunbaby", + "scores": { + "IFEval": 0.4253, + "BBH": 0.4618, + "MATH Level 5": 0.0967, + "GPQA": 0.3012, + "MUSR": 0.3656, + "MMLU-PRO": 0.2858 + } + }, + { + "model_id": "swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA", + "name": "LLaMAntino-3-ANITA-8B-Inst-DPO-ITA", + "developer": "swap-uniba", + "scores": { + "IFEval": 0.4815, + "BBH": 0.4936, + "MATH Level 5": 0.0483, + "GPQA": 0.2987, + "MUSR": 0.4387, + "MMLU-PRO": 0.3723 + } + }, + { + "model_id": "synergetic/FrankenQwen2.5-14B", + "name": "FrankenQwen2.5-14B", + "developer": "synergetic", + "scores": { + "IFEval": 0.1869, + "BBH": 0.6048, + "MATH Level 5": 0.0, + "GPQA": 0.2701, + "MUSR": 0.3843, + "MMLU-PRO": 0.4382 + } + }, + { + "model_id": "talha2001/Beast-Soul-new", + "name": "Beast-Soul-new", + "developer": "talha2001", + "scores": { + "IFEval": 0.4854, + "BBH": 0.5227, + "MATH Level 5": 0.074, + "GPQA": 0.2819, + "MUSR": 0.4459, + "MMLU-PRO": 0.3102 + } + }, + { + "model_id": "tangledgroup/tangled-llama-pints-1.5b-v0.1-instruct", + "name": "tangled-llama-pints-1.5b-v0.1-instruct", + "developer": "tangledgroup", + "scores": { + "IFEval": 0.1509, + "BBH": 0.3143, + "MATH Level 5": 0.0121, + "GPQA": 0.2399, + "MUSR": 0.3761, + "MMLU-PRO": 0.1109 + } + }, + { + "model_id": "tangledgroup/tangled-llama-pints-1.5b-v0.2-instruct", + "name": "tangled-llama-pints-1.5b-v0.2-instruct", + "developer": "tangledgroup", + "scores": { + "IFEval": 0.1724, + "BBH": 0.3158, + "MATH Level 5": 0.0128, + "GPQA": 0.2416, + "MUSR": 0.3643, + "MMLU-PRO": 0.1117 + } + }, + { + "model_id": "tanliboy/lambda-gemma-2-9b-dpo", + "name": "lambda-gemma-2-9b-dpo", + "developer": "tanliboy", + "scores": { + "IFEval": 0.1829, + "BBH": 0.5488, + "MATH Level 5": 0.0, + "GPQA": 0.3104, + "MUSR": 0.4056, + "MMLU-PRO": 0.3805 + } + }, + { + "model_id": "tanliboy/lambda-qwen2.5-14b-dpo-test", + "name": "lambda-qwen2.5-14b-dpo-test", + "developer": "tanliboy", + "scores": { + "IFEval": 0.8231, + "BBH": 0.6394, + "MATH Level 5": 0.5461, + "GPQA": 0.3624, + "MUSR": 0.426, + "MMLU-PRO": 0.4848 + } + }, + { + "model_id": "tanliboy/lambda-qwen2.5-32b-dpo-test", + "name": "lambda-qwen2.5-32b-dpo-test", + "developer": "tanliboy", + "scores": { + "IFEval": 0.8084, + "BBH": 0.6764, + "MATH Level 5": 0.6103, + "GPQA": 0.3565, + "MUSR": 0.4274, + "MMLU-PRO": 0.5657 + } + }, + { + "model_id": "tannedbum/Ellaria-9B", + "name": "Ellaria-9B", + "developer": "tannedbum", + "scores": { + "IFEval": 0.7826, + "BBH": 0.5942, + "MATH Level 5": 0.2077, + "GPQA": 0.3331, + "MUSR": 0.4151, + "MMLU-PRO": 0.4205 + } + }, + { + "model_id": "tannedbum/L3-Nymeria-Maid-8B", + "name": "L3-Nymeria-Maid-8B", + "developer": "tannedbum", + "scores": { + "IFEval": 0.725, + "BBH": 0.5146, + "MATH Level 5": 0.0937, + "GPQA": 0.2961, + "MUSR": 0.3751, + "MMLU-PRO": 0.3747 + } + }, + { + "model_id": "tannedbum/L3-Nymeria-v2-8B", + "name": "L3-Nymeria-v2-8B", + "developer": "tannedbum", + "scores": { + "IFEval": 0.7168, + "BBH": 0.5224, + "MATH Level 5": 0.0921, + "GPQA": 0.2903, + "MUSR": 0.3699, + "MMLU-PRO": 0.3753 + } + }, + { + "model_id": "tannedbum/L3-Rhaenys-8B", + "name": "L3-Rhaenys-8B", + "developer": "tannedbum", + "scores": { + "IFEval": 0.7363, + "BBH": 0.5299, + "MATH Level 5": 0.0876, + "GPQA": 0.2978, + "MUSR": 0.3725, + "MMLU-PRO": 0.3799 + } + }, + { + "model_id": "teknium/CollectiveCognition-v1.1-Mistral-7B", + "name": "CollectiveCognition-v1.1-Mistral-7B", + "developer": "teknium", + "scores": { + "IFEval": 0.279, + "BBH": 0.4493, + "MATH Level 5": 0.031, + "GPQA": 0.2869, + "MUSR": 0.3869, + "MMLU-PRO": 0.2837 + } + }, + { + "model_id": "teknium/OpenHermes-13B", + "name": "OpenHermes-13B", + "developer": "teknium", + "scores": { + "IFEval": 0.2668, + "BBH": 0.4206, + "MATH Level 5": 0.0121, + "GPQA": 0.2727, + "MUSR": 0.4043, + "MMLU-PRO": 0.2389 + } + }, + { + "model_id": "teknium/OpenHermes-2-Mistral-7B", + "name": "OpenHermes-2-Mistral-7B", + "developer": "teknium", + "scores": { + "IFEval": 0.5286, + "BBH": 0.4948, + "MATH Level 5": 0.0453, + "GPQA": 0.2836, + "MUSR": 0.452, + "MMLU-PRO": 0.2931 + } + }, + { + "model_id": "teknium/OpenHermes-2.5-Mistral-7B", + "name": "OpenHermes-2.5-Mistral-7B", + "developer": "teknium", + "scores": { + "IFEval": 0.5571, + "BBH": 0.487, + "MATH Level 5": 0.0506, + "GPQA": 0.2836, + "MUSR": 0.4242, + "MMLU-PRO": 0.3054 + } + }, + { + "model_id": "teknium/OpenHermes-7B", + "name": "OpenHermes-7B", + "developer": "teknium", + "scores": { + "IFEval": 0.1813, + "BBH": 0.362, + "MATH Level 5": 0.0159, + "GPQA": 0.2693, + "MUSR": 0.4324, + "MMLU-PRO": 0.1933 + } + }, + { + "model_id": "tensopolis/falcon3-10b-tensopolis-v1", + "name": "falcon3-10b-tensopolis-v1", + "developer": "tensopolis", + "scores": { + "IFEval": 0.7817, + "BBH": 0.6182, + "MATH Level 5": 0.2749, + "GPQA": 0.3297, + "MUSR": 0.4375, + "MMLU-PRO": 0.442 + } + }, + { + "model_id": "tensopolis/falcon3-10b-tensopolis-v2", + "name": "falcon3-10b-tensopolis-v2", + "developer": "tensopolis", + "scores": { + "IFEval": 0.7792, + "BBH": 0.6182, + "MATH Level 5": 0.2666, + "GPQA": 0.3272, + "MUSR": 0.4297, + "MMLU-PRO": 0.4424 + } + }, + { + "model_id": "tensopolis/lamarckvergence-14b-tensopolis-v1", + "name": "lamarckvergence-14b-tensopolis-v1", + "developer": "tensopolis", + "scores": { + "IFEval": 0.7604, + "BBH": 0.6561, + "MATH Level 5": 0.5166, + "GPQA": 0.3607, + "MUSR": 0.4475, + "MMLU-PRO": 0.525 + } + }, + { + "model_id": "tensopolis/mistral-small-2501-tensopolis-v1", + "name": "mistral-small-2501-tensopolis-v1", + "developer": "tensopolis", + "scores": { + "IFEval": 0.7762, + "BBH": 0.6475, + "MATH Level 5": 0.4441, + "GPQA": 0.3574, + "MUSR": 0.428, + "MMLU-PRO": 0.4465 + } + }, + { + "model_id": "tensopolis/mistral-small-r1-tensopolis", + "name": "mistral-small-r1-tensopolis", + "developer": "tensopolis", + "scores": { + "IFEval": 0.4622, + "BBH": 0.5436, + "MATH Level 5": 0.2908, + "GPQA": 0.2819, + "MUSR": 0.3738, + "MMLU-PRO": 0.4035 + } + }, + { + "model_id": "tensopolis/phi-4-tensopolis-v1", + "name": "phi-4-tensopolis-v1", + "developer": "tensopolis", + "scores": { + "IFEval": 0.6767, + "BBH": 0.6872, + "MATH Level 5": 0.494, + "GPQA": 0.3347, + "MUSR": 0.4141, + "MMLU-PRO": 0.5384 + } + }, + { + "model_id": "tensopolis/qwen2.5-14b-tensopolis-v1", + "name": "qwen2.5-14b-tensopolis-v1", + "developer": "tensopolis", + "scores": { + "IFEval": 0.799, + "BBH": 0.6364, + "MATH Level 5": 0.5295, + "GPQA": 0.3347, + "MUSR": 0.4193, + "MMLU-PRO": 0.4911 + } + }, + { + "model_id": "tensopolis/qwen2.5-3b-or1-tensopolis", + "name": "qwen2.5-3b-or1-tensopolis", + "developer": "tensopolis", + "scores": { + "IFEval": 0.354, + "BBH": 0.4421, + "MATH Level 5": 0.173, + "GPQA": 0.2945, + "MUSR": 0.3749, + "MMLU-PRO": 0.3197 + } + }, + { + "model_id": "tensopolis/qwen2.5-7b-tensopolis-v1", + "name": "qwen2.5-7b-tensopolis-v1", + "developer": "tensopolis", + "scores": { + "IFEval": 0.7661, + "BBH": 0.5379, + "MATH Level 5": 0.4562, + "GPQA": 0.2961, + "MUSR": 0.4339, + "MMLU-PRO": 0.4269 + } + }, + { + "model_id": "tensopolis/qwen2.5-7b-tensopolis-v2", + "name": "qwen2.5-7b-tensopolis-v2", + "developer": "tensopolis", + "scores": { + "IFEval": 0.7521, + "BBH": 0.5415, + "MATH Level 5": 0.4819, + "GPQA": 0.2903, + "MUSR": 0.4246, + "MMLU-PRO": 0.4243 + } + }, + { + "model_id": "tensopolis/virtuoso-lite-tensopolis-v1", + "name": "virtuoso-lite-tensopolis-v1", + "developer": "tensopolis", + "scores": { + "IFEval": 0.8069, + "BBH": 0.6102, + "MATH Level 5": 0.2545, + "GPQA": 0.3448, + "MUSR": 0.4582, + "MMLU-PRO": 0.4435 + } + }, + { + "model_id": "tensopolis/virtuoso-lite-tensopolis-v2", + "name": "virtuoso-lite-tensopolis-v2", + "developer": "tensopolis", + "scores": { + "IFEval": 0.8029, + "BBH": 0.61, + "MATH Level 5": 0.25, + "GPQA": 0.3431, + "MUSR": 0.4595, + "MMLU-PRO": 0.444 + } + }, + { + "model_id": "tensopolis/virtuoso-small-tensopolis-v1", + "name": "virtuoso-small-tensopolis-v1", + "developer": "tensopolis", + "scores": { + "IFEval": 0.7856, + "BBH": 0.6415, + "MATH Level 5": 0.3527, + "GPQA": 0.328, + "MUSR": 0.4326, + "MMLU-PRO": 0.4968 + } + }, + { + "model_id": "tensopolis/virtuoso-small-tensopolis-v2", + "name": "virtuoso-small-tensopolis-v2", + "developer": "tensopolis", + "scores": { + "IFEval": 0.802, + "BBH": 0.6516, + "MATH Level 5": 0.3875, + "GPQA": 0.3289, + "MUSR": 0.4352, + "MMLU-PRO": 0.5154 + } + }, + { + "model_id": "tensopolis/virtuoso-small-v2-tensopolis-v1", + "name": "virtuoso-small-v2-tensopolis-v1", + "developer": "tensopolis", + "scores": { + "IFEval": 0.8419, + "BBH": 0.6545, + "MATH Level 5": 0.4524, + "GPQA": 0.3465, + "MUSR": 0.4509, + "MMLU-PRO": 0.5175 + } + }, + { + "model_id": "tensoropera/Fox-1-1.6B", + "name": "Fox-1-1.6B", + "developer": "tensoropera", + "scores": { + "IFEval": 0.2766, + "BBH": 0.3307, + "MATH Level 5": 0.0174, + "GPQA": 0.2634, + "MUSR": 0.355, + "MMLU-PRO": 0.1371 + } + }, + { + "model_id": "tenyx/Llama3-TenyxChat-70B", + "name": "Llama3-TenyxChat-70B", + "developer": "tenyx", + "scores": { + "IFEval": 0.8087, + "BBH": 0.6511, + "MATH Level 5": 0.2356, + "GPQA": 0.3012, + "MUSR": 0.426, + "MMLU-PRO": 0.521 + } + }, + { + "model_id": "theo77186/Qwen2.5-Coder-7B-Instruct-20241106", + "name": "Qwen2.5-Coder-7B-Instruct-20241106", + "developer": "theo77186", + "scores": { + "IFEval": 0.6101, + "BBH": 0.5008, + "MATH Level 5": 0.3882, + "GPQA": 0.2919, + "MUSR": 0.4073, + "MMLU-PRO": 0.3353 + } + }, + { + "model_id": "theprint/Boptruth-Agatha-7B", + "name": "Boptruth-Agatha-7B", + "developer": "theprint", + "scores": { + "IFEval": 0.3124, + "BBH": 0.4984, + "MATH Level 5": 0.0551, + "GPQA": 0.2995, + "MUSR": 0.4277, + "MMLU-PRO": 0.2861 + } + }, + { + "model_id": "theprint/CleverBoi-7B-v2", + "name": "CleverBoi-7B-v2", + "developer": "theprint", + "scores": { + "IFEval": 0.217, + "BBH": 0.4532, + "MATH Level 5": 0.0264, + "GPQA": 0.2886, + "MUSR": 0.4695, + "MMLU-PRO": 0.2709 + } + }, + { + "model_id": "theprint/CleverBoi-7B-v3", + "name": "CleverBoi-7B-v3", + "developer": "theprint", + "scores": { + "IFEval": 0.2382, + "BBH": 0.4414, + "MATH Level 5": 0.04, + "GPQA": 0.2659, + "MUSR": 0.4072, + "MMLU-PRO": 0.2868 + } + }, + { + "model_id": "theprint/CleverBoi-Llama-3.1-8B-Instruct", + "name": "CleverBoi-Llama-3.1-8B-Instruct", + "developer": "theprint", + "scores": { + "IFEval": 0.1682, + "BBH": 0.456, + "MATH Level 5": 0.0491, + "GPQA": 0.3003, + "MUSR": 0.4014, + "MMLU-PRO": 0.3075 + } + }, + { + "model_id": "theprint/CleverBoi-Llama-3.1-8B-v2", + "name": "CleverBoi-Llama-3.1-8B-v2", + "developer": "theprint", + "scores": { + "IFEval": 0.1961, + "BBH": 0.4668, + "MATH Level 5": 0.0529, + "GPQA": 0.2861, + "MUSR": 0.3735, + "MMLU-PRO": 0.3188 + } + }, + { + "model_id": "theprint/CleverBoi-Nemo-12B-v2", + "name": "CleverBoi-Nemo-12B-v2", + "developer": "theprint", + "scores": { + "IFEval": 0.2046, + "BBH": 0.5241, + "MATH Level 5": 0.1035, + "GPQA": 0.3138, + "MUSR": 0.4187, + "MMLU-PRO": 0.3228 + } + }, + { + "model_id": "theprint/Code-Llama-Bagel-8B", + "name": "Code-Llama-Bagel-8B", + "developer": "theprint", + "scores": { + "IFEval": 0.253, + "BBH": 0.4697, + "MATH Level 5": 0.0612, + "GPQA": 0.276, + "MUSR": 0.368, + "MMLU-PRO": 0.2822 + } + }, + { + "model_id": "theprint/Conversely-Mistral-7B", + "name": "Conversely-Mistral-7B", + "developer": "theprint", + "scores": { + "IFEval": 0.2608, + "BBH": 0.4672, + "MATH Level 5": 0.0279, + "GPQA": 0.2852, + "MUSR": 0.4189, + "MMLU-PRO": 0.2826 + } + }, + { + "model_id": "theprint/Llama-3.2-3B-VanRossum", + "name": "Llama-3.2-3B-VanRossum", + "developer": "theprint", + "scores": { + "IFEval": 0.4783, + "BBH": 0.4279, + "MATH Level 5": 0.0974, + "GPQA": 0.2676, + "MUSR": 0.3442, + "MMLU-PRO": 0.277 + } + }, + { + "model_id": "theprint/ReWiz-7B", + "name": "ReWiz-7B", + "developer": "theprint", + "scores": { + "IFEval": 0.4048, + "BBH": 0.4564, + "MATH Level 5": 0.0408, + "GPQA": 0.2752, + "MUSR": 0.4612, + "MMLU-PRO": 0.267 + } + }, + { + "model_id": "theprint/ReWiz-Llama-3.1-8B-v2", + "name": "ReWiz-Llama-3.1-8B-v2", + "developer": "theprint", + "scores": { + "IFEval": 0.2379, + "BBH": 0.4632, + "MATH Level 5": 0.0574, + "GPQA": 0.3029, + "MUSR": 0.3814, + "MMLU-PRO": 0.331 + } + }, + { + "model_id": "theprint/ReWiz-Llama-3.2-3B", + "name": "ReWiz-Llama-3.2-3B", + "developer": "theprint", + "scores": { + "IFEval": 0.4649, + "BBH": 0.4343, + "MATH Level 5": 0.1095, + "GPQA": 0.2836, + "MUSR": 0.3614, + "MMLU-PRO": 0.2887 + } + }, + { + "model_id": "theprint/ReWiz-Nemo-12B-Instruct", + "name": "ReWiz-Nemo-12B-Instruct", + "developer": "theprint", + "scores": { + "IFEval": 0.1062, + "BBH": 0.5092, + "MATH Level 5": 0.1042, + "GPQA": 0.3238, + "MUSR": 0.4096, + "MMLU-PRO": 0.3339 + } + }, + { + "model_id": "theprint/ReWiz-Qwen-2.5-14B", + "name": "ReWiz-Qwen-2.5-14B", + "developer": "theprint", + "scores": { + "IFEval": 0.2785, + "BBH": 0.6179, + "MATH Level 5": 0.2923, + "GPQA": 0.38, + "MUSR": 0.4539, + "MMLU-PRO": 0.5092 + } + }, + { + "model_id": "theprint/ReWiz-Worldbuilder-7B", + "name": "ReWiz-Worldbuilder-7B", + "developer": "theprint", + "scores": { + "IFEval": 0.251, + "BBH": 0.4636, + "MATH Level 5": 0.037, + "GPQA": 0.2693, + "MUSR": 0.4572, + "MMLU-PRO": 0.2971 + } + }, + { + "model_id": "theprint/RuDolph-Hermes-7B", + "name": "RuDolph-Hermes-7B", + "developer": "theprint", + "scores": { + "IFEval": 0.3604, + "BBH": 0.5053, + "MATH Level 5": 0.0514, + "GPQA": 0.3121, + "MUSR": 0.4226, + "MMLU-PRO": 0.3073 + } + }, + { + "model_id": "theprint/WorldBuilder-12B", + "name": "WorldBuilder-12B", + "developer": "theprint", + "scores": { + "IFEval": 0.1374, + "BBH": 0.501, + "MATH Level 5": 0.0446, + "GPQA": 0.297, + "MUSR": 0.4066, + "MMLU-PRO": 0.3192 + } + }, + { + "model_id": "theprint/phi-3-mini-4k-python", + "name": "phi-3-mini-4k-python", + "developer": "theprint", + "scores": { + "IFEval": 0.2409, + "BBH": 0.4938, + "MATH Level 5": 0.105, + "GPQA": 0.2911, + "MUSR": 0.3922, + "MMLU-PRO": 0.3577 + } + }, + { + "model_id": "thinkcoder/llama3-8b-instruct-lora-8-sft", + "name": "llama3-8b-instruct-lora-8-sft", + "developer": "thinkcoder", + "scores": { + "IFEval": 0.648, + "BBH": 0.4865, + "MATH Level 5": 0.102, + "GPQA": 0.2668, + "MUSR": 0.3235, + "MMLU-PRO": 0.3476 + } + }, + { + "model_id": "thirdeyeai/elevate360m", + "name": "elevate360m", + "developer": "thirdeyeai", + "scores": { + "IFEval": 0.0445, + "BBH": 0.2963, + "MATH Level 5": 0.0159, + "GPQA": 0.2408, + "MUSR": 0.3462, + "MMLU-PRO": 0.1077 + } + }, + { + "model_id": "thomas-yanxin/XinYuan-Qwen2-1_5B", + "name": "XinYuan-Qwen2-1_5B", + "developer": "thomas-yanxin", + "scores": { + "IFEval": 0.2986, + "BBH": 0.3635, + "MATH Level 5": 0.0672, + "GPQA": 0.2701, + "MUSR": 0.3634, + "MMLU-PRO": 0.2357 + } + }, + { + "model_id": "thomas-yanxin/XinYuan-Qwen2-7B", + "name": "XinYuan-Qwen2-7B", + "developer": "thomas-yanxin", + "scores": { + "IFEval": 0.4438, + "BBH": 0.4937, + "MATH Level 5": 0.1458, + "GPQA": 0.2911, + "MUSR": 0.4058, + "MMLU-PRO": 0.3925 + } + }, + { + "model_id": "thomas-yanxin/XinYuan-Qwen2-7B-0917", + "name": "XinYuan-Qwen2-7B-0917", + "developer": "thomas-yanxin", + "scores": { + "IFEval": 0.3719, + "BBH": 0.5169, + "MATH Level 5": 0.1979, + "GPQA": 0.3096, + "MUSR": 0.4401, + "MMLU-PRO": 0.4245 + } + }, + { + "model_id": "thomas-yanxin/XinYuan-Qwen2.5-7B-0917", + "name": "XinYuan-Qwen2.5-7B-0917", + "developer": "thomas-yanxin", + "scores": { + "IFEval": 0.3577, + "BBH": 0.5184, + "MATH Level 5": 0.1934, + "GPQA": 0.281, + "MUSR": 0.3676, + "MMLU-PRO": 0.3882 + } + }, + { + "model_id": "tianyil1/MistralForCausalLM_Cal_DPO", + "name": "MistralForCausalLM_Cal_DPO", + "developer": "tianyil1", + "scores": { + "IFEval": 0.5328, + "BBH": 0.4381, + "MATH Level 5": 0.0287, + "GPQA": 0.276, + "MUSR": 0.3977, + "MMLU-PRO": 0.2763 + } + }, + { + "model_id": "tiiuae/Falcon3-10B-Base", + "name": "Falcon3-10B-Base", + "developer": "tiiuae", + "scores": { + "IFEval": 0.3648, + "BBH": 0.595, + "MATH Level 5": 0.2492, + "GPQA": 0.3456, + "MUSR": 0.4398, + "MMLU-PRO": 0.424 + } + }, + { + "model_id": "tiiuae/Falcon3-10B-Instruct", + "name": "Falcon3-10B-Instruct", + "developer": "tiiuae", + "scores": { + "IFEval": 0.7817, + "BBH": 0.617, + "MATH Level 5": 0.2764, + "GPQA": 0.3289, + "MUSR": 0.4323, + "MMLU-PRO": 0.4429 + } + }, + { + "model_id": "tiiuae/Falcon3-1B-Base", + "name": "Falcon3-1B-Base", + "developer": "tiiuae", + "scores": { + "IFEval": 0.2428, + "BBH": 0.3571, + "MATH Level 5": 0.0332, + "GPQA": 0.2794, + "MUSR": 0.4147, + "MMLU-PRO": 0.1608 + } + }, + { + "model_id": "tiiuae/Falcon3-1B-Instruct", + "name": "Falcon3-1B-Instruct", + "developer": "tiiuae", + "scores": { + "IFEval": 0.5557, + "BBH": 0.3745, + "MATH Level 5": 0.0634, + "GPQA": 0.2668, + "MUSR": 0.4189, + "MMLU-PRO": 0.1838 + } + }, + { + "model_id": "tiiuae/Falcon3-3B-Base", + "name": "Falcon3-3B-Base", + "developer": "tiiuae", + "scores": { + "IFEval": 0.2765, + "BBH": 0.4421, + "MATH Level 5": 0.1178, + "GPQA": 0.297, + "MUSR": 0.375, + "MMLU-PRO": 0.2879 + } + }, + { + "model_id": "tiiuae/Falcon3-3B-Instruct", + "name": "Falcon3-3B-Instruct", + "developer": "tiiuae", + "scores": { + "IFEval": 0.6977, + "BBH": 0.4754, + "MATH Level 5": 0.25, + "GPQA": 0.2886, + "MUSR": 0.4136, + "MMLU-PRO": 0.3005 + } + }, + { + "model_id": "tiiuae/Falcon3-7B-Base", + "name": "Falcon3-7B-Base", + "developer": "tiiuae", + "scores": { + "IFEval": 0.3416, + "BBH": 0.5099, + "MATH Level 5": 0.1941, + "GPQA": 0.3465, + "MUSR": 0.4702, + "MMLU-PRO": 0.391 + } + }, + { + "model_id": "tiiuae/Falcon3-7B-Instruct", + "name": "Falcon3-7B-Instruct", + "developer": "tiiuae", + "scores": { + "IFEval": 0.7612, + "BBH": 0.5632, + "MATH Level 5": 0.4086, + "GPQA": 0.3104, + "MUSR": 0.4827, + "MMLU-PRO": 0.4087 + } + }, + { + "model_id": "tiiuae/Falcon3-Mamba-7B-Base", + "name": "Falcon3-Mamba-7B-Base", + "developer": "tiiuae", + "scores": { + "IFEval": 0.2891, + "BBH": 0.4699, + "MATH Level 5": 0.1941, + "GPQA": 0.3096, + "MUSR": 0.3431, + "MMLU-PRO": 0.3038 + } + }, + { + "model_id": "tiiuae/Falcon3-Mamba-7B-Instruct", + "name": "Falcon3-Mamba-7B-Instruct", + "developer": "tiiuae", + "scores": { + "IFEval": 0.7165, + "BBH": 0.4679, + "MATH Level 5": 0.3006, + "GPQA": 0.3037, + "MUSR": 0.3869, + "MMLU-PRO": 0.3369 + } + }, + { + "model_id": "tiiuae/falcon-11B", + "name": "falcon-11B", + "developer": "tiiuae", + "scores": { + "IFEval": 0.3261, + "BBH": 0.4392, + "MATH Level 5": 0.0279, + "GPQA": 0.271, + "MUSR": 0.3986, + "MMLU-PRO": 0.2389 + } + }, + { + "model_id": "tiiuae/falcon-40b", + "name": "Falcon 40B", + "developer": "tiiuae", + "scores": { + "IFEval": 0.2496, + "BBH": 0.4019, + "MATH Level 5": 0.0181, + "GPQA": 0.2735, + "MUSR": 0.3631, + "MMLU-PRO": 0.2505 + } + }, + { + "model_id": "tiiuae/falcon-40b-instruct", + "name": "falcon-40b-instruct", + "developer": "tiiuae", + "scores": { + "IFEval": 0.2454, + "BBH": 0.4054, + "MATH Level 5": 0.0196, + "GPQA": 0.25, + "MUSR": 0.3762, + "MMLU-PRO": 0.2261 + } + }, + { + "model_id": "tiiuae/falcon-7b", + "name": "Falcon 7B", + "developer": "tiiuae", + "scores": { + "IFEval": 0.1821, + "BBH": 0.3285, + "MATH Level 5": 0.0098, + "GPQA": 0.245, + "MUSR": 0.3778, + "MMLU-PRO": 0.1125 + } + }, + { + "model_id": "tiiuae/falcon-7b-instruct", + "name": "falcon-7b-instruct", + "developer": "tiiuae", + "scores": { + "IFEval": 0.1969, + "BBH": 0.3203, + "MATH Level 5": 0.0121, + "GPQA": 0.2475, + "MUSR": 0.3634, + "MMLU-PRO": 0.1155 + } + }, + { + "model_id": "tiiuae/falcon-mamba-7b", + "name": "falcon-mamba-7b", + "developer": "tiiuae", + "scores": { + "IFEval": 0.3336, + "BBH": 0.4285, + "MATH Level 5": 0.0446, + "GPQA": 0.3104, + "MUSR": 0.421, + "MMLU-PRO": 0.2302 + } + }, + { + "model_id": "tinycompany/BiBo-v0.3", + "name": "BiBo-v0.3", + "developer": "tinycompany", + "scores": { + "IFEval": 0.5184, + "BBH": 0.4642, + "MATH Level 5": 0.0876, + "GPQA": 0.2676, + "MUSR": 0.395, + "MMLU-PRO": 0.2995 + } + }, + { + "model_id": "tinycompany/BiBo-v0.7", + "name": "BiBo-v0.7", + "developer": "tinycompany", + "scores": { + "IFEval": 0.3738, + "BBH": 0.4311, + "MATH Level 5": 0.0823, + "GPQA": 0.2768, + "MUSR": 0.4044, + "MMLU-PRO": 0.265 + } + }, + { + "model_id": "tinycompany/ShawtyIsBad-bgem3", + "name": "ShawtyIsBad-bgem3", + "developer": "tinycompany", + "scores": { + "IFEval": 0.2608, + "BBH": 0.3853, + "MATH Level 5": 0.0483, + "GPQA": 0.3054, + "MUSR": 0.3695, + "MMLU-PRO": 0.2583 + } + }, + { + "model_id": "tinycompany/ShawtyIsBad-e5-large", + "name": "ShawtyIsBad-e5-large", + "developer": "tinycompany", + "scores": { + "IFEval": 0.2468, + "BBH": 0.3873, + "MATH Level 5": 0.0453, + "GPQA": 0.302, + "MUSR": 0.372, + "MMLU-PRO": 0.2569 + } + }, + { + "model_id": "tinycompany/ShawtyIsBad-ib", + "name": "ShawtyIsBad-ib", + "developer": "tinycompany", + "scores": { + "IFEval": 0.2565, + "BBH": 0.388, + "MATH Level 5": 0.0491, + "GPQA": 0.2987, + "MUSR": 0.3641, + "MMLU-PRO": 0.2581 + } + }, + { + "model_id": "tinycompany/ShawtyIsBad-nomic-moe", + "name": "ShawtyIsBad-nomic-moe", + "developer": "tinycompany", + "scores": { + "IFEval": 0.2608, + "BBH": 0.3878, + "MATH Level 5": 0.0431, + "GPQA": 0.307, + "MUSR": 0.3747, + "MMLU-PRO": 0.2572 + } + }, + { + "model_id": "tinycompany/ShawtyIsBad-nomic1.5", + "name": "ShawtyIsBad-nomic1.5", + "developer": "tinycompany", + "scores": { + "IFEval": 0.2544, + "BBH": 0.3874, + "MATH Level 5": 0.0431, + "GPQA": 0.3112, + "MUSR": 0.3628, + "MMLU-PRO": 0.2567 + } + }, + { + "model_id": "tinycompany/SigmaBoi-base", + "name": "SigmaBoi-base", + "developer": "tinycompany", + "scores": { + "IFEval": 0.2447, + "BBH": 0.4314, + "MATH Level 5": 0.0778, + "GPQA": 0.2936, + "MUSR": 0.4343, + "MMLU-PRO": 0.2817 + } + }, + { + "model_id": "tinycompany/SigmaBoi-bge-m3", + "name": "SigmaBoi-bge-m3", + "developer": "tinycompany", + "scores": { + "IFEval": 0.245, + "BBH": 0.4351, + "MATH Level 5": 0.0763, + "GPQA": 0.2945, + "MUSR": 0.4383, + "MMLU-PRO": 0.2819 + } + }, + { + "model_id": "tinycompany/SigmaBoi-bgem3", + "name": "SigmaBoi-bgem3", + "developer": "tinycompany", + "scores": { + "IFEval": 0.245, + "BBH": 0.4351, + "MATH Level 5": 0.0763, + "GPQA": 0.2945, + "MUSR": 0.4383, + "MMLU-PRO": 0.2819 + } + }, + { + "model_id": "tinycompany/SigmaBoi-ib", + "name": "SigmaBoi-ib", + "developer": "tinycompany", + "scores": { + "IFEval": 0.2477, + "BBH": 0.4344, + "MATH Level 5": 0.074, + "GPQA": 0.2878, + "MUSR": 0.429, + "MMLU-PRO": 0.2824 + } + }, + { + "model_id": "tinycompany/SigmaBoi-nomic-moe", + "name": "SigmaBoi-nomic-moe", + "developer": "tinycompany", + "scores": { + "IFEval": 0.2474, + "BBH": 0.4334, + "MATH Level 5": 0.0718, + "GPQA": 0.2928, + "MUSR": 0.4316, + "MMLU-PRO": 0.2837 + } + }, + { + "model_id": "tinycompany/SigmaBoi-nomic1.5", + "name": "SigmaBoi-nomic1.5", + "developer": "tinycompany", + "scores": { + "IFEval": 0.2447, + "BBH": 0.4371, + "MATH Level 5": 0.0831, + "GPQA": 0.2961, + "MUSR": 0.4316, + "MMLU-PRO": 0.2841 + } + }, + { + "model_id": "tinycompany/SigmaBoi-nomic1.5-fp32", + "name": "SigmaBoi-nomic1.5-fp32", + "developer": "tinycompany", + "scores": { + "IFEval": 0.2462, + "BBH": 0.4371, + "MATH Level 5": 0.0831, + "GPQA": 0.2961, + "MUSR": 0.4316, + "MMLU-PRO": 0.2841 + } + }, + { + "model_id": "tinycompany/Tamed-Shawty", + "name": "Tamed-Shawty", + "developer": "tinycompany", + "scores": { + "IFEval": 0.3831, + "BBH": 0.3837, + "MATH Level 5": 0.0718, + "GPQA": 0.2626, + "MUSR": 0.3501, + "MMLU-PRO": 0.2601 + } + }, + { + "model_id": "tklohj/WindyFloLLM", + "name": "WindyFloLLM", + "developer": "tklohj", + "scores": { + "IFEval": 0.2669, + "BBH": 0.4637, + "MATH Level 5": 0.0159, + "GPQA": 0.2752, + "MUSR": 0.4253, + "MMLU-PRO": 0.2581 + } + }, + { + "model_id": "togethercomputer/GPT-JT-6B-v1", + "name": "GPT-JT-6B-v1", + "developer": "togethercomputer", + "scores": { + "IFEval": 0.2061, + "BBH": 0.3303, + "MATH Level 5": 0.0106, + "GPQA": 0.2609, + "MUSR": 0.3737, + "MMLU-PRO": 0.1626 + } + }, + { + "model_id": "togethercomputer/GPT-NeoXT-Chat-Base-20B", + "name": "GPT-NeoXT-Chat-Base-20B", + "developer": "togethercomputer", + "scores": { + "IFEval": 0.183, + "BBH": 0.3321, + "MATH Level 5": 0.0234, + "GPQA": 0.25, + "MUSR": 0.3461, + "MMLU-PRO": 0.1145 + } + }, + { + "model_id": "togethercomputer/LLaMA-2-7B-32K", + "name": "LLaMA-2-7B-32K", + "developer": "togethercomputer", + "scores": { + "IFEval": 0.1865, + "BBH": 0.34, + "MATH Level 5": 0.0144, + "GPQA": 0.25, + "MUSR": 0.3754, + "MMLU-PRO": 0.1768 + } + }, + { + "model_id": "togethercomputer/Llama-2-7B-32K-Instruct", + "name": "Llama-2-7B-32K-Instruct", + "developer": "togethercomputer", + "scores": { + "IFEval": 0.213, + "BBH": 0.3443, + "MATH Level 5": 0.0159, + "GPQA": 0.2517, + "MUSR": 0.4056, + "MMLU-PRO": 0.1781 + } + }, + { + "model_id": "togethercomputer/RedPajama-INCITE-7B-Base", + "name": "RedPajama-INCITE-7B-Base", + "developer": "togethercomputer", + "scores": { + "IFEval": 0.2082, + "BBH": 0.3195, + "MATH Level 5": 0.0159, + "GPQA": 0.255, + "MUSR": 0.362, + "MMLU-PRO": 0.1197 + } + }, + { + "model_id": "togethercomputer/RedPajama-INCITE-7B-Chat", + "name": "RedPajama-INCITE-7B-Chat", + "developer": "togethercomputer", + "scores": { + "IFEval": 0.1558, + "BBH": 0.3175, + "MATH Level 5": 0.0068, + "GPQA": 0.2525, + "MUSR": 0.3448, + "MMLU-PRO": 0.1121 + } + }, + { + "model_id": "togethercomputer/RedPajama-INCITE-7B-Instruct", + "name": "RedPajama-INCITE-7B-Instruct", + "developer": "togethercomputer", + "scores": { + "IFEval": 0.2055, + "BBH": 0.3377, + "MATH Level 5": 0.0211, + "GPQA": 0.2508, + "MUSR": 0.3685, + "MMLU-PRO": 0.1272 + } + }, + { + "model_id": "togethercomputer/RedPajama-INCITE-Base-3B-v1", + "name": "RedPajama-INCITE-Base-3B-v1", + "developer": "togethercomputer", + "scores": { + "IFEval": 0.2294, + "BBH": 0.306, + "MATH Level 5": 0.0144, + "GPQA": 0.2433, + "MUSR": 0.3739, + "MMLU-PRO": 0.1111 + } + }, + { + "model_id": "togethercomputer/RedPajama-INCITE-Chat-3B-v1", + "name": "RedPajama-INCITE-Chat-3B-v1", + "developer": "togethercomputer", + "scores": { + "IFEval": 0.1652, + "BBH": 0.3217, + "MATH Level 5": 0.0091, + "GPQA": 0.2441, + "MUSR": 0.3684, + "MMLU-PRO": 0.1127 + } + }, + { + "model_id": "togethercomputer/RedPajama-INCITE-Instruct-3B-v1", + "name": "RedPajama-INCITE-Instruct-3B-v1", + "developer": "togethercomputer", + "scores": { + "IFEval": 0.2124, + "BBH": 0.3146, + "MATH Level 5": 0.0128, + "GPQA": 0.2475, + "MUSR": 0.3886, + "MMLU-PRO": 0.111 + } + }, + { + "model_id": "tokyotech-llm/Llama-3-Swallow-8B-Instruct-v0.1", + "name": "Llama-3-Swallow-8B-Instruct-v0.1", + "developer": "tokyotech-llm", + "scores": { + "IFEval": 0.5508, + "BBH": 0.5009, + "MATH Level 5": 0.0748, + "GPQA": 0.2894, + "MUSR": 0.4357, + "MMLU-PRO": 0.3088 + } + }, + { + "model_id": "tomasmcm/sky-t1-coder-32b-flash", + "name": "sky-t1-coder-32b-flash", + "developer": "tomasmcm", + "scores": { + "IFEval": 0.778, + "BBH": 0.6822, + "MATH Level 5": 0.5423, + "GPQA": 0.3683, + "MUSR": 0.4233, + "MMLU-PRO": 0.5782 + } + }, + { + "model_id": "trthminh1112/autotrain-llama32-1b-finetune", + "name": "autotrain-llama32-1b-finetune", + "developer": "trthminh1112", + "scores": { + "IFEval": 0.1769, + "BBH": 0.2996, + "MATH Level 5": 0.0151, + "GPQA": 0.2567, + "MUSR": 0.3513, + "MMLU-PRO": 0.1099 + } + }, + { + "model_id": "tugstugi/Qwen2.5-7B-Instruct-QwQ-v0.1", + "name": "Qwen2.5-7B-Instruct-QwQ-v0.1", + "developer": "tugstugi", + "scores": { + "IFEval": 0.6017, + "BBH": 0.5101, + "MATH Level 5": 0.3814, + "GPQA": 0.2685, + "MUSR": 0.3794, + "MMLU-PRO": 0.4081 + } + }, + { + "model_id": "universalml/NepaliGPT-2.0", + "name": "NepaliGPT-2.0", + "developer": "universalml", + "scores": { + "IFEval": 0.0365, + "BBH": 0.466, + "MATH Level 5": 0.0045, + "GPQA": 0.281, + "MUSR": 0.4657, + "MMLU-PRO": 0.33 + } + }, + { + "model_id": "unsloth/Llama-3.2-1B-Instruct", + "name": "Llama-3.2-1B-Instruct", + "developer": "unsloth", + "scores": { + "IFEval": 0.581, + "BBH": 0.3485, + "MATH Level 5": 0.0823, + "GPQA": 0.2676, + "MUSR": 0.3196, + "MMLU-PRO": 0.1742 + } + }, + { + "model_id": "unsloth/Llama-3.2-1B-Instruct-no-system-message", + "name": "Llama-3.2-1B-Instruct-no-system-message", + "developer": "unsloth", + "scores": { + "IFEval": 0.565, + "BBH": 0.3544, + "MATH Level 5": 0.0755, + "GPQA": 0.2727, + "MUSR": 0.3341, + "MMLU-PRO": 0.1669 + } + }, + { + "model_id": "unsloth/Phi-3-mini-4k-instruct", + "name": "Phi-3-mini-4k-instruct", + "developer": "unsloth", + "scores": { + "IFEval": 0.544, + "BBH": 0.55, + "MATH Level 5": 0.1639, + "GPQA": 0.323, + "MUSR": 0.4284, + "MMLU-PRO": 0.4031 + } + }, + { + "model_id": "unsloth/phi-4", + "name": "phi-4", + "developer": "unsloth", + "scores": { + "IFEval": 0.6882, + "BBH": 0.6886, + "MATH Level 5": 0.5, + "GPQA": 0.3364, + "MUSR": 0.4114, + "MMLU-PRO": 0.5378 + } + }, + { + "model_id": "unsloth/phi-4-bnb-4bit", + "name": "phi-4-bnb-4bit", + "developer": "unsloth", + "scores": { + "IFEval": 0.673, + "BBH": 0.677, + "MATH Level 5": 0.4607, + "GPQA": 0.3381, + "MUSR": 0.4007, + "MMLU-PRO": 0.5256 + } + }, + { + "model_id": "unsloth/phi-4-unsloth-bnb-4bit", + "name": "phi-4-unsloth-bnb-4bit", + "developer": "unsloth", + "scores": { + "IFEval": 0.6794, + "BBH": 0.6791, + "MATH Level 5": 0.4562, + "GPQA": 0.3364, + "MUSR": 0.4034, + "MMLU-PRO": 0.5286 + } + }, + { + "model_id": "upstage/SOLAR-10.7B-Instruct-v1.0", + "name": "SOLAR-10.7B-Instruct-v1.0", + "developer": "upstage", + "scores": { + "IFEval": 0.4737, + "BBH": 0.5162, + "MATH Level 5": 0.0566, + "GPQA": 0.3087, + "MUSR": 0.3899, + "MMLU-PRO": 0.3138 + } + }, + { + "model_id": "upstage/SOLAR-10.7B-v1.0", + "name": "SOLAR-10.7B-v1.0", + "developer": "upstage", + "scores": { + "IFEval": 0.2421, + "BBH": 0.5094, + "MATH Level 5": 0.0264, + "GPQA": 0.281, + "MUSR": 0.4372, + "MMLU-PRO": 0.34 + } + }, + { + "model_id": "upstage/solar-pro-preview-instruct", + "name": "solar-pro-preview-instruct", + "developer": "upstage", + "scores": { + "IFEval": 0.8416, + "BBH": 0.6817, + "MATH Level 5": 0.2205, + "GPQA": 0.3708, + "MUSR": 0.4417, + "MMLU-PRO": 0.5273 + } + }, + { + "model_id": "utkmst/chimera-beta-test2-lora-merged", + "name": "chimera-beta-test2-lora-merged", + "developer": "utkmst", + "scores": { + "IFEval": 0.6054, + "BBH": 0.4796, + "MATH Level 5": 0.0952, + "GPQA": 0.3037, + "MUSR": 0.4118, + "MMLU-PRO": 0.2992 + } + }, + { + "model_id": "uukuguy/speechless-code-mistral-7b-v1.0", + "name": "speechless-code-mistral-7b-v1.0", + "developer": "uukuguy", + "scores": { + "IFEval": 0.3665, + "BBH": 0.4572, + "MATH Level 5": 0.0521, + "GPQA": 0.2844, + "MUSR": 0.4502, + "MMLU-PRO": 0.3146 + } + }, + { + "model_id": "uukuguy/speechless-codellama-34b-v2.0", + "name": "speechless-codellama-34b-v2.0", + "developer": "uukuguy", + "scores": { + "IFEval": 0.4604, + "BBH": 0.4813, + "MATH Level 5": 0.0431, + "GPQA": 0.2693, + "MUSR": 0.3787, + "MMLU-PRO": 0.2542 + } + }, + { + "model_id": "uukuguy/speechless-coder-ds-6.7b", + "name": "speechless-coder-ds-6.7b", + "developer": "uukuguy", + "scores": { + "IFEval": 0.2505, + "BBH": 0.4036, + "MATH Level 5": 0.0211, + "GPQA": 0.2643, + "MUSR": 0.3819, + "MMLU-PRO": 0.1719 + } + }, + { + "model_id": "uukuguy/speechless-instruct-mistral-7b-v0.2", + "name": "speechless-instruct-mistral-7b-v0.2", + "developer": "uukuguy", + "scores": { + "IFEval": 0.3261, + "BBH": 0.4607, + "MATH Level 5": 0.0491, + "GPQA": 0.2819, + "MUSR": 0.4902, + "MMLU-PRO": 0.2902 + } + }, + { + "model_id": "uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b", + "name": "speechless-llama2-hermes-orca-platypus-wizardlm-13b", + "developer": "uukuguy", + "scores": { + "IFEval": 0.4562, + "BBH": 0.4846, + "MATH Level 5": 0.0204, + "GPQA": 0.2701, + "MUSR": 0.4655, + "MMLU-PRO": 0.2559 + } + }, + { + "model_id": "uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b", + "name": "speechless-mistral-dolphin-orca-platypus-samantha-7b", + "developer": "uukuguy", + "scores": { + "IFEval": 0.37, + "BBH": 0.4983, + "MATH Level 5": 0.0295, + "GPQA": 0.2836, + "MUSR": 0.4361, + "MMLU-PRO": 0.299 + } + }, + { + "model_id": "uukuguy/speechless-zephyr-code-functionary-7b", + "name": "speechless-zephyr-code-functionary-7b", + "developer": "uukuguy", + "scores": { + "IFEval": 0.2696, + "BBH": 0.4664, + "MATH Level 5": 0.0423, + "GPQA": 0.3003, + "MUSR": 0.4268, + "MMLU-PRO": 0.3094 + } + }, + { + "model_id": "v000000/L3-8B-Stheno-v3.2-abliterated", + "name": "L3-8B-Stheno-v3.2-abliterated", + "developer": "v000000", + "scores": { + "IFEval": 0.6718, + "BBH": 0.5141, + "MATH Level 5": 0.0695, + "GPQA": 0.3096, + "MUSR": 0.362, + "MMLU-PRO": 0.3604 + } + }, + { + "model_id": "v000000/L3.1-Niitorm-8B-DPO-t0.0001", + "name": "L3.1-Niitorm-8B-DPO-t0.0001", + "developer": "v000000", + "scores": { + "IFEval": 0.7689, + "BBH": 0.5134, + "MATH Level 5": 0.1624, + "GPQA": 0.2945, + "MUSR": 0.388, + "MMLU-PRO": 0.3866 + } + }, + { + "model_id": "v000000/L3.1-Storniitova-8B", + "name": "L3.1-Storniitova-8B", + "developer": "v000000", + "scores": { + "IFEval": 0.7817, + "BBH": 0.5151, + "MATH Level 5": 0.1465, + "GPQA": 0.2894, + "MUSR": 0.4029, + "MMLU-PRO": 0.3776 + } + }, + { + "model_id": "v000000/Qwen2.5-14B-Gutenberg-1e-Delta", + "name": "Qwen2.5-14B-Gutenberg-1e-Delta", + "developer": "v000000", + "scores": { + "IFEval": 0.8045, + "BBH": 0.6398, + "MATH Level 5": 0.5264, + "GPQA": 0.3289, + "MUSR": 0.4073, + "MMLU-PRO": 0.493 + } + }, + { + "model_id": "v000000/Qwen2.5-14B-Gutenberg-Instruct-Slerpeno", + "name": "Qwen2.5-14B-Gutenberg-Instruct-Slerpeno", + "developer": "v000000", + "scores": { + "IFEval": 0.8197, + "BBH": 0.639, + "MATH Level 5": 0.5325, + "GPQA": 0.3314, + "MUSR": 0.4114, + "MMLU-PRO": 0.4924 + } + }, + { + "model_id": "v000000/Qwen2.5-Lumen-14B", + "name": "Qwen2.5-Lumen-14B", + "developer": "v000000", + "scores": { + "IFEval": 0.8064, + "BBH": 0.6391, + "MATH Level 5": 0.5363, + "GPQA": 0.328, + "MUSR": 0.4114, + "MMLU-PRO": 0.4903 + } + }, + { + "model_id": "vhab10/Llama-3.1-8B-Base-Instruct-SLERP", + "name": "Llama-3.1-8B-Base-Instruct-SLERP", + "developer": "vhab10", + "scores": { + "IFEval": 0.2907, + "BBH": 0.5057, + "MATH Level 5": 0.1201, + "GPQA": 0.2961, + "MUSR": 0.4011, + "MMLU-PRO": 0.3621 + } + }, + { + "model_id": "vhab10/Llama-3.2-Instruct-3B-TIES", + "name": "Llama-3.2-Instruct-3B-TIES", + "developer": "vhab10", + "scores": { + "IFEval": 0.4727, + "BBH": 0.4332, + "MATH Level 5": 0.0982, + "GPQA": 0.2693, + "MUSR": 0.3497, + "MMLU-PRO": 0.2916 + } + }, + { + "model_id": "vhab10/llama-3-8b-merged-linear", + "name": "llama-3-8b-merged-linear", + "developer": "vhab10", + "scores": { + "IFEval": 0.5917, + "BBH": 0.4937, + "MATH Level 5": 0.0816, + "GPQA": 0.2995, + "MUSR": 0.4191, + "MMLU-PRO": 0.3704 + } + }, + { + "model_id": "vicgalle/CarbonBeagle-11B", + "name": "CarbonBeagle-11B", + "developer": "vicgalle", + "scores": { + "IFEval": 0.5415, + "BBH": 0.5294, + "MATH Level 5": 0.0619, + "GPQA": 0.302, + "MUSR": 0.402, + "MMLU-PRO": 0.3276 + } + }, + { + "model_id": "vicgalle/CarbonBeagle-11B-truthy", + "name": "CarbonBeagle-11B-truthy", + "developer": "vicgalle", + "scores": { + "IFEval": 0.5212, + "BBH": 0.5348, + "MATH Level 5": 0.0491, + "GPQA": 0.2995, + "MUSR": 0.374, + "MMLU-PRO": 0.3357 + } + }, + { + "model_id": "vicgalle/Configurable-Hermes-2-Pro-Llama-3-8B", + "name": "Configurable-Hermes-2-Pro-Llama-3-8B", + "developer": "vicgalle", + "scores": { + "IFEval": 0.5763, + "BBH": 0.5055, + "MATH Level 5": 0.0763, + "GPQA": 0.297, + "MUSR": 0.4184, + "MMLU-PRO": 0.3098 + } + }, + { + "model_id": "vicgalle/Configurable-Llama-3.1-8B-Instruct", + "name": "Configurable-Llama-3.1-8B-Instruct", + "developer": "vicgalle", + "scores": { + "IFEval": 0.8312, + "BBH": 0.5045, + "MATH Level 5": 0.173, + "GPQA": 0.2743, + "MUSR": 0.3845, + "MMLU-PRO": 0.3592 + } + }, + { + "model_id": "vicgalle/Configurable-Yi-1.5-9B-Chat", + "name": "Configurable-Yi-1.5-9B-Chat", + "developer": "vicgalle", + "scores": { + "IFEval": 0.4323, + "BBH": 0.5452, + "MATH Level 5": 0.2047, + "GPQA": 0.3431, + "MUSR": 0.4271, + "MMLU-PRO": 0.4015 + } + }, + { + "model_id": "vicgalle/ConfigurableBeagle-11B", + "name": "ConfigurableBeagle-11B", + "developer": "vicgalle", + "scores": { + "IFEval": 0.5834, + "BBH": 0.5287, + "MATH Level 5": 0.0431, + "GPQA": 0.302, + "MUSR": 0.3953, + "MMLU-PRO": 0.3374 + } + }, + { + "model_id": "vicgalle/ConfigurableHermes-7B", + "name": "ConfigurableHermes-7B", + "developer": "vicgalle", + "scores": { + "IFEval": 0.5411, + "BBH": 0.4573, + "MATH Level 5": 0.0476, + "GPQA": 0.2768, + "MUSR": 0.4057, + "MMLU-PRO": 0.3025 + } + }, + { + "model_id": "vicgalle/ConfigurableSOLAR-10.7B", + "name": "ConfigurableSOLAR-10.7B", + "developer": "vicgalle", + "scores": { + "IFEval": 0.51, + "BBH": 0.4867, + "MATH Level 5": 0.0665, + "GPQA": 0.2987, + "MUSR": 0.3805, + "MMLU-PRO": 0.3173 + } + }, + { + "model_id": "vicgalle/Humanish-RP-Llama-3.1-8B", + "name": "Humanish-RP-Llama-3.1-8B", + "developer": "vicgalle", + "scores": { + "IFEval": 0.6669, + "BBH": 0.51, + "MATH Level 5": 0.1518, + "GPQA": 0.2869, + "MUSR": 0.3952, + "MMLU-PRO": 0.3477 + } + }, + { + "model_id": "vicgalle/Merge-Mistral-Prometheus-7B", + "name": "Merge-Mistral-Prometheus-7B", + "developer": "vicgalle", + "scores": { + "IFEval": 0.4848, + "BBH": 0.4201, + "MATH Level 5": 0.0181, + "GPQA": 0.2634, + "MUSR": 0.41, + "MMLU-PRO": 0.2717 + } + }, + { + "model_id": "vicgalle/Merge-Mixtral-Prometheus-8x7B", + "name": "Merge-Mixtral-Prometheus-8x7B", + "developer": "vicgalle", + "scores": { + "IFEval": 0.5744, + "BBH": 0.5351, + "MATH Level 5": 0.0929, + "GPQA": 0.3087, + "MUSR": 0.4098, + "MMLU-PRO": 0.3684 + } + }, + { + "model_id": "vicgalle/Roleplay-Llama-3-8B", + "name": "Roleplay-Llama-3-8B", + "developer": "vicgalle", + "scores": { + "IFEval": 0.732, + "BBH": 0.5012, + "MATH Level 5": 0.0914, + "GPQA": 0.2609, + "MUSR": 0.3529, + "MMLU-PRO": 0.3708 + } + }, + { + "model_id": "viettelsecurity-ai/security-llama3.2-3b", + "name": "security-llama3.2-3b", + "developer": "viettelsecurity-ai", + "scores": { + "IFEval": 0.5909, + "BBH": 0.4401, + "MATH Level 5": 0.1261, + "GPQA": 0.2743, + "MUSR": 0.3379, + "MMLU-PRO": 0.2837 + } + }, + { + "model_id": "vihangd/smart-dan-sft-v0.1", + "name": "smart-dan-sft-v0.1", + "developer": "vihangd", + "scores": { + "IFEval": 0.1576, + "BBH": 0.3062, + "MATH Level 5": 0.0098, + "GPQA": 0.255, + "MUSR": 0.3502, + "MMLU-PRO": 0.1142 + } + }, + { + "model_id": "voidful/smol-360m-ft", + "name": "smol-360m-ft", + "developer": "voidful", + "scores": { + "IFEval": 0.2013, + "BBH": 0.3012, + "MATH Level 5": 0.0083, + "GPQA": 0.2458, + "MUSR": 0.3714, + "MMLU-PRO": 0.1087 + } + }, + { + "model_id": "vonjack/MobileLLM-125M-HF", + "name": "MobileLLM-125M-HF", + "developer": "vonjack", + "scores": { + "IFEval": 0.2107, + "BBH": 0.3027, + "MATH Level 5": 0.0091, + "GPQA": 0.2601, + "MUSR": 0.3782, + "MMLU-PRO": 0.1164 + } + }, + { + "model_id": "vonjack/Phi-3-mini-4k-instruct-LLaMAfied", + "name": "Phi-3-mini-4k-instruct-LLaMAfied", + "developer": "vonjack", + "scores": { + "IFEval": 0.5787, + "BBH": 0.5741, + "MATH Level 5": 0.1382, + "GPQA": 0.3305, + "MUSR": 0.3924, + "MMLU-PRO": 0.3885 + } + }, + { + "model_id": "vonjack/Phi-3.5-mini-instruct-hermes-fc-json", + "name": "Phi-3.5-mini-instruct-hermes-fc-json", + "developer": "vonjack", + "scores": { + "IFEval": 0.1416, + "BBH": 0.2975, + "MATH Level 5": 0.0076, + "GPQA": 0.2542, + "MUSR": 0.4041, + "MMLU-PRO": 0.1139 + } + }, + { + "model_id": "vonjack/Qwen2.5-Coder-0.5B-Merged", + "name": "Qwen2.5-Coder-0.5B-Merged", + "developer": "vonjack", + "scores": { + "IFEval": 0.31, + "BBH": 0.3076, + "MATH Level 5": 0.0378, + "GPQA": 0.2534, + "MUSR": 0.3303, + "MMLU-PRO": 0.1202 + } + }, + { + "model_id": "vonjack/SmolLM2-1.7B-Merged", + "name": "SmolLM2-1.7B-Merged", + "developer": "vonjack", + "scores": { + "IFEval": 0.3698, + "BBH": 0.3587, + "MATH Level 5": 0.0627, + "GPQA": 0.2794, + "MUSR": 0.3408, + "MMLU-PRO": 0.2048 + } + }, + { + "model_id": "vonjack/SmolLM2-135M-Merged", + "name": "SmolLM2-135M-Merged", + "developer": "vonjack", + "scores": { + "IFEval": 0.2483, + "BBH": 0.31, + "MATH Level 5": 0.0113, + "GPQA": 0.2383, + "MUSR": 0.3662, + "MMLU-PRO": 0.1112 + } + }, + { + "model_id": "vonjack/SmolLM2-360M-Merged", + "name": "SmolLM2-360M-Merged", + "developer": "vonjack", + "scores": { + "IFEval": 0.3206, + "BBH": 0.3155, + "MATH Level 5": 0.0174, + "GPQA": 0.2559, + "MUSR": 0.3527, + "MMLU-PRO": 0.1098 + } + }, + { + "model_id": "w4r10ck/SOLAR-10.7B-Instruct-v1.0-uncensored", + "name": "SOLAR-10.7B-Instruct-v1.0-uncensored", + "developer": "w4r10ck", + "scores": { + "IFEval": 0.3884, + "BBH": 0.5302, + "MATH Level 5": 0.0657, + "GPQA": 0.2945, + "MUSR": 0.4639, + "MMLU-PRO": 0.3344 + } + }, + { + "model_id": "wanlige/li-14b-v0.4", + "name": "li-14b-v0.4", + "developer": "wanlige", + "scores": { + "IFEval": 0.8133, + "BBH": 0.6544, + "MATH Level 5": 0.5574, + "GPQA": 0.3389, + "MUSR": 0.446, + "MMLU-PRO": 0.5167 + } + }, + { + "model_id": "wanlige/li-14b-v0.4-slerp", + "name": "li-14b-v0.4-slerp", + "developer": "wanlige", + "scores": { + "IFEval": 0.4606, + "BBH": 0.6587, + "MATH Level 5": 0.4192, + "GPQA": 0.4002, + "MUSR": 0.4768, + "MMLU-PRO": 0.5372 + } + }, + { + "model_id": "wanlige/li-14b-v0.4-slerp0.1", + "name": "li-14b-v0.4-slerp0.1", + "developer": "wanlige", + "scores": { + "IFEval": 0.7923, + "BBH": 0.6572, + "MATH Level 5": 0.5332, + "GPQA": 0.3591, + "MUSR": 0.4207, + "MMLU-PRO": 0.5294 + } + }, + { + "model_id": "wannaphong/KhanomTanLLM-Instruct", + "name": "KhanomTanLLM-Instruct", + "developer": "wannaphong", + "scores": { + "IFEval": 0.1621, + "BBH": 0.3093, + "MATH Level 5": 0.0136, + "GPQA": 0.2634, + "MUSR": 0.3701, + "MMLU-PRO": 0.1119 + } + }, + { + "model_id": "waqasali1707/Beast-Soul-new", + "name": "Beast-Soul-new", + "developer": "waqasali1707", + "scores": { + "IFEval": 0.503, + "BBH": 0.5225, + "MATH Level 5": 0.0702, + "GPQA": 0.2827, + "MUSR": 0.4486, + "MMLU-PRO": 0.3108 + } + }, + { + "model_id": "wave-on-discord/qwent-7b", + "name": "qwent-7b", + "developer": "wave-on-discord", + "scores": { + "IFEval": 0.2015, + "BBH": 0.4228, + "MATH Level 5": 0.0038, + "GPQA": 0.2651, + "MUSR": 0.3817, + "MMLU-PRO": 0.1603 + } + }, + { + "model_id": "weathermanj/Menda-3B-500", + "name": "Menda-3B-500", + "developer": "weathermanj", + "scores": { + "IFEval": 0.6353, + "BBH": 0.4766, + "MATH Level 5": 0.3724, + "GPQA": 0.2878, + "MUSR": 0.3968, + "MMLU-PRO": 0.3475 + } + }, + { + "model_id": "weathermanj/Menda-3b-750", + "name": "Menda-3b-750", + "developer": "weathermanj", + "scores": { + "IFEval": 0.6335, + "BBH": 0.4737, + "MATH Level 5": 0.3716, + "GPQA": 0.2878, + "MUSR": 0.3942, + "MMLU-PRO": 0.3506 + } + }, + { + "model_id": "weathermanj/Menda-3b-Optim-100", + "name": "Menda-3b-Optim-100", + "developer": "weathermanj", + "scores": { + "IFEval": 0.6398, + "BBH": 0.4735, + "MATH Level 5": 0.3716, + "GPQA": 0.2894, + "MUSR": 0.3993, + "MMLU-PRO": 0.3461 + } + }, + { + "model_id": "weathermanj/Menda-3b-Optim-200", + "name": "Menda-3b-Optim-200", + "developer": "weathermanj", + "scores": { + "IFEval": 0.6375, + "BBH": 0.4746, + "MATH Level 5": 0.3731, + "GPQA": 0.2827, + "MUSR": 0.4033, + "MMLU-PRO": 0.3484 + } + }, + { + "model_id": "win10/ArliAI-RPMax-v1.3-merge-13.3B", + "name": "ArliAI-RPMax-v1.3-merge-13.3B", + "developer": "win10", + "scores": { + "IFEval": 0.3038, + "BBH": 0.4581, + "MATH Level 5": 0.0393, + "GPQA": 0.2743, + "MUSR": 0.4325, + "MMLU-PRO": 0.32 + } + }, + { + "model_id": "win10/Breeze-13B-32k-Instruct-v1_0", + "name": "Breeze-13B-32k-Instruct-v1_0", + "developer": "win10", + "scores": { + "IFEval": 0.3584, + "BBH": 0.4611, + "MATH Level 5": 0.0128, + "GPQA": 0.2643, + "MUSR": 0.4202, + "MMLU-PRO": 0.2568 + } + }, + { + "model_id": "win10/EVA-Norns-Qwen2.5-v0.1", + "name": "EVA-Norns-Qwen2.5-v0.1", + "developer": "win10", + "scores": { + "IFEval": 0.622, + "BBH": 0.5072, + "MATH Level 5": 0.2613, + "GPQA": 0.2852, + "MUSR": 0.4045, + "MMLU-PRO": 0.3425 + } + }, + { + "model_id": "win10/Llama-3.2-3B-Instruct-24-9-29", + "name": "Llama-3.2-3B-Instruct-24-9-29", + "developer": "win10", + "scores": { + "IFEval": 0.7332, + "BBH": 0.4614, + "MATH Level 5": 0.1707, + "GPQA": 0.2743, + "MUSR": 0.3555, + "MMLU-PRO": 0.3228 + } + }, + { + "model_id": "win10/Norns-Qwen2.5-12B", + "name": "Norns-Qwen2.5-12B", + "developer": "win10", + "scores": { + "IFEval": 0.4897, + "BBH": 0.4619, + "MATH Level 5": 0.0838, + "GPQA": 0.2836, + "MUSR": 0.3555, + "MMLU-PRO": 0.266 + } + }, + { + "model_id": "win10/Norns-Qwen2.5-7B", + "name": "Norns-Qwen2.5-7B", + "developer": "win10", + "scores": { + "IFEval": 0.6122, + "BBH": 0.5073, + "MATH Level 5": 0.2628, + "GPQA": 0.2844, + "MUSR": 0.4085, + "MMLU-PRO": 0.3413 + } + }, + { + "model_id": "win10/Qwen2.5-2B-Instruct", + "name": "Qwen2.5-2B-Instruct", + "developer": "win10", + "scores": { + "IFEval": 0.2273, + "BBH": 0.3706, + "MATH Level 5": 0.0227, + "GPQA": 0.2676, + "MUSR": 0.4378, + "MMLU-PRO": 0.1934 + } + }, + { + "model_id": "win10/llama3-13.45b-Instruct", + "name": "llama3-13.45b-Instruct", + "developer": "win10", + "scores": { + "IFEval": 0.4144, + "BBH": 0.4865, + "MATH Level 5": 0.0242, + "GPQA": 0.2584, + "MUSR": 0.3848, + "MMLU-PRO": 0.3345 + } + }, + { + "model_id": "win10/miscii-14b-1M-0128", + "name": "miscii-14b-1M-0128", + "developer": "win10", + "scores": { + "IFEval": 0.4181, + "BBH": 0.5742, + "MATH Level 5": 0.4773, + "GPQA": 0.3826, + "MUSR": 0.5431, + "MMLU-PRO": 0.4491 + } + }, + { + "model_id": "winglian/Llama-3-8b-64k-PoSE", + "name": "Llama-3-8b-64k-PoSE", + "developer": "winglian", + "scores": { + "IFEval": 0.2857, + "BBH": 0.3702, + "MATH Level 5": 0.0415, + "GPQA": 0.2609, + "MUSR": 0.3396, + "MMLU-PRO": 0.2467 + } + }, + { + "model_id": "winglian/llama-3-8b-256k-PoSE", + "name": "llama-3-8b-256k-PoSE", + "developer": "winglian", + "scores": { + "IFEval": 0.2909, + "BBH": 0.3157, + "MATH Level 5": 0.0196, + "GPQA": 0.2576, + "MUSR": 0.3316, + "MMLU-PRO": 0.1116 + } + }, + { + "model_id": "wzhouad/gemma-2-9b-it-WPO-HB", + "name": "gemma-2-9b-it-WPO-HB", + "developer": "wzhouad", + "scores": { + "IFEval": 0.5437, + "BBH": 0.5629, + "MATH Level 5": 0.1533, + "GPQA": 0.3498, + "MUSR": 0.3675, + "MMLU-PRO": 0.336 + } + }, + { + "model_id": "x0000001/Deepseek-Lumen-R1-Qwen2.5-14B", + "name": "Deepseek-Lumen-R1-Qwen2.5-14B", + "developer": "x0000001", + "scores": { + "IFEval": 0.4436, + "BBH": 0.4569, + "MATH Level 5": 0.2779, + "GPQA": 0.2852, + "MUSR": 0.474, + "MMLU-PRO": 0.4379 + } + }, + { + "model_id": "xMaulana/FinMatcha-3B-Instruct", + "name": "FinMatcha-3B-Instruct", + "developer": "xMaulana", + "scores": { + "IFEval": 0.7548, + "BBH": 0.4536, + "MATH Level 5": 0.1435, + "GPQA": 0.2693, + "MUSR": 0.3633, + "MMLU-PRO": 0.3182 + } + }, + { + "model_id": "xinchen9/Llama3.1_8B_Instruct_CoT", + "name": "Llama3.1_8B_Instruct_CoT", + "developer": "xinchen9", + "scores": { + "IFEval": 0.2974, + "BBH": 0.4398, + "MATH Level 5": 0.0604, + "GPQA": 0.302, + "MUSR": 0.4371, + "MMLU-PRO": 0.2879 + } + }, + { + "model_id": "xinchen9/Llama3.1_CoT", + "name": "Llama3.1_CoT", + "developer": "xinchen9", + "scores": { + "IFEval": 0.2246, + "BBH": 0.4341, + "MATH Level 5": 0.0385, + "GPQA": 0.2886, + "MUSR": 0.4305, + "MMLU-PRO": 0.2739 + } + }, + { + "model_id": "xinchen9/Llama3.1_CoT_V1", + "name": "Llama3.1_CoT_V1", + "developer": "xinchen9", + "scores": { + "IFEval": 0.2453, + "BBH": 0.4376, + "MATH Level 5": 0.0332, + "GPQA": 0.2794, + "MUSR": 0.4572, + "MMLU-PRO": 0.2805 + } + }, + { + "model_id": "xinchen9/Mistral-7B-CoT", + "name": "Mistral-7B-CoT", + "developer": "xinchen9", + "scores": { + "IFEval": 0.2783, + "BBH": 0.3873, + "MATH Level 5": 0.0249, + "GPQA": 0.2492, + "MUSR": 0.3994, + "MMLU-PRO": 0.2284 + } + }, + { + "model_id": "xinchen9/llama3-b8-ft-dis", + "name": "llama3-b8-ft-dis", + "developer": "xinchen9", + "scores": { + "IFEval": 0.1546, + "BBH": 0.4626, + "MATH Level 5": 0.0393, + "GPQA": 0.3129, + "MUSR": 0.3654, + "MMLU-PRO": 0.3244 + } + }, + { + "model_id": "xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table", + "developer": "xkp24", + "scores": { + "IFEval": 0.6375, + "BBH": 0.4912, + "MATH Level 5": 0.0921, + "GPQA": 0.2592, + "MUSR": 0.382, + "MMLU-PRO": 0.3686 + } + }, + { + "model_id": "xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table", + "developer": "xkp24", + "scores": { + "IFEval": 0.7275, + "BBH": 0.5057, + "MATH Level 5": 0.0846, + "GPQA": 0.2601, + "MUSR": 0.3819, + "MMLU-PRO": 0.3697 + } + }, + { + "model_id": "xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table", + "developer": "xkp24", + "scores": { + "IFEval": 0.6569, + "BBH": 0.4952, + "MATH Level 5": 0.0891, + "GPQA": 0.2592, + "MUSR": 0.3594, + "MMLU-PRO": 0.3702 + } + }, + { + "model_id": "xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table", + "developer": "xkp24", + "scores": { + "IFEval": 0.6621, + "BBH": 0.5004, + "MATH Level 5": 0.0861, + "GPQA": 0.2592, + "MUSR": 0.3805, + "MMLU-PRO": 0.36 + } + }, + { + "model_id": "xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001", + "developer": "xkp24", + "scores": { + "IFEval": 0.6042, + "BBH": 0.4936, + "MATH Level 5": 0.0997, + "GPQA": 0.2592, + "MUSR": 0.3793, + "MMLU-PRO": 0.3708 + } + }, + { + "model_id": "xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002", + "developer": "xkp24", + "scores": { + "IFEval": 0.7132, + "BBH": 0.4996, + "MATH Level 5": 0.0853, + "GPQA": 0.2584, + "MUSR": 0.3872, + "MMLU-PRO": 0.3664 + } + }, + { + "model_id": "xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001", + "developer": "xkp24", + "scores": { + "IFEval": 0.5947, + "BBH": 0.4899, + "MATH Level 5": 0.1073, + "GPQA": 0.2592, + "MUSR": 0.3581, + "MMLU-PRO": 0.3704 + } + }, + { + "model_id": "xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002", + "developer": "xkp24", + "scores": { + "IFEval": 0.6453, + "BBH": 0.4951, + "MATH Level 5": 0.0937, + "GPQA": 0.2601, + "MUSR": 0.3939, + "MMLU-PRO": 0.353 + } + }, + { + "model_id": "xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table", + "developer": "xukp20", + "scores": { + "IFEval": 0.5756, + "BBH": 0.4901, + "MATH Level 5": 0.0997, + "GPQA": 0.2592, + "MUSR": 0.366, + "MMLU-PRO": 0.3659 + } + }, + { + "model_id": "xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table", + "developer": "xukp20", + "scores": { + "IFEval": 0.7034, + "BBH": 0.5092, + "MATH Level 5": 0.0967, + "GPQA": 0.2592, + "MUSR": 0.3739, + "MMLU-PRO": 0.3693 + } + }, + { + "model_id": "xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table", + "developer": "xukp20", + "scores": { + "IFEval": 0.6024, + "BBH": 0.497, + "MATH Level 5": 0.1042, + "GPQA": 0.2592, + "MUSR": 0.3674, + "MMLU-PRO": 0.3658 + } + }, + { + "model_id": "xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table", + "developer": "xukp20", + "scores": { + "IFEval": 0.662, + "BBH": 0.5, + "MATH Level 5": 0.0937, + "GPQA": 0.2592, + "MUSR": 0.3818, + "MMLU-PRO": 0.3615 + } + }, + { + "model_id": "xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001", + "developer": "xukp20", + "scores": { + "IFEval": 0.5336, + "BBH": 0.4915, + "MATH Level 5": 0.0982, + "GPQA": 0.2592, + "MUSR": 0.378, + "MMLU-PRO": 0.3625 + } + }, + { + "model_id": "xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002", + "developer": "xukp20", + "scores": { + "IFEval": 0.6852, + "BBH": 0.5075, + "MATH Level 5": 0.0718, + "GPQA": 0.2584, + "MUSR": 0.3832, + "MMLU-PRO": 0.3621 + } + }, + { + "model_id": "xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001", + "developer": "xukp20", + "scores": { + "IFEval": 0.5482, + "BBH": 0.4887, + "MATH Level 5": 0.0891, + "GPQA": 0.2609, + "MUSR": 0.3633, + "MMLU-PRO": 0.3671 + } + }, + { + "model_id": "xukp20/llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table", + "name": "llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table", + "developer": "xukp20", + "scores": { + "IFEval": 0.69, + "BBH": 0.4978, + "MATH Level 5": 0.105, + "GPQA": 0.2592, + "MUSR": 0.3673, + "MMLU-PRO": 0.3716 + } + }, + { + "model_id": "xwen-team/Xwen-7B-Chat", + "name": "Xwen-7B-Chat", + "developer": "xwen-team", + "scores": { + "IFEval": 0.6864, + "BBH": 0.5068, + "MATH Level 5": 0.4509, + "GPQA": 0.2609, + "MUSR": 0.3914, + "MMLU-PRO": 0.429 + } + }, + { + "model_id": "xxx777xxxASD/L3.1-ClaudeMaid-4x8B", + "name": "L3.1-ClaudeMaid-4x8B", + "developer": "xxx777xxxASD", + "scores": { + "IFEval": 0.6696, + "BBH": 0.5071, + "MATH Level 5": 0.1412, + "GPQA": 0.2911, + "MUSR": 0.4289, + "MMLU-PRO": 0.358 + } + }, + { + "model_id": "yam-peleg/Hebrew-Gemma-11B-Instruct", + "name": "Hebrew-Gemma-11B-Instruct", + "developer": "yam-peleg", + "scores": { + "IFEval": 0.3021, + "BBH": 0.4036, + "MATH Level 5": 0.0657, + "GPQA": 0.276, + "MUSR": 0.4089, + "MMLU-PRO": 0.2554 + } + }, + { + "model_id": "yam-peleg/Hebrew-Mistral-7B", + "name": "Hebrew-Mistral-7B", + "developer": "yam-peleg", + "scores": { + "IFEval": 0.2328, + "BBH": 0.4334, + "MATH Level 5": 0.0498, + "GPQA": 0.2794, + "MUSR": 0.3977, + "MMLU-PRO": 0.278 + } + }, + { + "model_id": "yam-peleg/Hebrew-Mistral-7B-200K", + "name": "Hebrew-Mistral-7B-200K", + "developer": "yam-peleg", + "scores": { + "IFEval": 0.177, + "BBH": 0.3411, + "MATH Level 5": 0.031, + "GPQA": 0.2534, + "MUSR": 0.374, + "MMLU-PRO": 0.2529 + } + }, + { + "model_id": "yanng1242/Marcoro14-7B-slerp", + "name": "Marcoro14-7B-slerp", + "developer": "yanng1242", + "scores": { + "IFEval": 0.406, + "BBH": 0.5252, + "MATH Level 5": 0.0748, + "GPQA": 0.3146, + "MUSR": 0.4686, + "MMLU-PRO": 0.3168 + } + }, + { + "model_id": "yasserrmd/Coder-GRPO-3B", + "name": "Coder-GRPO-3B", + "developer": "yasserrmd", + "scores": { + "IFEval": 0.6208, + "BBH": 0.4469, + "MATH Level 5": 0.3202, + "GPQA": 0.2777, + "MUSR": 0.4115, + "MMLU-PRO": 0.3197 + } + }, + { + "model_id": "yasserrmd/Text2SQL-1.5B", + "name": "Text2SQL-1.5B", + "developer": "yasserrmd", + "scores": { + "IFEval": 0.2857, + "BBH": 0.3858, + "MATH Level 5": 0.068, + "GPQA": 0.2878, + "MUSR": 0.3942, + "MMLU-PRO": 0.2363 + } + }, + { + "model_id": "ycros/BagelMIsteryTour-v2-8x7B", + "name": "BagelMIsteryTour-v2-8x7B", + "developer": "ycros", + "scores": { + "IFEval": 0.6262, + "BBH": 0.5142, + "MATH Level 5": 0.0937, + "GPQA": 0.3079, + "MUSR": 0.4138, + "MMLU-PRO": 0.3481 + } + }, + { + "model_id": "yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table", + "developer": "yfzp", + "scores": { + "IFEval": 0.6709, + "BBH": 0.4987, + "MATH Level 5": 0.1118, + "GPQA": 0.2592, + "MUSR": 0.3727, + "MMLU-PRO": 0.3716 + } + }, + { + "model_id": "yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table", + "developer": "yfzp", + "scores": { + "IFEval": 0.7333, + "BBH": 0.508, + "MATH Level 5": 0.1035, + "GPQA": 0.2601, + "MUSR": 0.3806, + "MMLU-PRO": 0.3748 + } + }, + { + "model_id": "yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table", + "developer": "yfzp", + "scores": { + "IFEval": 0.6785, + "BBH": 0.4941, + "MATH Level 5": 0.1125, + "GPQA": 0.2592, + "MUSR": 0.3647, + "MMLU-PRO": 0.3718 + } + }, + { + "model_id": "yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table", + "developer": "yfzp", + "scores": { + "IFEval": 0.7132, + "BBH": 0.5025, + "MATH Level 5": 0.0989, + "GPQA": 0.2592, + "MUSR": 0.3713, + "MMLU-PRO": 0.3683 + } + }, + { + "model_id": "yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001", + "developer": "yfzp", + "scores": { + "IFEval": 0.6496, + "BBH": 0.4979, + "MATH Level 5": 0.1012, + "GPQA": 0.2592, + "MUSR": 0.378, + "MMLU-PRO": 0.372 + } + }, + { + "model_id": "yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002", + "developer": "yfzp", + "scores": { + "IFEval": 0.7196, + "BBH": 0.5045, + "MATH Level 5": 0.0876, + "GPQA": 0.2601, + "MUSR": 0.3831, + "MMLU-PRO": 0.3734 + } + }, + { + "model_id": "yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001", + "developer": "yfzp", + "scores": { + "IFEval": 0.6504, + "BBH": 0.4958, + "MATH Level 5": 0.0937, + "GPQA": 0.2592, + "MUSR": 0.366, + "MMLU-PRO": 0.3703 + } + }, + { + "model_id": "yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002", + "developer": "yfzp", + "scores": { + "IFEval": 0.7016, + "BBH": 0.4992, + "MATH Level 5": 0.0869, + "GPQA": 0.2592, + "MUSR": 0.3779, + "MMLU-PRO": 0.3669 + } + }, + { + "model_id": "yifAI/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002", + "developer": "yifAI", + "scores": { + "IFEval": 0.649, + "BBH": 0.4915, + "MATH Level 5": 0.0755, + "GPQA": 0.2617, + "MUSR": 0.3899, + "MMLU-PRO": 0.352 + } + }, + { + "model_id": "ylalain/ECE-PRYMMAL-YL-1B-SLERP-V8", + "name": "ECE-PRYMMAL-YL-1B-SLERP-V8", + "developer": "ylalain", + "scores": { + "IFEval": 0.1505, + "BBH": 0.3976, + "MATH Level 5": 0.0045, + "GPQA": 0.2894, + "MUSR": 0.3875, + "MMLU-PRO": 0.2384 + } + }, + { + "model_id": "ymcki/Llama-3.1-8B-GRPO-Instruct", + "name": "Llama-3.1-8B-GRPO-Instruct", + "developer": "ymcki", + "scores": { + "IFEval": 0.7445, + "BBH": 0.5132, + "MATH Level 5": 0.2024, + "GPQA": 0.2945, + "MUSR": 0.3817, + "MMLU-PRO": 0.3738 + } + }, + { + "model_id": "ymcki/Llama-3.1-8B-SFT-GRPO-Instruct", + "name": "Llama-3.1-8B-SFT-GRPO-Instruct", + "developer": "ymcki", + "scores": { + "IFEval": 0.3354, + "BBH": 0.3126, + "MATH Level 5": 0.04, + "GPQA": 0.2534, + "MUSR": 0.3526, + "MMLU-PRO": 0.1098 + } + }, + { + "model_id": "ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18", + "name": "gemma-2-2b-ORPO-jpn-it-abliterated-18", + "developer": "ymcki", + "scores": { + "IFEval": 0.4631, + "BBH": 0.4053, + "MATH Level 5": 0.0431, + "GPQA": 0.2886, + "MUSR": 0.3754, + "MMLU-PRO": 0.2345 + } + }, + { + "model_id": "ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18-merge", + "name": "gemma-2-2b-ORPO-jpn-it-abliterated-18-merge", + "developer": "ymcki", + "scores": { + "IFEval": 0.5218, + "BBH": 0.4147, + "MATH Level 5": 0.0544, + "GPQA": 0.2836, + "MUSR": 0.3514, + "MMLU-PRO": 0.2461 + } + }, + { + "model_id": "ymcki/gemma-2-2b-jpn-it-abliterated-17", + "name": "gemma-2-2b-jpn-it-abliterated-17", + "developer": "ymcki", + "scores": { + "IFEval": 0.5082, + "BBH": 0.4076, + "MATH Level 5": 0.0385, + "GPQA": 0.2718, + "MUSR": 0.3701, + "MMLU-PRO": 0.2455 + } + }, + { + "model_id": "ymcki/gemma-2-2b-jpn-it-abliterated-17-18-24", + "name": "gemma-2-2b-jpn-it-abliterated-17-18-24", + "developer": "ymcki", + "scores": { + "IFEval": 0.5055, + "BBH": 0.3812, + "MATH Level 5": 0.0257, + "GPQA": 0.281, + "MUSR": 0.3502, + "MMLU-PRO": 0.2282 + } + }, + { + "model_id": "ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO", + "name": "gemma-2-2b-jpn-it-abliterated-17-ORPO", + "developer": "ymcki", + "scores": { + "IFEval": 0.4748, + "BBH": 0.3898, + "MATH Level 5": 0.0619, + "GPQA": 0.2743, + "MUSR": 0.3768, + "MMLU-PRO": 0.2191 + } + }, + { + "model_id": "ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca", + "name": "gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca", + "developer": "ymcki", + "scores": { + "IFEval": 0.3065, + "BBH": 0.4072, + "MATH Level 5": 0.0325, + "GPQA": 0.2693, + "MUSR": 0.3969, + "MMLU-PRO": 0.2249 + } + }, + { + "model_id": "ymcki/gemma-2-2b-jpn-it-abliterated-18", + "name": "gemma-2-2b-jpn-it-abliterated-18", + "developer": "ymcki", + "scores": { + "IFEval": 0.5175, + "BBH": 0.4132, + "MATH Level 5": 0.0446, + "GPQA": 0.2735, + "MUSR": 0.3742, + "MMLU-PRO": 0.2505 + } + }, + { + "model_id": "ymcki/gemma-2-2b-jpn-it-abliterated-18-ORPO", + "name": "gemma-2-2b-jpn-it-abliterated-18-ORPO", + "developer": "ymcki", + "scores": { + "IFEval": 0.4742, + "BBH": 0.4039, + "MATH Level 5": 0.0468, + "GPQA": 0.2617, + "MUSR": 0.3953, + "MMLU-PRO": 0.2185 + } + }, + { + "model_id": "ymcki/gemma-2-2b-jpn-it-abliterated-24", + "name": "gemma-2-2b-jpn-it-abliterated-24", + "developer": "ymcki", + "scores": { + "IFEval": 0.4979, + "BBH": 0.411, + "MATH Level 5": 0.0438, + "GPQA": 0.2777, + "MUSR": 0.3915, + "MMLU-PRO": 0.2473 + } + }, + { + "model_id": "yuchenxie/ArlowGPT-3B-Multilingual", + "name": "ArlowGPT-3B-Multilingual", + "developer": "yuchenxie", + "scores": { + "IFEval": 0.6395, + "BBH": 0.4301, + "MATH Level 5": 0.1125, + "GPQA": 0.2802, + "MUSR": 0.3727, + "MMLU-PRO": 0.2817 + } + }, + { + "model_id": "yuchenxie/ArlowGPT-8B", + "name": "ArlowGPT-8B", + "developer": "yuchenxie", + "scores": { + "IFEval": 0.7847, + "BBH": 0.508, + "MATH Level 5": 0.2039, + "GPQA": 0.2936, + "MUSR": 0.3882, + "MMLU-PRO": 0.3787 + } + }, + { + "model_id": "yuvraj17/Llama3-8B-SuperNova-Spectrum-Hermes-DPO", + "name": "Llama3-8B-SuperNova-Spectrum-Hermes-DPO", + "developer": "yuvraj17", + "scores": { + "IFEval": 0.4691, + "BBH": 0.44, + "MATH Level 5": 0.0566, + "GPQA": 0.302, + "MUSR": 0.4012, + "MMLU-PRO": 0.2635 + } + }, + { + "model_id": "yuvraj17/Llama3-8B-SuperNova-Spectrum-dare_ties", + "name": "Llama3-8B-SuperNova-Spectrum-dare_ties", + "developer": "yuvraj17", + "scores": { + "IFEval": 0.4013, + "BBH": 0.4616, + "MATH Level 5": 0.0846, + "GPQA": 0.2752, + "MUSR": 0.4211, + "MMLU-PRO": 0.3574 + } + }, + { + "model_id": "yuvraj17/Llama3-8B-abliterated-Spectrum-slerp", + "name": "Llama3-8B-abliterated-Spectrum-slerp", + "developer": "yuvraj17", + "scores": { + "IFEval": 0.2885, + "BBH": 0.4978, + "MATH Level 5": 0.0604, + "GPQA": 0.3012, + "MUSR": 0.3998, + "MMLU-PRO": 0.3257 + } + }, + { + "model_id": "zake7749/gemma-2-2b-it-chinese-kyara-dpo", + "name": "gemma-2-2b-it-chinese-kyara-dpo", + "developer": "zake7749", + "scores": { + "IFEval": 0.5382, + "BBH": 0.4257, + "MATH Level 5": 0.0838, + "GPQA": 0.2668, + "MUSR": 0.4576, + "MMLU-PRO": 0.2573 + } + }, + { + "model_id": "zake7749/gemma-2-9b-it-chinese-kyara", + "name": "gemma-2-9b-it-chinese-kyara", + "developer": "zake7749", + "scores": { + "IFEval": 0.1764, + "BBH": 0.5954, + "MATH Level 5": 0.105, + "GPQA": 0.3381, + "MUSR": 0.4242, + "MMLU-PRO": 0.4179 + } + }, + { + "model_id": "zelk12/Gemma-2-TM-9B", + "name": "Gemma-2-TM-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.8045, + "BBH": 0.5987, + "MATH Level 5": 0.2024, + "GPQA": 0.3465, + "MUSR": 0.4152, + "MMLU-PRO": 0.4088 + } + }, + { + "model_id": "zelk12/MT-Gen1-gemma-2-9B", + "name": "MT-Gen1-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7886, + "BBH": 0.61, + "MATH Level 5": 0.2221, + "GPQA": 0.3465, + "MUSR": 0.4217, + "MMLU-PRO": 0.4381 + } + }, + { + "model_id": "zelk12/MT-Gen2-GI-gemma-2-9B", + "name": "MT-Gen2-GI-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7914, + "BBH": 0.6096, + "MATH Level 5": 0.2205, + "GPQA": 0.3507, + "MUSR": 0.4283, + "MMLU-PRO": 0.4356 + } + }, + { + "model_id": "zelk12/MT-Gen2-gemma-2-9B", + "name": "MT-Gen2-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7907, + "BBH": 0.61, + "MATH Level 5": 0.219, + "GPQA": 0.3465, + "MUSR": 0.4323, + "MMLU-PRO": 0.4387 + } + }, + { + "model_id": "zelk12/MT-Gen3-gemma-2-9B", + "name": "MT-Gen3-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.802, + "BBH": 0.6097, + "MATH Level 5": 0.2296, + "GPQA": 0.349, + "MUSR": 0.4217, + "MMLU-PRO": 0.4356 + } + }, + { + "model_id": "zelk12/MT-Gen4-gemma-2-9B", + "name": "MT-Gen4-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7883, + "BBH": 0.611, + "MATH Level 5": 0.2236, + "GPQA": 0.3549, + "MUSR": 0.4228, + "MMLU-PRO": 0.4387 + } + }, + { + "model_id": "zelk12/MT-Gen5-gemma-2-9B", + "name": "MT-Gen5-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7923, + "BBH": 0.6133, + "MATH Level 5": 0.2153, + "GPQA": 0.3515, + "MUSR": 0.4202, + "MMLU-PRO": 0.4402 + } + }, + { + "model_id": "zelk12/MT-Gen6-gemma-2-9B", + "name": "MT-Gen6-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.1616, + "BBH": 0.5845, + "MATH Level 5": 0.0823, + "GPQA": 0.3331, + "MUSR": 0.4069, + "MMLU-PRO": 0.4166 + } + }, + { + "model_id": "zelk12/MT-Gen6fix-gemma-2-9B", + "name": "MT-Gen6fix-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.1576, + "BBH": 0.5917, + "MATH Level 5": 0.0816, + "GPQA": 0.3372, + "MUSR": 0.4084, + "MMLU-PRO": 0.412 + } + }, + { + "model_id": "zelk12/MT-Gen7-gemma-2-9B", + "name": "MT-Gen7-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.1664, + "BBH": 0.5935, + "MATH Level 5": 0.0891, + "GPQA": 0.3356, + "MUSR": 0.4098, + "MMLU-PRO": 0.4122 + } + }, + { + "model_id": "zelk12/MT-Max-Merge_02012025163610-gemma-2-9B", + "name": "MT-Max-Merge_02012025163610-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7907, + "BBH": 0.6142, + "MATH Level 5": 0.2213, + "GPQA": 0.3515, + "MUSR": 0.4228, + "MMLU-PRO": 0.4396 + } + }, + { + "model_id": "zelk12/MT-Merge-gemma-2-9B", + "name": "MT-Merge-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.8035, + "BBH": 0.6118, + "MATH Level 5": 0.2205, + "GPQA": 0.3482, + "MUSR": 0.4256, + "MMLU-PRO": 0.4362 + } + }, + { + "model_id": "zelk12/MT-Merge1-gemma-2-9B", + "name": "MT-Merge1-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7901, + "BBH": 0.61, + "MATH Level 5": 0.2289, + "GPQA": 0.3515, + "MUSR": 0.4244, + "MMLU-PRO": 0.4374 + } + }, + { + "model_id": "zelk12/MT-Merge2-MU-gemma-2-MTg2MT1g2-9B", + "name": "MT-Merge2-MU-gemma-2-MTg2MT1g2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7956, + "BBH": 0.6084, + "MATH Level 5": 0.2183, + "GPQA": 0.3507, + "MUSR": 0.4322, + "MMLU-PRO": 0.4373 + } + }, + { + "model_id": "zelk12/MT-Merge2-gemma-2-9B", + "name": "MT-Merge2-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7877, + "BBH": 0.6107, + "MATH Level 5": 0.2349, + "GPQA": 0.3507, + "MUSR": 0.4217, + "MMLU-PRO": 0.4382 + } + }, + { + "model_id": "zelk12/MT-Merge3-gemma-2-9B", + "name": "MT-Merge3-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7859, + "BBH": 0.6102, + "MATH Level 5": 0.2205, + "GPQA": 0.349, + "MUSR": 0.4258, + "MMLU-PRO": 0.4373 + } + }, + { + "model_id": "zelk12/MT-Merge4-gemma-2-9B", + "name": "MT-Merge4-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7807, + "BBH": 0.6118, + "MATH Level 5": 0.2168, + "GPQA": 0.3523, + "MUSR": 0.4294, + "MMLU-PRO": 0.439 + } + }, + { + "model_id": "zelk12/MT-Merge5-gemma-2-9B", + "name": "MT-Merge5-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7844, + "BBH": 0.6123, + "MATH Level 5": 0.2183, + "GPQA": 0.3532, + "MUSR": 0.4281, + "MMLU-PRO": 0.4387 + } + }, + { + "model_id": "zelk12/MT-Merge6-gemma-2-9B", + "name": "MT-Merge6-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.1695, + "BBH": 0.5949, + "MATH Level 5": 0.0801, + "GPQA": 0.3289, + "MUSR": 0.4098, + "MMLU-PRO": 0.4115 + } + }, + { + "model_id": "zelk12/MT-gemma-2-9B", + "name": "MT-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7968, + "BBH": 0.6064, + "MATH Level 5": 0.2054, + "GPQA": 0.3456, + "MUSR": 0.4071, + "MMLU-PRO": 0.4224 + } + }, + { + "model_id": "zelk12/MT1-Gen1-gemma-2-9B", + "name": "MT1-Gen1-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7974, + "BBH": 0.6118, + "MATH Level 5": 0.2243, + "GPQA": 0.344, + "MUSR": 0.431, + "MMLU-PRO": 0.4376 + } + }, + { + "model_id": "zelk12/MT1-Gen2-gemma-2-9B", + "name": "MT1-Gen2-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7984, + "BBH": 0.6096, + "MATH Level 5": 0.2251, + "GPQA": 0.3523, + "MUSR": 0.4284, + "MMLU-PRO": 0.4355 + } + }, + { + "model_id": "zelk12/MT1-Gen3-gemma-2-9B", + "name": "MT1-Gen3-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.796, + "BBH": 0.6102, + "MATH Level 5": 0.2243, + "GPQA": 0.349, + "MUSR": 0.4243, + "MMLU-PRO": 0.4349 + } + }, + { + "model_id": "zelk12/MT1-Gen4-gemma-2-9B", + "name": "MT1-Gen4-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7941, + "BBH": 0.6058, + "MATH Level 5": 0.216, + "GPQA": 0.3473, + "MUSR": 0.4231, + "MMLU-PRO": 0.4286 + } + }, + { + "model_id": "zelk12/MT1-Gen5-IF-gemma-2-S2DMv1-9B", + "name": "MT1-Gen5-IF-gemma-2-S2DMv1-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7929, + "BBH": 0.6, + "MATH Level 5": 0.2032, + "GPQA": 0.344, + "MUSR": 0.4245, + "MMLU-PRO": 0.4218 + } + }, + { + "model_id": "zelk12/MT1-Gen5-gemma-2-9B", + "name": "MT1-Gen5-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7795, + "BBH": 0.6017, + "MATH Level 5": 0.2077, + "GPQA": 0.3465, + "MUSR": 0.4191, + "MMLU-PRO": 0.4222 + } + }, + { + "model_id": "zelk12/MT1-Gen6-gemma-2-9B", + "name": "MT1-Gen6-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.1634, + "BBH": 0.5944, + "MATH Level 5": 0.0808, + "GPQA": 0.328, + "MUSR": 0.4044, + "MMLU-PRO": 0.4133 + } + }, + { + "model_id": "zelk12/MT1-Gen7-gemma-2-9B", + "name": "MT1-Gen7-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.1634, + "BBH": 0.5938, + "MATH Level 5": 0.0831, + "GPQA": 0.328, + "MUSR": 0.4111, + "MMLU-PRO": 0.4145 + } + }, + { + "model_id": "zelk12/MT1-Max-Merge_02012025163610-gemma-2-9B", + "name": "MT1-Max-Merge_02012025163610-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7929, + "BBH": 0.6123, + "MATH Level 5": 0.2228, + "GPQA": 0.3549, + "MUSR": 0.4255, + "MMLU-PRO": 0.4382 + } + }, + { + "model_id": "zelk12/MT1-gemma-2-9B", + "name": "MT1-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7947, + "BBH": 0.6109, + "MATH Level 5": 0.2236, + "GPQA": 0.3456, + "MUSR": 0.4322, + "MMLU-PRO": 0.4358 + } + }, + { + "model_id": "zelk12/MT2-Gen1-gemma-2-9B", + "name": "MT2-Gen1-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7856, + "BBH": 0.6101, + "MATH Level 5": 0.2213, + "GPQA": 0.3431, + "MUSR": 0.4243, + "MMLU-PRO": 0.4377 + } + }, + { + "model_id": "zelk12/MT2-Gen2-gemma-2-9B", + "name": "MT2-Gen2-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7889, + "BBH": 0.6093, + "MATH Level 5": 0.2183, + "GPQA": 0.3465, + "MUSR": 0.427, + "MMLU-PRO": 0.4388 + } + }, + { + "model_id": "zelk12/MT2-Gen3-gemma-2-9B", + "name": "MT2-Gen3-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.781, + "BBH": 0.6105, + "MATH Level 5": 0.2107, + "GPQA": 0.3465, + "MUSR": 0.4231, + "MMLU-PRO": 0.4374 + } + }, + { + "model_id": "zelk12/MT2-Gen4-gemma-2-9B", + "name": "MT2-Gen4-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7896, + "BBH": 0.6097, + "MATH Level 5": 0.2236, + "GPQA": 0.3456, + "MUSR": 0.4125, + "MMLU-PRO": 0.4321 + } + }, + { + "model_id": "zelk12/MT2-Gen5-gemma-2-9B", + "name": "MT2-Gen5-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7749, + "BBH": 0.6064, + "MATH Level 5": 0.2107, + "GPQA": 0.3515, + "MUSR": 0.4244, + "MMLU-PRO": 0.4302 + } + }, + { + "model_id": "zelk12/MT2-Gen6-gemma-2-9B", + "name": "MT2-Gen6-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.1664, + "BBH": 0.596, + "MATH Level 5": 0.0846, + "GPQA": 0.3381, + "MUSR": 0.4137, + "MMLU-PRO": 0.421 + } + }, + { + "model_id": "zelk12/MT2-Gen7-gemma-2-9B", + "name": "MT2-Gen7-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.1762, + "BBH": 0.6079, + "MATH Level 5": 0.102, + "GPQA": 0.3549, + "MUSR": 0.4203, + "MMLU-PRO": 0.4311 + } + }, + { + "model_id": "zelk12/MT2-Max-Merge_02012025163610-gemma-2-9B", + "name": "MT2-Max-Merge_02012025163610-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7901, + "BBH": 0.6108, + "MATH Level 5": 0.2243, + "GPQA": 0.3515, + "MUSR": 0.4228, + "MMLU-PRO": 0.4391 + } + }, + { + "model_id": "zelk12/MT2-gemma-2-9B", + "name": "MT2-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7886, + "BBH": 0.6115, + "MATH Level 5": 0.2213, + "GPQA": 0.3473, + "MUSR": 0.4217, + "MMLU-PRO": 0.4368 + } + }, + { + "model_id": "zelk12/MT3-Gen1-gemma-2-9B", + "name": "MT3-Gen1-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7838, + "BBH": 0.6107, + "MATH Level 5": 0.2145, + "GPQA": 0.3465, + "MUSR": 0.4151, + "MMLU-PRO": 0.4327 + } + }, + { + "model_id": "zelk12/MT3-Gen2-gemma-2-9B", + "name": "MT3-Gen2-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7843, + "BBH": 0.6091, + "MATH Level 5": 0.2236, + "GPQA": 0.3574, + "MUSR": 0.4111, + "MMLU-PRO": 0.4333 + } + }, + { + "model_id": "zelk12/MT3-Gen3-gemma-2-9B", + "name": "MT3-Gen3-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7856, + "BBH": 0.6089, + "MATH Level 5": 0.2153, + "GPQA": 0.3515, + "MUSR": 0.4258, + "MMLU-PRO": 0.4303 + } + }, + { + "model_id": "zelk12/MT3-Gen4-gemma-2-9B", + "name": "MT3-Gen4-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7737, + "BBH": 0.6101, + "MATH Level 5": 0.2062, + "GPQA": 0.3473, + "MUSR": 0.4476, + "MMLU-PRO": 0.4387 + } + }, + { + "model_id": "zelk12/MT3-Gen5-gemma-2-9B", + "name": "MT3-Gen5-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.799, + "BBH": 0.6099, + "MATH Level 5": 0.2266, + "GPQA": 0.3532, + "MUSR": 0.4191, + "MMLU-PRO": 0.4317 + } + }, + { + "model_id": "zelk12/MT3-Gen5-gemma-2-9B_v1", + "name": "MT3-Gen5-gemma-2-9B_v1", + "developer": "zelk12", + "scores": { + "IFEval": 0.7996, + "BBH": 0.6113, + "MATH Level 5": 0.2228, + "GPQA": 0.349, + "MUSR": 0.4204, + "MMLU-PRO": 0.4359 + } + }, + { + "model_id": "zelk12/MT3-Gen6-gemma-2-9B", + "name": "MT3-Gen6-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.1762, + "BBH": 0.602, + "MATH Level 5": 0.0884, + "GPQA": 0.3431, + "MUSR": 0.4126, + "MMLU-PRO": 0.4102 + } + }, + { + "model_id": "zelk12/MT3-Max-Merge_02012025163610-gemma-2-9B", + "name": "MT3-Max-Merge_02012025163610-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.1762, + "BBH": 0.6123, + "MATH Level 5": 0.1012, + "GPQA": 0.3507, + "MUSR": 0.4255, + "MMLU-PRO": 0.4389 + } + }, + { + "model_id": "zelk12/MT3-gemma-2-9B", + "name": "MT3-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7786, + "BBH": 0.6131, + "MATH Level 5": 0.2168, + "GPQA": 0.3448, + "MUSR": 0.4243, + "MMLU-PRO": 0.4327 + } + }, + { + "model_id": "zelk12/MT4-Gen1-gemma-2-9B", + "name": "MT4-Gen1-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7895, + "BBH": 0.6094, + "MATH Level 5": 0.2198, + "GPQA": 0.344, + "MUSR": 0.4322, + "MMLU-PRO": 0.4389 + } + }, + { + "model_id": "zelk12/MT4-Gen2-gemma-2-9B", + "name": "MT4-Gen2-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.8051, + "BBH": 0.6108, + "MATH Level 5": 0.2326, + "GPQA": 0.3456, + "MUSR": 0.4257, + "MMLU-PRO": 0.4368 + } + }, + { + "model_id": "zelk12/MT4-Gen3-gemma-2-9B", + "name": "MT4-Gen3-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7841, + "BBH": 0.6087, + "MATH Level 5": 0.219, + "GPQA": 0.344, + "MUSR": 0.4243, + "MMLU-PRO": 0.4381 + } + }, + { + "model_id": "zelk12/MT4-Gen4-gemma-2-9B", + "name": "MT4-Gen4-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7874, + "BBH": 0.6076, + "MATH Level 5": 0.2145, + "GPQA": 0.3523, + "MUSR": 0.4244, + "MMLU-PRO": 0.4323 + } + }, + { + "model_id": "zelk12/MT4-Gen5-gemma-2-9B", + "name": "MT4-Gen5-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7789, + "BBH": 0.6107, + "MATH Level 5": 0.2266, + "GPQA": 0.3565, + "MUSR": 0.4268, + "MMLU-PRO": 0.4384 + } + }, + { + "model_id": "zelk12/MT4-Max-Merge_02012025163610-gemma-2-9B", + "name": "MT4-Max-Merge_02012025163610-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.1771, + "BBH": 0.612, + "MATH Level 5": 0.0952, + "GPQA": 0.3515, + "MUSR": 0.4228, + "MMLU-PRO": 0.4391 + } + }, + { + "model_id": "zelk12/MT4-gemma-2-9B", + "name": "MT4-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7762, + "BBH": 0.6073, + "MATH Level 5": 0.2085, + "GPQA": 0.3381, + "MUSR": 0.4309, + "MMLU-PRO": 0.4366 + } + }, + { + "model_id": "zelk12/MT5-Gen1-gemma-2-9B", + "name": "MT5-Gen1-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7831, + "BBH": 0.611, + "MATH Level 5": 0.2213, + "GPQA": 0.3473, + "MUSR": 0.4204, + "MMLU-PRO": 0.4368 + } + }, + { + "model_id": "zelk12/MT5-Gen2-gemma-2-9B", + "name": "MT5-Gen2-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7962, + "BBH": 0.6105, + "MATH Level 5": 0.2205, + "GPQA": 0.3515, + "MUSR": 0.4163, + "MMLU-PRO": 0.4379 + } + }, + { + "model_id": "zelk12/MT5-Gen3-gemma-2-9B", + "name": "MT5-Gen3-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7825, + "BBH": 0.609, + "MATH Level 5": 0.2168, + "GPQA": 0.3515, + "MUSR": 0.4231, + "MMLU-PRO": 0.4375 + } + }, + { + "model_id": "zelk12/MT5-Gen4-gemma-2-9B", + "name": "MT5-Gen4-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7835, + "BBH": 0.6131, + "MATH Level 5": 0.2243, + "GPQA": 0.3532, + "MUSR": 0.4228, + "MMLU-PRO": 0.4397 + } + }, + { + "model_id": "zelk12/MT5-Gen5-gemma-2-9B", + "name": "MT5-Gen5-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7947, + "BBH": 0.6112, + "MATH Level 5": 0.2258, + "GPQA": 0.3482, + "MUSR": 0.4191, + "MMLU-PRO": 0.4329 + } + }, + { + "model_id": "zelk12/MT5-Max-Merge_02012025163610-gemma-2-9B", + "name": "MT5-Max-Merge_02012025163610-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.1762, + "BBH": 0.6127, + "MATH Level 5": 0.0982, + "GPQA": 0.3515, + "MUSR": 0.4228, + "MMLU-PRO": 0.439 + } + }, + { + "model_id": "zelk12/MT5-gemma-2-9B", + "name": "MT5-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.8048, + "BBH": 0.6112, + "MATH Level 5": 0.2258, + "GPQA": 0.3431, + "MUSR": 0.4204, + "MMLU-PRO": 0.4367 + } + }, + { + "model_id": "zelk12/MTM-Merge-gemma-2-9B", + "name": "MTM-Merge-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7798, + "BBH": 0.6133, + "MATH Level 5": 0.2175, + "GPQA": 0.3549, + "MUSR": 0.4268, + "MMLU-PRO": 0.4388 + } + }, + { + "model_id": "zelk12/MTMaMe-Merge_02012025163610-gemma-2-9B", + "name": "MTMaMe-Merge_02012025163610-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.1786, + "BBH": 0.6117, + "MATH Level 5": 0.0959, + "GPQA": 0.3523, + "MUSR": 0.4241, + "MMLU-PRO": 0.4382 + } + }, + { + "model_id": "zelk12/Rv0.4DMv1t0.25-gemma-2-9B", + "name": "Rv0.4DMv1t0.25-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7497, + "BBH": 0.607, + "MATH Level 5": 0.2258, + "GPQA": 0.3456, + "MUSR": 0.4309, + "MMLU-PRO": 0.4401 + } + }, + { + "model_id": "zelk12/Rv0.4DMv1t0.25Tt0.25-gemma-2-9B", + "name": "Rv0.4DMv1t0.25Tt0.25-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7646, + "BBH": 0.6098, + "MATH Level 5": 0.2069, + "GPQA": 0.3423, + "MUSR": 0.4283, + "MMLU-PRO": 0.4347 + } + }, + { + "model_id": "zelk12/Rv0.4MT4g2-gemma-2-9B", + "name": "Rv0.4MT4g2-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.732, + "BBH": 0.6041, + "MATH Level 5": 0.1949, + "GPQA": 0.3532, + "MUSR": 0.4231, + "MMLU-PRO": 0.4417 + } + }, + { + "model_id": "zelk12/T31122024203920-gemma-2-9B", + "name": "T31122024203920-gemma-2-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7676, + "BBH": 0.6096, + "MATH Level 5": 0.2054, + "GPQA": 0.3507, + "MUSR": 0.4322, + "MMLU-PRO": 0.4373 + } + }, + { + "model_id": "zelk12/Test01012025155054", + "name": "Test01012025155054", + "developer": "zelk12", + "scores": { + "IFEval": 0.1555, + "BBH": 0.283, + "MATH Level 5": 0.0, + "GPQA": 0.2416, + "MUSR": 0.367, + "MMLU-PRO": 0.109 + } + }, + { + "model_id": "zelk12/Test01012025155054t0.5_gemma-2", + "name": "Test01012025155054t0.5_gemma-2", + "developer": "zelk12", + "scores": { + "IFEval": 0.1555, + "BBH": 0.283, + "MATH Level 5": 0.0, + "GPQA": 0.2416, + "MUSR": 0.367, + "MMLU-PRO": 0.109 + } + }, + { + "model_id": "zelk12/gemma-2-S2MTM-9B", + "name": "gemma-2-S2MTM-9B", + "developer": "zelk12", + "scores": { + "IFEval": 0.7823, + "BBH": 0.6061, + "MATH Level 5": 0.2047, + "GPQA": 0.3456, + "MUSR": 0.4218, + "MMLU-PRO": 0.4297 + } + }, + { + "model_id": "zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1", + "name": "recoilme-gemma-2-Ataraxy-9B-v0.1", + "developer": "zelk12", + "scores": { + "IFEval": 0.7649, + "BBH": 0.6075, + "MATH Level 5": 0.2281, + "GPQA": 0.3498, + "MUSR": 0.4136, + "MMLU-PRO": 0.4321 + } + }, + { + "model_id": "zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25", + "name": "recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25", + "developer": "zelk12", + "scores": { + "IFEval": 0.7707, + "BBH": 0.6075, + "MATH Level 5": 0.2145, + "GPQA": 0.3431, + "MUSR": 0.4323, + "MMLU-PRO": 0.44 + } + }, + { + "model_id": "zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75", + "name": "recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75", + "developer": "zelk12", + "scores": { + "IFEval": 0.7208, + "BBH": 0.5995, + "MATH Level 5": 0.2017, + "GPQA": 0.3498, + "MUSR": 0.3951, + "MMLU-PRO": 0.4141 + } + }, + { + "model_id": "zelk12/recoilme-gemma-2-Ataraxy-9B-v0.2", + "name": "recoilme-gemma-2-Ataraxy-9B-v0.2", + "developer": "zelk12", + "scores": { + "IFEval": 0.76, + "BBH": 0.6066, + "MATH Level 5": 0.2228, + "GPQA": 0.3482, + "MUSR": 0.411, + "MMLU-PRO": 0.4323 + } + }, + { + "model_id": "zelk12/recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1", + "name": "recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1", + "developer": "zelk12", + "scores": { + "IFEval": 0.7615, + "BBH": 0.6099, + "MATH Level 5": 0.21, + "GPQA": 0.3414, + "MUSR": 0.431, + "MMLU-PRO": 0.4315 + } + }, + { + "model_id": "zelk12/recoilme-gemma-2-Ifable-9B-v0.1", + "name": "recoilme-gemma-2-Ifable-9B-v0.1", + "developer": "zelk12", + "scores": { + "IFEval": 0.7944, + "BBH": 0.6064, + "MATH Level 5": 0.2205, + "GPQA": 0.3515, + "MUSR": 0.4202, + "MMLU-PRO": 0.4323 + } + }, + { + "model_id": "zelk12/recoilme-gemma-2-psy10k-mental_healt-9B-v0.1", + "name": "recoilme-gemma-2-psy10k-mental_healt-9B-v0.1", + "developer": "zelk12", + "scores": { + "IFEval": 0.7445, + "BBH": 0.5978, + "MATH Level 5": 0.1888, + "GPQA": 0.344, + "MUSR": 0.4295, + "MMLU-PRO": 0.4181 + } + }, + { + "model_id": "zetasepic/Qwen2.5-32B-Instruct-abliterated-v2", + "name": "Qwen2.5-32B-Instruct-abliterated-v2", + "developer": "zetasepic", + "scores": { + "IFEval": 0.8334, + "BBH": 0.6934, + "MATH Level 5": 0.5952, + "GPQA": 0.3674, + "MUSR": 0.4354, + "MMLU-PRO": 0.5622 + } + }, + { + "model_id": "zetasepic/Qwen2.5-72B-Instruct-abliterated", + "name": "Qwen2.5-72B-Instruct-abliterated", + "developer": "zetasepic", + "scores": { + "IFEval": 0.7153, + "BBH": 0.7152, + "MATH Level 5": 0.5242, + "GPQA": 0.4069, + "MUSR": 0.4719, + "MMLU-PRO": 0.5872 + } + }, + { + "model_id": "zhengr/MixTAO-7Bx2-MoE-v8.1", + "name": "MixTAO-7Bx2-MoE-v8.1", + "developer": "zhengr", + "scores": { + "IFEval": 0.4188, + "BBH": 0.4202, + "MATH Level 5": 0.0604, + "GPQA": 0.2987, + "MUSR": 0.3976, + "MMLU-PRO": 0.2847 + } + } + ] +} \ No newline at end of file diff --git a/data/benchmarks/livecodebenchpro.json b/data/benchmarks/livecodebenchpro.json new file mode 100644 index 0000000000000000000000000000000000000000..6792c24758cd5b97dfe6d07eea5a498bc4080cbe --- /dev/null +++ b/data/benchmarks/livecodebenchpro.json @@ -0,0 +1,274 @@ +{ + "models": [ + { + "model_id": "alibaba/qwen3-235b-a22b-thinking-2507", + "name": "qwen3-235b-a22b-thinking-2507", + "developer": "Alibaba", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.1267605633802817, + "Easy Problems": 0.7605633802816901 + } + }, + { + "model_id": "alibaba/qwen3-30b-a3b", + "name": "qwen3-30b-a3b", + "developer": "Alibaba", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.028169014084507043, + "Easy Problems": 0.5774647887323944 + } + }, + { + "model_id": "alibaba/qwen3-max", + "name": "alibaba/qwen3-max", + "developer": "Alibaba", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.04225352112676056, + "Easy Problems": 0.36619718309859156 + } + }, + { + "model_id": "alibaba/qwen3-next-80b-a3b-thinking", + "name": "qwen3-next-80b-a3b-thinking", + "developer": "Alibaba", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.14084507042253522, + "Easy Problems": 0.7464788732394366 + } + }, + { + "model_id": "aliyun/qwen3-next-80b-a3b-thinking", + "name": "qwen3-next-80b-a3b-thinking", + "developer": "aliyun", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.0704, + "Easy Problems": 0.6901 + } + }, + { + "model_id": "anthropic/claude-3-7-sonnet-20250219", + "name": "claude-3-7-sonnet-20250219", + "developer": "anthropic", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.0, + "Easy Problems": 0.28169014084507044 + } + }, + { + "model_id": "anthropic/claude-3.7-sonnet", + "name": "anthropic/claude-3.7-sonnet", + "developer": "Anthropic", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.014084507042253521, + "Easy Problems": 0.15492957746478872 + } + }, + { + "model_id": "anthropic/claude-sonnet-4-5-20250929", + "name": "claude-sonnet-4-5-20250929", + "developer": "anthropic", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.0, + "Easy Problems": 0.5352 + } + }, + { + "model_id": "ark/ep-20250603132404-cgpjm", + "name": "ep-20250603132404-cgpjm", + "developer": "ark", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.0141, + "Easy Problems": 0.507 + } + }, + { + "model_id": "bytedance/doubao-seed-1-6-thinking-250615", + "name": "doubao-seed-1-6-thinking-250615", + "developer": "ByteDance", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.07042253521126761, + "Easy Problems": 0.5774647887323944 + } + }, + { + "model_id": "deepseek/chat-v3-0324", + "name": "deepseek/chat-v3-0324", + "developer": "DeepSeek", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.0, + "Easy Problems": 0.19718309859154928 + } + }, + { + "model_id": "deepseek/ep-20250214004308-p7n89", + "name": "ep-20250214004308-p7n89", + "developer": "DeepSeek", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.014084507042253521, + "Easy Problems": 0.4225352112676056 + } + }, + { + "model_id": "deepseek/ep-20250228232227-z44x5", + "name": "ep-20250228232227-z44x5", + "developer": "DeepSeek", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.0, + "Easy Problems": 0.1267605633802817 + } + }, + { + "model_id": "deepseek/ep-20250603132404-cgpjm", + "name": "ep-20250603132404-cgpjm", + "developer": "DeepSeek", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.08450704225352113, + "Easy Problems": 0.5774647887323944 + } + }, + { + "model_id": "google/gemini-2.5-flash", + "name": "gemini-2.5-flash", + "developer": "google", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.028169014084507043, + "Easy Problems": 0.38028169014084506 + } + }, + { + "model_id": "google/gemini-2.5-pro", + "name": "gemini-2.5-pro", + "developer": "google", + "scores": { + "Hard Problems": 0.014084507042253521, + "Medium Problems": 0.2112676056338028, + "Easy Problems": 0.7183098591549296 + } + }, + { + "model_id": "kuaishou/kwaipilot-40b-0604", + "name": "kwaipilot-40b-0604", + "developer": "Kuaishou", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.07042253521126761, + "Easy Problems": 0.056338028169014086 + } + }, + { + "model_id": "meta/llama-4-maverick", + "name": "meta/llama-4-maverick", + "developer": "Meta", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.0, + "Easy Problems": 0.09859154929577464 + } + }, + { + "model_id": "openai/gpt-4.1", + "name": "openai/gpt-4.1", + "developer": "OpenAI", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.0, + "Easy Problems": 0.19718309859154928 + } + }, + { + "model_id": "openai/gpt-4o-2024-11-20", + "name": "GPT-4o 2024-11-20", + "developer": "openai", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.0, + "Easy Problems": 0.07042253521126761 + } + }, + { + "model_id": "openai/gpt-5-2025-08-07", + "name": "gpt-5-2025-08-07", + "developer": "openai", + "scores": { + "Hard Problems": 0.0423, + "Medium Problems": 0.4085, + "Easy Problems": 0.9014 + } + }, + { + "model_id": "openai/gpt-5.2-2025-12-11", + "name": "gpt-5.2-2025-12-11", + "developer": "OpenAI", + "scores": { + "Hard Problems": 0.1594, + "Medium Problems": 0.5211, + "Easy Problems": 0.9014 + } + }, + { + "model_id": "openai/gpt-oss-120b", + "name": "gpt-oss-120b", + "developer": "openai", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.11267605633802817, + "Easy Problems": 0.6619718309859155 + } + }, + { + "model_id": "openai/gpt-oss-20b", + "name": "gpt-oss-20b", + "developer": "openai", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.056338028169014086, + "Easy Problems": 0.5070422535211268 + } + }, + { + "model_id": "openai/o3-2025-04-16", + "name": "o3 2025-04-16", + "developer": "openai", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.22535211267605634, + "Easy Problems": 0.7183098591549296 + } + }, + { + "model_id": "openai/o4-mini-2025-04-16", + "name": "o4-mini-2025-04-16", + "developer": "openai", + "scores": { + "Hard Problems": 0.014084507042253521, + "Medium Problems": 0.30985915492957744, + "Easy Problems": 0.8873239436619719 + } + }, + { + "model_id": "z-ai/glm-4.5", + "name": "z-ai/glm-4.5", + "developer": "Z.AI", + "scores": { + "Hard Problems": 0.0, + "Medium Problems": 0.028169014084507043, + "Easy Problems": 0.1267605633802817 + } + } + ] +} \ No newline at end of file diff --git a/data/benchmarks/reward-bench.json b/data/benchmarks/reward-bench.json new file mode 100644 index 0000000000000000000000000000000000000000..e88892a000e33069b0f27e7d091b5b5b1b11b456 --- /dev/null +++ b/data/benchmarks/reward-bench.json @@ -0,0 +1,4421 @@ +{ + "models": [ + { + "model_id": "0-hero/Matter-0.1-7B-DPO-preview", + "name": "0-hero/Matter-0.1-7B-DPO-preview", + "developer": "0-hero", + "scores": { + "Score": 0.7247, + "Chat": 0.8939, + "Chat Hard": 0.5768, + "Safety": 0.6378, + "Reasoning": 0.8854, + "Prior Sets (0.5 weight)": 0.5348 + } + }, + { + "model_id": "0-hero/Matter-0.1-7B-boost-DPO-preview", + "name": "0-hero/Matter-0.1-7B-boost-DPO-preview", + "developer": "0-hero", + "scores": { + "Score": 0.7448, + "Chat": 0.9106, + "Chat Hard": 0.6096, + "Safety": 0.7135, + "Reasoning": 0.8395, + "Prior Sets (0.5 weight)": 0.5566 + } + }, + { + "model_id": "Ahjeong/MMPO_Gemma_7b", + "name": "Ahjeong/MMPO_Gemma_7b", + "developer": "Ahjeong", + "scores": { + "Score": 0.7587, + "Chat": 0.9693, + "Chat Hard": 0.614, + "Safety": 0.7135, + "Reasoning": 0.7756, + "Prior Sets (0.5 weight)": 0.6831 + } + }, + { + "model_id": "Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3", + "name": "Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3", + "developer": "Ahjeong", + "scores": { + "Score": 0.7652, + "Chat": 0.9721, + "Chat Hard": 0.6338, + "Safety": 0.7635, + "Reasoning": 0.7284, + "Prior Sets (0.5 weight)": 0.6913 + } + }, + { + "model_id": "Anthropic/claude-3-5-sonnet-20240620", + "name": "Anthropic/claude-3-5-sonnet-20240620", + "developer": "Anthropic", + "scores": { + "Score": 0.8417, + "Chat": 0.9637, + "Chat Hard": 0.7401, + "Safety": 0.8162, + "Reasoning": 0.8469 + } + }, + { + "model_id": "Anthropic/claude-3-haiku-20240307", + "name": "Anthropic/claude-3-haiku-20240307", + "developer": "Anthropic", + "scores": { + "Score": 0.7289, + "Chat": 0.9274, + "Chat Hard": 0.5197, + "Safety": 0.7953, + "Reasoning": 0.706, + "Prior Sets (0.5 weight)": 0.6635 + } + }, + { + "model_id": "Anthropic/claude-3-opus-20240229", + "name": "Anthropic/claude-3-opus-20240229", + "developer": "Anthropic", + "scores": { + "Score": 0.8008, + "Chat": 0.9469, + "Chat Hard": 0.6031, + "Safety": 0.8662, + "Reasoning": 0.7868 + } + }, + { + "model_id": "Anthropic/claude-3-sonnet-20240229", + "name": "Anthropic/claude-3-sonnet-20240229", + "developer": "Anthropic", + "scores": { + "Score": 0.7458, + "Chat": 0.9344, + "Chat Hard": 0.5658, + "Safety": 0.8169, + "Reasoning": 0.6907, + "Prior Sets (0.5 weight)": 0.6963 + } + }, + { + "model_id": "AtlaAI/Selene-1", + "name": "AtlaAI/Selene-1", + "developer": "AtlaAI", + "scores": { + "Score": 0.9241, + "Chat": 0.9777, + "Chat Hard": 0.8399, + "Safety": 0.9216, + "Reasoning": 0.9572 + } + }, + { + "model_id": "AtlaAI/Selene-1-Mini-Llama-3.1-8B", + "name": "AtlaAI/Selene-1-Mini-Llama-3.1-8B", + "developer": "AtlaAI", + "scores": { + "Score": 0.8913, + "Chat": 0.9358, + "Chat Hard": 0.7939, + "Safety": 0.8926, + "Reasoning": 0.9429 + } + }, + { + "model_id": "CIR-AMS/BTRM_Qwen2_7b_0613", + "name": "CIR-AMS/BTRM_Qwen2_7b_0613", + "developer": "CIR-AMS", + "scores": { + "Score": 0.5736, + "Factuality": 0.5347, + "Precise IF": 0.3563, + "Math": 0.6066, + "Safety": 0.7178, + "Focus": 0.5737, + "Ties": 0.6527 + } + }, + { + "model_id": "Cohere March 2024", + "name": "Cohere March 2024", + "developer": "unknown", + "scores": { + "Score": 0.8511, + "Chat": 0.9469, + "Chat Hard": 0.6513, + "Safety": 0.877, + "Reasoning": 0.9817, + "Prior Sets (0.5 weight)": 0.7458 + } + }, + { + "model_id": "Cohere May 2024", + "name": "Cohere May 2024", + "developer": "unknown", + "scores": { + "Score": 0.8816, + "Chat": 0.9637, + "Chat Hard": 0.7127, + "Safety": 0.923, + "Reasoning": 0.9768, + "Prior Sets (0.5 weight)": 0.782 + } + }, + { + "model_id": "CohereForAI/c4ai-command-r-plus", + "name": "c4ai-command-r-plus", + "developer": "CohereForAI", + "scores": { + "Score": 0.7057, + "Chat": 0.9511, + "Chat Hard": 0.5757, + "Safety": 0.5986, + "Reasoning": 0.704, + "Prior Sets (0.5 weight)": 0.6924 + } + }, + { + "model_id": "ContextualAI/LMUnit-llama3.1-70b", + "name": "ContextualAI/LMUnit-llama3.1-70b", + "developer": "ContextualAI", + "scores": { + "Score": 0.8054, + "Factuality": 0.8463, + "Precise IF": 0.4875, + "Math": 0.7158, + "Safety": 0.9067, + "Focus": 0.9697, + "Ties": 0.9063 + } + }, + { + "model_id": "ContextualAI/LMUnit-qwen2.5-72b", + "name": "ContextualAI/LMUnit-qwen2.5-72b", + "developer": "ContextualAI", + "scores": { + "Score": 0.8208, + "Factuality": 0.8716, + "Precise IF": 0.5437, + "Math": 0.7268, + "Safety": 0.9133, + "Focus": 0.9677, + "Ties": 0.9014 + } + }, + { + "model_id": "ContextualAI/archangel_sft-dpo_llama13b", + "name": "ContextualAI/archangel_sft-dpo_llama13b", + "developer": "ContextualAI", + "scores": { + "Score": 0.54, + "Chat": 0.7123, + "Chat Hard": 0.4298, + "Safety": 0.5649, + "Reasoning": 0.4401, + "Prior Sets (0.5 weight)": 0.5656 + } + }, + { + "model_id": "ContextualAI/archangel_sft-dpo_llama30b", + "name": "ContextualAI/archangel_sft-dpo_llama30b", + "developer": "ContextualAI", + "scores": { + "Score": 0.5618, + "Chat": 0.6927, + "Chat Hard": 0.4474, + "Safety": 0.6284, + "Reasoning": 0.4745, + "Prior Sets (0.5 weight)": 0.5705 + } + }, + { + "model_id": "ContextualAI/archangel_sft-dpo_llama7b", + "name": "ContextualAI/archangel_sft-dpo_llama7b", + "developer": "ContextualAI", + "scores": { + "Score": 0.5304, + "Chat": 0.5782, + "Chat Hard": 0.4452, + "Safety": 0.5203, + "Reasoning": 0.5658, + "Prior Sets (0.5 weight)": 0.5544 + } + }, + { + "model_id": "ContextualAI/archangel_sft-dpo_pythia1-4b", + "name": "ContextualAI/archangel_sft-dpo_pythia1-4b", + "developer": "ContextualAI", + "scores": { + "Score": 0.5233, + "Chat": 0.6397, + "Chat Hard": 0.3728, + "Safety": 0.5041, + "Reasoning": 0.5672, + "Prior Sets (0.5 weight)": 0.5427 + } + }, + { + "model_id": "ContextualAI/archangel_sft-dpo_pythia12-0b", + "name": "ContextualAI/archangel_sft-dpo_pythia12-0b", + "developer": "ContextualAI", + "scores": { + "Score": 0.5009, + "Chat": 0.6676, + "Chat Hard": 0.364, + "Safety": 0.5432, + "Reasoning": 0.4139, + "Prior Sets (0.5 weight)": 0.5303 + } + }, + { + "model_id": "ContextualAI/archangel_sft-dpo_pythia2-8b", + "name": "ContextualAI/archangel_sft-dpo_pythia2-8b", + "developer": "ContextualAI", + "scores": { + "Score": 0.5286, + "Chat": 0.8073, + "Chat Hard": 0.3355, + "Safety": 0.4473, + "Reasoning": 0.5135, + "Prior Sets (0.5 weight)": 0.5501 + } + }, + { + "model_id": "ContextualAI/archangel_sft-dpo_pythia6-9b", + "name": "ContextualAI/archangel_sft-dpo_pythia6-9b", + "developer": "ContextualAI", + "scores": { + "Score": 0.5263, + "Chat": 0.7486, + "Chat Hard": 0.3421, + "Safety": 0.5176, + "Reasoning": 0.4847, + "Prior Sets (0.5 weight)": 0.551 + } + }, + { + "model_id": "ContextualAI/archangel_sft-kto_llama13b", + "name": "ContextualAI/archangel_sft-kto_llama13b", + "developer": "ContextualAI", + "scores": { + "Score": 0.5952, + "Chat": 0.8408, + "Chat Hard": 0.3772, + "Safety": 0.4649, + "Reasoning": 0.7077, + "Prior Sets (0.5 weight)": 0.576 + } + }, + { + "model_id": "ContextualAI/archangel_sft-kto_llama30b", + "name": "ContextualAI/archangel_sft-kto_llama30b", + "developer": "ContextualAI", + "scores": { + "Score": 0.5901, + "Chat": 0.8436, + "Chat Hard": 0.4057, + "Safety": 0.6054, + "Reasoning": 0.5075, + "Prior Sets (0.5 weight)": 0.5862 + } + }, + { + "model_id": "ContextualAI/archangel_sft-kto_llama7b", + "name": "ContextualAI/archangel_sft-kto_llama7b", + "developer": "ContextualAI", + "scores": { + "Score": 0.5388, + "Chat": 0.5587, + "Chat Hard": 0.4364, + "Safety": 0.4568, + "Reasoning": 0.6941, + "Prior Sets (0.5 weight)": 0.5575 + } + }, + { + "model_id": "ContextualAI/archangel_sft-kto_pythia1-4b", + "name": "ContextualAI/archangel_sft-kto_pythia1-4b", + "developer": "ContextualAI", + "scores": { + "Score": 0.5581, + "Chat": 0.6844, + "Chat Hard": 0.3794, + "Safety": 0.5257, + "Reasoning": 0.6447, + "Prior Sets (0.5 weight)": 0.5546 + } + }, + { + "model_id": "ContextualAI/archangel_sft-kto_pythia12-0b", + "name": "ContextualAI/archangel_sft-kto_pythia12-0b", + "developer": "ContextualAI", + "scores": { + "Score": 0.5053, + "Chat": 0.7486, + "Chat Hard": 0.3618, + "Safety": 0.4757, + "Reasoning": 0.4127, + "Prior Sets (0.5 weight)": 0.55 + } + }, + { + "model_id": "ContextualAI/archangel_sft-kto_pythia2-8b", + "name": "ContextualAI/archangel_sft-kto_pythia2-8b", + "developer": "ContextualAI", + "scores": { + "Score": 0.5497, + "Chat": 0.757, + "Chat Hard": 0.3421, + "Safety": 0.4743, + "Reasoning": 0.6216, + "Prior Sets (0.5 weight)": 0.557 + } + }, + { + "model_id": "ContextualAI/archangel_sft-kto_pythia6-9b", + "name": "ContextualAI/archangel_sft-kto_pythia6-9b", + "developer": "ContextualAI", + "scores": { + "Score": 0.5561, + "Chat": 0.7765, + "Chat Hard": 0.3618, + "Safety": 0.5365, + "Reasoning": 0.5415, + "Prior Sets (0.5 weight)": 0.5723 + } + }, + { + "model_id": "Databricks-Mosaic-Research/PGRM", + "name": "Databricks-Mosaic-Research/PGRM", + "developer": "Databricks-Mosaic-Research", + "scores": { + "Score": 0.8002, + "Factuality": 0.7937, + "Precise IF": 0.5062, + "Math": 0.7404, + "Safety": 0.9289, + "Focus": 0.9424, + "Ties": 0.8893 + } + }, + { + "model_id": "HFXM/RAMO-Llama3.1-8B", + "name": "HFXM/RAMO-Llama3.1-8B", + "developer": "HFXM", + "scores": { + "Score": 0.6917, + "Factuality": 0.6547, + "Precise IF": 0.375, + "Math": 0.5628, + "Safety": 0.9756, + "Focus": 0.9071, + "Ties": 0.6752 + } + }, + { + "model_id": "HuggingFaceH4/starchat2-15b-v0.1", + "name": "HuggingFaceH4/starchat2-15b-v0.1", + "developer": "HuggingFaceH4", + "scores": { + "Score": 0.7322, + "Chat": 0.9385, + "Chat Hard": 0.5548, + "Safety": 0.7095, + "Reasoning": 0.8159, + "Prior Sets (0.5 weight)": 0.5525 + } + }, + { + "model_id": "HuggingFaceH4/zephyr-7b-alpha", + "name": "zephyr-7b-alpha", + "developer": "HuggingFaceH4", + "scores": { + "Score": 0.7392, + "Chat": 0.9162, + "Chat Hard": 0.625, + "Safety": 0.7662, + "Reasoning": 0.7514, + "Prior Sets (0.5 weight)": 0.5353 + } + }, + { + "model_id": "HuggingFaceH4/zephyr-7b-beta", + "name": "zephyr-7b-beta", + "developer": "HuggingFaceH4", + "scores": { + "Score": 0.7281, + "Chat": 0.9525, + "Chat Hard": 0.6272, + "Safety": 0.6568, + "Reasoning": 0.7789, + "Prior Sets (0.5 weight)": 0.5216 + } + }, + { + "model_id": "HuggingFaceH4/zephyr-7b-gemma-v0.1", + "name": "zephyr-7b-gemma-v0.1", + "developer": "HuggingFaceH4", + "scores": { + "Score": 0.6758, + "Chat": 0.9581, + "Chat Hard": 0.4956, + "Safety": 0.5824, + "Reasoning": 0.7463, + "Prior Sets (0.5 weight)": 0.5171 + } + }, + { + "model_id": "IDEA-CCNL/Ziya-LLaMA-7B-Reward", + "name": "IDEA-CCNL/Ziya-LLaMA-7B-Reward", + "developer": "IDEA-CCNL", + "scores": { + "Score": 0.6378, + "Chat": 0.8687, + "Chat Hard": 0.4605, + "Safety": 0.6405, + "Reasoning": 0.5775, + "Prior Sets (0.5 weight)": 0.6461 + } + }, + { + "model_id": "LxzGordon/URM-LLaMa-3-8B", + "name": "LxzGordon/URM-LLaMa-3-8B", + "developer": "LxzGordon", + "scores": { + "Score": 0.8991, + "Chat": 0.9693, + "Chat Hard": 0.7873, + "Safety": 0.8824, + "Reasoning": 0.9574 + } + }, + { + "model_id": "LxzGordon/URM-LLaMa-3.1-8B", + "name": "LxzGordon/URM-LLaMa-3.1-8B", + "developer": "LxzGordon", + "scores": { + "Score": 0.7394, + "Factuality": 0.6884, + "Precise IF": 0.45, + "Math": 0.6393, + "Safety": 0.9178, + "Focus": 0.9758, + "Ties": 0.7653 + } + }, + { + "model_id": "NCSOFT/Llama-3-OffsetBias-8B", + "name": "NCSOFT/Llama-3-OffsetBias-8B", + "developer": "NCSOFT", + "scores": { + "Score": 0.8397, + "Chat": 0.9246, + "Chat Hard": 0.8026, + "Safety": 0.8676, + "Reasoning": 0.7639 + } + }, + { + "model_id": "NCSOFT/Llama-3-OffsetBias-RM-8B", + "name": "NCSOFT/Llama-3-OffsetBias-RM-8B", + "developer": "NCSOFT", + "scores": { + "Score": 0.8942, + "Chat": 0.9721, + "Chat Hard": 0.818, + "Safety": 0.8676, + "Reasoning": 0.9192 + } + }, + { + "model_id": "Nexusflow/Starling-RM-34B", + "name": "Nexusflow/Starling-RM-34B", + "developer": "Nexusflow", + "scores": { + "Score": 0.4553, + "Factuality": 0.4589, + "Precise IF": 0.3187, + "Math": 0.6175, + "Safety": 0.7556, + "Focus": 0.4808, + "Ties": 0.1004 + } + }, + { + "model_id": "NousResearch/Hermes-3-Llama-3.1-70B", + "name": "Hermes-3-Llama-3.1-70B", + "developer": "NousResearch", + "scores": { + "Score": 0.7847, + "Chat": 0.9623, + "Chat Hard": 0.5669, + "Safety": 0.823, + "Reasoning": 0.7867 + } + }, + { + "model_id": "NousResearch/Nous-Hermes-2-Mistral-7B-DPO", + "name": "Nous-Hermes-2-Mistral-7B-DPO", + "developer": "NousResearch", + "scores": { + "Score": 0.7481, + "Chat": 0.9218, + "Chat Hard": 0.6053, + "Safety": 0.8243, + "Reasoning": 0.7375, + "Prior Sets (0.5 weight)": 0.555 + } + }, + { + "model_id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO", + "name": "Nous-Hermes-2-Mixtral-8x7B-DPO", + "developer": "NousResearch", + "scores": { + "Score": 0.7138, + "Chat": 0.9162, + "Chat Hard": 0.6053, + "Safety": 0.8149, + "Reasoning": 0.6126, + "Prior Sets (0.5 weight)": 0.5266 + } + }, + { + "model_id": "OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1", + "name": "OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1", + "developer": "OpenAssistant", + "scores": { + "Score": 0.2653, + "Factuality": 0.3979, + "Precise IF": 0.2875, + "Math": 0.377, + "Safety": 0.3289, + "Focus": 0.1535, + "Ties": 0.047 + } + }, + { + "model_id": "OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5", + "name": "OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5", + "developer": "OpenAssistant", + "scores": { + "Score": 0.6901, + "Chat": 0.8855, + "Chat Hard": 0.4868, + "Safety": 0.6311, + "Reasoning": 0.7752, + "Prior Sets (0.5 weight)": 0.6533 + } + }, + { + "model_id": "OpenAssistant/reward-model-deberta-v3-large-v2", + "name": "OpenAssistant/reward-model-deberta-v3-large-v2", + "developer": "OpenAssistant", + "scores": { + "Score": 0.6126, + "Chat": 0.8939, + "Chat Hard": 0.4518, + "Safety": 0.7338, + "Reasoning": 0.3855, + "Prior Sets (0.5 weight)": 0.5836 + } + }, + { + "model_id": "PKU-Alignment/beaver-7b-v1.0-cost", + "name": "PKU-Alignment/beaver-7b-v1.0-cost", + "developer": "PKU-Alignment", + "scores": { + "Score": 0.5798, + "Chat": 0.6173, + "Chat Hard": 0.4232, + "Safety": 0.7351, + "Reasoning": 0.5482, + "Prior Sets (0.5 weight)": 0.57 + } + }, + { + "model_id": "PKU-Alignment/beaver-7b-v1.0-reward", + "name": "PKU-Alignment/beaver-7b-v1.0-reward", + "developer": "PKU-Alignment", + "scores": { + "Score": 0.1606, + "Factuality": 0.2105, + "Precise IF": 0.2938, + "Math": 0.2623, + "Safety": 0.1422, + "Focus": 0.0646, + "Ties": -0.01 + } + }, + { + "model_id": "PKU-Alignment/beaver-7b-v2.0-cost", + "name": "PKU-Alignment/beaver-7b-v2.0-cost", + "developer": "PKU-Alignment", + "scores": { + "Score": 0.5957, + "Chat": 0.5726, + "Chat Hard": 0.4561, + "Safety": 0.7608, + "Reasoning": 0.6211, + "Prior Sets (0.5 weight)": 0.5397 + } + }, + { + "model_id": "PKU-Alignment/beaver-7b-v2.0-reward", + "name": "PKU-Alignment/beaver-7b-v2.0-reward", + "developer": "PKU-Alignment", + "scores": { + "Score": 0.6366, + "Chat": 0.8994, + "Chat Hard": 0.364, + "Safety": 0.6041, + "Reasoning": 0.6887, + "Prior Sets (0.5 weight)": 0.6171 + } + }, + { + "model_id": "PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022...", + "name": "PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022...", + "developer": "PoLL", + "scores": { + "Score": 0.7578, + "Chat": 0.9525, + "Chat Hard": 0.5406, + "Safety": 0.8034, + "Reasoning": 0.7346 + } + }, + { + "model_id": "Qwen/Qwen1.5-0.5B-Chat", + "name": "Qwen1.5-0.5B-Chat", + "developer": "Qwen", + "scores": { + "Score": 0.5298, + "Chat": 0.3547, + "Chat Hard": 0.6294, + "Safety": 0.5703, + "Reasoning": 0.5984, + "Prior Sets (0.5 weight)": 0.4629 + } + }, + { + "model_id": "Qwen/Qwen1.5-1.8B-Chat", + "name": "Qwen1.5-1.8B-Chat", + "developer": "Qwen", + "scores": { + "Score": 0.589, + "Chat": 0.5615, + "Chat Hard": 0.6031, + "Safety": 0.4838, + "Reasoning": 0.7793, + "Prior Sets (0.5 weight)": 0.4453 + } + }, + { + "model_id": "Qwen/Qwen1.5-14B-Chat", + "name": "Qwen1.5-14B-Chat", + "developer": "Qwen", + "scores": { + "Score": 0.6864, + "Chat": 0.5726, + "Chat Hard": 0.7018, + "Safety": 0.7122, + "Reasoning": 0.8961, + "Prior Sets (0.5 weight)": 0.4123 + } + }, + { + "model_id": "Qwen/Qwen1.5-4B-Chat", + "name": "Qwen1.5-4B-Chat", + "developer": "Qwen", + "scores": { + "Score": 0.5477, + "Chat": 0.3883, + "Chat Hard": 0.6272, + "Safety": 0.5568, + "Reasoning": 0.6689, + "Prior Sets (0.5 weight)": 0.447 + } + }, + { + "model_id": "Qwen/Qwen1.5-72B-Chat", + "name": "Qwen/Qwen1.5-72B-Chat", + "developer": "Qwen", + "scores": { + "Score": 0.6723, + "Chat": 0.6229, + "Chat Hard": 0.6601, + "Safety": 0.6757, + "Reasoning": 0.8554, + "Prior Sets (0.5 weight)": 0.4226 + } + }, + { + "model_id": "Qwen/Qwen1.5-7B-Chat", + "name": "Qwen1.5-7B-Chat", + "developer": "Qwen", + "scores": { + "Score": 0.675, + "Chat": 0.5363, + "Chat Hard": 0.6908, + "Safety": 0.6919, + "Reasoning": 0.9041, + "Prior Sets (0.5 weight)": 0.4288 + } + }, + { + "model_id": "Qwen/Qwen1.5-MoE-A2.7B-Chat", + "name": "Qwen1.5-MoE-A2.7B-Chat", + "developer": "Qwen", + "scores": { + "Score": 0.6644, + "Chat": 0.7291, + "Chat Hard": 0.6316, + "Safety": 0.6284, + "Reasoning": 0.774, + "Prior Sets (0.5 weight)": 0.4536 + } + }, + { + "model_id": "Qwen/WorldPM-72B", + "name": "Qwen/WorldPM-72B", + "developer": "Qwen", + "scores": { + "Score": 0.6333, + "Factuality": 0.7074, + "Precise IF": 0.3125, + "Math": 0.6557, + "Safety": 0.8533, + "Focus": 0.9172, + "Ties": 0.3535 + } + }, + { + "model_id": "R-I-S-E/RISE-Judge-Qwen2.5-32B", + "name": "R-I-S-E/RISE-Judge-Qwen2.5-32B", + "developer": "R-I-S-E", + "scores": { + "Score": 0.9266, + "Chat": 0.9665, + "Chat Hard": 0.8333, + "Safety": 0.9189, + "Reasoning": 0.9877 + } + }, + { + "model_id": "R-I-S-E/RISE-Judge-Qwen2.5-7B", + "name": "R-I-S-E/RISE-Judge-Qwen2.5-7B", + "developer": "R-I-S-E", + "scores": { + "Score": 0.8819, + "Chat": 0.9218, + "Chat Hard": 0.7654, + "Safety": 0.8797, + "Reasoning": 0.9608 + } + }, + { + "model_id": "RLHFlow/ArmoRM-Llama3-8B-v0.1", + "name": "ArmoRM-Llama3-8B-v0.1", + "developer": "RLHFlow", + "scores": { + "Score": 0.6646, + "Factuality": 0.6568, + "Precise IF": 0.4188, + "Math": 0.6612, + "Safety": 0.8222, + "Focus": 0.7657, + "Ties": 0.6629 + } + }, + { + "model_id": "RLHFlow/LLaMA3-iterative-DPO-final", + "name": "LLaMA3-iterative-DPO-final", + "developer": "RLHFlow", + "scores": { + "Score": 0.6783, + "Chat": 0.838, + "Chat Hard": 0.5921, + "Safety": 0.7865, + "Reasoning": 0.6161, + "Prior Sets (0.5 weight)": 0.4392 + } + }, + { + "model_id": "RLHFlow/RewardModel-Mistral-7B-for-DPA-v1", + "name": "RLHFlow/RewardModel-Mistral-7B-for-DPA-v1", + "developer": "RLHFlow", + "scores": { + "Score": 0.6633, + "Chat": 0.8799, + "Chat Hard": 0.4978, + "Safety": 0.7068, + "Reasoning": 0.5971, + "Prior Sets (0.5 weight)": 0.6068 + } + }, + { + "model_id": "RLHFlow/pair-preference-model-LLaMA3-8B", + "name": "RLHFlow/pair-preference-model-LLaMA3-8B", + "developer": "RLHFlow", + "scores": { + "Score": 0.8575, + "Chat": 0.9832, + "Chat Hard": 0.6579, + "Safety": 0.8973, + "Reasoning": 0.9473, + "Prior Sets (0.5 weight)": 0.7458 + } + }, + { + "model_id": "Ray2333/GRM-Gemma-2B-rewardmodel-ft", + "name": "Ray2333/GRM-Gemma-2B-rewardmodel-ft", + "developer": "Ray2333", + "scores": { + "Score": 0.8447, + "Chat": 0.8939, + "Chat Hard": 0.7522, + "Safety": 0.8446, + "Reasoning": 0.8881 + } + }, + { + "model_id": "Ray2333/GRM-Gemma-2B-sftreg", + "name": "Ray2333/GRM-Gemma-2B-sftreg", + "developer": "Ray2333", + "scores": { + "Score": 0.7451, + "Chat": 0.9553, + "Chat Hard": 0.4868, + "Safety": 0.7932, + "Reasoning": 0.7684, + "Prior Sets (0.5 weight)": 0.6983 + } + }, + { + "model_id": "Ray2333/GRM-Llama3-8B-rewardmodel-ft", + "name": "Ray2333/GRM-Llama3-8B-rewardmodel-ft", + "developer": "Ray2333", + "scores": { + "Score": 0.9154, + "Chat": 0.9553, + "Chat Hard": 0.8618, + "Safety": 0.9081, + "Reasoning": 0.9362 + } + }, + { + "model_id": "Ray2333/GRM-gemma2-2B-rewardmodel-ft", + "name": "Ray2333/GRM-gemma2-2B-rewardmodel-ft", + "developer": "Ray2333", + "scores": { + "Score": 0.5966, + "Factuality": 0.5305, + "Precise IF": 0.3125, + "Math": 0.5902, + "Safety": 0.9222, + "Focus": 0.7455, + "Ties": 0.4788 + } + }, + { + "model_id": "Ray2333/GRM-llama3-8B-distill", + "name": "Ray2333/GRM-llama3-8B-distill", + "developer": "Ray2333", + "scores": { + "Score": 0.8464, + "Chat": 0.9832, + "Chat Hard": 0.6842, + "Safety": 0.8676, + "Reasoning": 0.9133, + "Prior Sets (0.5 weight)": 0.7209 + } + }, + { + "model_id": "Ray2333/GRM-llama3-8B-sftreg", + "name": "Ray2333/GRM-llama3-8B-sftreg", + "developer": "Ray2333", + "scores": { + "Score": 0.8542, + "Chat": 0.986, + "Chat Hard": 0.6776, + "Safety": 0.8919, + "Reasoning": 0.9229, + "Prior Sets (0.5 weight)": 0.7309 + } + }, + { + "model_id": "Ray2333/GRM-llama3.2-3B-rewardmodel-ft", + "name": "Ray2333/GRM-llama3.2-3B-rewardmodel-ft", + "developer": "Ray2333", + "scores": { + "Score": 0.9092, + "Chat": 0.9162, + "Chat Hard": 0.8487, + "Safety": 0.927, + "Reasoning": 0.945 + } + }, + { + "model_id": "Ray2333/Gemma-2B-rewardmodel-baseline", + "name": "Ray2333/Gemma-2B-rewardmodel-baseline", + "developer": "Ray2333", + "scores": { + "Score": 0.729, + "Chat": 0.9413, + "Chat Hard": 0.4693, + "Safety": 0.7865, + "Reasoning": 0.7384, + "Prior Sets (0.5 weight)": 0.6897 + } + }, + { + "model_id": "Ray2333/Gemma-2B-rewardmodel-ft", + "name": "Ray2333/Gemma-2B-rewardmodel-ft", + "developer": "Ray2333", + "scores": { + "Score": 0.8048, + "Chat": 0.7793, + "Chat Hard": 0.7478, + "Safety": 0.8527, + "Reasoning": 0.8393 + } + }, + { + "model_id": "Ray2333/reward-model-Mistral-7B-instruct-Unifie...", + "name": "Ray2333/reward-model-Mistral-7B-instruct-Unifie...", + "developer": "Ray2333", + "scores": { + "Score": 0.7661, + "Chat": 0.9777, + "Chat Hard": 0.5066, + "Safety": 0.8527, + "Reasoning": 0.7389, + "Prior Sets (0.5 weight)": 0.7434 + } + }, + { + "model_id": "SF-Foundation/TextEval-Llama3.1-70B", + "name": "SF-Foundation/TextEval-Llama3.1-70B", + "developer": "SF-Foundation", + "scores": { + "Score": 0.9348, + "Chat": 0.9413, + "Chat Hard": 0.9013, + "Safety": 0.9324, + "Reasoning": 0.9641 + } + }, + { + "model_id": "SF-Foundation/TextEval-OffsetBias-12B", + "name": "SF-Foundation/TextEval-OffsetBias-12B", + "developer": "SF-Foundation", + "scores": { + "Score": 0.9105, + "Chat": 0.919, + "Chat Hard": 0.8662, + "Safety": 0.9203, + "Reasoning": 0.9365 + } + }, + { + "model_id": "Salesforce/SFR-LLaMa-3.1-70B-Judge-r", + "name": "Salesforce/SFR-LLaMa-3.1-70B-Judge-r", + "developer": "Salesforce", + "scores": { + "Score": 0.9272, + "Chat": 0.9693, + "Chat Hard": 0.8476, + "Safety": 0.9162, + "Reasoning": 0.9757 + } + }, + { + "model_id": "Salesforce/SFR-LLaMa-3.1-8B-Judge-r", + "name": "Salesforce/SFR-LLaMa-3.1-8B-Judge-r", + "developer": "Salesforce", + "scores": { + "Score": 0.8865, + "Chat": 0.9553, + "Chat Hard": 0.7774, + "Safety": 0.8622, + "Reasoning": 0.9513 + } + }, + { + "model_id": "Salesforce/SFR-nemo-12B-Judge-r", + "name": "Salesforce/SFR-nemo-12B-Judge-r", + "developer": "Salesforce", + "scores": { + "Score": 0.9027, + "Chat": 0.9721, + "Chat Hard": 0.8224, + "Safety": 0.8649, + "Reasoning": 0.9513 + } + }, + { + "model_id": "Schrieffer/Llama-SARM-4B", + "name": "Schrieffer/Llama-SARM-4B", + "developer": "Schrieffer", + "scores": { + "Score": 0.7379, + "Factuality": 0.6874, + "Precise IF": 0.4281, + "Math": 0.6448, + "Safety": 0.9178, + "Focus": 0.9556, + "Ties": 0.7939 + } + }, + { + "model_id": "ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1", + "name": "ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1", + "developer": "ShikaiChen", + "scores": { + "Score": 0.7249, + "Factuality": 0.7558, + "Precise IF": 0.35, + "Math": 0.6448, + "Safety": 0.9222, + "Focus": 0.9131, + "Ties": 0.7633 + } + }, + { + "model_id": "Skywork/Skywork-Critic-Llama-3.1-70B", + "name": "Skywork/Skywork-Critic-Llama-3.1-70B", + "developer": "Skywork", + "scores": { + "Score": 0.9331, + "Chat": 0.9665, + "Chat Hard": 0.8794, + "Safety": 0.9311, + "Reasoning": 0.9554 + } + }, + { + "model_id": "Skywork/Skywork-Critic-Llama-3.1-8B", + "name": "Skywork/Skywork-Critic-Llama-3.1-8B", + "developer": "Skywork", + "scores": { + "Score": 0.8896, + "Chat": 0.9358, + "Chat Hard": 0.8136, + "Safety": 0.9108, + "Reasoning": 0.898 + } + }, + { + "model_id": "Skywork/Skywork-Reward-Gemma-2-27B", + "name": "Skywork/Skywork-Reward-Gemma-2-27B", + "developer": "Skywork", + "scores": { + "Score": 0.7576, + "Factuality": 0.7368, + "Precise IF": 0.4031, + "Math": 0.7049, + "Safety": 0.9422, + "Focus": 0.9323, + "Ties": 0.8261 + } + }, + { + "model_id": "Skywork/Skywork-Reward-Gemma-2-27B-v0.2", + "name": "Skywork-Reward-Gemma-2-27B-v0.2", + "developer": "Skywork", + "scores": { + "Score": 0.7531, + "Factuality": 0.7674, + "Precise IF": 0.375, + "Math": 0.6721, + "Safety": 0.9689, + "Focus": 0.9172, + "Ties": 0.8182 + } + }, + { + "model_id": "Skywork/Skywork-Reward-Llama-3.1-8B", + "name": "Skywork/Skywork-Reward-Llama-3.1-8B", + "developer": "Skywork", + "scores": { + "Score": 0.7314, + "Factuality": 0.6989, + "Precise IF": 0.425, + "Math": 0.6284, + "Safety": 0.9333, + "Focus": 0.9616, + "Ties": 0.741 + } + }, + { + "model_id": "Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", + "name": "Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", + "developer": "Skywork", + "scores": { + "Score": 0.7175, + "Factuality": 0.6968, + "Precise IF": 0.4062, + "Math": 0.6011, + "Safety": 0.9422, + "Focus": 0.9414, + "Ties": 0.7169 + } + }, + { + "model_id": "Skywork/Skywork-Reward-V2-Llama-3.1-8B", + "name": "Skywork/Skywork-Reward-V2-Llama-3.1-8B", + "developer": "Skywork", + "scores": { + "Score": 0.8413, + "Factuality": 0.8463, + "Precise IF": 0.6625, + "Math": 0.776, + "Safety": 0.9667, + "Focus": 0.9838, + "Ties": 0.8124 + } + }, + { + "model_id": "Skywork/Skywork-Reward-V2-Llama-3.2-1B", + "name": "Skywork/Skywork-Reward-V2-Llama-3.2-1B", + "developer": "Skywork", + "scores": { + "Score": 0.6438, + "Factuality": 0.6084, + "Precise IF": 0.4562, + "Math": 0.6011, + "Safety": 0.8733, + "Focus": 0.8929, + "Ties": 0.4306 + } + }, + { + "model_id": "Skywork/Skywork-Reward-V2-Llama-3.2-3B", + "name": "Skywork/Skywork-Reward-V2-Llama-3.2-3B", + "developer": "Skywork", + "scores": { + "Score": 0.7466, + "Factuality": 0.7621, + "Precise IF": 0.4562, + "Math": 0.694, + "Safety": 0.9311, + "Focus": 0.9596, + "Ties": 0.6768 + } + }, + { + "model_id": "Skywork/Skywork-Reward-V2-Qwen3-0.6B", + "name": "Skywork/Skywork-Reward-V2-Qwen3-0.6B", + "developer": "Skywork", + "scores": { + "Score": 0.6125, + "Factuality": 0.58, + "Precise IF": 0.4, + "Math": 0.7158, + "Safety": 0.8444, + "Focus": 0.7949, + "Ties": 0.3397 + } + }, + { + "model_id": "Skywork/Skywork-Reward-V2-Qwen3-1.7B", + "name": "Skywork/Skywork-Reward-V2-Qwen3-1.7B", + "developer": "Skywork", + "scores": { + "Score": 0.6818, + "Factuality": 0.6568, + "Precise IF": 0.4437, + "Math": 0.7268, + "Safety": 0.8911, + "Focus": 0.8848, + "Ties": 0.4872 + } + }, + { + "model_id": "Skywork/Skywork-Reward-V2-Qwen3-4B", + "name": "Skywork/Skywork-Reward-V2-Qwen3-4B", + "developer": "Skywork", + "scores": { + "Score": 0.7551, + "Factuality": 0.7737, + "Precise IF": 0.4625, + "Math": 0.7322, + "Safety": 0.9222, + "Focus": 0.9657, + "Ties": 0.6743 + } + }, + { + "model_id": "Skywork/Skywork-Reward-V2-Qwen3-8B", + "name": "Skywork/Skywork-Reward-V2-Qwen3-8B", + "developer": "Skywork", + "scores": { + "Score": 0.7837, + "Factuality": 0.7989, + "Precise IF": 0.5, + "Math": 0.7705, + "Safety": 0.94, + "Focus": 0.9636, + "Ties": 0.7294 + } + }, + { + "model_id": "Skywork/Skywork-VL-Reward-7B", + "name": "Skywork/Skywork-VL-Reward-7B", + "developer": "Skywork", + "scores": { + "Score": 0.6885, + "Factuality": 0.6063, + "Precise IF": 0.35, + "Math": 0.6339, + "Safety": 0.8911, + "Focus": 0.8909, + "Ties": 0.7586 + } + }, + { + "model_id": "SultanR/SmolTulu-1.7b-RM", + "name": "SultanR/SmolTulu-1.7b-RM", + "developer": "SultanR", + "scores": { + "Score": 0.5094, + "Chat": 0.743, + "Chat Hard": 0.4408, + "Safety": 0.5716, + "Reasoning": 0.2821 + } + }, + { + "model_id": "ZiyiYe/Con-J-Qwen2-7B", + "name": "ZiyiYe/Con-J-Qwen2-7B", + "developer": "ZiyiYe", + "scores": { + "Score": 0.8712, + "Chat": 0.919, + "Chat Hard": 0.8026, + "Safety": 0.8824, + "Reasoning": 0.8808 + } + }, + { + "model_id": "ai2/llama-2-chat-7b-nectar-3.8m.json", + "name": "ai2/llama-2-chat-7b-nectar-3.8m.json", + "developer": "ai2", + "scores": { + "Score": 0.5843, + "Chat": 0.8631, + "Chat Hard": 0.2654, + "Safety": 0.6243 + } + }, + { + "model_id": "ai2/llama-2-chat-nectar-180k.json", + "name": "ai2/llama-2-chat-nectar-180k.json", + "developer": "ai2", + "scores": { + "Score": 0.5235, + "Chat": 0.8827, + "Chat Hard": 0.2851, + "Safety": 0.4027 + } + }, + { + "model_id": "ai2/llama-2-chat-ultrafeedback-60k.jsonl", + "name": "ai2/llama-2-chat-ultrafeedback-60k.jsonl", + "developer": "ai2", + "scores": { + "Score": 0.644, + "Chat": 0.9441, + "Chat Hard": 0.4539, + "Safety": 0.5338 + } + }, + { + "model_id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", + "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", + "developer": "ai2", + "scores": { + "Score": 0.7058, + "Chat": 0.9525, + "Chat Hard": 0.3947, + "Safety": 0.7703 + } + }, + { + "model_id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json", + "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json", + "developer": "ai2", + "scores": { + "Score": 0.7127, + "Chat": 0.9358, + "Chat Hard": 0.4079, + "Safety": 0.7946 + } + }, + { + "model_id": "ai2/tulu-2-7b-rm-v0-nectar-binarized.json", + "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized.json", + "developer": "ai2", + "scores": { + "Score": 0.6756, + "Chat": 0.9134, + "Chat Hard": 0.3904, + "Safety": 0.723 + } + }, + { + "model_id": "ai2/tulu-2-7b-rm-v0.json", + "name": "ai2/tulu-2-7b-rm-v0.json", + "developer": "ai2", + "scores": { + "Score": 0.6655, + "Chat": 0.933, + "Chat Hard": 0.4539, + "Safety": 0.6095 + } + }, + { + "model_id": "allenai/Llama-3.1-70B-Instruct-RM-RB2", + "name": "allenai/Llama-3.1-70B-Instruct-RM-RB2", + "developer": "allenai", + "scores": { + "Score": 0.7606, + "Factuality": 0.8126, + "Precise IF": 0.4188, + "Math": 0.6995, + "Safety": 0.8844, + "Focus": 0.8646, + "Ties": 0.8835 + } + }, + { + "model_id": "allenai/Llama-3.1-8B-Base-RM-RB2", + "name": "allenai/Llama-3.1-8B-Base-RM-RB2", + "developer": "allenai", + "scores": { + "Score": 0.8463, + "Chat": 0.933, + "Chat Hard": 0.7785, + "Safety": 0.8851, + "Reasoning": 0.7886, + "Prior Sets (0.5 weight)": 0.0 + } + }, + { + "model_id": "allenai/Llama-3.1-8B-Instruct-RM-RB2", + "name": "allenai/Llama-3.1-8B-Instruct-RM-RB2", + "developer": "allenai", + "scores": { + "Score": 0.7285, + "Factuality": 0.7432, + "Precise IF": 0.4437, + "Math": 0.6175, + "Safety": 0.8956, + "Focus": 0.9071, + "Ties": 0.7638 + } + }, + { + "model_id": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2", + "name": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2", + "developer": "allenai", + "scores": { + "Score": 0.722, + "Factuality": 0.8084, + "Precise IF": 0.3688, + "Math": 0.6776, + "Safety": 0.8689, + "Focus": 0.7778, + "Ties": 0.8308 + } + }, + { + "model_id": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2", + "name": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2", + "developer": "allenai", + "scores": { + "Score": 0.8431, + "Chat": 0.9553, + "Chat Hard": 0.761, + "Safety": 0.8662, + "Reasoning": 0.7898, + "Prior Sets (0.5 weight)": 0.0 + } + }, + { + "model_id": "allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2", + "name": "allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2", + "developer": "allenai", + "scores": { + "Score": 0.8369, + "Chat": 0.9469, + "Chat Hard": 0.7588, + "Safety": 0.8703, + "Reasoning": 0.7715, + "Prior Sets (0.5 weight)": 0.0 + } + }, + { + "model_id": "allenai/Llama-3.1-Tulu-3-8B-RM", + "name": "Llama-3.1-Tulu-3-8B-RM", + "developer": "allenai", + "scores": { + "Score": 0.59, + "Factuality": 0.7453, + "Precise IF": 0.3469, + "Math": 0.6448, + "Safety": 0.7422, + "Focus": 0.5364, + "Ties": 0.5243 + } + }, + { + "model_id": "allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2", + "name": "allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2", + "developer": "allenai", + "scores": { + "Score": 0.6821, + "Factuality": 0.7326, + "Precise IF": 0.3875, + "Math": 0.5792, + "Safety": 0.8978, + "Focus": 0.8889, + "Ties": 0.6063 + } + }, + { + "model_id": "allenai/OLMo-7B-Instruct", + "name": "allenai/OLMo-7B-Instruct", + "developer": "allenai", + "scores": { + "Score": 0.6727, + "Chat": 0.8966, + "Chat Hard": 0.5066, + "Safety": 0.6486, + "Reasoning": 0.7168, + "Prior Sets (0.5 weight)": 0.5173 + } + }, + { + "model_id": "allenai/llama-3-tulu-2-70b-uf-mean-rm", + "name": "allenai/llama-3-tulu-2-70b-uf-mean-rm", + "developer": "allenai", + "scores": { + "Score": 0.7019, + "Chat": 0.8631, + "Chat Hard": 0.5614, + "Safety": 0.6095, + "Reasoning": 0.8268, + "Prior Sets (0.5 weight)": 0.5957 + } + }, + { + "model_id": "allenai/llama-3-tulu-2-8b-uf-mean-rm", + "name": "allenai/llama-3-tulu-2-8b-uf-mean-rm", + "developer": "allenai", + "scores": { + "Score": 0.7342, + "Chat": 0.9525, + "Chat Hard": 0.5921, + "Safety": 0.6162, + "Reasoning": 0.8212, + "Prior Sets (0.5 weight)": 0.6434 + } + }, + { + "model_id": "allenai/llama-3-tulu-2-dpo-70b", + "name": "allenai/llama-3-tulu-2-dpo-70b", + "developer": "allenai", + "scores": { + "Score": 0.7496, + "Chat": 0.9637, + "Chat Hard": 0.5746, + "Safety": 0.7486, + "Reasoning": 0.802, + "Prior Sets (0.5 weight)": 0.5687 + } + }, + { + "model_id": "allenai/llama-3-tulu-2-dpo-8b", + "name": "allenai/llama-3-tulu-2-dpo-8b", + "developer": "allenai", + "scores": { + "Score": 0.7275, + "Chat": 0.9525, + "Chat Hard": 0.5351, + "Safety": 0.6649, + "Reasoning": 0.8663, + "Prior Sets (0.5 weight)": 0.5097 + } + }, + { + "model_id": "allenai/open_instruct_dev-reward_modeling__1__1739590997", + "name": "allenai/open_instruct_dev-reward_modeling__1__1739590997", + "developer": "allenai", + "scores": { + "Score": 0.6004, + "Factuality": 0.7032, + "Precise IF": 0.375, + "Math": 0.623, + "Safety": 0.7867, + "Focus": 0.598, + "Ties": 0.5165 + } + }, + { + "model_id": "allenai/open_instruct_dev-reward_modeling__1__1739871066", + "name": "allenai/open_instruct_dev-reward_modeling__1__1739871066", + "developer": "allenai", + "scores": { + "Score": 0.6012, + "Factuality": 0.6989, + "Precise IF": 0.425, + "Math": 0.6284, + "Safety": 0.7978, + "Focus": 0.604, + "Ties": 0.4527 + } + }, + { + "model_id": "allenai/open_instruct_dev-reward_modeling__1__1739925892", + "name": "allenai/open_instruct_dev-reward_modeling__1__1739925892", + "developer": "allenai", + "scores": { + "Score": 0.6345, + "Factuality": 0.7432, + "Precise IF": 0.3563, + "Math": 0.623, + "Safety": 0.8111, + "Focus": 0.7131, + "Ties": 0.5606 + } + }, + { + "model_id": "allenai/open_instruct_dev-reward_modeling__1__1739943850", + "name": "allenai/open_instruct_dev-reward_modeling__1__1739943850", + "developer": "allenai", + "scores": { + "Score": 0.4978, + "Factuality": 0.5726, + "Precise IF": 0.3125, + "Math": 0.5191, + "Safety": 0.6489, + "Focus": 0.6222, + "Ties": 0.3114 + } + }, + { + "model_id": "allenai/open_instruct_dev-reward_modeling__1__1739943881", + "name": "allenai/open_instruct_dev-reward_modeling__1__1739943881", + "developer": "allenai", + "scores": { + "Score": 0.5998, + "Factuality": 0.7032, + "Precise IF": 0.3187, + "Math": 0.5792, + "Safety": 0.8222, + "Focus": 0.6727, + "Ties": 0.5025 + } + }, + { + "model_id": "allenai/open_instruct_dev-reward_modeling__1__1739943972", + "name": "allenai/open_instruct_dev-reward_modeling__1__1739943972", + "developer": "allenai", + "scores": { + "Score": 0.5289, + "Factuality": 0.6168, + "Precise IF": 0.375, + "Math": 0.5738, + "Safety": 0.6844, + "Focus": 0.5657, + "Ties": 0.3577 + } + }, + { + "model_id": "allenai/open_instruct_dev-reward_modeling__1__1739957701", + "name": "allenai/open_instruct_dev-reward_modeling__1__1739957701", + "developer": "allenai", + "scores": { + "Score": 0.6194, + "Factuality": 0.6779, + "Precise IF": 0.3563, + "Math": 0.6011, + "Safety": 0.8022, + "Focus": 0.697, + "Ties": 0.5822 + } + }, + { + "model_id": "allenai/open_instruct_dev-reward_modeling__1__1739971507", + "name": "allenai/open_instruct_dev-reward_modeling__1__1739971507", + "developer": "allenai", + "scores": { + "Score": 0.5717, + "Factuality": 0.68, + "Precise IF": 0.375, + "Math": 0.6066, + "Safety": 0.7667, + "Focus": 0.5475, + "Ties": 0.4545 + } + }, + { + "model_id": "allenai/open_instruct_dev-reward_modeling__1__1739971529", + "name": "allenai/open_instruct_dev-reward_modeling__1__1739971529", + "developer": "allenai", + "scores": { + "Score": 0.5564, + "Factuality": 0.6568, + "Precise IF": 0.3563, + "Math": 0.5956, + "Safety": 0.7533, + "Focus": 0.5737, + "Ties": 0.4027 + } + }, + { + "model_id": "allenai/open_instruct_dev-reward_modeling__1__1739998765", + "name": "allenai/open_instruct_dev-reward_modeling__1__1739998765", + "developer": "allenai", + "scores": { + "Score": 0.6008, + "Factuality": 0.7095, + "Precise IF": 0.4125, + "Math": 0.6066, + "Safety": 0.8022, + "Focus": 0.5859, + "Ties": 0.4883 + } + }, + { + "model_id": "allenai/open_instruct_dev-reward_modeling__1__1740005072", + "name": "allenai/open_instruct_dev-reward_modeling__1__1740005072", + "developer": "allenai", + "scores": { + "Score": 0.6097, + "Factuality": 0.7137, + "Precise IF": 0.3937, + "Math": 0.6339, + "Safety": 0.7778, + "Focus": 0.6343, + "Ties": 0.5047 + } + }, + { + "model_id": "allenai/open_instruct_dev-reward_modeling__1__1740129284", + "name": "allenai/open_instruct_dev-reward_modeling__1__1740129284", + "developer": "allenai", + "scores": { + "Score": 0.6129, + "Factuality": 0.7116, + "Precise IF": 0.4437, + "Math": 0.6448, + "Safety": 0.8022, + "Focus": 0.6101, + "Ties": 0.4652 + } + }, + { + "model_id": "allenai/open_instruct_dev-reward_modeling__1__1741286813", + "name": "allenai/open_instruct_dev-reward_modeling__1__1741286813", + "developer": "allenai", + "scores": { + "Score": 0.6557, + "Factuality": 0.6295, + "Precise IF": 0.4188, + "Math": 0.612, + "Safety": 0.9111, + "Focus": 0.8263, + "Ties": 0.5365 + } + }, + { + "model_id": "allenai/open_instruct_dev-reward_modeling__1__1741287363", + "name": "allenai/open_instruct_dev-reward_modeling__1__1741287363", + "developer": "allenai", + "scores": { + "Score": 0.6672, + "Factuality": 0.6295, + "Precise IF": 0.375, + "Math": 0.6066, + "Safety": 0.88, + "Focus": 0.9374, + "Ties": 0.5748 + } + }, + { + "model_id": "allenai/open_instruct_dev-reward_modeling__1__1741292911", + "name": "allenai/open_instruct_dev-reward_modeling__1__1741292911", + "developer": "allenai", + "scores": { + "Score": 0.6607, + "Factuality": 0.6589, + "Precise IF": 0.4, + "Math": 0.6066, + "Safety": 0.9089, + "Focus": 0.8869, + "Ties": 0.5028 + } + }, + { + "model_id": "allenai/open_instruct_dev-reward_modeling__1__1742338142", + "name": "allenai/open_instruct_dev-reward_modeling__1__1742338142", + "developer": "allenai", + "scores": { + "Score": 0.6344, + "Factuality": 0.7326, + "Precise IF": 0.3812, + "Math": 0.7049, + "Safety": 0.88, + "Focus": 0.6323, + "Ties": 0.475 + } + }, + { + "model_id": "allenai/open_instruct_dev-reward_modeling__1__1742519610", + "name": "allenai/open_instruct_dev-reward_modeling__1__1742519610", + "developer": "allenai", + "scores": { + "Score": 0.6361, + "Factuality": 0.7074, + "Precise IF": 0.3812, + "Math": 0.6721, + "Safety": 0.82, + "Focus": 0.6444, + "Ties": 0.5915 + } + }, + { + "model_id": "allenai/open_instruct_dev-reward_modeling__1__1742519628", + "name": "allenai/open_instruct_dev-reward_modeling__1__1742519628", + "developer": "allenai", + "scores": { + "Score": 0.5609, + "Factuality": 0.5179, + "Precise IF": 0.3563, + "Math": 0.623, + "Safety": 0.8356, + "Focus": 0.5071, + "Ties": 0.5254 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455", + "name": "allenai/open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455", + "developer": "allenai", + "scores": { + "Score": 0.0576, + "Factuality": 0.04, + "Precise IF": 0.1313, + "Math": 0.0546, + "Safety": 0.0489, + "Focus": 0.0808, + "Ties": -0.01 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511", + "name": "allenai/open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511", + "developer": "allenai", + "scores": { + "Score": 0.5499, + "Factuality": 0.6821, + "Precise IF": 0.3937, + "Math": 0.5956, + "Safety": 0.7356, + "Focus": 0.5212, + "Ties": 0.3711 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406", + "name": "allenai/open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406", + "developer": "allenai", + "scores": { + "Score": 0.5054, + "Factuality": 0.6358, + "Precise IF": 0.3688, + "Math": 0.6066, + "Safety": 0.6867, + "Focus": 0.4424, + "Ties": 0.2922 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136", + "name": "allenai/open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136", + "developer": "allenai", + "scores": { + "Score": 0.478, + "Factuality": 0.6442, + "Precise IF": 0.3563, + "Math": 0.612, + "Safety": 0.6356, + "Focus": 0.2707, + "Ties": 0.3496 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398", + "name": "allenai/open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398", + "developer": "allenai", + "scores": { + "Score": 0.219, + "Factuality": 0.2484, + "Precise IF": 0.2812, + "Math": 0.2623, + "Safety": 0.3422, + "Focus": 0.1717, + "Ties": 0.008 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535", + "name": "allenai/open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535", + "developer": "allenai", + "scores": { + "Score": 0.5625, + "Factuality": 0.6821, + "Precise IF": 0.4062, + "Math": 0.6011, + "Safety": 0.7511, + "Focus": 0.5313, + "Ties": 0.403 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_1e-6_1_dpo__1__1743550054", + "name": "allenai/open_instruct_dev-rm_1e-6_1_dpo__1__1743550054", + "developer": "allenai", + "scores": { + "Score": 0.5759, + "Factuality": 0.7074, + "Precise IF": 0.375, + "Math": 0.623, + "Safety": 0.7578, + "Focus": 0.5333, + "Ties": 0.459 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271", + "name": "allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271", + "developer": "allenai", + "scores": { + "Score": 0.6057, + "Factuality": 0.5053, + "Precise IF": 0.375, + "Math": 0.5902, + "Safety": 0.8422, + "Focus": 0.7798, + "Ties": 0.5419 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181", + "name": "allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181", + "developer": "allenai", + "scores": { + "Score": 0.6535, + "Factuality": 0.7137, + "Precise IF": 0.3812, + "Math": 0.6175, + "Safety": 0.8244, + "Focus": 0.7737, + "Ties": 0.6101 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_1e-6_1_rl__1__1743551221", + "name": "allenai/open_instruct_dev-rm_1e-6_1_rl__1__1743551221", + "developer": "allenai", + "scores": { + "Score": 0.5799, + "Factuality": 0.7116, + "Precise IF": 0.3812, + "Math": 0.6284, + "Safety": 0.76, + "Focus": 0.5374, + "Ties": 0.461 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262", + "name": "allenai/open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262", + "developer": "allenai", + "scores": { + "Score": 0.5903, + "Factuality": 0.4863, + "Precise IF": 0.3625, + "Math": 0.5738, + "Safety": 0.8489, + "Focus": 0.7778, + "Ties": 0.4926 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523", + "name": "allenai/open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523", + "developer": "allenai", + "scores": { + "Score": 0.6483, + "Factuality": 0.7074, + "Precise IF": 0.3625, + "Math": 0.6175, + "Safety": 0.8222, + "Focus": 0.7758, + "Ties": 0.6044 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750", + "name": "allenai/open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750", + "developer": "allenai", + "scores": { + "Score": 0.5157, + "Factuality": 0.6084, + "Precise IF": 0.3688, + "Math": 0.6066, + "Safety": 0.7089, + "Focus": 0.4222, + "Ties": 0.3791 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427", + "name": "allenai/open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427", + "developer": "allenai", + "scores": { + "Score": 0.6009, + "Factuality": 0.7263, + "Precise IF": 0.375, + "Math": 0.5902, + "Safety": 0.7933, + "Focus": 0.7273, + "Ties": 0.3931 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446", + "name": "allenai/open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446", + "developer": "allenai", + "scores": { + "Score": 0.5716, + "Factuality": 0.6779, + "Precise IF": 0.3937, + "Math": 0.5464, + "Safety": 0.7533, + "Focus": 0.7051, + "Ties": 0.3534 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094", + "name": "allenai/open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094", + "developer": "allenai", + "scores": { + "Score": 0.5151, + "Factuality": 0.6484, + "Precise IF": 0.3312, + "Math": 0.5574, + "Safety": 0.7289, + "Focus": 0.4889, + "Ties": 0.3357 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636", + "name": "allenai/open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636", + "developer": "allenai", + "scores": { + "Score": 0.6119, + "Factuality": 0.72, + "Precise IF": 0.4062, + "Math": 0.6284, + "Safety": 0.8067, + "Focus": 0.6889, + "Ties": 0.421 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_1e-6_2_dpo__1__1743549325", + "name": "allenai/open_instruct_dev-rm_1e-6_2_dpo__1__1743549325", + "developer": "allenai", + "scores": { + "Score": 0.6008, + "Factuality": 0.7179, + "Precise IF": 0.35, + "Math": 0.5956, + "Safety": 0.8, + "Focus": 0.6707, + "Ties": 0.4707 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_1e-6_2_rl__1__1743551238", + "name": "allenai/open_instruct_dev-rm_1e-6_2_rl__1__1743551238", + "developer": "allenai", + "scores": { + "Score": 0.5965, + "Factuality": 0.7095, + "Precise IF": 0.3438, + "Math": 0.612, + "Safety": 0.8044, + "Focus": 0.6566, + "Ties": 0.453 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906", + "name": "allenai/open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906", + "developer": "allenai", + "scores": { + "Score": 0.5574, + "Factuality": 0.6526, + "Precise IF": 0.3937, + "Math": 0.6011, + "Safety": 0.7711, + "Focus": 0.5051, + "Ties": 0.4208 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529", + "name": "allenai/open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529", + "developer": "allenai", + "scores": { + "Score": 0.0719, + "Factuality": 0.0421, + "Precise IF": 0.2062, + "Math": 0.0601, + "Safety": 0.0378, + "Focus": 0.0949, + "Ties": -0.01 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305", + "name": "allenai/open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305", + "developer": "allenai", + "scores": { + "Score": 0.553, + "Factuality": 0.6674, + "Precise IF": 0.3563, + "Math": 0.6284, + "Safety": 0.6733, + "Focus": 0.5697, + "Ties": 0.4227 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778", + "name": "allenai/open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778", + "developer": "allenai", + "scores": { + "Score": 0.4955, + "Factuality": 0.6189, + "Precise IF": 0.325, + "Math": 0.5792, + "Safety": 0.6378, + "Focus": 0.5657, + "Ties": 0.2466 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459", + "name": "allenai/open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459", + "developer": "allenai", + "scores": { + "Score": 0.4198, + "Factuality": 0.5747, + "Precise IF": 0.3375, + "Math": 0.5464, + "Safety": 0.4933, + "Focus": 0.3596, + "Ties": 0.2073 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747", + "name": "allenai/open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747", + "developer": "allenai", + "scores": { + "Score": 0.5465, + "Factuality": 0.6821, + "Precise IF": 0.375, + "Math": 0.612, + "Safety": 0.7333, + "Focus": 0.5051, + "Ties": 0.3713 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935", + "name": "allenai/open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935", + "developer": "allenai", + "scores": { + "Score": 0.5197, + "Factuality": 0.6126, + "Precise IF": 0.3375, + "Math": 0.5847, + "Safety": 0.7333, + "Focus": 0.4646, + "Ties": 0.3855 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360", + "name": "allenai/open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360", + "developer": "allenai", + "scores": { + "Score": 0.4555, + "Factuality": 0.5495, + "Precise IF": 0.3063, + "Math": 0.4262, + "Safety": 0.5711, + "Focus": 0.6101, + "Ties": 0.2696 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366", + "name": "allenai/open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366", + "developer": "allenai", + "scores": { + "Score": 0.4422, + "Factuality": 0.5053, + "Precise IF": 0.3375, + "Math": 0.4044, + "Safety": 0.5422, + "Focus": 0.6646, + "Ties": 0.1991 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352", + "name": "allenai/open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352", + "developer": "allenai", + "scores": { + "Score": 0.341, + "Factuality": 0.4674, + "Precise IF": 0.2875, + "Math": 0.3333, + "Safety": 0.3711, + "Focus": 0.3919, + "Ties": 0.195 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634", + "name": "allenai/open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634", + "developer": "allenai", + "scores": { + "Score": 0.4698, + "Factuality": 0.5853, + "Precise IF": 0.2562, + "Math": 0.5027, + "Safety": 0.6489, + "Focus": 0.5697, + "Ties": 0.2562 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988", + "name": "allenai/open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988", + "developer": "allenai", + "scores": { + "Score": 0.4791, + "Factuality": 0.6421, + "Precise IF": 0.3125, + "Math": 0.541, + "Safety": 0.6911, + "Focus": 0.4182, + "Ties": 0.27 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103", + "name": "allenai/open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103", + "developer": "allenai", + "scores": { + "Score": 0.0607, + "Factuality": 0.0274, + "Precise IF": 0.1625, + "Math": 0.0656, + "Safety": 0.04, + "Focus": 0.0788, + "Ties": -0.01 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835", + "name": "allenai/open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835", + "developer": "allenai", + "scores": { + "Score": 0.6089, + "Factuality": 0.7284, + "Precise IF": 0.4375, + "Math": 0.612, + "Safety": 0.7622, + "Focus": 0.6444, + "Ties": 0.4686 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221", + "name": "allenai/open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221", + "developer": "allenai", + "scores": { + "Score": 0.6032, + "Factuality": 0.7158, + "Precise IF": 0.4062, + "Math": 0.6284, + "Safety": 0.7778, + "Focus": 0.5859, + "Ties": 0.5051 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826", + "name": "allenai/open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826", + "developer": "allenai", + "scores": { + "Score": 0.5831, + "Factuality": 0.6947, + "Precise IF": 0.4188, + "Math": 0.623, + "Safety": 0.74, + "Focus": 0.5758, + "Ties": 0.4465 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363", + "name": "allenai/open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363", + "developer": "allenai", + "scores": { + "Score": 0.5268, + "Factuality": 0.68, + "Precise IF": 0.3688, + "Math": 0.5792, + "Safety": 0.7178, + "Focus": 0.4343, + "Ties": 0.3809 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498", + "name": "allenai/open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498", + "developer": "allenai", + "scores": { + "Score": 0.6093, + "Factuality": 0.7326, + "Precise IF": 0.4313, + "Math": 0.6339, + "Safety": 0.7578, + "Focus": 0.5859, + "Ties": 0.5143 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_1__2__1743897475", + "name": "allenai/open_instruct_dev-rm_3e-6_1__2__1743897475", + "developer": "allenai", + "scores": { + "Score": 0.6122, + "Factuality": 0.7368, + "Precise IF": 0.4, + "Math": 0.623, + "Safety": 0.8044, + "Focus": 0.602, + "Ties": 0.5071 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_1__3__1744311421", + "name": "allenai/open_instruct_dev-rm_3e-6_1__3__1744311421", + "developer": "allenai", + "scores": { + "Score": 0.5995, + "Factuality": 0.7179, + "Precise IF": 0.3375, + "Math": 0.6066, + "Safety": 0.8, + "Focus": 0.6323, + "Ties": 0.503 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_1_dpo__1__1743549903", + "name": "allenai/open_instruct_dev-rm_3e-6_1_dpo__1__1743549903", + "developer": "allenai", + "scores": { + "Score": 0.6154, + "Factuality": 0.7326, + "Precise IF": 0.4375, + "Math": 0.6339, + "Safety": 0.7778, + "Focus": 0.6061, + "Ties": 0.5043 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368", + "name": "allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368", + "developer": "allenai", + "scores": { + "Score": 0.6604, + "Factuality": 0.6316, + "Precise IF": 0.3937, + "Math": 0.5792, + "Safety": 0.9044, + "Focus": 0.8929, + "Ties": 0.5604 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182", + "name": "allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182", + "developer": "allenai", + "scores": { + "Score": 0.6783, + "Factuality": 0.7705, + "Precise IF": 0.4, + "Math": 0.6066, + "Safety": 0.84, + "Focus": 0.8101, + "Ties": 0.6427 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_1_no_if__2__1744316012", + "name": "allenai/open_instruct_dev-rm_3e-6_1_no_if__2__1744316012", + "developer": "allenai", + "scores": { + "Score": 0.5911, + "Factuality": 0.7347, + "Precise IF": 0.4, + "Math": 0.6284, + "Safety": 0.74, + "Focus": 0.604, + "Ties": 0.4392 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_1_no_if__3__1744315765", + "name": "allenai/open_instruct_dev-rm_3e-6_1_no_if__3__1744315765", + "developer": "allenai", + "scores": { + "Score": 0.5926, + "Factuality": 0.7263, + "Precise IF": 0.3563, + "Math": 0.623, + "Safety": 0.7889, + "Focus": 0.5879, + "Ties": 0.4733 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_1_rl__1__1743551527", + "name": "allenai/open_instruct_dev-rm_3e-6_1_rl__1__1743551527", + "developer": "allenai", + "scores": { + "Score": 0.6126, + "Factuality": 0.7411, + "Precise IF": 0.425, + "Math": 0.623, + "Safety": 0.7822, + "Focus": 0.5939, + "Ties": 0.5104 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236", + "name": "allenai/open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236", + "developer": "allenai", + "scores": { + "Score": 0.6525, + "Factuality": 0.6021, + "Precise IF": 0.3875, + "Math": 0.5792, + "Safety": 0.8933, + "Focus": 0.8626, + "Ties": 0.59 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530", + "name": "allenai/open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530", + "developer": "allenai", + "scores": { + "Score": 0.6849, + "Factuality": 0.7453, + "Precise IF": 0.3812, + "Math": 0.612, + "Safety": 0.8422, + "Focus": 0.8404, + "Ties": 0.6885 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417", + "name": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417", + "developer": "allenai", + "scores": { + "Score": 0.586, + "Factuality": 0.6632, + "Precise IF": 0.425, + "Math": 0.6557, + "Safety": 0.7778, + "Focus": 0.5172, + "Ties": 0.477 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486", + "name": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486", + "developer": "allenai", + "scores": { + "Score": 0.6773, + "Factuality": 0.7432, + "Precise IF": 0.4, + "Math": 0.612, + "Safety": 0.8422, + "Focus": 0.804, + "Ties": 0.6626 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745", + "name": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745", + "developer": "allenai", + "scores": { + "Score": 0.6793, + "Factuality": 0.7558, + "Precise IF": 0.4062, + "Math": 0.6284, + "Safety": 0.8311, + "Focus": 0.8061, + "Ties": 0.6485 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661", + "name": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661", + "developer": "allenai", + "scores": { + "Score": 0.6611, + "Factuality": 0.72, + "Precise IF": 0.3563, + "Math": 0.6393, + "Safety": 0.8444, + "Focus": 0.7636, + "Ties": 0.6428 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472", + "name": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472", + "developer": "allenai", + "scores": { + "Score": 0.5778, + "Factuality": 0.6674, + "Precise IF": 0.3875, + "Math": 0.6011, + "Safety": 0.7933, + "Focus": 0.5172, + "Ties": 0.5003 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267", + "name": "allenai/open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267", + "developer": "allenai", + "scores": { + "Score": 0.5746, + "Factuality": 0.6505, + "Precise IF": 0.35, + "Math": 0.5082, + "Safety": 0.7844, + "Focus": 0.7414, + "Ties": 0.4128 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759", + "name": "allenai/open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759", + "developer": "allenai", + "scores": { + "Score": 0.6065, + "Factuality": 0.7116, + "Precise IF": 0.35, + "Math": 0.5792, + "Safety": 0.8178, + "Focus": 0.7152, + "Ties": 0.465 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905", + "name": "allenai/open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905", + "developer": "allenai", + "scores": { + "Score": 0.5305, + "Factuality": 0.5832, + "Precise IF": 0.3312, + "Math": 0.459, + "Safety": 0.7178, + "Focus": 0.7071, + "Ties": 0.3849 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363", + "name": "allenai/open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363", + "developer": "allenai", + "scores": { + "Score": 0.4436, + "Factuality": 0.5411, + "Precise IF": 0.3312, + "Math": 0.3115, + "Safety": 0.6267, + "Focus": 0.5414, + "Ties": 0.31 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505", + "name": "allenai/open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505", + "developer": "allenai", + "scores": { + "Score": 0.5925, + "Factuality": 0.68, + "Precise IF": 0.3688, + "Math": 0.5519, + "Safety": 0.78, + "Focus": 0.7434, + "Ties": 0.431 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_2_dpo__1__1743550180", + "name": "allenai/open_instruct_dev-rm_3e-6_2_dpo__1__1743550180", + "developer": "allenai", + "scores": { + "Score": 0.6198, + "Factuality": 0.7263, + "Precise IF": 0.3312, + "Math": 0.6339, + "Safety": 0.8133, + "Focus": 0.7232, + "Ties": 0.4908 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187", + "name": "allenai/open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187", + "developer": "allenai", + "scores": { + "Score": 0.6763, + "Factuality": 0.7411, + "Precise IF": 0.375, + "Math": 0.612, + "Safety": 0.8844, + "Focus": 0.8545, + "Ties": 0.5908 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_2_rl__1__1743551509", + "name": "allenai/open_instruct_dev-rm_3e-6_2_rl__1__1743551509", + "developer": "allenai", + "scores": { + "Score": 0.6245, + "Factuality": 0.7242, + "Precise IF": 0.35, + "Math": 0.6175, + "Safety": 0.8178, + "Focus": 0.7253, + "Ties": 0.5124 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498", + "name": "allenai/open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498", + "developer": "allenai", + "scores": { + "Score": 0.6673, + "Factuality": 0.7326, + "Precise IF": 0.3438, + "Math": 0.6175, + "Safety": 0.8622, + "Focus": 0.8566, + "Ties": 0.5911 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926", + "name": "allenai/open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926", + "developer": "allenai", + "scores": { + "Score": 0.5863, + "Factuality": 0.6674, + "Precise IF": 0.3937, + "Math": 0.6284, + "Safety": 0.8, + "Focus": 0.5515, + "Ties": 0.4768 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661", + "name": "allenai/open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661", + "developer": "allenai", + "scores": { + "Score": 0.589, + "Factuality": 0.6842, + "Precise IF": 0.3688, + "Math": 0.6393, + "Safety": 0.7867, + "Focus": 0.6081, + "Ties": 0.447 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598", + "name": "allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598", + "developer": "allenai", + "scores": { + "Score": 0.7306, + "Factuality": 0.7474, + "Precise IF": 0.375, + "Math": 0.694, + "Safety": 0.8622, + "Focus": 0.8061, + "Ties": 0.8992 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923", + "name": "allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923", + "developer": "allenai", + "scores": { + "Score": 0.7573, + "Factuality": 0.8168, + "Precise IF": 0.4125, + "Math": 0.7049, + "Safety": 0.8733, + "Focus": 0.8545, + "Ties": 0.8814 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llama_1e-6_1__1__1743896628", + "name": "allenai/open_instruct_dev-rm_llama_1e-6_1__1__1743896628", + "developer": "allenai", + "scores": { + "Score": 0.6637, + "Factuality": 0.6947, + "Precise IF": 0.4062, + "Math": 0.6284, + "Safety": 0.8422, + "Focus": 0.7273, + "Ties": 0.6834 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999", + "name": "allenai/open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999", + "developer": "allenai", + "scores": { + "Score": 0.6665, + "Factuality": 0.5979, + "Precise IF": 0.3688, + "Math": 0.6339, + "Safety": 0.8956, + "Focus": 0.8606, + "Ties": 0.6422 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777", + "name": "allenai/open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777", + "developer": "allenai", + "scores": { + "Score": 0.7038, + "Factuality": 0.6947, + "Precise IF": 0.3937, + "Math": 0.6557, + "Safety": 0.8867, + "Focus": 0.8586, + "Ties": 0.7331 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llama_1e-6_2__1__1743896638", + "name": "allenai/open_instruct_dev-rm_llama_1e-6_2__1__1743896638", + "developer": "allenai", + "scores": { + "Score": 0.6754, + "Factuality": 0.6716, + "Precise IF": 0.4, + "Math": 0.6339, + "Safety": 0.8756, + "Focus": 0.7737, + "Ties": 0.6976 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938", + "name": "allenai/open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938", + "developer": "allenai", + "scores": { + "Score": 0.7241, + "Factuality": 0.7305, + "Precise IF": 0.4, + "Math": 0.6667, + "Safety": 0.9422, + "Focus": 0.9414, + "Ties": 0.6635 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885", + "name": "allenai/open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885", + "developer": "allenai", + "scores": { + "Score": 0.6716, + "Factuality": 0.6632, + "Precise IF": 0.3688, + "Math": 0.6284, + "Safety": 0.82, + "Focus": 0.8303, + "Ties": 0.719 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773", + "name": "allenai/open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773", + "developer": "allenai", + "scores": { + "Score": 0.6207, + "Factuality": 0.6358, + "Precise IF": 0.375, + "Math": 0.5902, + "Safety": 0.8267, + "Focus": 0.802, + "Ties": 0.4948 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867", + "name": "allenai/open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867", + "developer": "allenai", + "scores": { + "Score": 0.719, + "Factuality": 0.7263, + "Precise IF": 0.3875, + "Math": 0.6393, + "Safety": 0.8956, + "Focus": 0.9273, + "Ties": 0.738 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llama_3e-6_1__1__1743929424", + "name": "allenai/open_instruct_dev-rm_llama_3e-6_1__1__1743929424", + "developer": "allenai", + "scores": { + "Score": 0.6572, + "Factuality": 0.7305, + "Precise IF": 0.3688, + "Math": 0.6284, + "Safety": 0.8289, + "Focus": 0.703, + "Ties": 0.6837 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llama_3e-6_1__2__1744311395", + "name": "allenai/open_instruct_dev-rm_llama_3e-6_1__2__1744311395", + "developer": "allenai", + "scores": { + "Score": 0.6938, + "Factuality": 0.7537, + "Precise IF": 0.45, + "Math": 0.6393, + "Safety": 0.8667, + "Focus": 0.7616, + "Ties": 0.6913 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llama_3e-6_1__3__1744311491", + "name": "allenai/open_instruct_dev-rm_llama_3e-6_1__3__1744311491", + "developer": "allenai", + "scores": { + "Score": 0.6754, + "Factuality": 0.7242, + "Precise IF": 0.4062, + "Math": 0.6284, + "Safety": 0.8422, + "Focus": 0.7535, + "Ties": 0.6976 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787", + "name": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787", + "developer": "allenai", + "scores": { + "Score": 0.7045, + "Factuality": 0.6253, + "Precise IF": 0.3812, + "Math": 0.6667, + "Safety": 0.92, + "Focus": 0.9232, + "Ties": 0.7109 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461", + "name": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461", + "developer": "allenai", + "scores": { + "Score": 0.7189, + "Factuality": 0.7305, + "Precise IF": 0.3937, + "Math": 0.6066, + "Safety": 0.8978, + "Focus": 0.9374, + "Ties": 0.7475 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780", + "name": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780", + "developer": "allenai", + "scores": { + "Score": 0.7172, + "Factuality": 0.7242, + "Precise IF": 0.4313, + "Math": 0.6175, + "Safety": 0.8778, + "Focus": 0.897, + "Ties": 0.7555 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llama_3e-6_2__1__1743896489", + "name": "allenai/open_instruct_dev-rm_llama_3e-6_2__1__1743896489", + "developer": "allenai", + "scores": { + "Score": 0.6813, + "Factuality": 0.7137, + "Precise IF": 0.4437, + "Math": 0.6284, + "Safety": 0.8644, + "Focus": 0.7596, + "Ties": 0.6781 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713", + "name": "allenai/open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713", + "developer": "allenai", + "scores": { + "Score": 0.7209, + "Factuality": 0.7116, + "Precise IF": 0.3875, + "Math": 0.6612, + "Safety": 0.9067, + "Focus": 0.9172, + "Ties": 0.7414 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911", + "name": "allenai/open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911", + "developer": "allenai", + "scores": { + "Score": 0.7266, + "Factuality": 0.7347, + "Precise IF": 0.4313, + "Math": 0.6339, + "Safety": 0.8933, + "Focus": 0.897, + "Ties": 0.7697 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412", + "name": "allenai/open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412", + "developer": "allenai", + "scores": { + "Score": 0.5342, + "Factuality": 0.6042, + "Precise IF": 0.275, + "Math": 0.6284, + "Safety": 0.7222, + "Focus": 0.5818, + "Ties": 0.3935 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922", + "name": "allenai/open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922", + "developer": "allenai", + "scores": { + "Score": 0.6111, + "Factuality": 0.6884, + "Precise IF": 0.3063, + "Math": 0.623, + "Safety": 0.8289, + "Focus": 0.7576, + "Ties": 0.4628 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495", + "name": "allenai/open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495", + "developer": "allenai", + "scores": { + "Score": 0.5825, + "Factuality": 0.6379, + "Precise IF": 0.325, + "Math": 0.5355, + "Safety": 0.8222, + "Focus": 0.7051, + "Ties": 0.4691 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507", + "name": "allenai/open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507", + "developer": "allenai", + "scores": { + "Score": 0.5598, + "Factuality": 0.5495, + "Precise IF": 0.3563, + "Math": 0.5902, + "Safety": 0.76, + "Focus": 0.7273, + "Ties": 0.3754 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507", + "name": "allenai/open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507", + "developer": "allenai", + "scores": { + "Score": 0.6101, + "Factuality": 0.6632, + "Precise IF": 0.35, + "Math": 0.6175, + "Safety": 0.7778, + "Focus": 0.7111, + "Ties": 0.5408 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917", + "name": "allenai/open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917", + "developer": "allenai", + "scores": { + "Score": 0.7185, + "Factuality": 0.7305, + "Precise IF": 0.4125, + "Math": 0.7158, + "Safety": 0.7933, + "Focus": 0.8545, + "Ties": 0.804 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961", + "name": "allenai/open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961", + "developer": "allenai", + "scores": { + "Score": 0.7325, + "Factuality": 0.7474, + "Precise IF": 0.4437, + "Math": 0.7158, + "Safety": 0.7978, + "Focus": 0.8141, + "Ties": 0.8763 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830", + "name": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830", + "developer": "allenai", + "scores": { + "Score": 0.6022, + "Factuality": 0.5284, + "Precise IF": 0.325, + "Math": 0.694, + "Safety": 0.7556, + "Focus": 0.7616, + "Ties": 0.5486 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024", + "name": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024", + "developer": "allenai", + "scores": { + "Score": 0.5948, + "Factuality": 0.5579, + "Precise IF": 0.2875, + "Math": 0.6776, + "Safety": 0.72, + "Focus": 0.7394, + "Ties": 0.5863 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914", + "name": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914", + "developer": "allenai", + "scores": { + "Score": 0.6492, + "Factuality": 0.6084, + "Precise IF": 0.35, + "Math": 0.6776, + "Safety": 0.76, + "Focus": 0.8, + "Ties": 0.699 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091", + "name": "allenai/open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091", + "developer": "allenai", + "scores": { + "Score": 0.6764, + "Factuality": 0.7074, + "Precise IF": 0.3, + "Math": 0.6885, + "Safety": 0.8622, + "Focus": 0.802, + "Ties": 0.6984 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829", + "name": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829", + "developer": "allenai", + "scores": { + "Score": 0.6408, + "Factuality": 0.6337, + "Precise IF": 0.3063, + "Math": 0.6831, + "Safety": 0.8467, + "Focus": 0.8222, + "Ties": 0.5529 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050", + "name": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050", + "developer": "allenai", + "scores": { + "Score": 0.6452, + "Factuality": 0.6063, + "Precise IF": 0.3187, + "Math": 0.7158, + "Safety": 0.8356, + "Focus": 0.8343, + "Ties": 0.5603 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916", + "name": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916", + "developer": "allenai", + "scores": { + "Score": 0.7013, + "Factuality": 0.7263, + "Precise IF": 0.3438, + "Math": 0.6995, + "Safety": 0.8222, + "Focus": 0.8444, + "Ties": 0.7714 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_qwen_3e-6_2__1__1743023576", + "name": "allenai/open_instruct_dev-rm_qwen_3e-6_2__1__1743023576", + "developer": "allenai", + "scores": { + "Score": 0.6369, + "Factuality": 0.6905, + "Precise IF": 0.3187, + "Math": 0.6448, + "Safety": 0.7844, + "Focus": 0.7596, + "Ties": 0.6236 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_qwen_3e-6_3__1__1743023619", + "name": "allenai/open_instruct_dev-rm_qwen_3e-6_3__1__1743023619", + "developer": "allenai", + "scores": { + "Score": 0.6221, + "Factuality": 0.6674, + "Precise IF": 0.325, + "Math": 0.612, + "Safety": 0.7978, + "Focus": 0.7455, + "Ties": 0.5852 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583", + "name": "allenai/open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583", + "developer": "allenai", + "scores": { + "Score": 0.5735, + "Factuality": 0.5895, + "Precise IF": 0.2625, + "Math": 0.6448, + "Safety": 0.6889, + "Focus": 0.6727, + "Ties": 0.5823 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604", + "name": "allenai/open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604", + "developer": "allenai", + "scores": { + "Score": 0.6336, + "Factuality": 0.6337, + "Precise IF": 0.3063, + "Math": 0.6885, + "Safety": 0.7244, + "Focus": 0.802, + "Ties": 0.6465 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738", + "name": "allenai/open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738", + "developer": "allenai", + "scores": { + "Score": 0.6824, + "Factuality": 0.6989, + "Precise IF": 0.3625, + "Math": 0.6831, + "Safety": 0.8311, + "Focus": 0.8081, + "Ties": 0.7107 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191", + "name": "allenai/open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191", + "developer": "allenai", + "scores": { + "Score": 0.6392, + "Factuality": 0.6589, + "Precise IF": 0.3312, + "Math": 0.6995, + "Safety": 0.7933, + "Focus": 0.7717, + "Ties": 0.5804 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737", + "name": "allenai/open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737", + "developer": "allenai", + "scores": { + "Score": 0.664, + "Factuality": 0.6821, + "Precise IF": 0.3312, + "Math": 0.6448, + "Safety": 0.8133, + "Focus": 0.8061, + "Ties": 0.7066 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138", + "name": "allenai/open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138", + "developer": "allenai", + "scores": { + "Score": 0.6678, + "Factuality": 0.6505, + "Precise IF": 0.3312, + "Math": 0.6831, + "Safety": 0.7978, + "Focus": 0.8808, + "Ties": 0.6632 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_tulu3_70b_1__8__1742924455", + "name": "allenai/open_instruct_dev-rm_tulu3_70b_1__8__1742924455", + "developer": "allenai", + "scores": { + "Score": 0.6618, + "Factuality": 0.7958, + "Precise IF": 0.325, + "Math": 0.6557, + "Safety": 0.8311, + "Focus": 0.6323, + "Ties": 0.7311 + } + }, + { + "model_id": "allenai/open_instruct_dev-rm_tulu3_70b_2__8__1742982964", + "name": "allenai/open_instruct_dev-rm_tulu3_70b_2__8__1742982964", + "developer": "allenai", + "scores": { + "Score": 0.6605, + "Factuality": 0.7789, + "Precise IF": 0.3688, + "Math": 0.6448, + "Safety": 0.8844, + "Focus": 0.6667, + "Ties": 0.6195 + } + }, + { + "model_id": "allenai/tulu-2-dpo-13b", + "name": "allenai/tulu-2-dpo-13b", + "developer": "allenai", + "scores": { + "Score": 0.7368, + "Chat": 0.9581, + "Chat Hard": 0.5833, + "Safety": 0.7946, + "Reasoning": 0.7323, + "Prior Sets (0.5 weight)": 0.4947 + } + }, + { + "model_id": "allenai/tulu-2-dpo-70b", + "name": "allenai/tulu-2-dpo-70b", + "developer": "allenai", + "scores": { + "Score": 0.7621, + "Chat": 0.9749, + "Chat Hard": 0.6053, + "Safety": 0.8446, + "Reasoning": 0.7407, + "Prior Sets (0.5 weight)": 0.5278 + } + }, + { + "model_id": "allenai/tulu-2-dpo-7b", + "name": "allenai/tulu-2-dpo-7b", + "developer": "allenai", + "scores": { + "Score": 0.7212, + "Chat": 0.9749, + "Chat Hard": 0.5614, + "Safety": 0.7527, + "Reasoning": 0.7176, + "Prior Sets (0.5 weight)": 0.4774 + } + }, + { + "model_id": "allenai/tulu-v2.5-13b-preference-mix-rm", + "name": "allenai/tulu-v2.5-13b-preference-mix-rm", + "developer": "allenai", + "scores": { + "Score": 0.8027, + "Chat": 0.9358, + "Chat Hard": 0.682, + "Safety": 0.773, + "Reasoning": 0.885, + "Prior Sets (0.5 weight)": 0.6724 + } + }, + { + "model_id": "allenai/tulu-v2.5-13b-uf-rm", + "name": "allenai/tulu-v2.5-13b-uf-rm", + "developer": "allenai", + "scores": { + "Score": 0.4806, + "Chat": 0.3939, + "Chat Hard": 0.4232, + "Safety": 0.5554, + "Reasoning": 0.4737, + "Prior Sets (0.5 weight)": 0.6326 + } + }, + { + "model_id": "allenai/tulu-v2.5-70b-preference-mix-rm", + "name": "allenai/tulu-v2.5-70b-preference-mix-rm", + "developer": "allenai", + "scores": { + "Score": 0.6516, + "Chat": 0.7737, + "Chat Hard": 0.5921, + "Safety": 0.8486, + "Reasoning": 0.4138, + "Prior Sets (0.5 weight)": 0.6079 + } + }, + { + "model_id": "allenai/tulu-v2.5-70b-uf-rm", + "name": "allenai/tulu-v2.5-70b-uf-rm", + "developer": "allenai", + "scores": { + "Score": 0.7398, + "Chat": 0.8659, + "Chat Hard": 0.7171, + "Safety": 0.7014, + "Reasoning": 0.757, + "Prior Sets (0.5 weight)": 0.5757 + } + }, + { + "model_id": "anthropic/claude-3-5-sonnet-20240620", + "name": "Claude 3.5 Sonnet 20240620", + "developer": "anthropic", + "scores": { + "Score": 0.6466, + "Factuality": 0.5284, + "Precise IF": 0.3875, + "Math": 0.5683, + "Safety": 0.8519, + "Focus": 0.8697, + "Ties": 0.674 + } + }, + { + "model_id": "anthropic/claude-3-7-sonnet-20250219", + "name": "claude-3-7-sonnet-20250219", + "developer": "anthropic", + "scores": { + "Score": 0.7539, + "Factuality": 0.7326, + "Precise IF": 0.5437, + "Math": 0.75, + "Safety": 0.9033, + "Focus": 0.9212, + "Ties": 0.6723 + } + }, + { + "model_id": "anthropic/claude-3-haiku-20240307", + "name": "Claude 3 Haiku 20240307", + "developer": "anthropic", + "scores": { + "Score": 0.3711, + "Factuality": 0.4042, + "Precise IF": 0.2812, + "Math": 0.3552, + "Safety": 0.595, + "Focus": 0.501, + "Ties": 0.0899 + } + }, + { + "model_id": "anthropic/claude-3-opus-20240229", + "name": "Claude 3 Opus 20240229", + "developer": "anthropic", + "scores": { + "Score": 0.5744, + "Factuality": 0.5389, + "Precise IF": 0.3312, + "Math": 0.5137, + "Safety": 0.8378, + "Focus": 0.6646, + "Ties": 0.5601 + } + }, + { + "model_id": "anthropic/claude-opus-4-20250514", + "name": "Claude 4 Opus 20250514", + "developer": "anthropic", + "scores": { + "Score": 0.7648, + "Factuality": 0.8267, + "Precise IF": 0.4188, + "Math": 0.7491, + "Safety": 0.8954, + "Focus": 0.8616, + "Ties": 0.8375 + } + }, + { + "model_id": "anthropic/claude-sonnet-4-20250514", + "name": "claude-sonnet-4-20250514", + "developer": "anthropic", + "scores": { + "Score": 0.7117, + "Factuality": 0.7612, + "Precise IF": 0.3594, + "Math": 0.7049, + "Safety": 0.8909, + "Focus": 0.7596, + "Ties": 0.7939 + } + }, + { + "model_id": "berkeley-nest/Starling-RM-7B-alpha", + "name": "berkeley-nest/Starling-RM-7B-alpha", + "developer": "berkeley-nest", + "scores": { + "Score": 0.7113, + "Chat": 0.9804, + "Chat Hard": 0.4561, + "Safety": 0.8446, + "Reasoning": 0.58, + "Prior Sets (0.5 weight)": 0.6794 + } + }, + { + "model_id": "facebook/Self-taught-Llama-3-70B", + "name": "facebook/Self-taught-Llama-3-70B", + "developer": "facebook", + "scores": { + "Score": 0.8863, + "Chat": 0.9693, + "Chat Hard": 0.8399, + "Safety": 0.9108, + "Reasoning": 0.8251 + } + }, + { + "model_id": "facebook/Self-taught-evaluator-llama3.1-70B", + "name": "facebook/Self-taught-evaluator-llama3.1-70B", + "developer": "facebook", + "scores": { + "Score": 0.9001, + "Chat": 0.9693, + "Chat Hard": 0.8509, + "Safety": 0.8959, + "Reasoning": 0.8844 + } + }, + { + "model_id": "gemini-1.5-flash-8b", + "name": "gemini-1.5-flash-8b", + "developer": "unknown", + "scores": { + "Score": 0.7601, + "Chat": 0.9441, + "Chat Hard": 0.5987, + "Safety": 0.7399, + "Reasoning": 0.7575 + } + }, + { + "model_id": "general-preference/GPM-Gemma-2B", + "name": "general-preference/GPM-Gemma-2B", + "developer": "general-preference", + "scores": { + "Score": 0.7449, + "Chat": 0.7151, + "Chat Hard": 0.6974, + "Safety": 0.8122, + "Reasoning": 0.755 + } + }, + { + "model_id": "general-preference/GPM-Llama-3.1-8B", + "name": "general-preference/GPM-Llama-3.1-8B", + "developer": "general-preference", + "scores": { + "Score": 0.9224, + "Chat": 0.933, + "Chat Hard": 0.886, + "Safety": 0.9108, + "Reasoning": 0.9597 + } + }, + { + "model_id": "google/flame-1.0-24B-july-2024", + "name": "google/flame-1.0-24B-july-2024", + "developer": "google", + "scores": { + "Score": 0.8781, + "Chat": 0.9218, + "Chat Hard": 0.7566, + "Safety": 0.8959, + "Reasoning": 0.938 + } + }, + { + "model_id": "google/gemini-1.5-flash-001", + "name": "Gemini 1.5 Flash 001", + "developer": "google", + "scores": { + "Score": 0.8054, + "Chat": 0.9218, + "Chat Hard": 0.6349, + "Safety": 0.8696, + "Reasoning": 0.8512, + "Prior Sets (0.5 weight)": 0.6937 + } + }, + { + "model_id": "google/gemini-1.5-flash-8b", + "name": "google/gemini-1.5-flash-8b", + "developer": "google", + "scores": { + "Score": 0.4851, + "Factuality": 0.4611, + "Precise IF": 0.3625, + "Math": 0.5082, + "Safety": 0.6622, + "Focus": 0.6747, + "Ties": 0.2421 + } + }, + { + "model_id": "google/gemini-1.5-pro-0514", + "name": "google/gemini-1.5-pro-0514", + "developer": "google", + "scores": { + "Score": 0.882, + "Chat": 0.9232, + "Chat Hard": 0.8059, + "Safety": 0.8791, + "Reasoning": 0.9199 + } + }, + { + "model_id": "google/gemini-1.5-pro-0924", + "name": "google/gemini-1.5-pro-0924", + "developer": "google", + "scores": { + "Score": 0.8678, + "Chat": 0.9413, + "Chat Hard": 0.7697, + "Safety": 0.8581, + "Reasoning": 0.9022 + } + }, + { + "model_id": "google/gemini-2.5-flash", + "name": "gemini-2.5-flash", + "developer": "google", + "scores": { + "Score": 0.7767, + "Factuality": 0.674, + "Precise IF": 0.575, + "Math": 0.852, + "Safety": 0.909, + "Focus": 0.841, + "Ties": 0.809 + } + }, + { + "model_id": "google/gemini-2.5-flash-preview-04-17", + "name": "Gemini 2.5 Flash 04-17 preview", + "developer": "google", + "scores": { + "Score": 0.7721, + "Factuality": 0.6574, + "Precise IF": 0.5531, + "Math": 0.8115, + "Safety": 0.9094, + "Focus": 0.8672, + "Ties": 0.8341 + } + }, + { + "model_id": "google/gemini-2.5-pro", + "name": "gemini-2.5-pro", + "developer": "google", + "scores": { + "Score": 0.7948, + "Factuality": 0.755, + "Precise IF": 0.619, + "Math": 0.898, + "Safety": 0.881, + "Focus": 0.805, + "Ties": 0.811 + } + }, + { + "model_id": "google/gemini-2.5-pro-preview-05-06", + "name": "google/gemini-2.5-pro-preview-05-06", + "developer": "google", + "scores": { + "Score": 0.6775, + "Factuality": 0.6532, + "Precise IF": 0.4688, + "Math": 0.5342, + "Safety": 0.8806, + "Focus": 0.8308, + "Ties": 0.6973 + } + }, + { + "model_id": "google/gemma-2-27b-it", + "name": "Gemma 2 Instruct 27B", + "developer": "google", + "scores": { + "Score": 0.809, + "Chat": 0.9483, + "Chat Hard": 0.591, + "Safety": 0.8635, + "Reasoning": 0.833 + } + }, + { + "model_id": "hendrydong/Mistral-RM-for-RAFT-GSHF-v0", + "name": "hendrydong/Mistral-RM-for-RAFT-GSHF-v0", + "developer": "hendrydong", + "scores": { + "Score": 0.5851, + "Factuality": 0.5779, + "Precise IF": 0.3625, + "Math": 0.6011, + "Safety": 0.6956, + "Focus": 0.6747, + "Ties": 0.5988 + } + }, + { + "model_id": "infly/INF-ORM-Llama3.1-70B", + "name": "infly/INF-ORM-Llama3.1-70B", + "developer": "infly", + "scores": { + "Score": 0.7648, + "Factuality": 0.7411, + "Precise IF": 0.4188, + "Math": 0.6995, + "Safety": 0.9644, + "Focus": 0.903, + "Ties": 0.8622 + } + }, + { + "model_id": "internlm/internlm2-1_8b-reward", + "name": "internlm/internlm2-1_8b-reward", + "developer": "internlm", + "scores": { + "Score": 0.8217, + "Chat": 0.9358, + "Chat Hard": 0.6623, + "Safety": 0.8162, + "Reasoning": 0.8724 + } + }, + { + "model_id": "internlm/internlm2-20b-reward", + "name": "internlm/internlm2-20b-reward", + "developer": "internlm", + "scores": { + "Score": 0.9016, + "Chat": 0.9888, + "Chat Hard": 0.7654, + "Safety": 0.8946, + "Reasoning": 0.9576 + } + }, + { + "model_id": "internlm/internlm2-7b-reward", + "name": "internlm/internlm2-7b-reward", + "developer": "internlm", + "scores": { + "Score": 0.5335, + "Factuality": 0.4211, + "Precise IF": 0.4, + "Math": 0.5628, + "Safety": 0.5956, + "Focus": 0.7051, + "Ties": 0.5164 + } + }, + { + "model_id": "jondurbin/bagel-dpo-34b-v0.5", + "name": "jondurbin/bagel-dpo-34b-v0.5", + "developer": "jondurbin", + "scores": { + "Score": 0.7215, + "Chat": 0.9385, + "Chat Hard": 0.5504, + "Safety": 0.6446, + "Reasoning": 0.8889, + "Prior Sets (0.5 weight)": 0.4487 + } + }, + { + "model_id": "llm-blender/PairRM-hf", + "name": "llm-blender/PairRM-hf", + "developer": "llm-blender", + "scores": { + "Score": 0.6087, + "Chat": 0.9022, + "Chat Hard": 0.5219, + "Safety": 0.477, + "Reasoning": 0.4898, + "Prior Sets (0.5 weight)": 0.6961 + } + }, + { + "model_id": "mattshumer/Reflection-70B", + "name": "mattshumer/Reflection-70B", + "developer": "mattshumer", + "scores": { + "Score": 0.8422, + "Chat": 0.9749, + "Chat Hard": 0.7061, + "Safety": 0.8318, + "Reasoning": 0.8562 + } + }, + { + "model_id": "meta-llama/Meta-Llama-3-70B-Instruct", + "name": "Meta-Llama-3-70B-Instruct", + "developer": "meta-llama", + "scores": { + "Score": 0.7627, + "Chat": 0.9763, + "Chat Hard": 0.5888, + "Safety": 0.7297, + "Reasoning": 0.7854, + "Prior Sets (0.5 weight)": 0.7035 + } + }, + { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "name": "Meta-Llama-3-8B-Instruct", + "developer": "meta-llama", + "scores": { + "Score": 0.645, + "Chat": 0.8547, + "Chat Hard": 0.4156, + "Safety": 0.6797, + "Reasoning": 0.6482, + "Prior Sets (0.5 weight)": 0.6082 + } + }, + { + "model_id": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", + "name": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", + "developer": "meta-llama", + "scores": { + "Score": 0.8412, + "Chat": 0.9721, + "Chat Hard": 0.7456, + "Safety": 0.7757, + "Reasoning": 0.8715 + } + }, + { + "model_id": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "name": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "developer": "meta-llama", + "scores": { + "Score": 0.8405, + "Chat": 0.9721, + "Chat Hard": 0.7018, + "Safety": 0.8284, + "Reasoning": 0.8599 + } + }, + { + "model_id": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", + "name": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", + "developer": "meta-llama", + "scores": { + "Score": 0.7808, + "Chat": 0.8757, + "Chat Hard": 0.6689, + "Safety": 0.7507, + "Reasoning": 0.828 + } + }, + { + "model_id": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "name": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "developer": "meta-llama", + "scores": { + "Score": 0.6565, + "Chat": 0.8073, + "Chat Hard": 0.4978, + "Safety": 0.6399, + "Reasoning": 0.6811 + } + }, + { + "model_id": "meta-metrics/MetaMetrics-RM-v1.0", + "name": "meta-metrics/MetaMetrics-RM-v1.0", + "developer": "meta-metrics", + "scores": { + "Score": 0.9342, + "Chat": 0.9832, + "Chat Hard": 0.864, + "Safety": 0.9081, + "Reasoning": 0.9816 + } + }, + { + "model_id": "mightbe/Better-PairRM", + "name": "mightbe/Better-PairRM", + "developer": "mightbe", + "scores": { + "Score": 0.673, + "Chat": 0.9553, + "Chat Hard": 0.3925, + "Safety": 0.8203, + "Reasoning": 0.4983, + "Prior Sets (0.5 weight)": 0.724 + } + }, + { + "model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "name": "Mixtral-8x7B-Instruct-v0.1", + "developer": "mistralai", + "scores": { + "Score": 0.7455, + "Chat": 0.9497, + "Chat Hard": 0.6404, + "Safety": 0.7257, + "Reasoning": 0.7872, + "Prior Sets (0.5 weight)": 0.5033 + } + }, + { + "model_id": "my_model/", + "name": "my_model/", + "developer": "my_model", + "scores": { + "Score": 0.5267, + "Chat": 0.4553, + "Chat Hard": 0.5592, + "Safety": 0.4392, + "Reasoning": 0.6532 + } + }, + { + "model_id": "nicolinho/QRM-Gemma-2-27B", + "name": "nicolinho/QRM-Gemma-2-27B", + "developer": "nicolinho", + "scores": { + "Score": 0.7667, + "Factuality": 0.7853, + "Precise IF": 0.3719, + "Math": 0.6995, + "Safety": 0.9578, + "Focus": 0.9535, + "Ties": 0.8321 + } + }, + { + "model_id": "nicolinho/QRM-Llama3-8B", + "name": "nicolinho/QRM-Llama3-8B", + "developer": "nicolinho", + "scores": { + "Score": 0.911, + "Chat": 0.9581, + "Chat Hard": 0.8114, + "Safety": 0.8986, + "Reasoning": 0.9758 + } + }, + { + "model_id": "nicolinho/QRM-Llama3.1-8B", + "name": "nicolinho/QRM-Llama3.1-8B", + "developer": "nicolinho", + "scores": { + "Score": 0.9306, + "Chat": 0.9441, + "Chat Hard": 0.8969, + "Safety": 0.923, + "Reasoning": 0.9583 + } + }, + { + "model_id": "nicolinho/QRM-Llama3.1-8B-v2", + "name": "nicolinho/QRM-Llama3.1-8B-v2", + "developer": "nicolinho", + "scores": { + "Score": 0.7074, + "Factuality": 0.6653, + "Precise IF": 0.4062, + "Math": 0.612, + "Safety": 0.9467, + "Focus": 0.8909, + "Ties": 0.7234 + } + }, + { + "model_id": "nvidia/Llama-3.1-Nemotron-70B-Reward", + "name": "nvidia/Llama-3.1-Nemotron-70B-Reward", + "developer": "nvidia", + "scores": { + "Score": 0.9411, + "Chat": 0.9749, + "Chat Hard": 0.8575, + "Safety": 0.9514, + "Reasoning": 0.9807 + } + }, + { + "model_id": "nvidia/Llama3-70B-SteerLM-RM", + "name": "nvidia/Llama3-70B-SteerLM-RM", + "developer": "nvidia", + "scores": { + "Score": 0.8877, + "Chat": 0.9134, + "Chat Hard": 0.8026, + "Safety": 0.9284, + "Reasoning": 0.9064 + } + }, + { + "model_id": "nvidia/Nemotron-4-340B-Reward", + "name": "nvidia/Nemotron-4-340B-Reward", + "developer": "nvidia", + "scores": { + "Score": 0.92, + "Chat": 0.9581, + "Chat Hard": 0.8706, + "Safety": 0.9149, + "Reasoning": 0.9363 + } + }, + { + "model_id": "openai/gpt-3.5-turbo-0125", + "name": "GPT-3.5 Turbo 0125", + "developer": "openai", + "scores": { + "Score": 0.6534, + "Chat": 0.9218, + "Chat Hard": 0.4452, + "Safety": 0.6547, + "Reasoning": 0.5912, + "Prior Sets (0.5 weight)": 0.6548 + } + }, + { + "model_id": "openai/gpt-4-0125-preview", + "name": "openai/gpt-4-0125-preview", + "developer": "openai", + "scores": { + "Score": 0.8434, + "Chat": 0.9525, + "Chat Hard": 0.7434, + "Safety": 0.8757, + "Reasoning": 0.8692, + "Prior Sets (0.5 weight)": 0.7085 + } + }, + { + "model_id": "openai/gpt-4-turbo-2024-04-09", + "name": "GPT-4 Turbo 2024-04-09", + "developer": "openai", + "scores": { + "Score": 0.8395, + "Chat": 0.9525, + "Chat Hard": 0.7544, + "Safety": 0.8757, + "Reasoning": 0.827, + "Prior Sets (0.5 weight)": 0.7363 + } + }, + { + "model_id": "openai/gpt-4.1-2025-04-14", + "name": "gpt-4.1-2025-04-14", + "developer": "openai", + "scores": { + "Score": 0.7232, + "Factuality": 0.8289, + "Precise IF": 0.3974, + "Math": 0.6521, + "Safety": 0.8726, + "Focus": 0.7338, + "Ties": 0.8542 + } + }, + { + "model_id": "openai/gpt-4.1-mini-2025-04-14", + "name": "GPT-4.1 mini 2025-04-14", + "developer": "openai", + "scores": { + "Score": 0.6573, + "Factuality": 0.6084, + "Precise IF": 0.4125, + "Math": 0.7213, + "Safety": 0.7265, + "Focus": 0.7354, + "Ties": 0.74 + } + }, + { + "model_id": "openai/gpt-4.1-nano-2025-04-14", + "name": "GPT-4.1 nano 2025-04-14", + "developer": "openai", + "scores": { + "Score": 0.4849, + "Factuality": 0.4646, + "Precise IF": 0.2578, + "Math": 0.5041, + "Safety": 0.7156, + "Focus": 0.466, + "Ties": 0.5015 + } + }, + { + "model_id": "openai/gpt-4o-2024-05-13", + "name": "GPT-4o 2024-05-13", + "developer": "openai", + "scores": { + "Score": 0.8327, + "Chat": 0.9665, + "Chat Hard": 0.7039, + "Safety": 0.8649, + "Reasoning": 0.8487, + "Prior Sets (0.5 weight)": 0.7262 + } + }, + { + "model_id": "openai/gpt-4o-2024-08-06", + "name": "GPT-4o 2024-08-06", + "developer": "openai", + "scores": { + "Score": 0.6493, + "Factuality": 0.5684, + "Precise IF": 0.3312, + "Math": 0.623, + "Safety": 0.8619, + "Focus": 0.7293, + "Ties": 0.7819 + } + }, + { + "model_id": "openai/gpt-4o-mini-2024-07-18", + "name": "GPT-4o mini 2024-07-18", + "developer": "openai", + "scores": { + "Score": 0.5796, + "Factuality": 0.4105, + "Precise IF": 0.3438, + "Math": 0.5191, + "Safety": 0.7667, + "Focus": 0.7414, + "Ties": 0.6962 + } + }, + { + "model_id": "openbmb/Eurus-7b-kto", + "name": "openbmb/Eurus-7b-kto", + "developer": "openbmb", + "scores": { + "Score": 0.69, + "Chat": 0.9525, + "Chat Hard": 0.5373, + "Safety": 0.6054, + "Reasoning": 0.7467, + "Prior Sets (0.5 weight)": 0.5261 + } + }, + { + "model_id": "openbmb/Eurus-RM-7b", + "name": "openbmb/Eurus-RM-7b", + "developer": "openbmb", + "scores": { + "Score": 0.8159, + "Chat": 0.9804, + "Chat Hard": 0.6557, + "Safety": 0.8135, + "Reasoning": 0.8633, + "Prior Sets (0.5 weight)": 0.7172 + } + }, + { + "model_id": "openbmb/MiniCPM-2B-dpo-fp32", + "name": "openbmb/MiniCPM-2B-dpo-fp32", + "developer": "openbmb", + "scores": { + "Score": 0.673, + "Chat": 0.8911, + "Chat Hard": 0.4934, + "Safety": 0.573, + "Reasoning": 0.8233, + "Prior Sets (0.5 weight)": 0.4958 + } + }, + { + "model_id": "openbmb/UltraRM-13b", + "name": "openbmb/UltraRM-13b", + "developer": "openbmb", + "scores": { + "Score": 0.4683, + "Factuality": 0.5063, + "Precise IF": 0.3312, + "Math": 0.5519, + "Safety": 0.5089, + "Focus": 0.6081, + "Ties": 0.3036 + } + }, + { + "model_id": "opencompass/CompassJudger-1-1.5B-Instruct", + "name": "opencompass/CompassJudger-1-1.5B-Instruct", + "developer": "opencompass", + "scores": { + "Score": 0.7344, + "Chat": 0.9637, + "Chat Hard": 0.4923, + "Safety": 0.7818, + "Reasoning": 0.6999 + } + }, + { + "model_id": "opencompass/CompassJudger-1-14B-Instruct", + "name": "opencompass/CompassJudger-1-14B-Instruct", + "developer": "opencompass", + "scores": { + "Score": 0.8409, + "Chat": 0.9749, + "Chat Hard": 0.6228, + "Safety": 0.8392, + "Reasoning": 0.9268 + } + }, + { + "model_id": "opencompass/CompassJudger-1-32B-Instruct", + "name": "opencompass/CompassJudger-1-32B-Instruct", + "developer": "opencompass", + "scores": { + "Score": 0.8522, + "Chat": 0.9804, + "Chat Hard": 0.6513, + "Safety": 0.8527, + "Reasoning": 0.9244 + } + }, + { + "model_id": "opencompass/CompassJudger-1-7B-Instruct", + "name": "opencompass/CompassJudger-1-7B-Instruct", + "developer": "opencompass", + "scores": { + "Score": 0.8317, + "Chat": 0.9777, + "Chat Hard": 0.6096, + "Safety": 0.8446, + "Reasoning": 0.8948 + } + }, + { + "model_id": "prometheus-eval/prometheus-7b-v2.0", + "name": "prometheus-eval/prometheus-7b-v2.0", + "developer": "prometheus-eval", + "scores": { + "Score": 0.7204, + "Chat": 0.8547, + "Chat Hard": 0.4912, + "Safety": 0.7709, + "Reasoning": 0.7648 + } + }, + { + "model_id": "prometheus-eval/prometheus-8x7b-v2.0", + "name": "prometheus-eval/prometheus-8x7b-v2.0", + "developer": "prometheus-eval", + "scores": { + "Score": 0.7451, + "Chat": 0.9302, + "Chat Hard": 0.4715, + "Safety": 0.8047, + "Reasoning": 0.774 + } + }, + { + "model_id": "sfairXC/FsfairX-LLaMA3-RM-v0.1", + "name": "sfairXC/FsfairX-LLaMA3-RM-v0.1", + "developer": "sfairXC", + "scores": { + "Score": 0.6292, + "Factuality": 0.5916, + "Precise IF": 0.4188, + "Math": 0.6284, + "Safety": 0.7667, + "Focus": 0.7051, + "Ties": 0.6647 + } + }, + { + "model_id": "stabilityai/stable-code-instruct-3b", + "name": "stabilityai/stable-code-instruct-3b", + "developer": "stabilityai", + "scores": { + "Score": 0.6216, + "Chat": 0.5782, + "Chat Hard": 0.5855, + "Safety": 0.6554, + "Reasoning": 0.7528, + "Prior Sets (0.5 weight)": 0.4506 + } + }, + { + "model_id": "stabilityai/stablelm-2-12b-chat", + "name": "stablelm-2-12b-chat", + "developer": "stabilityai", + "scores": { + "Score": 0.7642, + "Chat": 0.9665, + "Chat Hard": 0.5548, + "Safety": 0.7811, + "Reasoning": 0.8945, + "Prior Sets (0.5 weight)": 0.4839 + } + }, + { + "model_id": "stabilityai/stablelm-2-zephyr-1_6b", + "name": "stablelm-2-zephyr-1_6b", + "developer": "stabilityai", + "scores": { + "Score": 0.6574, + "Chat": 0.9665, + "Chat Hard": 0.4671, + "Safety": 0.6027, + "Reasoning": 0.6784, + "Prior Sets (0.5 weight)": 0.4868 + } + }, + { + "model_id": "stabilityai/stablelm-zephyr-3b", + "name": "stablelm-zephyr-3b", + "developer": "stabilityai", + "scores": { + "Score": 0.7146, + "Chat": 0.8631, + "Chat Hard": 0.6009, + "Safety": 0.7405, + "Reasoning": 0.7573, + "Prior Sets (0.5 weight)": 0.5075 + } + }, + { + "model_id": "stanfordnlp/SteamSHP-flan-t5-large", + "name": "stanfordnlp/SteamSHP-flan-t5-large", + "developer": "stanfordnlp", + "scores": { + "Score": 0.4962, + "Chat": 0.8575, + "Chat Hard": 0.3311, + "Safety": 0.3743, + "Reasoning": 0.3563, + "Prior Sets (0.5 weight)": 0.6273 + } + }, + { + "model_id": "stanfordnlp/SteamSHP-flan-t5-xl", + "name": "stanfordnlp/SteamSHP-flan-t5-xl", + "developer": "stanfordnlp", + "scores": { + "Score": 0.5135, + "Chat": 0.8547, + "Chat Hard": 0.3684, + "Safety": 0.3784, + "Reasoning": 0.3841, + "Prior Sets (0.5 weight)": 0.6498 + } + }, + { + "model_id": "upstage/SOLAR-10.7B-Instruct-v1.0", + "name": "SOLAR-10.7B-Instruct-v1.0", + "developer": "upstage", + "scores": { + "Score": 0.7391, + "Chat": 0.8156, + "Chat Hard": 0.6864, + "Safety": 0.8514, + "Reasoning": 0.7252, + "Prior Sets (0.5 weight)": 0.4949 + } + }, + { + "model_id": "wenbopan/Faro-Yi-9B-DPO", + "name": "wenbopan/Faro-Yi-9B-DPO", + "developer": "wenbopan", + "scores": { + "Score": 0.6461, + "Chat": 0.9218, + "Chat Hard": 0.5307, + "Safety": 0.5514, + "Reasoning": 0.5839, + "Prior Sets (0.5 weight)": 0.6395 + } + }, + { + "model_id": "weqweasdas/RM-Gemma-2B", + "name": "weqweasdas/RM-Gemma-2B", + "developer": "weqweasdas", + "scores": { + "Score": 0.6549, + "Chat": 0.9441, + "Chat Hard": 0.4079, + "Safety": 0.4986, + "Reasoning": 0.7637, + "Prior Sets (0.5 weight)": 0.6652 + } + }, + { + "model_id": "weqweasdas/RM-Gemma-7B", + "name": "weqweasdas/RM-Gemma-7B", + "developer": "weqweasdas", + "scores": { + "Score": 0.4826, + "Factuality": 0.4926, + "Precise IF": 0.3937, + "Math": 0.6066, + "Safety": 0.4822, + "Focus": 0.497, + "Ties": 0.4232 + } + }, + { + "model_id": "weqweasdas/RM-Gemma-7B-4096", + "name": "weqweasdas/RM-Gemma-7B-4096", + "developer": "weqweasdas", + "scores": { + "Score": 0.6922, + "Chat": 0.9497, + "Chat Hard": 0.5022, + "Safety": 0.5608, + "Reasoning": 0.7511, + "Prior Sets (0.5 weight)": 0.7024 + } + }, + { + "model_id": "weqweasdas/RM-Mistral-7B", + "name": "weqweasdas/RM-Mistral-7B", + "developer": "weqweasdas", + "scores": { + "Score": 0.596, + "Factuality": 0.5937, + "Precise IF": 0.3438, + "Math": 0.5956, + "Safety": 0.6911, + "Focus": 0.7293, + "Ties": 0.6226 + } + }, + { + "model_id": "weqweasdas/hh_rlhf_rm_open_llama_3b", + "name": "weqweasdas/hh_rlhf_rm_open_llama_3b", + "developer": "weqweasdas", + "scores": { + "Score": 0.2498, + "Factuality": 0.3642, + "Precise IF": 0.275, + "Math": 0.3497, + "Safety": 0.24, + "Focus": 0.2384, + "Ties": 0.0315 + } + } + ] +} \ No newline at end of file diff --git a/data/benchmarks/swe-bench.json b/data/benchmarks/swe-bench.json new file mode 100644 index 0000000000000000000000000000000000000000..eb8b39879e565a2a2d717362ca5d2e5f29196eee --- /dev/null +++ b/data/benchmarks/swe-bench.json @@ -0,0 +1,28 @@ +{ + "models": [ + { + "model_id": "anthropic/claude-opus-4-5", + "name": "claude-opus-4-5", + "developer": "Anthropic", + "scores": { + "swe-bench": 0.6061 + } + }, + { + "model_id": "google/gemini-3-pro-preview", + "name": "gemini-3-pro-preview", + "developer": "Google", + "scores": { + "swe-bench": 0.7576 + } + }, + { + "model_id": "openai/gpt-5.2-2025-12-11", + "name": "gpt-5.2-2025-12-11", + "developer": "OpenAI", + "scores": { + "swe-bench": 0.57 + } + } + ] +} \ No newline at end of file diff --git a/data/benchmarks/tau-bench-2_airline.json b/data/benchmarks/tau-bench-2_airline.json new file mode 100644 index 0000000000000000000000000000000000000000..75a484db572c144d7448daa3de7461dad0d94d62 --- /dev/null +++ b/data/benchmarks/tau-bench-2_airline.json @@ -0,0 +1,28 @@ +{ + "models": [ + { + "model_id": "anthropic/claude-opus-4-5", + "name": "claude-opus-4-5", + "developer": "Anthropic", + "scores": { + "tau-bench-2/airline": 0.66 + } + }, + { + "model_id": "google/gemini-3-pro-preview", + "name": "gemini-3-pro-preview", + "developer": "Google", + "scores": { + "tau-bench-2/airline": 0.7 + } + }, + { + "model_id": "openai/gpt-5.2-2025-12-11", + "name": "gpt-5.2-2025-12-11", + "developer": "OpenAI", + "scores": { + "tau-bench-2/airline": 0.54 + } + } + ] +} \ No newline at end of file diff --git a/data/benchmarks/tau-bench-2_retail.json b/data/benchmarks/tau-bench-2_retail.json new file mode 100644 index 0000000000000000000000000000000000000000..2da1258ba59e3f413441ef0b6df6faabeb6eb792 --- /dev/null +++ b/data/benchmarks/tau-bench-2_retail.json @@ -0,0 +1,28 @@ +{ + "models": [ + { + "model_id": "anthropic/claude-opus-4-5", + "name": "claude-opus-4-5", + "developer": "Anthropic", + "scores": { + "tau-bench-2/retail": 0.83 + } + }, + { + "model_id": "google/gemini-3-pro-preview", + "name": "gemini-3-pro-preview", + "developer": "Google", + "scores": { + "tau-bench-2/retail": 0.7576 + } + }, + { + "model_id": "openai/gpt-5.2-2025-12-11", + "name": "gpt-5.2-2025-12-11", + "developer": "OpenAI", + "scores": { + "tau-bench-2/retail": 0.68 + } + } + ] +} \ No newline at end of file diff --git a/data/benchmarks/tau-bench-2_telecom.json b/data/benchmarks/tau-bench-2_telecom.json new file mode 100644 index 0000000000000000000000000000000000000000..5e2e97c5a63c814404bfd0e936bb7f41ce63593e --- /dev/null +++ b/data/benchmarks/tau-bench-2_telecom.json @@ -0,0 +1,28 @@ +{ + "models": [ + { + "model_id": "anthropic/claude-opus-4-5", + "name": "claude-opus-4-5", + "developer": "Anthropic", + "scores": { + "tau-bench-2/telecom": 0.76 + } + }, + { + "model_id": "google/gemini-3-pro-preview", + "name": "gemini-3-pro-preview", + "developer": "Google", + "scores": { + "tau-bench-2/telecom": 0.73 + } + }, + { + "model_id": "openai/gpt-5.2-2025-12-11", + "name": "gpt-5.2-2025-12-11", + "developer": "OpenAI", + "scores": { + "tau-bench-2/telecom": 0.5354 + } + } + ] +} \ No newline at end of file diff --git a/data/benchmarks/terminal-bench-2.0.json b/data/benchmarks/terminal-bench-2.0.json new file mode 100644 index 0000000000000000000000000000000000000000..1b7467dde0ee56ac12d5ecd84baf3cf468c0942e --- /dev/null +++ b/data/benchmarks/terminal-bench-2.0.json @@ -0,0 +1,300 @@ +{ + "models": [ + { + "model_id": "alibaba/qwen-3-coder-480b", + "name": "Qwen 3 Coder 480B", + "developer": "Alibaba", + "scores": { + "terminal-bench-2.0": 25.4 + } + }, + { + "model_id": "anthropic/claude-haiku-4.5", + "name": "Claude Haiku 4.5", + "developer": "Anthropic", + "scores": { + "terminal-bench-2.0": 29.8 + } + }, + { + "model_id": "anthropic/claude-opus-4.1", + "name": "Claude Opus 4.1", + "developer": "Anthropic", + "scores": { + "terminal-bench-2.0": 35.1 + } + }, + { + "model_id": "anthropic/claude-opus-4.5", + "name": "Claude Opus 4.5", + "developer": "Anthropic", + "scores": { + "terminal-bench-2.0": 59.1 + } + }, + { + "model_id": "anthropic/claude-opus-4.6", + "name": "Claude Opus 4.6", + "developer": "Anthropic", + "scores": { + "terminal-bench-2.0": 58.0 + } + }, + { + "model_id": "anthropic/claude-sonnet-4.5", + "name": "Claude Sonnet 4.5", + "developer": "Anthropic", + "scores": { + "terminal-bench-2.0": 46.5 + } + }, + { + "model_id": "deepseek/deepseek-v3.2", + "name": "DeepSeek-V3.2", + "developer": "DeepSeek", + "scores": { + "terminal-bench-2.0": 39.6 + } + }, + { + "model_id": "google/gemini-2.5-flash", + "name": "gemini-2.5-flash", + "developer": "google", + "scores": { + "terminal-bench-2.0": 17.1 + } + }, + { + "model_id": "google/gemini-2.5-pro", + "name": "gemini-2.5-pro", + "developer": "google", + "scores": { + "terminal-bench-2.0": 26.1 + } + }, + { + "model_id": "google/gemini-3-flash", + "name": "Gemini 3 Flash", + "developer": "Google", + "scores": { + "terminal-bench-2.0": 64.3 + } + }, + { + "model_id": "google/gemini-3-pro", + "name": "Gemini 3 Pro", + "developer": "Google", + "scores": { + "terminal-bench-2.0": 65.2 + } + }, + { + "model_id": "google/gemini-3.1-pro", + "name": "Gemini 3.1 Pro", + "developer": "Google", + "scores": { + "terminal-bench-2.0": 74.8 + } + }, + { + "model_id": "minimax/minimax-m2", + "name": "MiniMax M2", + "developer": "MiniMax", + "scores": { + "terminal-bench-2.0": 30.0 + } + }, + { + "model_id": "minimax/minimax-m2.1", + "name": "MiniMax M2.1", + "developer": "MiniMax", + "scores": { + "terminal-bench-2.0": 36.6 + } + }, + { + "model_id": "minimax/minimax-m2.5", + "name": "Minimax m2.5", + "developer": "Minimax", + "scores": { + "terminal-bench-2.0": 42.2 + } + }, + { + "model_id": "moonshot-ai/kimi-k2-instruct", + "name": "Kimi K2 Instruct", + "developer": "Moonshot AI", + "scores": { + "terminal-bench-2.0": 27.8 + } + }, + { + "model_id": "moonshot-ai/kimi-k2-thinking", + "name": "Kimi K2 Thinking", + "developer": "Moonshot AI", + "scores": { + "terminal-bench-2.0": 35.7 + } + }, + { + "model_id": "moonshot-ai/kimi-k2.5", + "name": "Kimi K2.5", + "developer": "Kimi", + "scores": { + "terminal-bench-2.0": 43.2 + } + }, + { + "model_id": "multiple/multiple", + "name": "Multiple", + "developer": "Multiple", + "scores": { + "terminal-bench-2.0": 59.1 + } + }, + { + "model_id": "openai/gpt-5", + "name": "GPT-5", + "developer": "OpenAI", + "scores": { + "terminal-bench-2.0": 33.9 + } + }, + { + "model_id": "openai/gpt-5-codex", + "name": "GPT-5-Codex", + "developer": "OpenAI", + "scores": { + "terminal-bench-2.0": 43.4 + } + }, + { + "model_id": "openai/gpt-5-mini", + "name": "GPT-5-Mini", + "developer": "OpenAI", + "scores": { + "terminal-bench-2.0": 29.2 + } + }, + { + "model_id": "openai/gpt-5-nano", + "name": "GPT-5-Nano", + "developer": "OpenAI", + "scores": { + "terminal-bench-2.0": 9.9 + } + }, + { + "model_id": "openai/gpt-5.1", + "name": "GPT-5.1", + "developer": "OpenAI", + "scores": { + "terminal-bench-2.0": 47.6 + } + }, + { + "model_id": "openai/gpt-5.1-codex", + "name": "GPT-5.1-Codex", + "developer": "OpenAI", + "scores": { + "terminal-bench-2.0": 57.8 + } + }, + { + "model_id": "openai/gpt-5.1-codex-max", + "name": "GPT-5.1-Codex-Max", + "developer": "OpenAI", + "scores": { + "terminal-bench-2.0": 60.4 + } + }, + { + "model_id": "openai/gpt-5.1-codex-mini", + "name": "GPT-5.1-Codex-Mini", + "developer": "OpenAI", + "scores": { + "terminal-bench-2.0": 43.1 + } + }, + { + "model_id": "openai/gpt-5.2", + "name": "GPT-5.2", + "developer": "OpenAI", + "scores": { + "terminal-bench-2.0": 54.0 + } + }, + { + "model_id": "openai/gpt-5.2-codex", + "name": "GPT-5.2-Codex", + "developer": "OpenAI", + "scores": { + "terminal-bench-2.0": 66.5 + } + }, + { + "model_id": "openai/gpt-5.3-codex", + "name": "GPT-5.3-Codex", + "developer": "OpenAI", + "scores": { + "terminal-bench-2.0": 70.3 + } + }, + { + "model_id": "openai/gpt-oss-120b", + "name": "gpt-oss-120b", + "developer": "openai", + "scores": { + "terminal-bench-2.0": 14.2 + } + }, + { + "model_id": "openai/gpt-oss-20b", + "name": "gpt-oss-20b", + "developer": "openai", + "scores": { + "terminal-bench-2.0": 3.1 + } + }, + { + "model_id": "xai/grok-4", + "name": "Grok 4", + "developer": "xAI", + "scores": { + "terminal-bench-2.0": 25.4 + } + }, + { + "model_id": "xai/grok-code-fast-1", + "name": "Grok Code Fast 1", + "developer": "xAI", + "scores": { + "terminal-bench-2.0": 25.8 + } + }, + { + "model_id": "zhipu-ai/glm-4.6", + "name": "GLM 4.6", + "developer": "Z.ai", + "scores": { + "terminal-bench-2.0": 24.5 + } + }, + { + "model_id": "zhipu-ai/glm-4.7", + "name": "GLM 4.7", + "developer": "Z-AI", + "scores": { + "terminal-bench-2.0": 33.4 + } + }, + { + "model_id": "zhipu-ai/glm-5", + "name": "GLM 5", + "developer": "Z-AI", + "scores": { + "terminal-bench-2.0": 52.4 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/0-hero.json b/data/developers/0-hero.json new file mode 100644 index 0000000000000000000000000000000000000000..4af1a706dfa90928a1fbd821d3f1249efc7baae4 --- /dev/null +++ b/data/developers/0-hero.json @@ -0,0 +1,47 @@ +{ + "developer": "0-hero", + "models": [ + { + "id": "0-hero/Matter-0.1-7B-DPO-preview", + "name": "0-hero/Matter-0.1-7B-DPO-preview", + "developer": "0-hero", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7247, + "reward-bench/Chat": 0.8939, + "reward-bench/Chat Hard": 0.5768, + "reward-bench/Safety": 0.6378, + "reward-bench/Reasoning": 0.8854, + "reward-bench/Prior Sets (0.5 weight)": 0.5348 + } + }, + { + "id": "0-hero/Matter-0.1-7B-boost-DPO-preview", + "name": "0-hero/Matter-0.1-7B-boost-DPO-preview", + "developer": "0-hero", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7448, + "reward-bench/Chat": 0.9106, + "reward-bench/Chat Hard": 0.6096, + "reward-bench/Safety": 0.7135, + "reward-bench/Reasoning": 0.8395, + "reward-bench/Prior Sets (0.5 weight)": 0.5566 + } + }, + { + "id": "0-hero/Matter-0.2-7B-DPO", + "name": "Matter-0.2-7B-DPO", + "developer": "0-hero", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3303, + "hfopenllm_v2/BBH": 0.3596, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3814, + "hfopenllm_v2/MMLU-PRO": 0.1164 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/01-ai.json b/data/developers/01-ai.json new file mode 100644 index 0000000000000000000000000000000000000000..aa37394e8f8bac4db9210f0b867a030c555ebc72 --- /dev/null +++ b/data/developers/01-ai.json @@ -0,0 +1,433 @@ +{ + "developer": "01-ai", + "models": [ + { + "id": "01-ai/Yi-1.5-34B", + "name": "Yi-1.5-34B", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2841, + "hfopenllm_v2/BBH": 0.5976, + "hfopenllm_v2/MATH Level 5": 0.1533, + "hfopenllm_v2/GPQA": 0.3658, + "hfopenllm_v2/MUSR": 0.4236, + "hfopenllm_v2/MMLU-PRO": 0.4666 + } + }, + { + "id": "01-ai/Yi-1.5-34B-32K", + "name": "Yi-1.5-34B-32K", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3119, + "hfopenllm_v2/BBH": 0.6016, + "hfopenllm_v2/MATH Level 5": 0.1541, + "hfopenllm_v2/GPQA": 0.3633, + "hfopenllm_v2/MUSR": 0.4398, + "hfopenllm_v2/MMLU-PRO": 0.4709 + } + }, + { + "id": "01-ai/Yi-1.5-34B-Chat", + "name": "Yi-1.5-34B-Chat", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6067, + "hfopenllm_v2/BBH": 0.6084, + "hfopenllm_v2/MATH Level 5": 0.2772, + "hfopenllm_v2/GPQA": 0.3649, + "hfopenllm_v2/MUSR": 0.4282, + "hfopenllm_v2/MMLU-PRO": 0.452 + } + }, + { + "id": "01-ai/Yi-1.5-34B-Chat-16K", + "name": "Yi-1.5-34B-Chat-16K", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4564, + "hfopenllm_v2/BBH": 0.61, + "hfopenllm_v2/MATH Level 5": 0.2137, + "hfopenllm_v2/GPQA": 0.3381, + "hfopenllm_v2/MUSR": 0.4398, + "hfopenllm_v2/MMLU-PRO": 0.4545 + } + }, + { + "id": "01-ai/Yi-1.5-6B", + "name": "Yi-1.5-6B", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2617, + "hfopenllm_v2/BBH": 0.4493, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4374, + "hfopenllm_v2/MMLU-PRO": 0.3144 + } + }, + { + "id": "01-ai/Yi-1.5-6B-Chat", + "name": "Yi-1.5-6B-Chat", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5145, + "hfopenllm_v2/BBH": 0.4571, + "hfopenllm_v2/MATH Level 5": 0.1624, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4392, + "hfopenllm_v2/MMLU-PRO": 0.3193 + } + }, + { + "id": "01-ai/Yi-1.5-9B", + "name": "Yi-1.5-9B", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2936, + "hfopenllm_v2/BBH": 0.5143, + "hfopenllm_v2/MATH Level 5": 0.114, + "hfopenllm_v2/GPQA": 0.3792, + "hfopenllm_v2/MUSR": 0.4328, + "hfopenllm_v2/MMLU-PRO": 0.3916 + } + }, + { + "id": "01-ai/Yi-1.5-9B-32K", + "name": "Yi-1.5-9B-32K", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2303, + "hfopenllm_v2/BBH": 0.4963, + "hfopenllm_v2/MATH Level 5": 0.108, + "hfopenllm_v2/GPQA": 0.3591, + "hfopenllm_v2/MUSR": 0.4186, + "hfopenllm_v2/MMLU-PRO": 0.3765 + } + }, + { + "id": "01-ai/Yi-1.5-9B-Chat", + "name": "Yi-1.5-9B-Chat", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6046, + "hfopenllm_v2/BBH": 0.5559, + "hfopenllm_v2/MATH Level 5": 0.2258, + "hfopenllm_v2/GPQA": 0.3347, + "hfopenllm_v2/MUSR": 0.4259, + "hfopenllm_v2/MMLU-PRO": 0.3975 + } + }, + { + "id": "01-ai/Yi-1.5-9B-Chat-16K", + "name": "Yi-1.5-9B-Chat-16K", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4214, + "hfopenllm_v2/BBH": 0.5153, + "hfopenllm_v2/MATH Level 5": 0.1782, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4099, + "hfopenllm_v2/MMLU-PRO": 0.3994 + } + }, + { + "id": "01-ai/Yi-34B", + "name": "Yi-34B", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3046, + "hfopenllm_v2/BBH": 0.5457, + "hfopenllm_v2/MATH Level 5": 0.0514, + "hfopenllm_v2/GPQA": 0.3666, + "hfopenllm_v2/MUSR": 0.4119, + "hfopenllm_v2/MMLU-PRO": 0.4412 + } + }, + { + "id": "01-ai/Yi-34B-200K", + "name": "Yi-34B-200K", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1542, + "hfopenllm_v2/BBH": 0.5442, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.3565, + "hfopenllm_v2/MUSR": 0.3817, + "hfopenllm_v2/MMLU-PRO": 0.4535 + } + }, + { + "id": "01-ai/Yi-34B-Chat", + "name": "Yi-34B-Chat", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4699, + "hfopenllm_v2/BBH": 0.5561, + "hfopenllm_v2/MATH Level 5": 0.0627, + "hfopenllm_v2/GPQA": 0.3381, + "hfopenllm_v2/MUSR": 0.3978, + "hfopenllm_v2/MMLU-PRO": 0.4093 + } + }, + { + "id": "01-ai/Yi-6B", + "name": "Yi-6B", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2893, + "hfopenllm_v2/BBH": 0.4309, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3937, + "hfopenllm_v2/MMLU-PRO": 0.2991 + } + }, + { + "id": "01-ai/Yi-6B-200K", + "name": "Yi-6B-200K", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0843, + "hfopenllm_v2/BBH": 0.4289, + "hfopenllm_v2/MATH Level 5": 0.0181, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.4587, + "hfopenllm_v2/MMLU-PRO": 0.2844 + } + }, + { + "id": "01-ai/Yi-6B-Chat", + "name": "Yi-6B-Chat", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3395, + "hfopenllm_v2/BBH": 0.4133, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.3688, + "hfopenllm_v2/MMLU-PRO": 0.3061 + } + }, + { + "id": "01-ai/Yi-9B", + "name": "Yi-9B", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2709, + "hfopenllm_v2/BBH": 0.494, + "hfopenllm_v2/MATH Level 5": 0.0559, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.4054, + "hfopenllm_v2/MMLU-PRO": 0.3574 + } + }, + { + "id": "01-ai/Yi-9B-200K", + "name": "Yi-9B-200K", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2327, + "hfopenllm_v2/BBH": 0.4793, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.4294, + "hfopenllm_v2/MMLU-PRO": 0.3622 + } + }, + { + "id": "01-ai/Yi-Coder-9B-Chat", + "name": "Yi-Coder-9B-Chat", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4817, + "hfopenllm_v2/BBH": 0.4814, + "hfopenllm_v2/MATH Level 5": 0.04, + "hfopenllm_v2/GPQA": 0.2475, + "hfopenllm_v2/MUSR": 0.3992, + "hfopenllm_v2/MMLU-PRO": 0.2425 + } + }, + { + "id": "01-ai/yi-34b", + "name": "Yi 34B", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.57, + "helm_lite/NarrativeQA": 0.782, + "helm_lite/NaturalQuestions (closed-book)": 0.443, + "helm_lite/OpenbookQA": 0.92, + "helm_lite/MMLU": 0.65, + "helm_lite/MATH": 0.375, + "helm_lite/GSM8K": 0.648, + "helm_lite/LegalBench": 0.618, + "helm_lite/MedQA": 0.656, + "helm_lite/WMT 2014": 0.172, + "helm_mmlu/MMLU All Subjects": 0.762, + "helm_mmlu/Abstract Algebra": 0.4, + "helm_mmlu/Anatomy": 0.748, + "helm_mmlu/College Physics": 0.5, + "helm_mmlu/Computer Security": 0.83, + "helm_mmlu/Econometrics": 0.588, + "helm_mmlu/Global Facts": 0.53, + "helm_mmlu/Jurisprudence": 0.898, + "helm_mmlu/Philosophy": 0.82, + "helm_mmlu/Professional Psychology": 0.835, + "helm_mmlu/Us Foreign Policy": 0.91, + "helm_mmlu/Astronomy": 0.901, + "helm_mmlu/Business Ethics": 0.75, + "helm_mmlu/Clinical Knowledge": 0.8, + "helm_mmlu/Conceptual Physics": 0.77, + "helm_mmlu/Electrical Engineering": 0.779, + "helm_mmlu/Elementary Mathematics": 0.656, + "helm_mmlu/Formal Logic": 0.548, + "helm_mmlu/High School World History": 0.907, + "helm_mmlu/Human Sexuality": 0.87, + "helm_mmlu/International Law": 0.909, + "helm_mmlu/Logical Fallacies": 0.883, + "helm_mmlu/Machine Learning": 0.58, + "helm_mmlu/Management": 0.893, + "helm_mmlu/Marketing": 0.936, + "helm_mmlu/Medical Genetics": 0.87, + "helm_mmlu/Miscellaneous": 0.902, + "helm_mmlu/Moral Scenarios": 0.606, + "helm_mmlu/Nutrition": 0.869, + "helm_mmlu/Prehistory": 0.877, + "helm_mmlu/Public Relations": 0.745, + "helm_mmlu/Security Studies": 0.833, + "helm_mmlu/Sociology": 0.9, + "helm_mmlu/Virology": 0.572, + "helm_mmlu/World Religions": 0.877, + "helm_mmlu/Mean win rate": 0.315 + } + }, + { + "id": "01-ai/yi-6b", + "name": "Yi 6B", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.253, + "helm_lite/NarrativeQA": 0.702, + "helm_lite/NaturalQuestions (closed-book)": 0.31, + "helm_lite/OpenbookQA": 0.8, + "helm_lite/MMLU": 0.53, + "helm_lite/MATH": 0.126, + "helm_lite/GSM8K": 0.375, + "helm_lite/LegalBench": 0.519, + "helm_lite/MedQA": 0.497, + "helm_lite/WMT 2014": 0.117, + "helm_mmlu/MMLU All Subjects": 0.64, + "helm_mmlu/Abstract Algebra": 0.3, + "helm_mmlu/Anatomy": 0.6, + "helm_mmlu/College Physics": 0.422, + "helm_mmlu/Computer Security": 0.73, + "helm_mmlu/Econometrics": 0.351, + "helm_mmlu/Global Facts": 0.43, + "helm_mmlu/Jurisprudence": 0.796, + "helm_mmlu/Philosophy": 0.678, + "helm_mmlu/Professional Psychology": 0.668, + "helm_mmlu/Us Foreign Policy": 0.87, + "helm_mmlu/Astronomy": 0.684, + "helm_mmlu/Business Ethics": 0.67, + "helm_mmlu/Clinical Knowledge": 0.66, + "helm_mmlu/Conceptual Physics": 0.621, + "helm_mmlu/Electrical Engineering": 0.662, + "helm_mmlu/Elementary Mathematics": 0.452, + "helm_mmlu/Formal Logic": 0.452, + "helm_mmlu/High School World History": 0.785, + "helm_mmlu/Human Sexuality": 0.763, + "helm_mmlu/International Law": 0.769, + "helm_mmlu/Logical Fallacies": 0.779, + "helm_mmlu/Machine Learning": 0.411, + "helm_mmlu/Management": 0.806, + "helm_mmlu/Marketing": 0.893, + "helm_mmlu/Medical Genetics": 0.77, + "helm_mmlu/Miscellaneous": 0.796, + "helm_mmlu/Moral Scenarios": 0.335, + "helm_mmlu/Nutrition": 0.739, + "helm_mmlu/Prehistory": 0.713, + "helm_mmlu/Public Relations": 0.718, + "helm_mmlu/Security Studies": 0.735, + "helm_mmlu/Sociology": 0.831, + "helm_mmlu/Virology": 0.452, + "helm_mmlu/World Religions": 0.836, + "helm_mmlu/Mean win rate": 0.651 + } + }, + { + "id": "01-ai/yi-large-preview", + "name": "Yi Large Preview", + "developer": "01-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.471, + "helm_lite/NarrativeQA": 0.373, + "helm_lite/NaturalQuestions (closed-book)": 0.428, + "helm_lite/OpenbookQA": 0.946, + "helm_lite/MMLU": 0.712, + "helm_lite/MATH": 0.712, + "helm_lite/GSM8K": 0.69, + "helm_lite/LegalBench": 0.519, + "helm_lite/MedQA": 0.66, + "helm_lite/WMT 2014": 0.176, + "helm_mmlu/MMLU All Subjects": 0.793, + "helm_mmlu/Abstract Algebra": 0.6, + "helm_mmlu/Anatomy": 0.83, + "helm_mmlu/College Physics": 0.569, + "helm_mmlu/Computer Security": 0.86, + "helm_mmlu/Econometrics": 0.728, + "helm_mmlu/Global Facts": 0.52, + "helm_mmlu/Jurisprudence": 0.852, + "helm_mmlu/Philosophy": 0.842, + "helm_mmlu/Professional Psychology": 0.853, + "helm_mmlu/Us Foreign Policy": 0.85, + "helm_mmlu/Astronomy": 0.914, + "helm_mmlu/Business Ethics": 0.8, + "helm_mmlu/Clinical Knowledge": 0.857, + "helm_mmlu/Conceptual Physics": 0.864, + "helm_mmlu/Electrical Engineering": 0.779, + "helm_mmlu/Elementary Mathematics": 0.685, + "helm_mmlu/Formal Logic": 0.603, + "helm_mmlu/High School World History": 0.928, + "helm_mmlu/Human Sexuality": 0.901, + "helm_mmlu/International Law": 0.917, + "helm_mmlu/Logical Fallacies": 0.865, + "helm_mmlu/Machine Learning": 0.616, + "helm_mmlu/Management": 0.903, + "helm_mmlu/Marketing": 0.927, + "helm_mmlu/Medical Genetics": 0.83, + "helm_mmlu/Miscellaneous": 0.916, + "helm_mmlu/Moral Scenarios": 0.831, + "helm_mmlu/Nutrition": 0.846, + "helm_mmlu/Prehistory": 0.892, + "helm_mmlu/Public Relations": 0.827, + "helm_mmlu/Security Studies": 0.82, + "helm_mmlu/Sociology": 0.881, + "helm_mmlu/Virology": 0.59, + "helm_mmlu/World Religions": 0.871, + "helm_mmlu/Mean win rate": 0.258 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/1-800-LLMs.json b/data/developers/1-800-LLMs.json new file mode 100644 index 0000000000000000000000000000000000000000..885526469de5418eddf3a50c6f5f70664911b03b --- /dev/null +++ b/data/developers/1-800-LLMs.json @@ -0,0 +1,33 @@ +{ + "developer": "1-800-LLMs", + "models": [ + { + "id": "1-800-LLMs/Qwen-2.5-14B-Hindi", + "name": "Qwen-2.5-14B-Hindi", + "developer": "1-800-LLMs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5826, + "hfopenllm_v2/BBH": 0.6524, + "hfopenllm_v2/MATH Level 5": 0.3331, + "hfopenllm_v2/GPQA": 0.3624, + "hfopenllm_v2/MUSR": 0.4489, + "hfopenllm_v2/MMLU-PRO": 0.5263 + } + }, + { + "id": "1-800-LLMs/Qwen-2.5-14B-Hindi-Custom-Instruct", + "name": "Qwen-2.5-14B-Hindi-Custom-Instruct", + "developer": "1-800-LLMs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3077, + "hfopenllm_v2/BBH": 0.6284, + "hfopenllm_v2/MATH Level 5": 0.3112, + "hfopenllm_v2/GPQA": 0.37, + "hfopenllm_v2/MUSR": 0.4491, + "hfopenllm_v2/MMLU-PRO": 0.5164 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/1024m.json b/data/developers/1024m.json new file mode 100644 index 0000000000000000000000000000000000000000..761a59c34e642cf9a31ba6bedb3ad4c747679080 --- /dev/null +++ b/data/developers/1024m.json @@ -0,0 +1,33 @@ +{ + "developer": "1024m", + "models": [ + { + "id": "1024m/PHI-4-Hindi", + "name": "PHI-4-Hindi", + "developer": "1024m", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0082, + "hfopenllm_v2/BBH": 0.671, + "hfopenllm_v2/MATH Level 5": 0.2334, + "hfopenllm_v2/GPQA": 0.3977, + "hfopenllm_v2/MUSR": 0.4914, + "hfopenllm_v2/MMLU-PRO": 0.5239 + } + }, + { + "id": "1024m/QWEN-14B-B100", + "name": "QWEN-14B-B100", + "developer": "1024m", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7762, + "hfopenllm_v2/BBH": 0.6533, + "hfopenllm_v2/MATH Level 5": 0.5438, + "hfopenllm_v2/GPQA": 0.3507, + "hfopenllm_v2/MUSR": 0.41, + "hfopenllm_v2/MMLU-PRO": 0.5179 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/152334H.json b/data/developers/152334H.json new file mode 100644 index 0000000000000000000000000000000000000000..8f79212374b64adcfe1585d7cfb364de175eb6c3 --- /dev/null +++ b/data/developers/152334H.json @@ -0,0 +1,19 @@ +{ + "developer": "152334H", + "models": [ + { + "id": "152334H/miqu-1-70b-sf", + "name": "miqu-1-70b-sf", + "developer": "152334H", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5182, + "hfopenllm_v2/BBH": 0.6102, + "hfopenllm_v2/MATH Level 5": 0.1246, + "hfopenllm_v2/GPQA": 0.3507, + "hfopenllm_v2/MUSR": 0.4582, + "hfopenllm_v2/MMLU-PRO": 0.4228 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/1TuanPham.json b/data/developers/1TuanPham.json new file mode 100644 index 0000000000000000000000000000000000000000..f278fb7ce517d014ec8ccbf051550b592383476a --- /dev/null +++ b/data/developers/1TuanPham.json @@ -0,0 +1,33 @@ +{ + "developer": "1TuanPham", + "models": [ + { + "id": "1TuanPham/T-VisStar-7B-v0.1", + "name": "T-VisStar-7B-v0.1", + "developer": "1TuanPham", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3607, + "hfopenllm_v2/BBH": 0.5052, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.4375, + "hfopenllm_v2/MMLU-PRO": 0.3211 + } + }, + { + "id": "1TuanPham/T-VisStar-v0.1", + "name": "T-VisStar-v0.1", + "developer": "1TuanPham", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3607, + "hfopenllm_v2/BBH": 0.5052, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.4375, + "hfopenllm_v2/MMLU-PRO": 0.3211 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/3rd-Degree-Burn.json b/data/developers/3rd-Degree-Burn.json new file mode 100644 index 0000000000000000000000000000000000000000..a9c30dd95da988409fe72636331540de646e02b4 --- /dev/null +++ b/data/developers/3rd-Degree-Burn.json @@ -0,0 +1,61 @@ +{ + "developer": "3rd-Degree-Burn", + "models": [ + { + "id": "3rd-Degree-Burn/L-3.1-Science-Writer-8B", + "name": "L-3.1-Science-Writer-8B", + "developer": "3rd-Degree-Burn", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4263, + "hfopenllm_v2/BBH": 0.5041, + "hfopenllm_v2/MATH Level 5": 0.1035, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3959, + "hfopenllm_v2/MMLU-PRO": 0.3649 + } + }, + { + "id": "3rd-Degree-Burn/Llama-3.1-8B-Squareroot", + "name": "Llama-3.1-8B-Squareroot", + "developer": "3rd-Degree-Burn", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2213, + "hfopenllm_v2/BBH": 0.3461, + "hfopenllm_v2/MATH Level 5": 0.2659, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3089, + "hfopenllm_v2/MMLU-PRO": 0.175 + } + }, + { + "id": "3rd-Degree-Burn/Llama-3.1-8B-Squareroot-v1", + "name": "Llama-3.1-8B-Squareroot-v1", + "developer": "3rd-Degree-Burn", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2892, + "hfopenllm_v2/BBH": 0.3343, + "hfopenllm_v2/MATH Level 5": 0.0884, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3341, + "hfopenllm_v2/MMLU-PRO": 0.1127 + } + }, + { + "id": "3rd-Degree-Burn/Llama-Squared-8B", + "name": "Llama-Squared-8B", + "developer": "3rd-Degree-Burn", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2755, + "hfopenllm_v2/BBH": 0.4431, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3089, + "hfopenllm_v2/MMLU-PRO": 0.2366 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/4season.json b/data/developers/4season.json new file mode 100644 index 0000000000000000000000000000000000000000..f089a9c6a4b809bc200bc8fd4f2e397ac7b658d0 --- /dev/null +++ b/data/developers/4season.json @@ -0,0 +1,19 @@ +{ + "developer": "4season", + "models": [ + { + "id": "4season/final_model_test_v2", + "name": "final_model_test_v2", + "developer": "4season", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3191, + "hfopenllm_v2/BBH": 0.6342, + "hfopenllm_v2/MATH Level 5": 0.0838, + "hfopenllm_v2/GPQA": 0.3272, + "hfopenllm_v2/MUSR": 0.4314, + "hfopenllm_v2/MMLU-PRO": 0.3528 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/AALF.json b/data/developers/AALF.json new file mode 100644 index 0000000000000000000000000000000000000000..83290565071a0a55a04ca2c414334445d97f9e10 --- /dev/null +++ b/data/developers/AALF.json @@ -0,0 +1,61 @@ +{ + "developer": "AALF", + "models": [ + { + "id": "AALF/FuseChat-Llama-3.1-8B-Instruct-preview", + "name": "FuseChat-Llama-3.1-8B-Instruct-preview", + "developer": "AALF", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.719, + "hfopenllm_v2/BBH": 0.512, + "hfopenllm_v2/MATH Level 5": 0.2477, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.382, + "hfopenllm_v2/MMLU-PRO": 0.3733 + } + }, + { + "id": "AALF/FuseChat-Llama-3.1-8B-SFT-preview", + "name": "FuseChat-Llama-3.1-8B-SFT-preview", + "developer": "AALF", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7281, + "hfopenllm_v2/BBH": 0.524, + "hfopenllm_v2/MATH Level 5": 0.2251, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.402, + "hfopenllm_v2/MMLU-PRO": 0.3743 + } + }, + { + "id": "AALF/gemma-2-27b-it-SimPO-37K", + "name": "gemma-2-27b-it-SimPO-37K", + "developer": "AALF", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2407, + "hfopenllm_v2/BBH": 0.3911, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3488, + "hfopenllm_v2/MMLU-PRO": 0.1971 + } + }, + { + "id": "AALF/gemma-2-27b-it-SimPO-37K-100steps", + "name": "gemma-2-27b-it-SimPO-37K-100steps", + "developer": "AALF", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2568, + "hfopenllm_v2/BBH": 0.3931, + "hfopenllm_v2/MATH Level 5": 0.0211, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.3329, + "hfopenllm_v2/MMLU-PRO": 0.2125 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/AELLM.json b/data/developers/AELLM.json new file mode 100644 index 0000000000000000000000000000000000000000..9e995289b108488f3ccc1b3679d0a21d92b604db --- /dev/null +++ b/data/developers/AELLM.json @@ -0,0 +1,33 @@ +{ + "developer": "AELLM", + "models": [ + { + "id": "AELLM/gemma-2-aeria-infinity-9b", + "name": "gemma-2-aeria-infinity-9b", + "developer": "AELLM", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7594, + "hfopenllm_v2/BBH": 0.5983, + "hfopenllm_v2/MATH Level 5": 0.2145, + "hfopenllm_v2/GPQA": 0.3339, + "hfopenllm_v2/MUSR": 0.402, + "hfopenllm_v2/MMLU-PRO": 0.3862 + } + }, + { + "id": "AELLM/gemma-2-lyco-infinity-9b", + "name": "gemma-2-lyco-infinity-9b", + "developer": "AELLM", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7316, + "hfopenllm_v2/BBH": 0.584, + "hfopenllm_v2/MATH Level 5": 0.1707, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.4006, + "hfopenllm_v2/MMLU-PRO": 0.3787 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/AGI-0.json b/data/developers/AGI-0.json new file mode 100644 index 0000000000000000000000000000000000000000..c95120768c3cd74c99ec36ec0c05db50bb8deaf5 --- /dev/null +++ b/data/developers/AGI-0.json @@ -0,0 +1,47 @@ +{ + "developer": "AGI-0", + "models": [ + { + "id": "AGI-0/Art-v0-3B", + "name": "Art-v0-3B", + "developer": "AGI-0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3192, + "hfopenllm_v2/BBH": 0.3401, + "hfopenllm_v2/MATH Level 5": 0.2462, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3768, + "hfopenllm_v2/MMLU-PRO": 0.1179 + } + }, + { + "id": "AGI-0/Artificium-llama3.1-8B-001", + "name": "Artificium-llama3.1-8B-001", + "developer": "AGI-0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5248, + "hfopenllm_v2/BBH": 0.4256, + "hfopenllm_v2/MATH Level 5": 0.136, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3795, + "hfopenllm_v2/MMLU-PRO": 0.3182 + } + }, + { + "id": "AGI-0/smartllama3.1-8B-001", + "name": "smartllama3.1-8B-001", + "developer": "AGI-0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3518, + "hfopenllm_v2/BBH": 0.467, + "hfopenllm_v2/MATH Level 5": 0.1299, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4386, + "hfopenllm_v2/MMLU-PRO": 0.3487 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/AI-MO.json b/data/developers/AI-MO.json new file mode 100644 index 0000000000000000000000000000000000000000..7ef790ea54a6088c6a20cda8c7a1caabf46cd638 --- /dev/null +++ b/data/developers/AI-MO.json @@ -0,0 +1,33 @@ +{ + "developer": "AI-MO", + "models": [ + { + "id": "AI-MO/NuminaMath-7B-CoT", + "name": "NuminaMath-7B-CoT", + "developer": "AI-MO", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2689, + "hfopenllm_v2/BBH": 0.4314, + "hfopenllm_v2/MATH Level 5": 0.2696, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3303, + "hfopenllm_v2/MMLU-PRO": 0.2868 + } + }, + { + "id": "AI-MO/NuminaMath-7B-TIR", + "name": "NuminaMath-7B-TIR", + "developer": "AI-MO", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2756, + "hfopenllm_v2/BBH": 0.4144, + "hfopenllm_v2/MATH Level 5": 0.1609, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3509, + "hfopenllm_v2/MMLU-PRO": 0.2733 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/AI-Sweden-Models.json b/data/developers/AI-Sweden-Models.json new file mode 100644 index 0000000000000000000000000000000000000000..10e07192410c4196687fbc1ca5a08749e9ee5ab8 --- /dev/null +++ b/data/developers/AI-Sweden-Models.json @@ -0,0 +1,33 @@ +{ + "developer": "AI-Sweden-Models", + "models": [ + { + "id": "AI-Sweden-Models/Llama-3-8B-instruct", + "name": "Llama-3-8B-instruct", + "developer": "AI-Sweden-Models", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2401, + "hfopenllm_v2/BBH": 0.4173, + "hfopenllm_v2/MATH Level 5": 0.0385, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.4771, + "hfopenllm_v2/MMLU-PRO": 0.2597 + } + }, + { + "id": "AI-Sweden-Models/gpt-sw3-40b", + "name": "gpt-sw3-40b", + "developer": "AI-Sweden-Models", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.147, + "hfopenllm_v2/BBH": 0.3268, + "hfopenllm_v2/MATH Level 5": 0.0174, + "hfopenllm_v2/GPQA": 0.2349, + "hfopenllm_v2/MUSR": 0.3632, + "hfopenllm_v2/MMLU-PRO": 0.1276 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/AI4free.json b/data/developers/AI4free.json new file mode 100644 index 0000000000000000000000000000000000000000..c31b5f213a67e63a45fc560b57d6f186f5284b23 --- /dev/null +++ b/data/developers/AI4free.json @@ -0,0 +1,33 @@ +{ + "developer": "AI4free", + "models": [ + { + "id": "AI4free/Dhanishtha", + "name": "Dhanishtha", + "developer": "AI4free", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2451, + "hfopenllm_v2/BBH": 0.3404, + "hfopenllm_v2/MATH Level 5": 0.256, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.3569, + "hfopenllm_v2/MMLU-PRO": 0.1643 + } + }, + { + "id": "AI4free/t2", + "name": "t2", + "developer": "AI4free", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3867, + "hfopenllm_v2/BBH": 0.291, + "hfopenllm_v2/MATH Level 5": 0.1896, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3846, + "hfopenllm_v2/MMLU-PRO": 0.1144 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/AIDC-AI.json b/data/developers/AIDC-AI.json new file mode 100644 index 0000000000000000000000000000000000000000..7016f4a16bdd0d8604b746144073520c40d16c45 --- /dev/null +++ b/data/developers/AIDC-AI.json @@ -0,0 +1,19 @@ +{ + "developer": "AIDC-AI", + "models": [ + { + "id": "AIDC-AI/Marco-o1", + "name": "Marco-o1", + "developer": "AIDC-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4771, + "hfopenllm_v2/BBH": 0.5364, + "hfopenllm_v2/MATH Level 5": 0.3746, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.4138, + "hfopenllm_v2/MMLU-PRO": 0.4117 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Aashraf995.json b/data/developers/Aashraf995.json new file mode 100644 index 0000000000000000000000000000000000000000..9c3d5bcd4f4262d4967db225c4d78902d29a0c24 --- /dev/null +++ b/data/developers/Aashraf995.json @@ -0,0 +1,61 @@ +{ + "developer": "Aashraf995", + "models": [ + { + "id": "Aashraf995/Creative-7B-nerd", + "name": "Creative-7B-nerd", + "developer": "Aashraf995", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4722, + "hfopenllm_v2/BBH": 0.5607, + "hfopenllm_v2/MATH Level 5": 0.3165, + "hfopenllm_v2/GPQA": 0.3263, + "hfopenllm_v2/MUSR": 0.4515, + "hfopenllm_v2/MMLU-PRO": 0.4492 + } + }, + { + "id": "Aashraf995/Gemma-Evo-10B", + "name": "Gemma-Evo-10B", + "developer": "Aashraf995", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7332, + "hfopenllm_v2/BBH": 0.6044, + "hfopenllm_v2/MATH Level 5": 0.2228, + "hfopenllm_v2/GPQA": 0.354, + "hfopenllm_v2/MUSR": 0.4595, + "hfopenllm_v2/MMLU-PRO": 0.4275 + } + }, + { + "id": "Aashraf995/Qwen-Evo-7B", + "name": "Qwen-Evo-7B", + "developer": "Aashraf995", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4757, + "hfopenllm_v2/BBH": 0.5709, + "hfopenllm_v2/MATH Level 5": 0.3142, + "hfopenllm_v2/GPQA": 0.3255, + "hfopenllm_v2/MUSR": 0.4541, + "hfopenllm_v2/MMLU-PRO": 0.4462 + } + }, + { + "id": "Aashraf995/QwenStock-14B", + "name": "QwenStock-14B", + "developer": "Aashraf995", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5009, + "hfopenllm_v2/BBH": 0.655, + "hfopenllm_v2/MATH Level 5": 0.3573, + "hfopenllm_v2/GPQA": 0.3893, + "hfopenllm_v2/MUSR": 0.4793, + "hfopenllm_v2/MMLU-PRO": 0.5382 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/AbacusResearch.json b/data/developers/AbacusResearch.json new file mode 100644 index 0000000000000000000000000000000000000000..221269693ea740084ef1ec9cc41801e075802fce --- /dev/null +++ b/data/developers/AbacusResearch.json @@ -0,0 +1,19 @@ +{ + "developer": "AbacusResearch", + "models": [ + { + "id": "AbacusResearch/Jallabi-34B", + "name": "Jallabi-34B", + "developer": "AbacusResearch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3529, + "hfopenllm_v2/BBH": 0.6023, + "hfopenllm_v2/MATH Level 5": 0.0521, + "hfopenllm_v2/GPQA": 0.3389, + "hfopenllm_v2/MUSR": 0.4822, + "hfopenllm_v2/MMLU-PRO": 0.4682 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Ahdoot.json b/data/developers/Ahdoot.json new file mode 100644 index 0000000000000000000000000000000000000000..a449cc05cb76624902adfae4bc27a73846de58bf --- /dev/null +++ b/data/developers/Ahdoot.json @@ -0,0 +1,33 @@ +{ + "developer": "Ahdoot", + "models": [ + { + "id": "Ahdoot/StructuredThinker-v0.3-MoreStructure", + "name": "StructuredThinker-v0.3-MoreStructure", + "developer": "Ahdoot", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4193, + "hfopenllm_v2/BBH": 0.4838, + "hfopenllm_v2/MATH Level 5": 0.2908, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4158, + "hfopenllm_v2/MMLU-PRO": 0.361 + } + }, + { + "id": "Ahdoot/Test_StealthThinker", + "name": "Test_StealthThinker", + "developer": "Ahdoot", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.422, + "hfopenllm_v2/BBH": 0.4647, + "hfopenllm_v2/MATH Level 5": 0.179, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.428, + "hfopenllm_v2/MMLU-PRO": 0.3597 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Ahjeong.json b/data/developers/Ahjeong.json new file mode 100644 index 0000000000000000000000000000000000000000..8bad04354c12c186962719731a334cb95ca3b61a --- /dev/null +++ b/data/developers/Ahjeong.json @@ -0,0 +1,33 @@ +{ + "developer": "Ahjeong", + "models": [ + { + "id": "Ahjeong/MMPO_Gemma_7b", + "name": "Ahjeong/MMPO_Gemma_7b", + "developer": "Ahjeong", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7587, + "reward-bench/Chat": 0.9693, + "reward-bench/Chat Hard": 0.614, + "reward-bench/Safety": 0.7135, + "reward-bench/Reasoning": 0.7756, + "reward-bench/Prior Sets (0.5 weight)": 0.6831 + } + }, + { + "id": "Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3", + "name": "Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3", + "developer": "Ahjeong", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7652, + "reward-bench/Chat": 0.9721, + "reward-bench/Chat Hard": 0.6338, + "reward-bench/Safety": 0.7635, + "reward-bench/Reasoning": 0.7284, + "reward-bench/Prior Sets (0.5 weight)": 0.6913 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/AicoresSecurity.json b/data/developers/AicoresSecurity.json new file mode 100644 index 0000000000000000000000000000000000000000..812ec81d59db5c76d60bd5a09b361da8eea79f99 --- /dev/null +++ b/data/developers/AicoresSecurity.json @@ -0,0 +1,61 @@ +{ + "developer": "AicoresSecurity", + "models": [ + { + "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V0", + "name": "Cybernet-Sec-3B-R1-V0", + "developer": "AicoresSecurity", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6358, + "hfopenllm_v2/BBH": 0.4497, + "hfopenllm_v2/MATH Level 5": 0.1156, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3314, + "hfopenllm_v2/MMLU-PRO": 0.301 + } + }, + { + "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V0-Coder", + "name": "Cybernet-Sec-3B-R1-V0-Coder", + "developer": "AicoresSecurity", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7098, + "hfopenllm_v2/BBH": 0.4478, + "hfopenllm_v2/MATH Level 5": 0.1488, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3408, + "hfopenllm_v2/MMLU-PRO": 0.3178 + } + }, + { + "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V1", + "name": "Cybernet-Sec-3B-R1-V1", + "developer": "AicoresSecurity", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6146, + "hfopenllm_v2/BBH": 0.4282, + "hfopenllm_v2/MATH Level 5": 0.1518, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3287, + "hfopenllm_v2/MMLU-PRO": 0.2876 + } + }, + { + "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V1.1", + "name": "Cybernet-Sec-3B-R1-V1.1", + "developer": "AicoresSecurity", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.673, + "hfopenllm_v2/BBH": 0.4392, + "hfopenllm_v2/MATH Level 5": 0.176, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3541, + "hfopenllm_v2/MMLU-PRO": 0.3088 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Alepach.json b/data/developers/Alepach.json new file mode 100644 index 0000000000000000000000000000000000000000..781d99e668f919b1a6e78c381f192426b1911d1e --- /dev/null +++ b/data/developers/Alepach.json @@ -0,0 +1,47 @@ +{ + "developer": "Alepach", + "models": [ + { + "id": "Alepach/notHumpback-M0", + "name": "notHumpback-M0", + "developer": "Alepach", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.235, + "hfopenllm_v2/BBH": 0.2785, + "hfopenllm_v2/MATH Level 5": 0.0189, + "hfopenllm_v2/GPQA": 0.2492, + "hfopenllm_v2/MUSR": 0.3552, + "hfopenllm_v2/MMLU-PRO": 0.1119 + } + }, + { + "id": "Alepach/notHumpback-M1", + "name": "notHumpback-M1", + "developer": "Alepach", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2207, + "hfopenllm_v2/BBH": 0.2882, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.2374, + "hfopenllm_v2/MUSR": 0.342, + "hfopenllm_v2/MMLU-PRO": 0.1091 + } + }, + { + "id": "Alepach/notHumpback-M1-v2", + "name": "notHumpback-M1-v2", + "developer": "Alepach", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2277, + "hfopenllm_v2/BBH": 0.2776, + "hfopenllm_v2/MATH Level 5": 0.0219, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3473, + "hfopenllm_v2/MMLU-PRO": 0.1119 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/AlephAlpha.json b/data/developers/AlephAlpha.json new file mode 100644 index 0000000000000000000000000000000000000000..6443dc032cb73772b33472d727d995f58333d00b --- /dev/null +++ b/data/developers/AlephAlpha.json @@ -0,0 +1,59 @@ +{ + "developer": "AlephAlpha", + "models": [ + { + "id": "AlephAlpha/luminous-base", + "name": "Luminous Base 13B", + "developer": "AlephAlpha", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.041, + "helm_lite/NarrativeQA": 0.633, + "helm_lite/NaturalQuestions (closed-book)": 0.197, + "helm_lite/OpenbookQA": 0.286, + "helm_lite/MMLU": 0.243, + "helm_lite/MATH": 0.026, + "helm_lite/GSM8K": 0.028, + "helm_lite/LegalBench": 0.332, + "helm_lite/MedQA": 0.26, + "helm_lite/WMT 2014": 0.066 + } + }, + { + "id": "AlephAlpha/luminous-extended", + "name": "Luminous Extended 30B", + "developer": "AlephAlpha", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.078, + "helm_lite/NarrativeQA": 0.684, + "helm_lite/NaturalQuestions (closed-book)": 0.253, + "helm_lite/OpenbookQA": 0.272, + "helm_lite/MMLU": 0.248, + "helm_lite/MATH": 0.04, + "helm_lite/GSM8K": 0.075, + "helm_lite/LegalBench": 0.421, + "helm_lite/MedQA": 0.276, + "helm_lite/WMT 2014": 0.083 + } + }, + { + "id": "AlephAlpha/luminous-supreme", + "name": "Luminous Supreme 70B", + "developer": "AlephAlpha", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.145, + "helm_lite/NarrativeQA": 0.743, + "helm_lite/NaturalQuestions (closed-book)": 0.299, + "helm_lite/OpenbookQA": 0.284, + "helm_lite/MMLU": 0.316, + "helm_lite/MATH": 0.078, + "helm_lite/GSM8K": 0.137, + "helm_lite/LegalBench": 0.452, + "helm_lite/MedQA": 0.276, + "helm_lite/WMT 2014": 0.102 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Alibaba-NLP.json b/data/developers/Alibaba-NLP.json new file mode 100644 index 0000000000000000000000000000000000000000..5f6d0b2ffd32c20f9bfaf3ede70baae22cb89e6d --- /dev/null +++ b/data/developers/Alibaba-NLP.json @@ -0,0 +1,19 @@ +{ + "developer": "Alibaba-NLP", + "models": [ + { + "id": "Alibaba-NLP/gte-Qwen2-7B-instruct", + "name": "gte-Qwen2-7B-instruct", + "developer": "Alibaba-NLP", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2255, + "hfopenllm_v2/BBH": 0.4495, + "hfopenllm_v2/MATH Level 5": 0.0642, + "hfopenllm_v2/GPQA": 0.245, + "hfopenllm_v2/MUSR": 0.3559, + "hfopenllm_v2/MMLU-PRO": 0.3321 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Alibaba.json b/data/developers/Alibaba.json new file mode 100644 index 0000000000000000000000000000000000000000..8a87c20f0276d3b198b12dc620a9e1f6f6cd3f5f --- /dev/null +++ b/data/developers/Alibaba.json @@ -0,0 +1,58 @@ +{ + "developer": "Alibaba", + "models": [ + { + "id": "alibaba/qwen-3-coder-480b", + "name": "Qwen 3 Coder 480B", + "developer": "Alibaba", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 23.9 + } + }, + { + "id": "alibaba/qwen3-235b-a22b-thinking-2507", + "name": "qwen3-235b-a22b-thinking-2507", + "developer": "Alibaba", + "evaluator_relationship": null, + "benchmark_scores": { + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.1267605633802817, + "livecodebenchpro/Easy Problems": 0.7605633802816901 + } + }, + { + "id": "alibaba/qwen3-30b-a3b", + "name": "qwen3-30b-a3b", + "developer": "Alibaba", + "evaluator_relationship": null, + "benchmark_scores": { + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.028169014084507043, + "livecodebenchpro/Easy Problems": 0.5774647887323944 + } + }, + { + "id": "alibaba/qwen3-max", + "name": "alibaba/qwen3-max", + "developer": "Alibaba", + "evaluator_relationship": null, + "benchmark_scores": { + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.04225352112676056, + "livecodebenchpro/Easy Problems": 0.36619718309859156 + } + }, + { + "id": "alibaba/qwen3-next-80b-a3b-thinking", + "name": "qwen3-next-80b-a3b-thinking", + "developer": "Alibaba", + "evaluator_relationship": null, + "benchmark_scores": { + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.14084507042253522, + "livecodebenchpro/Easy Problems": 0.7464788732394366 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Alsebay.json b/data/developers/Alsebay.json new file mode 100644 index 0000000000000000000000000000000000000000..1fe37a631a47cb31729718905ced5476eddcc6e9 --- /dev/null +++ b/data/developers/Alsebay.json @@ -0,0 +1,19 @@ +{ + "developer": "Alsebay", + "models": [ + { + "id": "Alsebay/Qwen2.5-7B-test-novelist", + "name": "Qwen2.5-7B-test-novelist", + "developer": "Alsebay", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5352, + "hfopenllm_v2/BBH": 0.5151, + "hfopenllm_v2/MATH Level 5": 0.2349, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.4749, + "hfopenllm_v2/MMLU-PRO": 0.3866 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Amaorynho.json b/data/developers/Amaorynho.json new file mode 100644 index 0000000000000000000000000000000000000000..efa0884c28c767971a41196952a6b7d32bd94adf --- /dev/null +++ b/data/developers/Amaorynho.json @@ -0,0 +1,61 @@ +{ + "developer": "Amaorynho", + "models": [ + { + "id": "Amaorynho/BBAI2006", + "name": "BBAI2006", + "developer": "Amaorynho", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1467, + "hfopenllm_v2/BBH": 0.2704, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.3605, + "hfopenllm_v2/MMLU-PRO": 0.1123 + } + }, + { + "id": "Amaorynho/BBAI270V4", + "name": "BBAI270V4", + "developer": "Amaorynho", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.199, + "hfopenllm_v2/BBH": 0.3071, + "hfopenllm_v2/MATH Level 5": 0.0083, + "hfopenllm_v2/GPQA": 0.2458, + "hfopenllm_v2/MUSR": 0.3314, + "hfopenllm_v2/MMLU-PRO": 0.1114 + } + }, + { + "id": "Amaorynho/BBAIIFEV1", + "name": "BBAIIFEV1", + "developer": "Amaorynho", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8047, + "hfopenllm_v2/BBH": 0.5292, + "hfopenllm_v2/MATH Level 5": 0.1934, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4185, + "hfopenllm_v2/MMLU-PRO": 0.3857 + } + }, + { + "id": "Amaorynho/BBAI_375", + "name": "BBAI_375", + "developer": "Amaorynho", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1467, + "hfopenllm_v2/BBH": 0.2704, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.3605, + "hfopenllm_v2/MMLU-PRO": 0.1123 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Amu.json b/data/developers/Amu.json new file mode 100644 index 0000000000000000000000000000000000000000..cdcb9b2bf1d0155e992dd2cb1e040b235ebb7c33 --- /dev/null +++ b/data/developers/Amu.json @@ -0,0 +1,33 @@ +{ + "developer": "Amu", + "models": [ + { + "id": "Amu/t1-1.5B", + "name": "t1-1.5B", + "developer": "Amu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3394, + "hfopenllm_v2/BBH": 0.4008, + "hfopenllm_v2/MATH Level 5": 0.0514, + "hfopenllm_v2/GPQA": 0.2433, + "hfopenllm_v2/MUSR": 0.3517, + "hfopenllm_v2/MMLU-PRO": 0.2566 + } + }, + { + "id": "Amu/t1-3B", + "name": "t1-3B", + "developer": "Amu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3328, + "hfopenllm_v2/BBH": 0.3999, + "hfopenllm_v2/MATH Level 5": 0.1375, + "hfopenllm_v2/GPQA": 0.2408, + "hfopenllm_v2/MUSR": 0.3435, + "hfopenllm_v2/MMLU-PRO": 0.1284 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Anthropic.json b/data/developers/Anthropic.json new file mode 100644 index 0000000000000000000000000000000000000000..21f20c31a6454ac03058235d56cee7152af73904 --- /dev/null +++ b/data/developers/Anthropic.json @@ -0,0 +1,129 @@ +{ + "developer": "Anthropic", + "models": [ + { + "id": "Anthropic/claude-3-5-sonnet-20240620", + "name": "Anthropic/claude-3-5-sonnet-20240620", + "developer": "Anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8417, + "reward-bench/Chat": 0.9637, + "reward-bench/Chat Hard": 0.7401, + "reward-bench/Safety": 0.8162, + "reward-bench/Reasoning": 0.8469 + } + }, + { + "id": "Anthropic/claude-3-haiku-20240307", + "name": "Anthropic/claude-3-haiku-20240307", + "developer": "Anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7289, + "reward-bench/Chat": 0.9274, + "reward-bench/Chat Hard": 0.5197, + "reward-bench/Safety": 0.7953, + "reward-bench/Reasoning": 0.706, + "reward-bench/Prior Sets (0.5 weight)": 0.6635 + } + }, + { + "id": "Anthropic/claude-3-opus-20240229", + "name": "Anthropic/claude-3-opus-20240229", + "developer": "Anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8008, + "reward-bench/Chat": 0.9469, + "reward-bench/Chat Hard": 0.6031, + "reward-bench/Safety": 0.8662, + "reward-bench/Reasoning": 0.7868 + } + }, + { + "id": "Anthropic/claude-3-sonnet-20240229", + "name": "Anthropic/claude-3-sonnet-20240229", + "developer": "Anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7458, + "reward-bench/Chat": 0.9344, + "reward-bench/Chat Hard": 0.5658, + "reward-bench/Safety": 0.8169, + "reward-bench/Reasoning": 0.6907, + "reward-bench/Prior Sets (0.5 weight)": 0.6963 + } + }, + { + "id": "anthropic/claude-3.7-sonnet", + "name": "anthropic/claude-3.7-sonnet", + "developer": "Anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.014084507042253521, + "livecodebenchpro/Easy Problems": 0.15492957746478872 + } + }, + { + "id": "anthropic/claude-haiku-4.5", + "name": "Claude Haiku 4.5", + "developer": "Anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 35.5 + } + }, + { + "id": "anthropic/claude-opus-4-5", + "name": "claude-opus-4-5", + "developer": "Anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "appworld_test_normal/appworld/test_normal": 0.66, + "browsecompplus/browsecompplus": 0.49, + "swe-bench/swe-bench": 0.65, + "tau-bench-2_airline/tau-bench-2/airline": 0.66, + "tau-bench-2_retail/tau-bench-2/retail": 0.85, + "tau-bench-2_telecom/tau-bench-2/telecom": 0.58 + } + }, + { + "id": "anthropic/claude-opus-4.1", + "name": "Claude Opus 4.1", + "developer": "Anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 38.0 + } + }, + { + "id": "anthropic/claude-opus-4.5", + "name": "Claude Opus 4.5", + "developer": "Anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 54.3 + } + }, + { + "id": "anthropic/claude-opus-4.6", + "name": "Claude Opus 4.6", + "developer": "Anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 69.9 + } + }, + { + "id": "anthropic/claude-sonnet-4.5", + "name": "Claude Sonnet 4.5", + "developer": "Anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 42.6 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ArliAI.json b/data/developers/ArliAI.json new file mode 100644 index 0000000000000000000000000000000000000000..e1d87d6549acf88234ad4a9341b69cff2439d761 --- /dev/null +++ b/data/developers/ArliAI.json @@ -0,0 +1,33 @@ +{ + "developer": "ArliAI", + "models": [ + { + "id": "ArliAI/ArliAI-RPMax-12B-v1.1", + "name": "ArliAI-RPMax-12B-v1.1", + "developer": "ArliAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5349, + "hfopenllm_v2/BBH": 0.4752, + "hfopenllm_v2/MATH Level 5": 0.1125, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.3618, + "hfopenllm_v2/MMLU-PRO": 0.3384 + } + }, + { + "id": "ArliAI/Llama-3.1-8B-ArliAI-RPMax-v1.1", + "name": "Llama-3.1-8B-ArliAI-RPMax-v1.1", + "developer": "ArliAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6359, + "hfopenllm_v2/BBH": 0.5016, + "hfopenllm_v2/MATH Level 5": 0.1314, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3577, + "hfopenllm_v2/MMLU-PRO": 0.3551 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Arthur-LAGACHERIE.json b/data/developers/Arthur-LAGACHERIE.json new file mode 100644 index 0000000000000000000000000000000000000000..194d8f751fe5b0c657ff9d2dbffc72d638e8c613 --- /dev/null +++ b/data/developers/Arthur-LAGACHERIE.json @@ -0,0 +1,19 @@ +{ + "developer": "Arthur-LAGACHERIE", + "models": [ + { + "id": "Arthur-LAGACHERIE/Precis-1B-Instruct", + "name": "Precis-1B-Instruct", + "developer": "Arthur-LAGACHERIE", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3671, + "hfopenllm_v2/BBH": 0.3224, + "hfopenllm_v2/MATH Level 5": 0.0038, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3436, + "hfopenllm_v2/MMLU-PRO": 0.1426 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Artples.json b/data/developers/Artples.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7386fde874dc3f1d6529d96da80a827af37dff --- /dev/null +++ b/data/developers/Artples.json @@ -0,0 +1,33 @@ +{ + "developer": "Artples", + "models": [ + { + "id": "Artples/L-MChat-7b", + "name": "L-MChat-7b", + "developer": "Artples", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5297, + "hfopenllm_v2/BBH": 0.46, + "hfopenllm_v2/MATH Level 5": 0.0921, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4029, + "hfopenllm_v2/MMLU-PRO": 0.3299 + } + }, + { + "id": "Artples/L-MChat-Small", + "name": "L-MChat-Small", + "developer": "Artples", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3287, + "hfopenllm_v2/BBH": 0.4823, + "hfopenllm_v2/MATH Level 5": 0.0378, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3696, + "hfopenllm_v2/MMLU-PRO": 0.2464 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Aryanne.json b/data/developers/Aryanne.json new file mode 100644 index 0000000000000000000000000000000000000000..17738c5975a89f8da20b14683880dff2f2671b71 --- /dev/null +++ b/data/developers/Aryanne.json @@ -0,0 +1,47 @@ +{ + "developer": "Aryanne", + "models": [ + { + "id": "Aryanne/QwentileSwap", + "name": "QwentileSwap", + "developer": "Aryanne", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7378, + "hfopenllm_v2/BBH": 0.7008, + "hfopenllm_v2/MATH Level 5": 0.4222, + "hfopenllm_v2/GPQA": 0.3674, + "hfopenllm_v2/MUSR": 0.464, + "hfopenllm_v2/MMLU-PRO": 0.5946 + } + }, + { + "id": "Aryanne/SHBA", + "name": "SHBA", + "developer": "Aryanne", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7817, + "hfopenllm_v2/BBH": 0.5233, + "hfopenllm_v2/MATH Level 5": 0.1798, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4161, + "hfopenllm_v2/MMLU-PRO": 0.3892 + } + }, + { + "id": "Aryanne/SuperHeart", + "name": "SuperHeart", + "developer": "Aryanne", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5192, + "hfopenllm_v2/BBH": 0.5215, + "hfopenllm_v2/MATH Level 5": 0.1563, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4436, + "hfopenllm_v2/MMLU-PRO": 0.3912 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/AtAndDev.json b/data/developers/AtAndDev.json new file mode 100644 index 0000000000000000000000000000000000000000..d269c1fbba37e9ea4df30635a880656764997806 --- /dev/null +++ b/data/developers/AtAndDev.json @@ -0,0 +1,19 @@ +{ + "developer": "AtAndDev", + "models": [ + { + "id": "AtAndDev/Qwen2.5-1.5B-continuous-learnt", + "name": "Qwen2.5-1.5B-continuous-learnt", + "developer": "AtAndDev", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4511, + "hfopenllm_v2/BBH": 0.4275, + "hfopenllm_v2/MATH Level 5": 0.1473, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3623, + "hfopenllm_v2/MMLU-PRO": 0.2806 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Ateron.json b/data/developers/Ateron.json new file mode 100644 index 0000000000000000000000000000000000000000..14b39e12249b44d8c29773d550919aadc86d0a24 --- /dev/null +++ b/data/developers/Ateron.json @@ -0,0 +1,47 @@ +{ + "developer": "Ateron", + "models": [ + { + "id": "Ateron/Glowing-Forest-12B", + "name": "Glowing-Forest-12B", + "developer": "Ateron", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3592, + "hfopenllm_v2/BBH": 0.5492, + "hfopenllm_v2/MATH Level 5": 0.0778, + "hfopenllm_v2/GPQA": 0.3331, + "hfopenllm_v2/MUSR": 0.4449, + "hfopenllm_v2/MMLU-PRO": 0.3718 + } + }, + { + "id": "Ateron/Lotus-Magpic", + "name": "Lotus-Magpic", + "developer": "Ateron", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6286, + "hfopenllm_v2/BBH": 0.5254, + "hfopenllm_v2/MATH Level 5": 0.0997, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4332, + "hfopenllm_v2/MMLU-PRO": 0.3491 + } + }, + { + "id": "Ateron/Way_of_MagPicaro", + "name": "Way_of_MagPicaro", + "developer": "Ateron", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2637, + "hfopenllm_v2/BBH": 0.5427, + "hfopenllm_v2/MATH Level 5": 0.0589, + "hfopenllm_v2/GPQA": 0.3339, + "hfopenllm_v2/MUSR": 0.4649, + "hfopenllm_v2/MMLU-PRO": 0.3536 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/AtlaAI.json b/data/developers/AtlaAI.json new file mode 100644 index 0000000000000000000000000000000000000000..338a8d19ff5e81d140afc67a6e00cabdf7cb10b8 --- /dev/null +++ b/data/developers/AtlaAI.json @@ -0,0 +1,31 @@ +{ + "developer": "AtlaAI", + "models": [ + { + "id": "AtlaAI/Selene-1", + "name": "AtlaAI/Selene-1", + "developer": "AtlaAI", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9241, + "reward-bench/Chat": 0.9777, + "reward-bench/Chat Hard": 0.8399, + "reward-bench/Safety": 0.9216, + "reward-bench/Reasoning": 0.9572 + } + }, + { + "id": "AtlaAI/Selene-1-Mini-Llama-3.1-8B", + "name": "AtlaAI/Selene-1-Mini-Llama-3.1-8B", + "developer": "AtlaAI", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8913, + "reward-bench/Chat": 0.9358, + "reward-bench/Chat Hard": 0.7939, + "reward-bench/Safety": 0.8926, + "reward-bench/Reasoning": 0.9429 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/AuraIndustries.json b/data/developers/AuraIndustries.json new file mode 100644 index 0000000000000000000000000000000000000000..8e7fe10b080a62af2b3a06d2506e02c3caac5d9d --- /dev/null +++ b/data/developers/AuraIndustries.json @@ -0,0 +1,61 @@ +{ + "developer": "AuraIndustries", + "models": [ + { + "id": "AuraIndustries/Aura-4B", + "name": "Aura-4B", + "developer": "AuraIndustries", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3816, + "hfopenllm_v2/BBH": 0.449, + "hfopenllm_v2/MATH Level 5": 0.0423, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.3938, + "hfopenllm_v2/MMLU-PRO": 0.2706 + } + }, + { + "id": "AuraIndustries/Aura-8B", + "name": "Aura-8B", + "developer": "AuraIndustries", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7205, + "hfopenllm_v2/BBH": 0.5131, + "hfopenllm_v2/MATH Level 5": 0.1518, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.4004, + "hfopenllm_v2/MMLU-PRO": 0.3874 + } + }, + { + "id": "AuraIndustries/Aura-MoE-2x4B", + "name": "Aura-MoE-2x4B", + "developer": "AuraIndustries", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4601, + "hfopenllm_v2/BBH": 0.4339, + "hfopenllm_v2/MATH Level 5": 0.031, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.4085, + "hfopenllm_v2/MMLU-PRO": 0.265 + } + }, + { + "id": "AuraIndustries/Aura-MoE-2x4B-v2", + "name": "Aura-MoE-2x4B-v2", + "developer": "AuraIndustries", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4778, + "hfopenllm_v2/BBH": 0.4315, + "hfopenllm_v2/MATH Level 5": 0.0317, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.4101, + "hfopenllm_v2/MMLU-PRO": 0.261 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Aurel9.json b/data/developers/Aurel9.json new file mode 100644 index 0000000000000000000000000000000000000000..5e0fcfe0146d9a08ebc2129738d2c199dbac5334 --- /dev/null +++ b/data/developers/Aurel9.json @@ -0,0 +1,19 @@ +{ + "developer": "Aurel9", + "models": [ + { + "id": "Aurel9/testmerge-7b", + "name": "testmerge-7b", + "developer": "Aurel9", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.398, + "hfopenllm_v2/BBH": 0.519, + "hfopenllm_v2/MATH Level 5": 0.0657, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4659, + "hfopenllm_v2/MMLU-PRO": 0.3053 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Ayush-Singh.json b/data/developers/Ayush-Singh.json new file mode 100644 index 0000000000000000000000000000000000000000..19ff6c763c83495b47b9f84520d63cbf1a709765 --- /dev/null +++ b/data/developers/Ayush-Singh.json @@ -0,0 +1,19 @@ +{ + "developer": "Ayush-Singh", + "models": [ + { + "id": "Ayush-Singh/Llama1B-sft-2", + "name": "Llama1B-sft-2", + "developer": "Ayush-Singh", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1374, + "hfopenllm_v2/BBH": 0.2834, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2458, + "hfopenllm_v2/MUSR": 0.3552, + "hfopenllm_v2/MMLU-PRO": 0.1117 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Azure99.json b/data/developers/Azure99.json new file mode 100644 index 0000000000000000000000000000000000000000..b8ee160b5f24357b7631b3b03e65a1ac317b0efa --- /dev/null +++ b/data/developers/Azure99.json @@ -0,0 +1,89 @@ +{ + "developer": "Azure99", + "models": [ + { + "id": "Azure99/Blossom-V6-14B", + "name": "Blossom-V6-14B", + "developer": "Azure99", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6395, + "hfopenllm_v2/BBH": 0.5069, + "hfopenllm_v2/MATH Level 5": 0.5257, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.4035, + "hfopenllm_v2/MMLU-PRO": 0.4544 + } + }, + { + "id": "Azure99/Blossom-V6-7B", + "name": "Blossom-V6-7B", + "developer": "Azure99", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5538, + "hfopenllm_v2/BBH": 0.4974, + "hfopenllm_v2/MATH Level 5": 0.4585, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4301, + "hfopenllm_v2/MMLU-PRO": 0.4144 + } + }, + { + "id": "Azure99/blossom-v5-32b", + "name": "blossom-v5-32b", + "developer": "Azure99", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5235, + "hfopenllm_v2/BBH": 0.5955, + "hfopenllm_v2/MATH Level 5": 0.1866, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.402, + "hfopenllm_v2/MMLU-PRO": 0.4235 + } + }, + { + "id": "Azure99/blossom-v5-llama3-8b", + "name": "blossom-v5-llama3-8b", + "developer": "Azure99", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4343, + "hfopenllm_v2/BBH": 0.4185, + "hfopenllm_v2/MATH Level 5": 0.0514, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.367, + "hfopenllm_v2/MMLU-PRO": 0.2206 + } + }, + { + "id": "Azure99/blossom-v5.1-34b", + "name": "blossom-v5.1-34b", + "developer": "Azure99", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5697, + "hfopenllm_v2/BBH": 0.6109, + "hfopenllm_v2/MATH Level 5": 0.2591, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.3928, + "hfopenllm_v2/MMLU-PRO": 0.4558 + } + }, + { + "id": "Azure99/blossom-v5.1-9b", + "name": "blossom-v5.1-9b", + "developer": "Azure99", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5086, + "hfopenllm_v2/BBH": 0.5343, + "hfopenllm_v2/MATH Level 5": 0.2122, + "hfopenllm_v2/GPQA": 0.3356, + "hfopenllm_v2/MUSR": 0.3994, + "hfopenllm_v2/MMLU-PRO": 0.3979 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/BAAI.json b/data/developers/BAAI.json new file mode 100644 index 0000000000000000000000000000000000000000..a1ddaa9d361ef4811d7a7b68f818f5fc67e4ede0 --- /dev/null +++ b/data/developers/BAAI.json @@ -0,0 +1,201 @@ +{ + "developer": "BAAI", + "models": [ + { + "id": "BAAI/Gemma2-9B-IT-Simpo-Infinity-Preference", + "name": "Gemma2-9B-IT-Simpo-Infinity-Preference", + "developer": "BAAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3176, + "hfopenllm_v2/BBH": 0.5979, + "hfopenllm_v2/MATH Level 5": 0.0974, + "hfopenllm_v2/GPQA": 0.3398, + "hfopenllm_v2/MUSR": 0.3966, + "hfopenllm_v2/MMLU-PRO": 0.3869 + } + }, + { + "id": "BAAI/Infinity-Instruct-3M-0613-Llama3-70B", + "name": "Infinity-Instruct-3M-0613-Llama3-70B", + "developer": "BAAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6821, + "hfopenllm_v2/BBH": 0.6642, + "hfopenllm_v2/MATH Level 5": 0.2153, + "hfopenllm_v2/GPQA": 0.3582, + "hfopenllm_v2/MUSR": 0.4523, + "hfopenllm_v2/MMLU-PRO": 0.473 + } + }, + { + "id": "BAAI/Infinity-Instruct-3M-0613-Mistral-7B", + "name": "Infinity-Instruct-3M-0613-Mistral-7B", + "developer": "BAAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.532, + "hfopenllm_v2/BBH": 0.4958, + "hfopenllm_v2/MATH Level 5": 0.0816, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4351, + "hfopenllm_v2/MMLU-PRO": 0.3161 + } + }, + { + "id": "BAAI/Infinity-Instruct-3M-0625-Llama3-70B", + "name": "Infinity-Instruct-3M-0625-Llama3-70B", + "developer": "BAAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7442, + "hfopenllm_v2/BBH": 0.667, + "hfopenllm_v2/MATH Level 5": 0.2251, + "hfopenllm_v2/GPQA": 0.3574, + "hfopenllm_v2/MUSR": 0.4617, + "hfopenllm_v2/MMLU-PRO": 0.4586 + } + }, + { + "id": "BAAI/Infinity-Instruct-3M-0625-Llama3-8B", + "name": "Infinity-Instruct-3M-0625-Llama3-8B", + "developer": "BAAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.605, + "hfopenllm_v2/BBH": 0.4955, + "hfopenllm_v2/MATH Level 5": 0.0884, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3712, + "hfopenllm_v2/MMLU-PRO": 0.3252 + } + }, + { + "id": "BAAI/Infinity-Instruct-3M-0625-Mistral-7B", + "name": "Infinity-Instruct-3M-0625-Mistral-7B", + "developer": "BAAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5867, + "hfopenllm_v2/BBH": 0.494, + "hfopenllm_v2/MATH Level 5": 0.0763, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.4272, + "hfopenllm_v2/MMLU-PRO": 0.323 + } + }, + { + "id": "BAAI/Infinity-Instruct-3M-0625-Qwen2-7B", + "name": "Infinity-Instruct-3M-0625-Qwen2-7B", + "developer": "BAAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5554, + "hfopenllm_v2/BBH": 0.5346, + "hfopenllm_v2/MATH Level 5": 0.1926, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.3888, + "hfopenllm_v2/MMLU-PRO": 0.396 + } + }, + { + "id": "BAAI/Infinity-Instruct-3M-0625-Yi-1.5-9B", + "name": "Infinity-Instruct-3M-0625-Yi-1.5-9B", + "developer": "BAAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5186, + "hfopenllm_v2/BBH": 0.5509, + "hfopenllm_v2/MATH Level 5": 0.1639, + "hfopenllm_v2/GPQA": 0.354, + "hfopenllm_v2/MUSR": 0.4575, + "hfopenllm_v2/MMLU-PRO": 0.4118 + } + }, + { + "id": "BAAI/Infinity-Instruct-7M-0729-Llama3_1-8B", + "name": "Infinity-Instruct-7M-0729-Llama3_1-8B", + "developer": "BAAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6132, + "hfopenllm_v2/BBH": 0.5077, + "hfopenllm_v2/MATH Level 5": 0.1276, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.3578, + "hfopenllm_v2/MMLU-PRO": 0.3224 + } + }, + { + "id": "BAAI/Infinity-Instruct-7M-0729-mistral-7B", + "name": "Infinity-Instruct-7M-0729-mistral-7B", + "developer": "BAAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6162, + "hfopenllm_v2/BBH": 0.4964, + "hfopenllm_v2/MATH Level 5": 0.0831, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4062, + "hfopenllm_v2/MMLU-PRO": 0.3274 + } + }, + { + "id": "BAAI/Infinity-Instruct-7M-Gen-Llama3_1-70B", + "name": "Infinity-Instruct-7M-Gen-Llama3_1-70B", + "developer": "BAAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7335, + "hfopenllm_v2/BBH": 0.6695, + "hfopenllm_v2/MATH Level 5": 0.2523, + "hfopenllm_v2/GPQA": 0.3758, + "hfopenllm_v2/MUSR": 0.4539, + "hfopenllm_v2/MMLU-PRO": 0.4607 + } + }, + { + "id": "BAAI/Infinity-Instruct-7M-Gen-Llama3_1-8B", + "name": "Infinity-Instruct-7M-Gen-Llama3_1-8B", + "developer": "BAAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6132, + "hfopenllm_v2/BBH": 0.5077, + "hfopenllm_v2/MATH Level 5": 0.1276, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.3578, + "hfopenllm_v2/MMLU-PRO": 0.3224 + } + }, + { + "id": "BAAI/Infinity-Instruct-7M-Gen-mistral-7B", + "name": "Infinity-Instruct-7M-Gen-mistral-7B", + "developer": "BAAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6147, + "hfopenllm_v2/BBH": 0.4964, + "hfopenllm_v2/MATH Level 5": 0.0831, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4062, + "hfopenllm_v2/MMLU-PRO": 0.3274 + } + }, + { + "id": "BAAI/OPI-Llama-3.1-8B-Instruct", + "name": "OPI-Llama-3.1-8B-Instruct", + "developer": "BAAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2075, + "hfopenllm_v2/BBH": 0.3551, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3233, + "hfopenllm_v2/MMLU-PRO": 0.2124 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/BEE-spoke-data.json b/data/developers/BEE-spoke-data.json new file mode 100644 index 0000000000000000000000000000000000000000..fc03c3478ee30f4c9b29cc59141c814c84a9837c --- /dev/null +++ b/data/developers/BEE-spoke-data.json @@ -0,0 +1,131 @@ +{ + "developer": "BEE-spoke-data", + "models": [ + { + "id": "BEE-spoke-data/Meta-Llama-3-8Bee", + "name": "Meta-Llama-3-8Bee", + "developer": "BEE-spoke-data", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1951, + "hfopenllm_v2/BBH": 0.4626, + "hfopenllm_v2/MATH Level 5": 0.0483, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.3654, + "hfopenllm_v2/MMLU-PRO": 0.322 + } + }, + { + "id": "BEE-spoke-data/smol_llama-101M-GQA", + "name": "smol_llama-101M-GQA", + "developer": "BEE-spoke-data", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1384, + "hfopenllm_v2/BBH": 0.3018, + "hfopenllm_v2/MATH Level 5": 0.006, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3713, + "hfopenllm_v2/MMLU-PRO": 0.1107 + } + }, + { + "id": "BEE-spoke-data/smol_llama-220M-GQA", + "name": "smol_llama-220M-GQA", + "developer": "BEE-spoke-data", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2386, + "hfopenllm_v2/BBH": 0.3032, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.4059, + "hfopenllm_v2/MMLU-PRO": 0.1149 + } + }, + { + "id": "BEE-spoke-data/smol_llama-220M-GQA-fineweb_edu", + "name": "smol_llama-220M-GQA-fineweb_edu", + "developer": "BEE-spoke-data", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1988, + "hfopenllm_v2/BBH": 0.2929, + "hfopenllm_v2/MATH Level 5": 0.0068, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.4368, + "hfopenllm_v2/MMLU-PRO": 0.1127 + } + }, + { + "id": "BEE-spoke-data/smol_llama-220M-openhermes", + "name": "smol_llama-220M-openhermes", + "developer": "BEE-spoke-data", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1555, + "hfopenllm_v2/BBH": 0.3028, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3847, + "hfopenllm_v2/MMLU-PRO": 0.112 + } + }, + { + "id": "BEE-spoke-data/tFINE-900m-e16-d32-flan", + "name": "tFINE-900m-e16-d32-flan", + "developer": "BEE-spoke-data", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1506, + "hfopenllm_v2/BBH": 0.3028, + "hfopenllm_v2/MATH Level 5": 0.0098, + "hfopenllm_v2/GPQA": 0.2332, + "hfopenllm_v2/MUSR": 0.3724, + "hfopenllm_v2/MMLU-PRO": 0.1307 + } + }, + { + "id": "BEE-spoke-data/tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024", + "name": "tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024", + "developer": "BEE-spoke-data", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1321, + "hfopenllm_v2/BBH": 0.3138, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.4393, + "hfopenllm_v2/MMLU-PRO": 0.1237 + } + }, + { + "id": "BEE-spoke-data/tFINE-900m-e16-d32-instruct_2e", + "name": "tFINE-900m-e16-d32-instruct_2e", + "developer": "BEE-spoke-data", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1403, + "hfopenllm_v2/BBH": 0.3135, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.4207, + "hfopenllm_v2/MMLU-PRO": 0.1237 + } + }, + { + "id": "BEE-spoke-data/tFINE-900m-instruct-orpo", + "name": "tFINE-900m-instruct-orpo", + "developer": "BEE-spoke-data", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.133, + "hfopenllm_v2/BBH": 0.3022, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3409, + "hfopenllm_v2/MMLU-PRO": 0.1152 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/BSC-LT.json b/data/developers/BSC-LT.json new file mode 100644 index 0000000000000000000000000000000000000000..a8b8e66d18cd4161749d5209a0996487090ec0c7 --- /dev/null +++ b/data/developers/BSC-LT.json @@ -0,0 +1,33 @@ +{ + "developer": "BSC-LT", + "models": [ + { + "id": "BSC-LT/salamandra-7b", + "name": "salamandra-7b", + "developer": "BSC-LT", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1367, + "hfopenllm_v2/BBH": 0.3517, + "hfopenllm_v2/MATH Level 5": 0.0038, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3501, + "hfopenllm_v2/MMLU-PRO": 0.1493 + } + }, + { + "id": "BSC-LT/salamandra-7b-instruct", + "name": "salamandra-7b-instruct", + "developer": "BSC-LT", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2451, + "hfopenllm_v2/BBH": 0.3851, + "hfopenllm_v2/MATH Level 5": 0.0083, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.4134, + "hfopenllm_v2/MMLU-PRO": 0.1805 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Ba2han.json b/data/developers/Ba2han.json new file mode 100644 index 0000000000000000000000000000000000000000..09d3c8ab2b17ba42025a7a080c5b86be0e1095b4 --- /dev/null +++ b/data/developers/Ba2han.json @@ -0,0 +1,19 @@ +{ + "developer": "Ba2han", + "models": [ + { + "id": "Ba2han/Llama-Phi-3_DoRA", + "name": "Llama-Phi-3_DoRA", + "developer": "Ba2han", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5131, + "hfopenllm_v2/BBH": 0.5515, + "hfopenllm_v2/MATH Level 5": 0.1216, + "hfopenllm_v2/GPQA": 0.3263, + "hfopenllm_v2/MUSR": 0.4069, + "hfopenllm_v2/MMLU-PRO": 0.3915 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Baptiste-HUVELLE-10.json b/data/developers/Baptiste-HUVELLE-10.json new file mode 100644 index 0000000000000000000000000000000000000000..525db3466746c4a31d8f8e15019e7928407e064a --- /dev/null +++ b/data/developers/Baptiste-HUVELLE-10.json @@ -0,0 +1,19 @@ +{ + "developer": "Baptiste-HUVELLE-10", + "models": [ + { + "id": "Baptiste-HUVELLE-10/LeTriomphant2.2_ECE_iLAB", + "name": "LeTriomphant2.2_ECE_iLAB", + "developer": "Baptiste-HUVELLE-10", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5076, + "hfopenllm_v2/BBH": 0.7256, + "hfopenllm_v2/MATH Level 5": 0.4449, + "hfopenllm_v2/GPQA": 0.3993, + "hfopenllm_v2/MUSR": 0.4626, + "hfopenllm_v2/MMLU-PRO": 0.5851 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/BenevolenceMessiah.json b/data/developers/BenevolenceMessiah.json new file mode 100644 index 0000000000000000000000000000000000000000..1cb1df580e8e3ee414b19dafb8df7ce1e3771378 --- /dev/null +++ b/data/developers/BenevolenceMessiah.json @@ -0,0 +1,33 @@ +{ + "developer": "BenevolenceMessiah", + "models": [ + { + "id": "BenevolenceMessiah/Qwen2.5-72B-2x-Instruct-TIES-v1.0", + "name": "Qwen2.5-72B-2x-Instruct-TIES-v1.0", + "developer": "BenevolenceMessiah", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5473, + "hfopenllm_v2/BBH": 0.7273, + "hfopenllm_v2/MATH Level 5": 0.5785, + "hfopenllm_v2/GPQA": 0.3674, + "hfopenllm_v2/MUSR": 0.4207, + "hfopenllm_v2/MMLU-PRO": 0.5628 + } + }, + { + "id": "BenevolenceMessiah/Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0", + "name": "Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0", + "developer": "BenevolenceMessiah", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3012, + "hfopenllm_v2/BBH": 0.4909, + "hfopenllm_v2/MATH Level 5": 0.0415, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.408, + "hfopenllm_v2/MMLU-PRO": 0.268 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/BlackBeenie.json b/data/developers/BlackBeenie.json new file mode 100644 index 0000000000000000000000000000000000000000..fb4e197ff1321394286412726ff7a7ec5753f9fd --- /dev/null +++ b/data/developers/BlackBeenie.json @@ -0,0 +1,131 @@ +{ + "developer": "BlackBeenie", + "models": [ + { + "id": "BlackBeenie/Bloslain-8B-v0.2", + "name": "Bloslain-8B-v0.2", + "developer": "BlackBeenie", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5023, + "hfopenllm_v2/BBH": 0.5111, + "hfopenllm_v2/MATH Level 5": 0.145, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4076, + "hfopenllm_v2/MMLU-PRO": 0.3654 + } + }, + { + "id": "BlackBeenie/Llama-3.1-8B-OpenO1-SFT-v0.1", + "name": "Llama-3.1-8B-OpenO1-SFT-v0.1", + "developer": "BlackBeenie", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5124, + "hfopenllm_v2/BBH": 0.4787, + "hfopenllm_v2/MATH Level 5": 0.1526, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3618, + "hfopenllm_v2/MMLU-PRO": 0.3492 + } + }, + { + "id": "BlackBeenie/Llama-3.1-8B-pythonic-passthrough-merge", + "name": "Llama-3.1-8B-pythonic-passthrough-merge", + "developer": "BlackBeenie", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2316, + "hfopenllm_v2/BBH": 0.3454, + "hfopenllm_v2/MATH Level 5": 0.0113, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3778, + "hfopenllm_v2/MMLU-PRO": 0.1332 + } + }, + { + "id": "BlackBeenie/Neos-Gemma-2-9b", + "name": "Neos-Gemma-2-9b", + "developer": "BlackBeenie", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5876, + "hfopenllm_v2/BBH": 0.5503, + "hfopenllm_v2/MATH Level 5": 0.0982, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.3618, + "hfopenllm_v2/MMLU-PRO": 0.3981 + } + }, + { + "id": "BlackBeenie/Neos-Llama-3.1-8B", + "name": "Neos-Llama-3.1-8B", + "developer": "BlackBeenie", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4944, + "hfopenllm_v2/BBH": 0.4425, + "hfopenllm_v2/MATH Level 5": 0.1322, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.375, + "hfopenllm_v2/MMLU-PRO": 0.3262 + } + }, + { + "id": "BlackBeenie/Neos-Llama-3.1-base", + "name": "Neos-Llama-3.1-base", + "developer": "BlackBeenie", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1751, + "hfopenllm_v2/BBH": 0.293, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2374, + "hfopenllm_v2/MUSR": 0.3499, + "hfopenllm_v2/MMLU-PRO": 0.1112 + } + }, + { + "id": "BlackBeenie/Neos-Phi-3-14B-v0.1", + "name": "Neos-Phi-3-14B-v0.1", + "developer": "BlackBeenie", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4022, + "hfopenllm_v2/BBH": 0.6212, + "hfopenllm_v2/MATH Level 5": 0.1782, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4125, + "hfopenllm_v2/MMLU-PRO": 0.4564 + } + }, + { + "id": "BlackBeenie/llama-3-luminous-merged", + "name": "llama-3-luminous-merged", + "developer": "BlackBeenie", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4323, + "hfopenllm_v2/BBH": 0.5154, + "hfopenllm_v2/MATH Level 5": 0.0869, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4149, + "hfopenllm_v2/MMLU-PRO": 0.3773 + } + }, + { + "id": "BlackBeenie/llama-3.1-8B-Galore-openassistant-guanaco", + "name": "llama-3.1-8B-Galore-openassistant-guanaco", + "developer": "BlackBeenie", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2635, + "hfopenllm_v2/BBH": 0.5213, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4406, + "hfopenllm_v2/MMLU-PRO": 0.3206 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Bllossom.json b/data/developers/Bllossom.json new file mode 100644 index 0000000000000000000000000000000000000000..118b7d9252f14b9680c259a3ee44ddf57c525ca4 --- /dev/null +++ b/data/developers/Bllossom.json @@ -0,0 +1,19 @@ +{ + "developer": "Bllossom", + "models": [ + { + "id": "Bllossom/llama-3.2-Korean-Bllossom-AICA-5B", + "name": "llama-3.2-Korean-Bllossom-AICA-5B", + "developer": "Bllossom", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5172, + "hfopenllm_v2/BBH": 0.4293, + "hfopenllm_v2/MATH Level 5": 0.1239, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.3834, + "hfopenllm_v2/MMLU-PRO": 0.271 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/BoltMonkey.json b/data/developers/BoltMonkey.json new file mode 100644 index 0000000000000000000000000000000000000000..93a1093c0cbb01b9a84ae22cf6063fc50d8793cb --- /dev/null +++ b/data/developers/BoltMonkey.json @@ -0,0 +1,47 @@ +{ + "developer": "BoltMonkey", + "models": [ + { + "id": "BoltMonkey/DreadMix", + "name": "DreadMix", + "developer": "BoltMonkey", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7095, + "hfopenllm_v2/BBH": 0.5435, + "hfopenllm_v2/MATH Level 5": 0.1556, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4212, + "hfopenllm_v2/MMLU-PRO": 0.379 + } + }, + { + "id": "BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated", + "name": "NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated", + "developer": "BoltMonkey", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7999, + "hfopenllm_v2/BBH": 0.5152, + "hfopenllm_v2/MATH Level 5": 0.1193, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.4019, + "hfopenllm_v2/MMLU-PRO": 0.3733 + } + }, + { + "id": "BoltMonkey/SuperNeuralDreadDevil-8b", + "name": "SuperNeuralDreadDevil-8b", + "developer": "BoltMonkey", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.771, + "hfopenllm_v2/BBH": 0.5286, + "hfopenllm_v2/MATH Level 5": 0.0929, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.3977, + "hfopenllm_v2/MMLU-PRO": 0.3679 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/BrainWave-ML.json b/data/developers/BrainWave-ML.json new file mode 100644 index 0000000000000000000000000000000000000000..6b4fca8d6496e23c5bb9f54ca7bf1ad8b86b317a --- /dev/null +++ b/data/developers/BrainWave-ML.json @@ -0,0 +1,19 @@ +{ + "developer": "BrainWave-ML", + "models": [ + { + "id": "BrainWave-ML/llama3.2-3B-maths-orpo", + "name": "llama3.2-3B-maths-orpo", + "developer": "BrainWave-ML", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2049, + "hfopenllm_v2/BBH": 0.2912, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3575, + "hfopenllm_v2/MMLU-PRO": 0.1168 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/BramVanroy.json b/data/developers/BramVanroy.json new file mode 100644 index 0000000000000000000000000000000000000000..9317edbb900da73c5037c982a65c55e37a79ed3f --- /dev/null +++ b/data/developers/BramVanroy.json @@ -0,0 +1,61 @@ +{ + "developer": "BramVanroy", + "models": [ + { + "id": "BramVanroy/GEITje-7B-ultra", + "name": "GEITje-7B-ultra", + "developer": "BramVanroy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3723, + "hfopenllm_v2/BBH": 0.3776, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.329, + "hfopenllm_v2/MMLU-PRO": 0.2011 + } + }, + { + "id": "BramVanroy/fietje-2", + "name": "fietje-2", + "developer": "BramVanroy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2098, + "hfopenllm_v2/BBH": 0.4036, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.3696, + "hfopenllm_v2/MMLU-PRO": 0.1986 + } + }, + { + "id": "BramVanroy/fietje-2-chat", + "name": "fietje-2-chat", + "developer": "BramVanroy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2917, + "hfopenllm_v2/BBH": 0.415, + "hfopenllm_v2/MATH Level 5": 0.0189, + "hfopenllm_v2/GPQA": 0.2399, + "hfopenllm_v2/MUSR": 0.3528, + "hfopenllm_v2/MMLU-PRO": 0.2055 + } + }, + { + "id": "BramVanroy/fietje-2-instruct", + "name": "fietje-2-instruct", + "developer": "BramVanroy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.279, + "hfopenllm_v2/BBH": 0.4136, + "hfopenllm_v2/MATH Level 5": 0.0227, + "hfopenllm_v2/GPQA": 0.2332, + "hfopenllm_v2/MUSR": 0.3369, + "hfopenllm_v2/MMLU-PRO": 0.2104 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ByteDance.json b/data/developers/ByteDance.json new file mode 100644 index 0000000000000000000000000000000000000000..9907018f50ee485ee6fbe6cf68ed3b72be8f6578 --- /dev/null +++ b/data/developers/ByteDance.json @@ -0,0 +1,16 @@ +{ + "developer": "ByteDance", + "models": [ + { + "id": "bytedance/doubao-seed-1-6-thinking-250615", + "name": "doubao-seed-1-6-thinking-250615", + "developer": "ByteDance", + "evaluator_relationship": null, + "benchmark_scores": { + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.07042253521126761, + "livecodebenchpro/Easy Problems": 0.5774647887323944 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/CIR-AMS.json b/data/developers/CIR-AMS.json new file mode 100644 index 0000000000000000000000000000000000000000..df9dcecb6f8fae5d901c2496b58813341797427e --- /dev/null +++ b/data/developers/CIR-AMS.json @@ -0,0 +1,24 @@ +{ + "developer": "CIR-AMS", + "models": [ + { + "id": "CIR-AMS/BTRM_Qwen2_7b_0613", + "name": "CIR-AMS/BTRM_Qwen2_7b_0613", + "developer": "CIR-AMS", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8172, + "reward-bench/Factuality": 0.5347, + "reward-bench/Precise IF": 0.3563, + "reward-bench/Math": 0.6066, + "reward-bench/Safety": 0.9014, + "reward-bench/Focus": 0.5737, + "reward-bench/Ties": 0.6527, + "reward-bench/Chat": 0.9749, + "reward-bench/Chat Hard": 0.5724, + "reward-bench/Reasoning": 0.8775, + "reward-bench/Prior Sets (0.5 weight)": 0.7029 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/CYFRAGOVPL.json b/data/developers/CYFRAGOVPL.json new file mode 100644 index 0000000000000000000000000000000000000000..275204dc2ef859e3fbec74104c82df244ab3fe62 --- /dev/null +++ b/data/developers/CYFRAGOVPL.json @@ -0,0 +1,89 @@ +{ + "developer": "CYFRAGOVPL", + "models": [ + { + "id": "CYFRAGOVPL/Llama-PLLuM-8B-base", + "name": "Llama-PLLuM-8B-base", + "developer": "CYFRAGOVPL", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2899, + "hfopenllm_v2/BBH": 0.432, + "hfopenllm_v2/MATH Level 5": 0.0363, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.397, + "hfopenllm_v2/MMLU-PRO": 0.2757 + } + }, + { + "id": "CYFRAGOVPL/Llama-PLLuM-8B-chat", + "name": "Llama-PLLuM-8B-chat", + "developer": "CYFRAGOVPL", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3515, + "hfopenllm_v2/BBH": 0.4077, + "hfopenllm_v2/MATH Level 5": 0.034, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.4199, + "hfopenllm_v2/MMLU-PRO": 0.2719 + } + }, + { + "id": "CYFRAGOVPL/PLLuM-12B-base", + "name": "PLLuM-12B-base", + "developer": "CYFRAGOVPL", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2821, + "hfopenllm_v2/BBH": 0.4391, + "hfopenllm_v2/MATH Level 5": 0.0287, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4142, + "hfopenllm_v2/MMLU-PRO": 0.274 + } + }, + { + "id": "CYFRAGOVPL/PLLuM-12B-chat", + "name": "PLLuM-12B-chat", + "developer": "CYFRAGOVPL", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3214, + "hfopenllm_v2/BBH": 0.4446, + "hfopenllm_v2/MATH Level 5": 0.0181, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.4115, + "hfopenllm_v2/MMLU-PRO": 0.2872 + } + }, + { + "id": "CYFRAGOVPL/PLLuM-12B-nc-base", + "name": "PLLuM-12B-nc-base", + "developer": "CYFRAGOVPL", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2405, + "hfopenllm_v2/BBH": 0.4277, + "hfopenllm_v2/MATH Level 5": 0.0219, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3645, + "hfopenllm_v2/MMLU-PRO": 0.2559 + } + }, + { + "id": "CYFRAGOVPL/PLLuM-12B-nc-chat", + "name": "PLLuM-12B-nc-chat", + "developer": "CYFRAGOVPL", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2834, + "hfopenllm_v2/BBH": 0.4576, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4354, + "hfopenllm_v2/MMLU-PRO": 0.2597 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/CarrotAI.json b/data/developers/CarrotAI.json new file mode 100644 index 0000000000000000000000000000000000000000..560ba820d0e2b653e59b98dd60b1707b9630a4fb --- /dev/null +++ b/data/developers/CarrotAI.json @@ -0,0 +1,33 @@ +{ + "developer": "CarrotAI", + "models": [ + { + "id": "CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct", + "name": "Llama-3.2-Rabbit-Ko-3B-Instruct", + "developer": "CarrotAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7199, + "hfopenllm_v2/BBH": 0.4427, + "hfopenllm_v2/MATH Level 5": 0.2054, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3649, + "hfopenllm_v2/MMLU-PRO": 0.2822 + } + }, + { + "id": "CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct-2412", + "name": "Llama-3.2-Rabbit-Ko-3B-Instruct-2412", + "developer": "CarrotAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4782, + "hfopenllm_v2/BBH": 0.4358, + "hfopenllm_v2/MATH Level 5": 0.176, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.3872, + "hfopenllm_v2/MMLU-PRO": 0.3134 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Casual-Autopsy.json b/data/developers/Casual-Autopsy.json new file mode 100644 index 0000000000000000000000000000000000000000..f44a0f3b5844f249f20428333dd1fa8ffda3387f --- /dev/null +++ b/data/developers/Casual-Autopsy.json @@ -0,0 +1,19 @@ +{ + "developer": "Casual-Autopsy", + "models": [ + { + "id": "Casual-Autopsy/L3-Umbral-Mind-RP-v2.0-8B", + "name": "L3-Umbral-Mind-RP-v2.0-8B", + "developer": "Casual-Autopsy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7123, + "hfopenllm_v2/BBH": 0.5262, + "hfopenllm_v2/MATH Level 5": 0.1095, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.3687, + "hfopenllm_v2/MMLU-PRO": 0.3723 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/CausalLM.json b/data/developers/CausalLM.json new file mode 100644 index 0000000000000000000000000000000000000000..102164bdeda9074354691687d74c13970d79628b --- /dev/null +++ b/data/developers/CausalLM.json @@ -0,0 +1,47 @@ +{ + "developer": "CausalLM", + "models": [ + { + "id": "CausalLM/14B", + "name": "14B", + "developer": "CausalLM", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2788, + "hfopenllm_v2/BBH": 0.47, + "hfopenllm_v2/MATH Level 5": 0.0755, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4155, + "hfopenllm_v2/MMLU-PRO": 0.3221 + } + }, + { + "id": "CausalLM/34b-beta", + "name": "34b-beta", + "developer": "CausalLM", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3043, + "hfopenllm_v2/BBH": 0.5591, + "hfopenllm_v2/MATH Level 5": 0.0483, + "hfopenllm_v2/GPQA": 0.3465, + "hfopenllm_v2/MUSR": 0.3749, + "hfopenllm_v2/MMLU-PRO": 0.5325 + } + }, + { + "id": "CausalLM/preview-1-hf", + "name": "preview-1-hf", + "developer": "CausalLM", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5559, + "hfopenllm_v2/BBH": 0.3615, + "hfopenllm_v2/MATH Level 5": 0.0302, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3422, + "hfopenllm_v2/MMLU-PRO": 0.3597 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Changgil.json b/data/developers/Changgil.json new file mode 100644 index 0000000000000000000000000000000000000000..c46c5aecbefbdde79b96e9e4f80e05ee09ea8fc9 --- /dev/null +++ b/data/developers/Changgil.json @@ -0,0 +1,33 @@ +{ + "developer": "Changgil", + "models": [ + { + "id": "Changgil/K2S3-14b-v0.2", + "name": "K2S3-14b-v0.2", + "developer": "Changgil", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3243, + "hfopenllm_v2/BBH": 0.4613, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.3923, + "hfopenllm_v2/MMLU-PRO": 0.2644 + } + }, + { + "id": "Changgil/K2S3-v0.1", + "name": "K2S3-v0.1", + "developer": "Changgil", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3277, + "hfopenllm_v2/BBH": 0.4655, + "hfopenllm_v2/MATH Level 5": 0.0461, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.4014, + "hfopenllm_v2/MMLU-PRO": 0.2562 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ClaudioItaly.json b/data/developers/ClaudioItaly.json new file mode 100644 index 0000000000000000000000000000000000000000..93d950b8e7f8145269ad3b3395032c17afda5c5d --- /dev/null +++ b/data/developers/ClaudioItaly.json @@ -0,0 +1,61 @@ +{ + "developer": "ClaudioItaly", + "models": [ + { + "id": "ClaudioItaly/Albacus", + "name": "Albacus", + "developer": "ClaudioItaly", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4667, + "hfopenllm_v2/BBH": 0.5113, + "hfopenllm_v2/MATH Level 5": 0.071, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.4135, + "hfopenllm_v2/MMLU-PRO": 0.3165 + } + }, + { + "id": "ClaudioItaly/Book-Gut12B", + "name": "Book-Gut12B", + "developer": "ClaudioItaly", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3998, + "hfopenllm_v2/BBH": 0.5417, + "hfopenllm_v2/MATH Level 5": 0.102, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4635, + "hfopenllm_v2/MMLU-PRO": 0.367 + } + }, + { + "id": "ClaudioItaly/Evolutionstory-7B-v2.2", + "name": "Evolutionstory-7B-v2.2", + "developer": "ClaudioItaly", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4814, + "hfopenllm_v2/BBH": 0.5108, + "hfopenllm_v2/MATH Level 5": 0.071, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.4135, + "hfopenllm_v2/MMLU-PRO": 0.3159 + } + }, + { + "id": "ClaudioItaly/intelligence-cod-rag-7b-v3", + "name": "intelligence-cod-rag-7b-v3", + "developer": "ClaudioItaly", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6898, + "hfopenllm_v2/BBH": 0.5366, + "hfopenllm_v2/MATH Level 5": 0.3807, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.4153, + "hfopenllm_v2/MMLU-PRO": 0.4195 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/CohereForAI.json b/data/developers/CohereForAI.json new file mode 100644 index 0000000000000000000000000000000000000000..2b0f53d64c4d2507c6d6a18b645672e22916bc2a --- /dev/null +++ b/data/developers/CohereForAI.json @@ -0,0 +1,123 @@ +{ + "developer": "CohereForAI", + "models": [ + { + "id": "CohereForAI/aya-23-35B", + "name": "aya-23-35B", + "developer": "CohereForAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6462, + "hfopenllm_v2/BBH": 0.54, + "hfopenllm_v2/MATH Level 5": 0.0347, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.431, + "hfopenllm_v2/MMLU-PRO": 0.3356 + } + }, + { + "id": "CohereForAI/aya-23-8B", + "name": "aya-23-8B", + "developer": "CohereForAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4699, + "hfopenllm_v2/BBH": 0.4296, + "hfopenllm_v2/MATH Level 5": 0.0166, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.3941, + "hfopenllm_v2/MMLU-PRO": 0.2278 + } + }, + { + "id": "CohereForAI/aya-expanse-32b", + "name": "aya-expanse-32b", + "developer": "CohereForAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7302, + "hfopenllm_v2/BBH": 0.5649, + "hfopenllm_v2/MATH Level 5": 0.1533, + "hfopenllm_v2/GPQA": 0.3255, + "hfopenllm_v2/MUSR": 0.3873, + "hfopenllm_v2/MMLU-PRO": 0.413 + } + }, + { + "id": "CohereForAI/aya-expanse-8b", + "name": "aya-expanse-8b", + "developer": "CohereForAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6359, + "hfopenllm_v2/BBH": 0.4977, + "hfopenllm_v2/MATH Level 5": 0.0861, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.3729, + "hfopenllm_v2/MMLU-PRO": 0.3004 + } + }, + { + "id": "CohereForAI/c4ai-command-r-plus", + "name": "c4ai-command-r-plus", + "developer": "CohereForAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7664, + "hfopenllm_v2/BBH": 0.5815, + "hfopenllm_v2/MATH Level 5": 0.0801, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4807, + "hfopenllm_v2/MMLU-PRO": 0.3992, + "reward-bench/Score": 0.7057, + "reward-bench/Chat": 0.9511, + "reward-bench/Chat Hard": 0.5757, + "reward-bench/Safety": 0.5986, + "reward-bench/Reasoning": 0.704, + "reward-bench/Prior Sets (0.5 weight)": 0.6924 + } + }, + { + "id": "CohereForAI/c4ai-command-r-plus-08-2024", + "name": "c4ai-command-r-plus-08-2024", + "developer": "CohereForAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.754, + "hfopenllm_v2/BBH": 0.5996, + "hfopenllm_v2/MATH Level 5": 0.1239, + "hfopenllm_v2/GPQA": 0.3507, + "hfopenllm_v2/MUSR": 0.4829, + "hfopenllm_v2/MMLU-PRO": 0.4421 + } + }, + { + "id": "CohereForAI/c4ai-command-r-v01", + "name": "c4ai-command-r-v01", + "developer": "CohereForAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6748, + "hfopenllm_v2/BBH": 0.5406, + "hfopenllm_v2/MATH Level 5": 0.0347, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4517, + "hfopenllm_v2/MMLU-PRO": 0.3369 + } + }, + { + "id": "CohereForAI/c4ai-command-r7b-12-2024", + "name": "c4ai-command-r7b-12-2024", + "developer": "CohereForAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7713, + "hfopenllm_v2/BBH": 0.5503, + "hfopenllm_v2/MATH Level 5": 0.2991, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4125, + "hfopenllm_v2/MMLU-PRO": 0.3572 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Columbia-NLP.json b/data/developers/Columbia-NLP.json new file mode 100644 index 0000000000000000000000000000000000000000..11f5fed45eb39522787aef4e964f3d3e28d320c0 --- /dev/null +++ b/data/developers/Columbia-NLP.json @@ -0,0 +1,89 @@ +{ + "developer": "Columbia-NLP", + "models": [ + { + "id": "Columbia-NLP/LION-Gemma-2b-dpo-v1.0", + "name": "LION-Gemma-2b-dpo-v1.0", + "developer": "Columbia-NLP", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3102, + "hfopenllm_v2/BBH": 0.3881, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.4081, + "hfopenllm_v2/MMLU-PRO": 0.1665 + } + }, + { + "id": "Columbia-NLP/LION-Gemma-2b-odpo-v1.0", + "name": "LION-Gemma-2b-odpo-v1.0", + "developer": "Columbia-NLP", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3066, + "hfopenllm_v2/BBH": 0.3896, + "hfopenllm_v2/MATH Level 5": 0.0695, + "hfopenllm_v2/GPQA": 0.2424, + "hfopenllm_v2/MUSR": 0.4279, + "hfopenllm_v2/MMLU-PRO": 0.1692 + } + }, + { + "id": "Columbia-NLP/LION-Gemma-2b-sft-v1.0", + "name": "LION-Gemma-2b-sft-v1.0", + "developer": "Columbia-NLP", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3692, + "hfopenllm_v2/BBH": 0.3879, + "hfopenllm_v2/MATH Level 5": 0.068, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.4027, + "hfopenllm_v2/MMLU-PRO": 0.1782 + } + }, + { + "id": "Columbia-NLP/LION-LLaMA-3-8b-dpo-v1.0", + "name": "LION-LLaMA-3-8b-dpo-v1.0", + "developer": "Columbia-NLP", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4957, + "hfopenllm_v2/BBH": 0.5028, + "hfopenllm_v2/MATH Level 5": 0.1171, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.4097, + "hfopenllm_v2/MMLU-PRO": 0.3219 + } + }, + { + "id": "Columbia-NLP/LION-LLaMA-3-8b-odpo-v1.0", + "name": "LION-LLaMA-3-8b-odpo-v1.0", + "developer": "Columbia-NLP", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3968, + "hfopenllm_v2/BBH": 0.5024, + "hfopenllm_v2/MATH Level 5": 0.1065, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.4057, + "hfopenllm_v2/MMLU-PRO": 0.3152 + } + }, + { + "id": "Columbia-NLP/LION-LLaMA-3-8b-sft-v1.0", + "name": "LION-LLaMA-3-8b-sft-v1.0", + "developer": "Columbia-NLP", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3817, + "hfopenllm_v2/BBH": 0.5088, + "hfopenllm_v2/MATH Level 5": 0.114, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.4503, + "hfopenllm_v2/MMLU-PRO": 0.3237 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/CombinHorizon.json b/data/developers/CombinHorizon.json new file mode 100644 index 0000000000000000000000000000000000000000..eb6a06b9fd92a5e569fb90e3790d9c4c915c8bb0 --- /dev/null +++ b/data/developers/CombinHorizon.json @@ -0,0 +1,89 @@ +{ + "developer": "CombinHorizon", + "models": [ + { + "id": "CombinHorizon/Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES", + "name": "Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES", + "developer": "CombinHorizon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.824, + "hfopenllm_v2/BBH": 0.637, + "hfopenllm_v2/MATH Level 5": 0.5317, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.426, + "hfopenllm_v2/MMLU-PRO": 0.4979 + } + }, + { + "id": "CombinHorizon/Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES", + "name": "Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES", + "developer": "CombinHorizon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7564, + "hfopenllm_v2/BBH": 0.5402, + "hfopenllm_v2/MATH Level 5": 0.4932, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4033, + "hfopenllm_v2/MMLU-PRO": 0.4342 + } + }, + { + "id": "CombinHorizon/YiSM-blossom5.1-34B-SLERP", + "name": "YiSM-blossom5.1-34B-SLERP", + "developer": "CombinHorizon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5033, + "hfopenllm_v2/BBH": 0.6208, + "hfopenllm_v2/MATH Level 5": 0.2153, + "hfopenllm_v2/GPQA": 0.3557, + "hfopenllm_v2/MUSR": 0.4413, + "hfopenllm_v2/MMLU-PRO": 0.4741 + } + }, + { + "id": "CombinHorizon/huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES", + "name": "huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES", + "developer": "CombinHorizon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8206, + "hfopenllm_v2/BBH": 0.6929, + "hfopenllm_v2/MATH Level 5": 0.5944, + "hfopenllm_v2/GPQA": 0.3389, + "hfopenllm_v2/MUSR": 0.4207, + "hfopenllm_v2/MMLU-PRO": 0.5721 + } + }, + { + "id": "CombinHorizon/huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES", + "name": "huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES", + "developer": "CombinHorizon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8176, + "hfopenllm_v2/BBH": 0.6336, + "hfopenllm_v2/MATH Level 5": 0.5476, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.426, + "hfopenllm_v2/MMLU-PRO": 0.491 + } + }, + { + "id": "CombinHorizon/zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES", + "name": "zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES", + "developer": "CombinHorizon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8328, + "hfopenllm_v2/BBH": 0.6955, + "hfopenllm_v2/MATH Level 5": 0.5853, + "hfopenllm_v2/GPQA": 0.3674, + "hfopenllm_v2/MUSR": 0.4314, + "hfopenllm_v2/MMLU-PRO": 0.5685 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ContactDoctor.json b/data/developers/ContactDoctor.json new file mode 100644 index 0000000000000000000000000000000000000000..dcb1d2e97c42931745bbb22aed62c3d49a409491 --- /dev/null +++ b/data/developers/ContactDoctor.json @@ -0,0 +1,33 @@ +{ + "developer": "ContactDoctor", + "models": [ + { + "id": "ContactDoctor/Bio-Medical-3B-CoT-012025", + "name": "Bio-Medical-3B-CoT-012025", + "developer": "ContactDoctor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3604, + "hfopenllm_v2/BBH": 0.4383, + "hfopenllm_v2/MATH Level 5": 0.2213, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.3368, + "hfopenllm_v2/MMLU-PRO": 0.2934 + } + }, + { + "id": "ContactDoctor/Bio-Medical-Llama-3-8B", + "name": "Bio-Medical-Llama-3-8B", + "developer": "ContactDoctor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4422, + "hfopenllm_v2/BBH": 0.4863, + "hfopenllm_v2/MATH Level 5": 0.0672, + "hfopenllm_v2/GPQA": 0.3339, + "hfopenllm_v2/MUSR": 0.3514, + "hfopenllm_v2/MMLU-PRO": 0.3648 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ContextualAI.json b/data/developers/ContextualAI.json new file mode 100644 index 0000000000000000000000000000000000000000..3d65c52efee8d8e5caefb34e762d29caa494ab56 --- /dev/null +++ b/data/developers/ContextualAI.json @@ -0,0 +1,231 @@ +{ + "developer": "ContextualAI", + "models": [ + { + "id": "ContextualAI/LMUnit-llama3.1-70b", + "name": "ContextualAI/LMUnit-llama3.1-70b", + "developer": "ContextualAI", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8054, + "reward-bench/Factuality": 0.8463, + "reward-bench/Precise IF": 0.4875, + "reward-bench/Math": 0.7158, + "reward-bench/Safety": 0.9067, + "reward-bench/Focus": 0.9697, + "reward-bench/Ties": 0.9063 + } + }, + { + "id": "ContextualAI/LMUnit-qwen2.5-72b", + "name": "ContextualAI/LMUnit-qwen2.5-72b", + "developer": "ContextualAI", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8208, + "reward-bench/Factuality": 0.8716, + "reward-bench/Precise IF": 0.5437, + "reward-bench/Math": 0.7268, + "reward-bench/Safety": 0.9133, + "reward-bench/Focus": 0.9677, + "reward-bench/Ties": 0.9014 + } + }, + { + "id": "ContextualAI/archangel_sft-dpo_llama13b", + "name": "ContextualAI/archangel_sft-dpo_llama13b", + "developer": "ContextualAI", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.54, + "reward-bench/Chat": 0.7123, + "reward-bench/Chat Hard": 0.4298, + "reward-bench/Safety": 0.5649, + "reward-bench/Reasoning": 0.4401, + "reward-bench/Prior Sets (0.5 weight)": 0.5656 + } + }, + { + "id": "ContextualAI/archangel_sft-dpo_llama30b", + "name": "ContextualAI/archangel_sft-dpo_llama30b", + "developer": "ContextualAI", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5618, + "reward-bench/Chat": 0.6927, + "reward-bench/Chat Hard": 0.4474, + "reward-bench/Safety": 0.6284, + "reward-bench/Reasoning": 0.4745, + "reward-bench/Prior Sets (0.5 weight)": 0.5705 + } + }, + { + "id": "ContextualAI/archangel_sft-dpo_llama7b", + "name": "ContextualAI/archangel_sft-dpo_llama7b", + "developer": "ContextualAI", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5304, + "reward-bench/Chat": 0.5782, + "reward-bench/Chat Hard": 0.4452, + "reward-bench/Safety": 0.5203, + "reward-bench/Reasoning": 0.5658, + "reward-bench/Prior Sets (0.5 weight)": 0.5544 + } + }, + { + "id": "ContextualAI/archangel_sft-dpo_pythia1-4b", + "name": "ContextualAI/archangel_sft-dpo_pythia1-4b", + "developer": "ContextualAI", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5233, + "reward-bench/Chat": 0.6397, + "reward-bench/Chat Hard": 0.3728, + "reward-bench/Safety": 0.5041, + "reward-bench/Reasoning": 0.5672, + "reward-bench/Prior Sets (0.5 weight)": 0.5427 + } + }, + { + "id": "ContextualAI/archangel_sft-dpo_pythia12-0b", + "name": "ContextualAI/archangel_sft-dpo_pythia12-0b", + "developer": "ContextualAI", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5009, + "reward-bench/Chat": 0.6676, + "reward-bench/Chat Hard": 0.364, + "reward-bench/Safety": 0.5432, + "reward-bench/Reasoning": 0.4139, + "reward-bench/Prior Sets (0.5 weight)": 0.5303 + } + }, + { + "id": "ContextualAI/archangel_sft-dpo_pythia2-8b", + "name": "ContextualAI/archangel_sft-dpo_pythia2-8b", + "developer": "ContextualAI", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5286, + "reward-bench/Chat": 0.8073, + "reward-bench/Chat Hard": 0.3355, + "reward-bench/Safety": 0.4473, + "reward-bench/Reasoning": 0.5135, + "reward-bench/Prior Sets (0.5 weight)": 0.5501 + } + }, + { + "id": "ContextualAI/archangel_sft-dpo_pythia6-9b", + "name": "ContextualAI/archangel_sft-dpo_pythia6-9b", + "developer": "ContextualAI", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5263, + "reward-bench/Chat": 0.7486, + "reward-bench/Chat Hard": 0.3421, + "reward-bench/Safety": 0.5176, + "reward-bench/Reasoning": 0.4847, + "reward-bench/Prior Sets (0.5 weight)": 0.551 + } + }, + { + "id": "ContextualAI/archangel_sft-kto_llama13b", + "name": "ContextualAI/archangel_sft-kto_llama13b", + "developer": "ContextualAI", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5952, + "reward-bench/Chat": 0.8408, + "reward-bench/Chat Hard": 0.3772, + "reward-bench/Safety": 0.4649, + "reward-bench/Reasoning": 0.7077, + "reward-bench/Prior Sets (0.5 weight)": 0.576 + } + }, + { + "id": "ContextualAI/archangel_sft-kto_llama30b", + "name": "ContextualAI/archangel_sft-kto_llama30b", + "developer": "ContextualAI", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5901, + "reward-bench/Chat": 0.8436, + "reward-bench/Chat Hard": 0.4057, + "reward-bench/Safety": 0.6054, + "reward-bench/Reasoning": 0.5075, + "reward-bench/Prior Sets (0.5 weight)": 0.5862 + } + }, + { + "id": "ContextualAI/archangel_sft-kto_llama7b", + "name": "ContextualAI/archangel_sft-kto_llama7b", + "developer": "ContextualAI", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5388, + "reward-bench/Chat": 0.5587, + "reward-bench/Chat Hard": 0.4364, + "reward-bench/Safety": 0.4568, + "reward-bench/Reasoning": 0.6941, + "reward-bench/Prior Sets (0.5 weight)": 0.5575 + } + }, + { + "id": "ContextualAI/archangel_sft-kto_pythia1-4b", + "name": "ContextualAI/archangel_sft-kto_pythia1-4b", + "developer": "ContextualAI", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5581, + "reward-bench/Chat": 0.6844, + "reward-bench/Chat Hard": 0.3794, + "reward-bench/Safety": 0.5257, + "reward-bench/Reasoning": 0.6447, + "reward-bench/Prior Sets (0.5 weight)": 0.5546 + } + }, + { + "id": "ContextualAI/archangel_sft-kto_pythia12-0b", + "name": "ContextualAI/archangel_sft-kto_pythia12-0b", + "developer": "ContextualAI", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5053, + "reward-bench/Chat": 0.7486, + "reward-bench/Chat Hard": 0.3618, + "reward-bench/Safety": 0.4757, + "reward-bench/Reasoning": 0.4127, + "reward-bench/Prior Sets (0.5 weight)": 0.55 + } + }, + { + "id": "ContextualAI/archangel_sft-kto_pythia2-8b", + "name": "ContextualAI/archangel_sft-kto_pythia2-8b", + "developer": "ContextualAI", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5497, + "reward-bench/Chat": 0.757, + "reward-bench/Chat Hard": 0.3421, + "reward-bench/Safety": 0.4743, + "reward-bench/Reasoning": 0.6216, + "reward-bench/Prior Sets (0.5 weight)": 0.557 + } + }, + { + "id": "ContextualAI/archangel_sft-kto_pythia6-9b", + "name": "ContextualAI/archangel_sft-kto_pythia6-9b", + "developer": "ContextualAI", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5561, + "reward-bench/Chat": 0.7765, + "reward-bench/Chat Hard": 0.3618, + "reward-bench/Safety": 0.5365, + "reward-bench/Reasoning": 0.5415, + "reward-bench/Prior Sets (0.5 weight)": 0.5723 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/CoolSpring.json b/data/developers/CoolSpring.json new file mode 100644 index 0000000000000000000000000000000000000000..54c42a45d8ef264f4abde70878b766561279410a --- /dev/null +++ b/data/developers/CoolSpring.json @@ -0,0 +1,47 @@ +{ + "developer": "CoolSpring", + "models": [ + { + "id": "CoolSpring/Qwen2-0.5B-Abyme", + "name": "Qwen2-0.5B-Abyme", + "developer": "CoolSpring", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1915, + "hfopenllm_v2/BBH": 0.2862, + "hfopenllm_v2/MATH Level 5": 0.0295, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3542, + "hfopenllm_v2/MMLU-PRO": 0.1333 + } + }, + { + "id": "CoolSpring/Qwen2-0.5B-Abyme-merge2", + "name": "Qwen2-0.5B-Abyme-merge2", + "developer": "CoolSpring", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2022, + "hfopenllm_v2/BBH": 0.2994, + "hfopenllm_v2/MATH Level 5": 0.0332, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3687, + "hfopenllm_v2/MMLU-PRO": 0.1489 + } + }, + { + "id": "CoolSpring/Qwen2-0.5B-Abyme-merge3", + "name": "Qwen2-0.5B-Abyme-merge3", + "developer": "CoolSpring", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2386, + "hfopenllm_v2/BBH": 0.3003, + "hfopenllm_v2/MATH Level 5": 0.0317, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3501, + "hfopenllm_v2/MMLU-PRO": 0.15 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Corianas.json b/data/developers/Corianas.json new file mode 100644 index 0000000000000000000000000000000000000000..4e3bbab9987994ea0c181117c7ee6b96db89c89b --- /dev/null +++ b/data/developers/Corianas.json @@ -0,0 +1,47 @@ +{ + "developer": "Corianas", + "models": [ + { + "id": "Corianas/Neural-Mistral-7B", + "name": "Neural-Mistral-7B", + "developer": "Corianas", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5489, + "hfopenllm_v2/BBH": 0.4428, + "hfopenllm_v2/MATH Level 5": 0.0189, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3873, + "hfopenllm_v2/MMLU-PRO": 0.2738 + } + }, + { + "id": "Corianas/Quokka_2.7b", + "name": "Quokka_2.7b", + "developer": "Corianas", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1749, + "hfopenllm_v2/BBH": 0.3055, + "hfopenllm_v2/MATH Level 5": 0.0083, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3908, + "hfopenllm_v2/MMLU-PRO": 0.1145 + } + }, + { + "id": "Corianas/llama-3-reactor", + "name": "llama-3-reactor", + "developer": "Corianas", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.23, + "hfopenllm_v2/BBH": 0.4457, + "hfopenllm_v2/MATH Level 5": 0.0468, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.3977, + "hfopenllm_v2/MMLU-PRO": 0.2801 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/CortexLM.json b/data/developers/CortexLM.json new file mode 100644 index 0000000000000000000000000000000000000000..71f8ef9440c41c73700314b9148c2374292ded16 --- /dev/null +++ b/data/developers/CortexLM.json @@ -0,0 +1,19 @@ +{ + "developer": "CortexLM", + "models": [ + { + "id": "CortexLM/btlm-7b-base-v0.2", + "name": "btlm-7b-base-v0.2", + "developer": "CortexLM", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1483, + "hfopenllm_v2/BBH": 0.4006, + "hfopenllm_v2/MATH Level 5": 0.0151, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3846, + "hfopenllm_v2/MMLU-PRO": 0.235 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Cran-May.json b/data/developers/Cran-May.json new file mode 100644 index 0000000000000000000000000000000000000000..ea0510236fd227ea9e9fb0b75d4b55c578af1db6 --- /dev/null +++ b/data/developers/Cran-May.json @@ -0,0 +1,103 @@ +{ + "developer": "Cran-May", + "models": [ + { + "id": "Cran-May/SCE-2-24B", + "name": "SCE-2-24B", + "developer": "Cran-May", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5866, + "hfopenllm_v2/BBH": 0.6265, + "hfopenllm_v2/MATH Level 5": 0.1896, + "hfopenllm_v2/GPQA": 0.3372, + "hfopenllm_v2/MUSR": 0.4528, + "hfopenllm_v2/MMLU-PRO": 0.4612 + } + }, + { + "id": "Cran-May/SCE-3-24B", + "name": "SCE-3-24B", + "developer": "Cran-May", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5465, + "hfopenllm_v2/BBH": 0.5973, + "hfopenllm_v2/MATH Level 5": 0.1881, + "hfopenllm_v2/GPQA": 0.3465, + "hfopenllm_v2/MUSR": 0.4435, + "hfopenllm_v2/MMLU-PRO": 0.4647 + } + }, + { + "id": "Cran-May/T.E-8.1", + "name": "T.E-8.1", + "developer": "Cran-May", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7077, + "hfopenllm_v2/BBH": 0.5582, + "hfopenllm_v2/MATH Level 5": 0.4456, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4505, + "hfopenllm_v2/MMLU-PRO": 0.4432 + } + }, + { + "id": "Cran-May/merge_model_20250308_2", + "name": "merge_model_20250308_2", + "developer": "Cran-May", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5932, + "hfopenllm_v2/BBH": 0.6585, + "hfopenllm_v2/MATH Level 5": 0.4381, + "hfopenllm_v2/GPQA": 0.3909, + "hfopenllm_v2/MUSR": 0.4794, + "hfopenllm_v2/MMLU-PRO": 0.542 + } + }, + { + "id": "Cran-May/merge_model_20250308_3", + "name": "merge_model_20250308_3", + "developer": "Cran-May", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6018, + "hfopenllm_v2/BBH": 0.6271, + "hfopenllm_v2/MATH Level 5": 0.2545, + "hfopenllm_v2/GPQA": 0.3221, + "hfopenllm_v2/MUSR": 0.432, + "hfopenllm_v2/MMLU-PRO": 0.4962 + } + }, + { + "id": "Cran-May/merge_model_20250308_4", + "name": "merge_model_20250308_4", + "developer": "Cran-May", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.454, + "hfopenllm_v2/BBH": 0.6664, + "hfopenllm_v2/MATH Level 5": 0.4199, + "hfopenllm_v2/GPQA": 0.3977, + "hfopenllm_v2/MUSR": 0.4688, + "hfopenllm_v2/MMLU-PRO": 0.5367 + } + }, + { + "id": "Cran-May/tempmotacilla-cinerea-0308", + "name": "tempmotacilla-cinerea-0308", + "developer": "Cran-May", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8085, + "hfopenllm_v2/BBH": 0.6551, + "hfopenllm_v2/MATH Level 5": 0.5551, + "hfopenllm_v2/GPQA": 0.3624, + "hfopenllm_v2/MUSR": 0.4208, + "hfopenllm_v2/MMLU-PRO": 0.525 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/CreitinGameplays.json b/data/developers/CreitinGameplays.json new file mode 100644 index 0000000000000000000000000000000000000000..4c545e3e9a2cc1a1e528e82b73d82085837f071f --- /dev/null +++ b/data/developers/CreitinGameplays.json @@ -0,0 +1,19 @@ +{ + "developer": "CreitinGameplays", + "models": [ + { + "id": "CreitinGameplays/Llama-3.1-8B-R1-v0.1", + "name": "Llama-3.1-8B-R1-v0.1", + "developer": "CreitinGameplays", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3235, + "hfopenllm_v2/BBH": 0.3057, + "hfopenllm_v2/MATH Level 5": 0.1813, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3622, + "hfopenllm_v2/MMLU-PRO": 0.1252 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/CultriX.json b/data/developers/CultriX.json new file mode 100644 index 0000000000000000000000000000000000000000..b0d6e4094097188aa13dc09b918bc606e8409d8a --- /dev/null +++ b/data/developers/CultriX.json @@ -0,0 +1,453 @@ +{ + "developer": "CultriX", + "models": [ + { + "id": "CultriX/Qwen2.5-14B-Broca", + "name": "Qwen2.5-14B-Broca", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5604, + "hfopenllm_v2/BBH": 0.6527, + "hfopenllm_v2/MATH Level 5": 0.358, + "hfopenllm_v2/GPQA": 0.3867, + "hfopenllm_v2/MUSR": 0.4767, + "hfopenllm_v2/MMLU-PRO": 0.5364 + } + }, + { + "id": "CultriX/Qwen2.5-14B-BrocaV9", + "name": "Qwen2.5-14B-BrocaV9", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6763, + "hfopenllm_v2/BBH": 0.6391, + "hfopenllm_v2/MATH Level 5": 0.3814, + "hfopenllm_v2/GPQA": 0.3641, + "hfopenllm_v2/MUSR": 0.469, + "hfopenllm_v2/MMLU-PRO": 0.5331 + } + }, + { + "id": "CultriX/Qwen2.5-14B-Brocav3", + "name": "Qwen2.5-14B-Brocav3", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6952, + "hfopenllm_v2/BBH": 0.6452, + "hfopenllm_v2/MATH Level 5": 0.3875, + "hfopenllm_v2/GPQA": 0.3591, + "hfopenllm_v2/MUSR": 0.4756, + "hfopenllm_v2/MMLU-PRO": 0.5317 + } + }, + { + "id": "CultriX/Qwen2.5-14B-Brocav6", + "name": "Qwen2.5-14B-Brocav6", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6995, + "hfopenllm_v2/BBH": 0.6389, + "hfopenllm_v2/MATH Level 5": 0.3875, + "hfopenllm_v2/GPQA": 0.3674, + "hfopenllm_v2/MUSR": 0.4742, + "hfopenllm_v2/MMLU-PRO": 0.5319 + } + }, + { + "id": "CultriX/Qwen2.5-14B-Brocav7", + "name": "Qwen2.5-14B-Brocav7", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6724, + "hfopenllm_v2/BBH": 0.6444, + "hfopenllm_v2/MATH Level 5": 0.3844, + "hfopenllm_v2/GPQA": 0.3674, + "hfopenllm_v2/MUSR": 0.4796, + "hfopenllm_v2/MMLU-PRO": 0.5258 + } + }, + { + "id": "CultriX/Qwen2.5-14B-Emerged", + "name": "Qwen2.5-14B-Emerged", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7, + "hfopenllm_v2/BBH": 0.626, + "hfopenllm_v2/MATH Level 5": 0.3248, + "hfopenllm_v2/GPQA": 0.3574, + "hfopenllm_v2/MUSR": 0.4691, + "hfopenllm_v2/MMLU-PRO": 0.5186 + } + }, + { + "id": "CultriX/Qwen2.5-14B-Emergedv3", + "name": "Qwen2.5-14B-Emergedv3", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6388, + "hfopenllm_v2/BBH": 0.6191, + "hfopenllm_v2/MATH Level 5": 0.4358, + "hfopenllm_v2/GPQA": 0.3607, + "hfopenllm_v2/MUSR": 0.4728, + "hfopenllm_v2/MMLU-PRO": 0.5174 + } + }, + { + "id": "CultriX/Qwen2.5-14B-FinalMerge", + "name": "Qwen2.5-14B-FinalMerge", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4891, + "hfopenllm_v2/BBH": 0.5715, + "hfopenllm_v2/MATH Level 5": 0.3814, + "hfopenllm_v2/GPQA": 0.3549, + "hfopenllm_v2/MUSR": 0.4379, + "hfopenllm_v2/MMLU-PRO": 0.4574 + } + }, + { + "id": "CultriX/Qwen2.5-14B-Hyper", + "name": "Qwen2.5-14B-Hyper", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5391, + "hfopenllm_v2/BBH": 0.6507, + "hfopenllm_v2/MATH Level 5": 0.3437, + "hfopenllm_v2/GPQA": 0.3918, + "hfopenllm_v2/MUSR": 0.4898, + "hfopenllm_v2/MMLU-PRO": 0.5374 + } + }, + { + "id": "CultriX/Qwen2.5-14B-HyperMarck-dl", + "name": "Qwen2.5-14B-HyperMarck-dl", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.665, + "hfopenllm_v2/BBH": 0.6096, + "hfopenllm_v2/MATH Level 5": 0.5287, + "hfopenllm_v2/GPQA": 0.3674, + "hfopenllm_v2/MUSR": 0.4416, + "hfopenllm_v2/MMLU-PRO": 0.5091 + } + }, + { + "id": "CultriX/Qwen2.5-14B-Hyperionv3", + "name": "Qwen2.5-14B-Hyperionv3", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6836, + "hfopenllm_v2/BBH": 0.6522, + "hfopenllm_v2/MATH Level 5": 0.3701, + "hfopenllm_v2/GPQA": 0.3708, + "hfopenllm_v2/MUSR": 0.473, + "hfopenllm_v2/MMLU-PRO": 0.534 + } + }, + { + "id": "CultriX/Qwen2.5-14B-Hyperionv4", + "name": "Qwen2.5-14B-Hyperionv4", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5416, + "hfopenllm_v2/BBH": 0.6472, + "hfopenllm_v2/MATH Level 5": 0.3474, + "hfopenllm_v2/GPQA": 0.3977, + "hfopenllm_v2/MUSR": 0.4832, + "hfopenllm_v2/MMLU-PRO": 0.5364 + } + }, + { + "id": "CultriX/Qwen2.5-14B-Hyperionv5", + "name": "Qwen2.5-14B-Hyperionv5", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6729, + "hfopenllm_v2/BBH": 0.6443, + "hfopenllm_v2/MATH Level 5": 0.3822, + "hfopenllm_v2/GPQA": 0.3716, + "hfopenllm_v2/MUSR": 0.4795, + "hfopenllm_v2/MMLU-PRO": 0.5302 + } + }, + { + "id": "CultriX/Qwen2.5-14B-MegaMerge-pt2", + "name": "Qwen2.5-14B-MegaMerge-pt2", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5683, + "hfopenllm_v2/BBH": 0.6578, + "hfopenllm_v2/MATH Level 5": 0.3995, + "hfopenllm_v2/GPQA": 0.3792, + "hfopenllm_v2/MUSR": 0.4729, + "hfopenllm_v2/MMLU-PRO": 0.5421 + } + }, + { + "id": "CultriX/Qwen2.5-14B-MergeStock", + "name": "Qwen2.5-14B-MergeStock", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5685, + "hfopenllm_v2/BBH": 0.6579, + "hfopenllm_v2/MATH Level 5": 0.4147, + "hfopenllm_v2/GPQA": 0.3733, + "hfopenllm_v2/MUSR": 0.4676, + "hfopenllm_v2/MMLU-PRO": 0.5396 + } + }, + { + "id": "CultriX/Qwen2.5-14B-ReasoningMerge", + "name": "Qwen2.5-14B-ReasoningMerge", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4605, + "hfopenllm_v2/BBH": 0.6578, + "hfopenllm_v2/MATH Level 5": 0.5204, + "hfopenllm_v2/GPQA": 0.4077, + "hfopenllm_v2/MUSR": 0.5166, + "hfopenllm_v2/MMLU-PRO": 0.5345 + } + }, + { + "id": "CultriX/Qwen2.5-14B-Ultimav2", + "name": "Qwen2.5-14B-Ultimav2", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.55, + "hfopenllm_v2/BBH": 0.6555, + "hfopenllm_v2/MATH Level 5": 0.3844, + "hfopenllm_v2/GPQA": 0.3851, + "hfopenllm_v2/MUSR": 0.4966, + "hfopenllm_v2/MMLU-PRO": 0.5417 + } + }, + { + "id": "CultriX/Qwen2.5-14B-Unity", + "name": "Qwen2.5-14B-Unity", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6739, + "hfopenllm_v2/BBH": 0.602, + "hfopenllm_v2/MATH Level 5": 0.4313, + "hfopenllm_v2/GPQA": 0.3473, + "hfopenllm_v2/MUSR": 0.4679, + "hfopenllm_v2/MMLU-PRO": 0.5076 + } + }, + { + "id": "CultriX/Qwen2.5-14B-Wernicke", + "name": "Qwen2.5-14B-Wernicke", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5235, + "hfopenllm_v2/BBH": 0.6568, + "hfopenllm_v2/MATH Level 5": 0.3814, + "hfopenllm_v2/GPQA": 0.3935, + "hfopenllm_v2/MUSR": 0.4689, + "hfopenllm_v2/MMLU-PRO": 0.5424 + } + }, + { + "id": "CultriX/Qwen2.5-14B-Wernicke-SFT", + "name": "Qwen2.5-14B-Wernicke-SFT", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4937, + "hfopenllm_v2/BBH": 0.6461, + "hfopenllm_v2/MATH Level 5": 0.3595, + "hfopenllm_v2/GPQA": 0.354, + "hfopenllm_v2/MUSR": 0.39, + "hfopenllm_v2/MMLU-PRO": 0.507 + } + }, + { + "id": "CultriX/Qwen2.5-14B-Wernicke-SLERP", + "name": "Qwen2.5-14B-Wernicke-SLERP", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5589, + "hfopenllm_v2/BBH": 0.6441, + "hfopenllm_v2/MATH Level 5": 0.4486, + "hfopenllm_v2/GPQA": 0.344, + "hfopenllm_v2/MUSR": 0.414, + "hfopenllm_v2/MMLU-PRO": 0.5094 + } + }, + { + "id": "CultriX/Qwen2.5-14B-Wernickev3", + "name": "Qwen2.5-14B-Wernickev3", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7048, + "hfopenllm_v2/BBH": 0.6184, + "hfopenllm_v2/MATH Level 5": 0.3542, + "hfopenllm_v2/GPQA": 0.3624, + "hfopenllm_v2/MUSR": 0.4717, + "hfopenllm_v2/MMLU-PRO": 0.5151 + } + }, + { + "id": "CultriX/Qwen2.5-14B-partialmergept1", + "name": "Qwen2.5-14B-partialmergept1", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6337, + "hfopenllm_v2/BBH": 0.6151, + "hfopenllm_v2/MATH Level 5": 0.4539, + "hfopenllm_v2/GPQA": 0.3616, + "hfopenllm_v2/MUSR": 0.4757, + "hfopenllm_v2/MMLU-PRO": 0.5208 + } + }, + { + "id": "CultriX/Qwenfinity-2.5-14B", + "name": "Qwenfinity-2.5-14B", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4814, + "hfopenllm_v2/BBH": 0.5655, + "hfopenllm_v2/MATH Level 5": 0.4101, + "hfopenllm_v2/GPQA": 0.349, + "hfopenllm_v2/MUSR": 0.4506, + "hfopenllm_v2/MMLU-PRO": 0.4498 + } + }, + { + "id": "CultriX/Qwestion-14B", + "name": "Qwestion-14B", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6318, + "hfopenllm_v2/BBH": 0.645, + "hfopenllm_v2/MATH Level 5": 0.3724, + "hfopenllm_v2/GPQA": 0.3683, + "hfopenllm_v2/MUSR": 0.4636, + "hfopenllm_v2/MMLU-PRO": 0.5422 + } + }, + { + "id": "CultriX/SeQwence-14B", + "name": "SeQwence-14B", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5352, + "hfopenllm_v2/BBH": 0.6506, + "hfopenllm_v2/MATH Level 5": 0.3535, + "hfopenllm_v2/GPQA": 0.3607, + "hfopenllm_v2/MUSR": 0.4666, + "hfopenllm_v2/MMLU-PRO": 0.5419 + } + }, + { + "id": "CultriX/SeQwence-14B-EvolMerge", + "name": "SeQwence-14B-EvolMerge", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5382, + "hfopenllm_v2/BBH": 0.6572, + "hfopenllm_v2/MATH Level 5": 0.3671, + "hfopenllm_v2/GPQA": 0.3809, + "hfopenllm_v2/MUSR": 0.4821, + "hfopenllm_v2/MMLU-PRO": 0.5419 + } + }, + { + "id": "CultriX/SeQwence-14B-EvolMergev1", + "name": "SeQwence-14B-EvolMergev1", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5555, + "hfopenllm_v2/BBH": 0.6546, + "hfopenllm_v2/MATH Level 5": 0.4215, + "hfopenllm_v2/GPQA": 0.3767, + "hfopenllm_v2/MUSR": 0.4623, + "hfopenllm_v2/MMLU-PRO": 0.5393 + } + }, + { + "id": "CultriX/SeQwence-14B-v5", + "name": "SeQwence-14B-v5", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.592, + "hfopenllm_v2/BBH": 0.6517, + "hfopenllm_v2/MATH Level 5": 0.3308, + "hfopenllm_v2/GPQA": 0.37, + "hfopenllm_v2/MUSR": 0.4714, + "hfopenllm_v2/MMLU-PRO": 0.5415 + } + }, + { + "id": "CultriX/SeQwence-14Bv1", + "name": "SeQwence-14Bv1", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6678, + "hfopenllm_v2/BBH": 0.6345, + "hfopenllm_v2/MATH Level 5": 0.361, + "hfopenllm_v2/GPQA": 0.3616, + "hfopenllm_v2/MUSR": 0.4704, + "hfopenllm_v2/MMLU-PRO": 0.532 + } + }, + { + "id": "CultriX/SeQwence-14Bv2", + "name": "SeQwence-14Bv2", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5786, + "hfopenllm_v2/BBH": 0.6305, + "hfopenllm_v2/MATH Level 5": 0.4758, + "hfopenllm_v2/GPQA": 0.3607, + "hfopenllm_v2/MUSR": 0.4601, + "hfopenllm_v2/MMLU-PRO": 0.5334 + } + }, + { + "id": "CultriX/SeQwence-14Bv3", + "name": "SeQwence-14Bv3", + "developer": "CultriX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5719, + "hfopenllm_v2/BBH": 0.6302, + "hfopenllm_v2/MATH Level 5": 0.4766, + "hfopenllm_v2/GPQA": 0.3649, + "hfopenllm_v2/MUSR": 0.4624, + "hfopenllm_v2/MMLU-PRO": 0.5335 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/DRXD1000.json b/data/developers/DRXD1000.json new file mode 100644 index 0000000000000000000000000000000000000000..21836df503c7975786feb29506d0fd2ff71c7fa0 --- /dev/null +++ b/data/developers/DRXD1000.json @@ -0,0 +1,33 @@ +{ + "developer": "DRXD1000", + "models": [ + { + "id": "DRXD1000/Atlas-7B", + "name": "Atlas-7B", + "developer": "DRXD1000", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3704, + "hfopenllm_v2/BBH": 0.3302, + "hfopenllm_v2/MATH Level 5": 0.0189, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3342, + "hfopenllm_v2/MMLU-PRO": 0.1401 + } + }, + { + "id": "DRXD1000/Phoenix-7B", + "name": "Phoenix-7B", + "developer": "DRXD1000", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.321, + "hfopenllm_v2/BBH": 0.3932, + "hfopenllm_v2/MATH Level 5": 0.0166, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.3849, + "hfopenllm_v2/MMLU-PRO": 0.2343 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/DUAL-GPO.json b/data/developers/DUAL-GPO.json new file mode 100644 index 0000000000000000000000000000000000000000..8cb18864877c43a42607c65cff7a0e6df8aeac00 --- /dev/null +++ b/data/developers/DUAL-GPO.json @@ -0,0 +1,19 @@ +{ + "developer": "DUAL-GPO", + "models": [ + { + "id": "DUAL-GPO/zephyr-7b-ipo-0k-15k-i1", + "name": "zephyr-7b-ipo-0k-15k-i1", + "developer": "DUAL-GPO", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2756, + "hfopenllm_v2/BBH": 0.4473, + "hfopenllm_v2/MATH Level 5": 0.0302, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.4173, + "hfopenllm_v2/MMLU-PRO": 0.313 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/DZgas.json b/data/developers/DZgas.json new file mode 100644 index 0000000000000000000000000000000000000000..533207e2f2eb79f315d16abfbdebb6768930c84e --- /dev/null +++ b/data/developers/DZgas.json @@ -0,0 +1,19 @@ +{ + "developer": "DZgas", + "models": [ + { + "id": "DZgas/GIGABATEMAN-7B", + "name": "GIGABATEMAN-7B", + "developer": "DZgas", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4607, + "hfopenllm_v2/BBH": 0.5032, + "hfopenllm_v2/MATH Level 5": 0.0551, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.4328, + "hfopenllm_v2/MMLU-PRO": 0.3177 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Daemontatox.json b/data/developers/Daemontatox.json new file mode 100644 index 0000000000000000000000000000000000000000..3c61d79ec519abf4d73b74e813ac31f059bd869c --- /dev/null +++ b/data/developers/Daemontatox.json @@ -0,0 +1,453 @@ +{ + "developer": "Daemontatox", + "models": [ + { + "id": "Daemontatox/AetherDrake-SFT", + "name": "AetherDrake-SFT", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4813, + "hfopenllm_v2/BBH": 0.4872, + "hfopenllm_v2/MATH Level 5": 0.1511, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.4088, + "hfopenllm_v2/MMLU-PRO": 0.3499 + } + }, + { + "id": "Daemontatox/AetherSett", + "name": "AetherSett", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.537, + "hfopenllm_v2/BBH": 0.5452, + "hfopenllm_v2/MATH Level 5": 0.3973, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4603, + "hfopenllm_v2/MMLU-PRO": 0.4279 + } + }, + { + "id": "Daemontatox/AetherTOT", + "name": "AetherTOT", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4383, + "hfopenllm_v2/BBH": 0.5034, + "hfopenllm_v2/MATH Level 5": 0.1443, + "hfopenllm_v2/GPQA": 0.3238, + "hfopenllm_v2/MUSR": 0.4052, + "hfopenllm_v2/MMLU-PRO": 0.3778 + } + }, + { + "id": "Daemontatox/AetherUncensored", + "name": "AetherUncensored", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4042, + "hfopenllm_v2/BBH": 0.4463, + "hfopenllm_v2/MATH Level 5": 0.145, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.3747, + "hfopenllm_v2/MMLU-PRO": 0.271 + } + }, + { + "id": "Daemontatox/Cogito-MIS", + "name": "Cogito-MIS", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1815, + "hfopenllm_v2/BBH": 0.506, + "hfopenllm_v2/MATH Level 5": 0.0861, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3768, + "hfopenllm_v2/MMLU-PRO": 0.1435 + } + }, + { + "id": "Daemontatox/CogitoDistil", + "name": "CogitoDistil", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2776, + "hfopenllm_v2/BBH": 0.3677, + "hfopenllm_v2/MATH Level 5": 0.3927, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3755, + "hfopenllm_v2/MMLU-PRO": 0.2625 + } + }, + { + "id": "Daemontatox/CogitoZ", + "name": "CogitoZ", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3967, + "hfopenllm_v2/BBH": 0.6734, + "hfopenllm_v2/MATH Level 5": 0.5242, + "hfopenllm_v2/GPQA": 0.3951, + "hfopenllm_v2/MUSR": 0.4793, + "hfopenllm_v2/MMLU-PRO": 0.5593 + } + }, + { + "id": "Daemontatox/CogitoZ14", + "name": "CogitoZ14", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6637, + "hfopenllm_v2/BBH": 0.6298, + "hfopenllm_v2/MATH Level 5": 0.4222, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4059, + "hfopenllm_v2/MMLU-PRO": 0.3999 + } + }, + { + "id": "Daemontatox/DocumentCogito", + "name": "DocumentCogito", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.777, + "hfopenllm_v2/BBH": 0.5187, + "hfopenllm_v2/MATH Level 5": 0.2198, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.3911, + "hfopenllm_v2/MMLU-PRO": 0.3738 + } + }, + { + "id": "Daemontatox/Llama3.3-70B-CogniLink", + "name": "Llama3.3-70B-CogniLink", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6931, + "hfopenllm_v2/BBH": 0.6668, + "hfopenllm_v2/MATH Level 5": 0.4139, + "hfopenllm_v2/GPQA": 0.4455, + "hfopenllm_v2/MUSR": 0.4877, + "hfopenllm_v2/MMLU-PRO": 0.5173 + } + }, + { + "id": "Daemontatox/Llama_cot", + "name": "Llama_cot", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7549, + "hfopenllm_v2/BBH": 0.4838, + "hfopenllm_v2/MATH Level 5": 0.2024, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.3872, + "hfopenllm_v2/MMLU-PRO": 0.3518 + } + }, + { + "id": "Daemontatox/MawaredT1", + "name": "MawaredT1", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4199, + "hfopenllm_v2/BBH": 0.5215, + "hfopenllm_v2/MATH Level 5": 0.3021, + "hfopenllm_v2/GPQA": 0.3347, + "hfopenllm_v2/MUSR": 0.4702, + "hfopenllm_v2/MMLU-PRO": 0.4718 + } + }, + { + "id": "Daemontatox/Mini_QwQ", + "name": "Mini_QwQ", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4497, + "hfopenllm_v2/BBH": 0.5549, + "hfopenllm_v2/MATH Level 5": 0.4192, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4682, + "hfopenllm_v2/MMLU-PRO": 0.4373 + } + }, + { + "id": "Daemontatox/NemoR", + "name": "NemoR", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2287, + "hfopenllm_v2/BBH": 0.5194, + "hfopenllm_v2/MATH Level 5": 0.0831, + "hfopenllm_v2/GPQA": 0.3272, + "hfopenllm_v2/MUSR": 0.3908, + "hfopenllm_v2/MMLU-PRO": 0.329 + } + }, + { + "id": "Daemontatox/PathFinderAI2.0", + "name": "PathFinderAI2.0", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4541, + "hfopenllm_v2/BBH": 0.6658, + "hfopenllm_v2/MATH Level 5": 0.5076, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4216, + "hfopenllm_v2/MMLU-PRO": 0.5547 + } + }, + { + "id": "Daemontatox/PathFinderAi3.0", + "name": "PathFinderAi3.0", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4271, + "hfopenllm_v2/BBH": 0.6884, + "hfopenllm_v2/MATH Level 5": 0.5045, + "hfopenllm_v2/GPQA": 0.4086, + "hfopenllm_v2/MUSR": 0.4807, + "hfopenllm_v2/MMLU-PRO": 0.5757 + } + }, + { + "id": "Daemontatox/PathfinderAI", + "name": "PathfinderAI", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3745, + "hfopenllm_v2/BBH": 0.6668, + "hfopenllm_v2/MATH Level 5": 0.4758, + "hfopenllm_v2/GPQA": 0.3943, + "hfopenllm_v2/MUSR": 0.4858, + "hfopenllm_v2/MMLU-PRO": 0.5593 + } + }, + { + "id": "Daemontatox/Phi-4-COT", + "name": "Phi-4-COT", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1793, + "hfopenllm_v2/BBH": 0.6173, + "hfopenllm_v2/MATH Level 5": 0.2243, + "hfopenllm_v2/GPQA": 0.3356, + "hfopenllm_v2/MUSR": 0.453, + "hfopenllm_v2/MMLU-PRO": 0.5005 + } + }, + { + "id": "Daemontatox/PixelParse_AI", + "name": "PixelParse_AI", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4383, + "hfopenllm_v2/BBH": 0.5034, + "hfopenllm_v2/MATH Level 5": 0.1473, + "hfopenllm_v2/GPQA": 0.3238, + "hfopenllm_v2/MUSR": 0.4052, + "hfopenllm_v2/MMLU-PRO": 0.3778 + } + }, + { + "id": "Daemontatox/RA2.0", + "name": "RA2.0", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3784, + "hfopenllm_v2/BBH": 0.4889, + "hfopenllm_v2/MATH Level 5": 0.3837, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4091, + "hfopenllm_v2/MMLU-PRO": 0.2616 + } + }, + { + "id": "Daemontatox/RA_Reasoner", + "name": "RA_Reasoner", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5592, + "hfopenllm_v2/BBH": 0.6054, + "hfopenllm_v2/MATH Level 5": 0.2122, + "hfopenllm_v2/GPQA": 0.3314, + "hfopenllm_v2/MUSR": 0.3964, + "hfopenllm_v2/MMLU-PRO": 0.43 + } + }, + { + "id": "Daemontatox/RA_Reasoner2.0", + "name": "RA_Reasoner2.0", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5366, + "hfopenllm_v2/BBH": 0.6062, + "hfopenllm_v2/MATH Level 5": 0.2311, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.3884, + "hfopenllm_v2/MMLU-PRO": 0.4353 + } + }, + { + "id": "Daemontatox/ReasonTest", + "name": "ReasonTest", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.408, + "hfopenllm_v2/BBH": 0.5435, + "hfopenllm_v2/MATH Level 5": 0.2137, + "hfopenllm_v2/GPQA": 0.3188, + "hfopenllm_v2/MUSR": 0.4315, + "hfopenllm_v2/MMLU-PRO": 0.4272 + } + }, + { + "id": "Daemontatox/Research_PathfinderAI", + "name": "Research_PathfinderAI", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3457, + "hfopenllm_v2/BBH": 0.2872, + "hfopenllm_v2/MATH Level 5": 0.1699, + "hfopenllm_v2/GPQA": 0.2408, + "hfopenllm_v2/MUSR": 0.3394, + "hfopenllm_v2/MMLU-PRO": 0.113 + } + }, + { + "id": "Daemontatox/SphinX", + "name": "SphinX", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5725, + "hfopenllm_v2/BBH": 0.5441, + "hfopenllm_v2/MATH Level 5": 0.3082, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4405, + "hfopenllm_v2/MMLU-PRO": 0.4366 + } + }, + { + "id": "Daemontatox/Sphinx2.0", + "name": "Sphinx2.0", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7123, + "hfopenllm_v2/BBH": 0.6473, + "hfopenllm_v2/MATH Level 5": 0.4018, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.426, + "hfopenllm_v2/MMLU-PRO": 0.5184 + } + }, + { + "id": "Daemontatox/TinySphinx", + "name": "TinySphinx", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2567, + "hfopenllm_v2/BBH": 0.331, + "hfopenllm_v2/MATH Level 5": 0.0431, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3328, + "hfopenllm_v2/MMLU-PRO": 0.1698 + } + }, + { + "id": "Daemontatox/TinySphinx2.0", + "name": "TinySphinx2.0", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2535, + "hfopenllm_v2/BBH": 0.3168, + "hfopenllm_v2/MATH Level 5": 0.0325, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3382, + "hfopenllm_v2/MMLU-PRO": 0.1731 + } + }, + { + "id": "Daemontatox/Zirel-7B-Math", + "name": "Zirel-7B-Math", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6639, + "hfopenllm_v2/BBH": 0.5448, + "hfopenllm_v2/MATH Level 5": 0.1979, + "hfopenllm_v2/GPQA": 0.3263, + "hfopenllm_v2/MUSR": 0.4789, + "hfopenllm_v2/MMLU-PRO": 0.4237 + } + }, + { + "id": "Daemontatox/Zirel_1.5", + "name": "Zirel_1.5", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4168, + "hfopenllm_v2/BBH": 0.3985, + "hfopenllm_v2/MATH Level 5": 0.1133, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3658, + "hfopenllm_v2/MMLU-PRO": 0.2143 + } + }, + { + "id": "Daemontatox/mini-Cogito-R1", + "name": "mini-Cogito-R1", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2298, + "hfopenllm_v2/BBH": 0.328, + "hfopenllm_v2/MATH Level 5": 0.2749, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.3447, + "hfopenllm_v2/MMLU-PRO": 0.1482 + } + }, + { + "id": "Daemontatox/mini_Pathfinder", + "name": "mini_Pathfinder", + "developer": "Daemontatox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2962, + "hfopenllm_v2/BBH": 0.3956, + "hfopenllm_v2/MATH Level 5": 0.4751, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3781, + "hfopenllm_v2/MMLU-PRO": 0.2809 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Dampfinchen.json b/data/developers/Dampfinchen.json new file mode 100644 index 0000000000000000000000000000000000000000..2067a01e4f962631afa82d4370062d20be21ad66 --- /dev/null +++ b/data/developers/Dampfinchen.json @@ -0,0 +1,19 @@ +{ + "developer": "Dampfinchen", + "models": [ + { + "id": "Dampfinchen/Llama-3.1-8B-Ultra-Instruct", + "name": "Llama-3.1-8B-Ultra-Instruct", + "developer": "Dampfinchen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8081, + "hfopenllm_v2/BBH": 0.5258, + "hfopenllm_v2/MATH Level 5": 0.2205, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4003, + "hfopenllm_v2/MMLU-PRO": 0.3826 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Danielbrdz.json b/data/developers/Danielbrdz.json new file mode 100644 index 0000000000000000000000000000000000000000..4125accfc0491df06739e6b344e2093e59c20ead --- /dev/null +++ b/data/developers/Danielbrdz.json @@ -0,0 +1,103 @@ +{ + "developer": "Danielbrdz", + "models": [ + { + "id": "Danielbrdz/Barcenas-10b", + "name": "Barcenas-10b", + "developer": "Danielbrdz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6608, + "hfopenllm_v2/BBH": 0.6121, + "hfopenllm_v2/MATH Level 5": 0.2153, + "hfopenllm_v2/GPQA": 0.3414, + "hfopenllm_v2/MUSR": 0.4135, + "hfopenllm_v2/MMLU-PRO": 0.4361 + } + }, + { + "id": "Danielbrdz/Barcenas-14b-Phi-3-medium-ORPO", + "name": "Barcenas-14b-Phi-3-medium-ORPO", + "developer": "Danielbrdz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4799, + "hfopenllm_v2/BBH": 0.6536, + "hfopenllm_v2/MATH Level 5": 0.2024, + "hfopenllm_v2/GPQA": 0.3263, + "hfopenllm_v2/MUSR": 0.4808, + "hfopenllm_v2/MMLU-PRO": 0.4723 + } + }, + { + "id": "Danielbrdz/Barcenas-14b-phi-4", + "name": "Barcenas-14b-phi-4", + "developer": "Danielbrdz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0498, + "hfopenllm_v2/BBH": 0.6769, + "hfopenllm_v2/MATH Level 5": 0.2583, + "hfopenllm_v2/GPQA": 0.3834, + "hfopenllm_v2/MUSR": 0.5097, + "hfopenllm_v2/MMLU-PRO": 0.5175 + } + }, + { + "id": "Danielbrdz/Barcenas-14b-phi-4-v2", + "name": "Barcenas-14b-phi-4-v2", + "developer": "Danielbrdz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2775, + "hfopenllm_v2/BBH": 0.6573, + "hfopenllm_v2/MATH Level 5": 0.3218, + "hfopenllm_v2/GPQA": 0.3784, + "hfopenllm_v2/MUSR": 0.4399, + "hfopenllm_v2/MMLU-PRO": 0.5244 + } + }, + { + "id": "Danielbrdz/Barcenas-3b-GRPO", + "name": "Barcenas-3b-GRPO", + "developer": "Danielbrdz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5444, + "hfopenllm_v2/BBH": 0.4414, + "hfopenllm_v2/MATH Level 5": 0.1375, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.3576, + "hfopenllm_v2/MMLU-PRO": 0.3037 + } + }, + { + "id": "Danielbrdz/Barcenas-Llama3-8b-ORPO", + "name": "Barcenas-Llama3-8b-ORPO", + "developer": "Danielbrdz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7372, + "hfopenllm_v2/BBH": 0.4987, + "hfopenllm_v2/MATH Level 5": 0.0657, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.419, + "hfopenllm_v2/MMLU-PRO": 0.383 + } + }, + { + "id": "Danielbrdz/Barcenas-R1-Qwen-1.5b", + "name": "Barcenas-R1-Qwen-1.5b", + "developer": "Danielbrdz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2428, + "hfopenllm_v2/BBH": 0.3587, + "hfopenllm_v2/MATH Level 5": 0.3497, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.3541, + "hfopenllm_v2/MMLU-PRO": 0.1909 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Dans-DiscountModels.json b/data/developers/Dans-DiscountModels.json new file mode 100644 index 0000000000000000000000000000000000000000..df0c6b64b19d5722abdd64c9a914cf90ad96133d --- /dev/null +++ b/data/developers/Dans-DiscountModels.json @@ -0,0 +1,131 @@ +{ + "developer": "Dans-DiscountModels", + "models": [ + { + "id": "Dans-DiscountModels/12b-mn-dans-reasoning-test-2", + "name": "12b-mn-dans-reasoning-test-2", + "developer": "Dans-DiscountModels", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3711, + "hfopenllm_v2/BBH": 0.4807, + "hfopenllm_v2/MATH Level 5": 0.0634, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3702, + "hfopenllm_v2/MMLU-PRO": 0.2507 + } + }, + { + "id": "Dans-DiscountModels/12b-mn-dans-reasoning-test-3", + "name": "12b-mn-dans-reasoning-test-3", + "developer": "Dans-DiscountModels", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5053, + "hfopenllm_v2/BBH": 0.4839, + "hfopenllm_v2/MATH Level 5": 0.0778, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.4168, + "hfopenllm_v2/MMLU-PRO": 0.2516 + } + }, + { + "id": "Dans-DiscountModels/Dans-Instruct-CoreCurriculum-12b-ChatML", + "name": "Dans-Instruct-CoreCurriculum-12b-ChatML", + "developer": "Dans-DiscountModels", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2111, + "hfopenllm_v2/BBH": 0.4792, + "hfopenllm_v2/MATH Level 5": 0.0431, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3606, + "hfopenllm_v2/MMLU-PRO": 0.2805 + } + }, + { + "id": "Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML", + "name": "Dans-Instruct-Mix-8b-ChatML", + "developer": "Dans-DiscountModels", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0825, + "hfopenllm_v2/BBH": 0.4738, + "hfopenllm_v2/MATH Level 5": 0.0551, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.3918, + "hfopenllm_v2/MMLU-PRO": 0.3288 + } + }, + { + "id": "Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.0", + "name": "Dans-Instruct-Mix-8b-ChatML-V0.1.0", + "developer": "Dans-DiscountModels", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0668, + "hfopenllm_v2/BBH": 0.4775, + "hfopenllm_v2/MATH Level 5": 0.0672, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.3786, + "hfopenllm_v2/MMLU-PRO": 0.3284 + } + }, + { + "id": "Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.1", + "name": "Dans-Instruct-Mix-8b-ChatML-V0.1.1", + "developer": "Dans-DiscountModels", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0911, + "hfopenllm_v2/BBH": 0.4749, + "hfopenllm_v2/MATH Level 5": 0.0597, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.3825, + "hfopenllm_v2/MMLU-PRO": 0.3279 + } + }, + { + "id": "Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.2.0", + "name": "Dans-Instruct-Mix-8b-ChatML-V0.2.0", + "developer": "Dans-DiscountModels", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5064, + "hfopenllm_v2/BBH": 0.4624, + "hfopenllm_v2/MATH Level 5": 0.0733, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.3644, + "hfopenllm_v2/MMLU-PRO": 0.3 + } + }, + { + "id": "Dans-DiscountModels/Mistral-7b-v0.3-Test-E0.7", + "name": "Mistral-7b-v0.3-Test-E0.7", + "developer": "Dans-DiscountModels", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5124, + "hfopenllm_v2/BBH": 0.475, + "hfopenllm_v2/MATH Level 5": 0.034, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4005, + "hfopenllm_v2/MMLU-PRO": 0.2744 + } + }, + { + "id": "Dans-DiscountModels/mistral-7b-test-merged", + "name": "mistral-7b-test-merged", + "developer": "Dans-DiscountModels", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6678, + "hfopenllm_v2/BBH": 0.4898, + "hfopenllm_v2/MATH Level 5": 0.0446, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.3754, + "hfopenllm_v2/MMLU-PRO": 0.2978 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Darkknight535.json b/data/developers/Darkknight535.json new file mode 100644 index 0000000000000000000000000000000000000000..27e2b821467b28cb196806825201f21e08fe0a34 --- /dev/null +++ b/data/developers/Darkknight535.json @@ -0,0 +1,19 @@ +{ + "developer": "Darkknight535", + "models": [ + { + "id": "Darkknight535/OpenCrystal-12B-L3", + "name": "OpenCrystal-12B-L3", + "developer": "Darkknight535", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4071, + "hfopenllm_v2/BBH": 0.5223, + "hfopenllm_v2/MATH Level 5": 0.0899, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.3657, + "hfopenllm_v2/MMLU-PRO": 0.364 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Databricks-Mosaic-Research.json b/data/developers/Databricks-Mosaic-Research.json new file mode 100644 index 0000000000000000000000000000000000000000..d54480ce8af664b382e1a483ecab0c45fb6bd496 --- /dev/null +++ b/data/developers/Databricks-Mosaic-Research.json @@ -0,0 +1,20 @@ +{ + "developer": "Databricks-Mosaic-Research", + "models": [ + { + "id": "Databricks-Mosaic-Research/PGRM", + "name": "Databricks-Mosaic-Research/PGRM", + "developer": "Databricks-Mosaic-Research", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8002, + "reward-bench/Factuality": 0.7937, + "reward-bench/Precise IF": 0.5062, + "reward-bench/Math": 0.7404, + "reward-bench/Safety": 0.9289, + "reward-bench/Focus": 0.9424, + "reward-bench/Ties": 0.8893 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/DavidAU.json b/data/developers/DavidAU.json new file mode 100644 index 0000000000000000000000000000000000000000..e585ccc4dc2449be23a95a6199dd6251b9919896 --- /dev/null +++ b/data/developers/DavidAU.json @@ -0,0 +1,355 @@ +{ + "developer": "DavidAU", + "models": [ + { + "id": "DavidAU/DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm", + "name": "DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3136, + "hfopenllm_v2/BBH": 0.4762, + "hfopenllm_v2/MATH Level 5": 0.1057, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.3928, + "hfopenllm_v2/MMLU-PRO": 0.3209 + } + }, + { + "id": "DavidAU/DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B", + "name": "DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3685, + "hfopenllm_v2/BBH": 0.4887, + "hfopenllm_v2/MATH Level 5": 0.0657, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.432, + "hfopenllm_v2/MMLU-PRO": 0.2976 + } + }, + { + "id": "DavidAU/DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B", + "name": "DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2507, + "hfopenllm_v2/BBH": 0.4488, + "hfopenllm_v2/MATH Level 5": 0.0295, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4164, + "hfopenllm_v2/MMLU-PRO": 0.2709 + } + }, + { + "id": "DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B", + "name": "DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3883, + "hfopenllm_v2/BBH": 0.4886, + "hfopenllm_v2/MATH Level 5": 0.0816, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.4375, + "hfopenllm_v2/MMLU-PRO": 0.3024 + } + }, + { + "id": "DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B", + "name": "DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3436, + "hfopenllm_v2/BBH": 0.4769, + "hfopenllm_v2/MATH Level 5": 0.0755, + "hfopenllm_v2/GPQA": 0.3372, + "hfopenllm_v2/MUSR": 0.4231, + "hfopenllm_v2/MMLU-PRO": 0.297 + } + }, + { + "id": "DavidAU/DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm", + "name": "DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3416, + "hfopenllm_v2/BBH": 0.5807, + "hfopenllm_v2/MATH Level 5": 0.5536, + "hfopenllm_v2/GPQA": 0.3859, + "hfopenllm_v2/MUSR": 0.5155, + "hfopenllm_v2/MMLU-PRO": 0.4624 + } + }, + { + "id": "DavidAU/DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B", + "name": "DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2853, + "hfopenllm_v2/BBH": 0.4462, + "hfopenllm_v2/MATH Level 5": 0.0174, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4179, + "hfopenllm_v2/MMLU-PRO": 0.2778 + } + }, + { + "id": "DavidAU/DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B", + "name": "DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3793, + "hfopenllm_v2/BBH": 0.4232, + "hfopenllm_v2/MATH Level 5": 0.108, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.356, + "hfopenllm_v2/MMLU-PRO": 0.272 + } + }, + { + "id": "DavidAU/Gemma-The-Writer-9B", + "name": "Gemma-The-Writer-9B", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.174, + "hfopenllm_v2/BBH": 0.5905, + "hfopenllm_v2/MATH Level 5": 0.0876, + "hfopenllm_v2/GPQA": 0.3456, + "hfopenllm_v2/MUSR": 0.4099, + "hfopenllm_v2/MMLU-PRO": 0.3979 + } + }, + { + "id": "DavidAU/Gemma-The-Writer-DEADLINE-10B", + "name": "Gemma-The-Writer-DEADLINE-10B", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2332, + "hfopenllm_v2/BBH": 0.5896, + "hfopenllm_v2/MATH Level 5": 0.0989, + "hfopenllm_v2/GPQA": 0.3423, + "hfopenllm_v2/MUSR": 0.4189, + "hfopenllm_v2/MMLU-PRO": 0.3946 + } + }, + { + "id": "DavidAU/Gemma-The-Writer-J.GutenBerg-10B", + "name": "Gemma-The-Writer-J.GutenBerg-10B", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2858, + "hfopenllm_v2/BBH": 0.5909, + "hfopenllm_v2/MATH Level 5": 0.0921, + "hfopenllm_v2/GPQA": 0.3381, + "hfopenllm_v2/MUSR": 0.4176, + "hfopenllm_v2/MMLU-PRO": 0.3947 + } + }, + { + "id": "DavidAU/Gemma-The-Writer-Mighty-Sword-9B", + "name": "Gemma-The-Writer-Mighty-Sword-9B", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7528, + "hfopenllm_v2/BBH": 0.5912, + "hfopenllm_v2/MATH Level 5": 0.1911, + "hfopenllm_v2/GPQA": 0.3482, + "hfopenllm_v2/MUSR": 0.4112, + "hfopenllm_v2/MMLU-PRO": 0.3968 + } + }, + { + "id": "DavidAU/Gemma-The-Writer-N-Restless-Quill-10B-Uncensored", + "name": "Gemma-The-Writer-N-Restless-Quill-10B-Uncensored", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7071, + "hfopenllm_v2/BBH": 0.5922, + "hfopenllm_v2/MATH Level 5": 0.2296, + "hfopenllm_v2/GPQA": 0.3414, + "hfopenllm_v2/MUSR": 0.4163, + "hfopenllm_v2/MMLU-PRO": 0.3966 + } + }, + { + "id": "DavidAU/L3-DARKEST-PLANET-16.5B", + "name": "L3-DARKEST-PLANET-16.5B", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6231, + "hfopenllm_v2/BBH": 0.523, + "hfopenllm_v2/MATH Level 5": 0.0899, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.3754, + "hfopenllm_v2/MMLU-PRO": 0.363 + } + }, + { + "id": "DavidAU/L3-Dark-Planet-8B", + "name": "L3-Dark-Planet-8B", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4134, + "hfopenllm_v2/BBH": 0.5084, + "hfopenllm_v2/MATH Level 5": 0.0823, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.3616, + "hfopenllm_v2/MMLU-PRO": 0.3737 + } + }, + { + "id": "DavidAU/L3-Jamet-12.2B-MK.V-Blackroot-Instruct", + "name": "L3-Jamet-12.2B-MK.V-Blackroot-Instruct", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3962, + "hfopenllm_v2/BBH": 0.4766, + "hfopenllm_v2/MATH Level 5": 0.0408, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.402, + "hfopenllm_v2/MMLU-PRO": 0.3291 + } + }, + { + "id": "DavidAU/L3-Lumimaid-12.2B-v0.1-OAS-Instruct", + "name": "L3-Lumimaid-12.2B-v0.1-OAS-Instruct", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3924, + "hfopenllm_v2/BBH": 0.4693, + "hfopenllm_v2/MATH Level 5": 0.0461, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.4194, + "hfopenllm_v2/MMLU-PRO": 0.3142 + } + }, + { + "id": "DavidAU/L3-SMB-Instruct-12.2B-F32", + "name": "L3-SMB-Instruct-12.2B-F32", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4303, + "hfopenllm_v2/BBH": 0.4786, + "hfopenllm_v2/MATH Level 5": 0.0468, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.4087, + "hfopenllm_v2/MMLU-PRO": 0.3312 + } + }, + { + "id": "DavidAU/L3-Stheno-Maid-Blackroot-Grand-HORROR-16B", + "name": "L3-Stheno-Maid-Blackroot-Grand-HORROR-16B", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3439, + "hfopenllm_v2/BBH": 0.4736, + "hfopenllm_v2/MATH Level 5": 0.0219, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.4031, + "hfopenllm_v2/MMLU-PRO": 0.357 + } + }, + { + "id": "DavidAU/L3-Stheno-v3.2-12.2B-Instruct", + "name": "L3-Stheno-v3.2-12.2B-Instruct", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4028, + "hfopenllm_v2/BBH": 0.4846, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.4103, + "hfopenllm_v2/MMLU-PRO": 0.3345 + } + }, + { + "id": "DavidAU/L3.1-Dark-Planet-SpinFire-Uncensored-8B", + "name": "L3.1-Dark-Planet-SpinFire-Uncensored-8B", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7043, + "hfopenllm_v2/BBH": 0.5261, + "hfopenllm_v2/MATH Level 5": 0.0929, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3541, + "hfopenllm_v2/MMLU-PRO": 0.367 + } + }, + { + "id": "DavidAU/L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B", + "name": "L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3345, + "hfopenllm_v2/BBH": 0.4421, + "hfopenllm_v2/MATH Level 5": 0.2606, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.3749, + "hfopenllm_v2/MMLU-PRO": 0.2892 + } + }, + { + "id": "DavidAU/Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B", + "name": "Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1783, + "hfopenllm_v2/BBH": 0.3033, + "hfopenllm_v2/MATH Level 5": 0.0249, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3715, + "hfopenllm_v2/MMLU-PRO": 0.1142 + } + }, + { + "id": "DavidAU/Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B", + "name": "Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2835, + "hfopenllm_v2/BBH": 0.3592, + "hfopenllm_v2/MATH Level 5": 0.2417, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3847, + "hfopenllm_v2/MMLU-PRO": 0.1636 + } + }, + { + "id": "DavidAU/Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32", + "name": "Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32", + "developer": "DavidAU", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2107, + "hfopenllm_v2/BBH": 0.3286, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.2475, + "hfopenllm_v2/MUSR": 0.3404, + "hfopenllm_v2/MMLU-PRO": 0.1122 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Davidsv.json b/data/developers/Davidsv.json new file mode 100644 index 0000000000000000000000000000000000000000..681040458292ee5182c8bc14f1b8cdb7d2059796 --- /dev/null +++ b/data/developers/Davidsv.json @@ -0,0 +1,19 @@ +{ + "developer": "Davidsv", + "models": [ + { + "id": "Davidsv/SUONG-1", + "name": "SUONG-1", + "developer": "Davidsv", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2497, + "hfopenllm_v2/BBH": 0.2817, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2441, + "hfopenllm_v2/MUSR": 0.3578, + "hfopenllm_v2/MMLU-PRO": 0.1085 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/DavieLion.json b/data/developers/DavieLion.json new file mode 100644 index 0000000000000000000000000000000000000000..ffdc7de10295de8981ccb0c2da137caa37979e5b --- /dev/null +++ b/data/developers/DavieLion.json @@ -0,0 +1,75 @@ +{ + "developer": "DavieLion", + "models": [ + { + "id": "DavieLion/Llama-3.2-1B-SPIN-iter0", + "name": "Llama-3.2-1B-SPIN-iter0", + "developer": "DavieLion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1507, + "hfopenllm_v2/BBH": 0.293, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3565, + "hfopenllm_v2/MMLU-PRO": 0.1125 + } + }, + { + "id": "DavieLion/Llama-3.2-1B-SPIN-iter1", + "name": "Llama-3.2-1B-SPIN-iter1", + "developer": "DavieLion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1575, + "hfopenllm_v2/BBH": 0.294, + "hfopenllm_v2/MATH Level 5": 0.0023, + "hfopenllm_v2/GPQA": 0.2508, + "hfopenllm_v2/MUSR": 0.3646, + "hfopenllm_v2/MMLU-PRO": 0.1118 + } + }, + { + "id": "DavieLion/Llama-3.2-1B-SPIN-iter2", + "name": "Llama-3.2-1B-SPIN-iter2", + "developer": "DavieLion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1376, + "hfopenllm_v2/BBH": 0.298, + "hfopenllm_v2/MATH Level 5": 0.0053, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.3553, + "hfopenllm_v2/MMLU-PRO": 0.1129 + } + }, + { + "id": "DavieLion/Llama-3.2-1B-SPIN-iter3", + "name": "Llama-3.2-1B-SPIN-iter3", + "developer": "DavieLion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1336, + "hfopenllm_v2/BBH": 0.2975, + "hfopenllm_v2/MATH Level 5": 0.0068, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.35, + "hfopenllm_v2/MMLU-PRO": 0.1128 + } + }, + { + "id": "DavieLion/Lllma-3.2-1B", + "name": "Lllma-3.2-1B", + "developer": "DavieLion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1601, + "hfopenllm_v2/BBH": 0.2965, + "hfopenllm_v2/MATH Level 5": 0.0068, + "hfopenllm_v2/GPQA": 0.2441, + "hfopenllm_v2/MUSR": 0.3578, + "hfopenllm_v2/MMLU-PRO": 0.1126 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/DebateLabKIT.json b/data/developers/DebateLabKIT.json new file mode 100644 index 0000000000000000000000000000000000000000..43228abf3989aca152b661c107d0571e3bdf50d9 --- /dev/null +++ b/data/developers/DebateLabKIT.json @@ -0,0 +1,19 @@ +{ + "developer": "DebateLabKIT", + "models": [ + { + "id": "DebateLabKIT/Llama-3.1-Argunaut-1-8B-SFT", + "name": "Llama-3.1-Argunaut-1-8B-SFT", + "developer": "DebateLabKIT", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5519, + "hfopenllm_v2/BBH": 0.4824, + "hfopenllm_v2/MATH Level 5": 0.145, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.4503, + "hfopenllm_v2/MMLU-PRO": 0.3472 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Deci.json b/data/developers/Deci.json new file mode 100644 index 0000000000000000000000000000000000000000..83da7bdeea3c354a4ff914e45297da9b4b117edb --- /dev/null +++ b/data/developers/Deci.json @@ -0,0 +1,33 @@ +{ + "developer": "Deci", + "models": [ + { + "id": "Deci/DeciLM-7B", + "name": "DeciLM-7B", + "developer": "Deci", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2813, + "hfopenllm_v2/BBH": 0.4423, + "hfopenllm_v2/MATH Level 5": 0.0287, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4359, + "hfopenllm_v2/MMLU-PRO": 0.2692 + } + }, + { + "id": "Deci/DeciLM-7B-instruct", + "name": "DeciLM-7B-instruct", + "developer": "Deci", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.488, + "hfopenllm_v2/BBH": 0.459, + "hfopenllm_v2/MATH Level 5": 0.0302, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.3884, + "hfopenllm_v2/MMLU-PRO": 0.2608 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/DeepAutoAI.json b/data/developers/DeepAutoAI.json new file mode 100644 index 0000000000000000000000000000000000000000..50d4250c62e870b082c804e9b26b28096d4c4fe3 --- /dev/null +++ b/data/developers/DeepAutoAI.json @@ -0,0 +1,173 @@ +{ + "developer": "DeepAutoAI", + "models": [ + { + "id": "DeepAutoAI/Explore_Llama-3.1-8B-Inst", + "name": "Explore_Llama-3.1-8B-Inst", + "developer": "DeepAutoAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7795, + "hfopenllm_v2/BBH": 0.5117, + "hfopenllm_v2/MATH Level 5": 0.2009, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.391, + "hfopenllm_v2/MMLU-PRO": 0.3792 + } + }, + { + "id": "DeepAutoAI/Explore_Llama-3.2-1B-Inst", + "name": "Explore_Llama-3.2-1B-Inst", + "developer": "DeepAutoAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5649, + "hfopenllm_v2/BBH": 0.3505, + "hfopenllm_v2/MATH Level 5": 0.0748, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3183, + "hfopenllm_v2/MMLU-PRO": 0.1809 + } + }, + { + "id": "DeepAutoAI/Explore_Llama-3.2-1B-Inst_v0", + "name": "Explore_Llama-3.2-1B-Inst_v0", + "developer": "DeepAutoAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5597, + "hfopenllm_v2/BBH": 0.3365, + "hfopenllm_v2/MATH Level 5": 0.0597, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3103, + "hfopenllm_v2/MMLU-PRO": 0.1804 + } + }, + { + "id": "DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1", + "name": "Explore_Llama-3.2-1B-Inst_v1", + "developer": "DeepAutoAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4999, + "hfopenllm_v2/BBH": 0.3141, + "hfopenllm_v2/MATH Level 5": 0.031, + "hfopenllm_v2/GPQA": 0.245, + "hfopenllm_v2/MUSR": 0.3781, + "hfopenllm_v2/MMLU-PRO": 0.1269 + } + }, + { + "id": "DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1.1", + "name": "Explore_Llama-3.2-1B-Inst_v1.1", + "developer": "DeepAutoAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5844, + "hfopenllm_v2/BBH": 0.3513, + "hfopenllm_v2/MATH Level 5": 0.0718, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3117, + "hfopenllm_v2/MMLU-PRO": 0.1818 + } + }, + { + "id": "DeepAutoAI/causal_gpt2", + "name": "causal_gpt2", + "developer": "DeepAutoAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1813, + "hfopenllm_v2/BBH": 0.3026, + "hfopenllm_v2/MATH Level 5": 0.0053, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.427, + "hfopenllm_v2/MMLU-PRO": 0.1131 + } + }, + { + "id": "DeepAutoAI/d2nwg_Llama-3.1-8B-Instruct-v0.0", + "name": "d2nwg_Llama-3.1-8B-Instruct-v0.0", + "developer": "DeepAutoAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7893, + "hfopenllm_v2/BBH": 0.508, + "hfopenllm_v2/MATH Level 5": 0.1805, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4135, + "hfopenllm_v2/MMLU-PRO": 0.3877 + } + }, + { + "id": "DeepAutoAI/d2nwg_causal_gpt2", + "name": "d2nwg_causal_gpt2", + "developer": "DeepAutoAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1916, + "hfopenllm_v2/BBH": 0.3027, + "hfopenllm_v2/MATH Level 5": 0.0045, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.4297, + "hfopenllm_v2/MMLU-PRO": 0.1151 + } + }, + { + "id": "DeepAutoAI/d2nwg_causal_gpt2_v1", + "name": "d2nwg_causal_gpt2_v1", + "developer": "DeepAutoAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1989, + "hfopenllm_v2/BBH": 0.2992, + "hfopenllm_v2/MATH Level 5": 0.0038, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.4337, + "hfopenllm_v2/MMLU-PRO": 0.1135 + } + }, + { + "id": "DeepAutoAI/ldm_soup_Llama-3.1-8B-Inst", + "name": "ldm_soup_Llama-3.1-8B-Inst", + "developer": "DeepAutoAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8033, + "hfopenllm_v2/BBH": 0.5121, + "hfopenllm_v2/MATH Level 5": 0.1888, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.4161, + "hfopenllm_v2/MMLU-PRO": 0.3886 + } + }, + { + "id": "DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.0", + "name": "ldm_soup_Llama-3.1-8B-Instruct-v0.0", + "developer": "DeepAutoAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7889, + "hfopenllm_v2/BBH": 0.5125, + "hfopenllm_v2/MATH Level 5": 0.1918, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.4121, + "hfopenllm_v2/MMLU-PRO": 0.3895 + } + }, + { + "id": "DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.1", + "name": "ldm_soup_Llama-3.1-8B-Instruct-v0.1", + "developer": "DeepAutoAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7889, + "hfopenllm_v2/BBH": 0.5125, + "hfopenllm_v2/MATH Level 5": 0.1918, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.4121, + "hfopenllm_v2/MMLU-PRO": 0.3895 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/DeepMount00.json b/data/developers/DeepMount00.json new file mode 100644 index 0000000000000000000000000000000000000000..fd4496dea5fdde5c57b6a979dcde3b1ed2338a2e --- /dev/null +++ b/data/developers/DeepMount00.json @@ -0,0 +1,201 @@ +{ + "developer": "DeepMount00", + "models": [ + { + "id": "DeepMount00/Lexora-Lite-3B", + "name": "Lexora-Lite-3B", + "developer": "DeepMount00", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5776, + "hfopenllm_v2/BBH": 0.4873, + "hfopenllm_v2/MATH Level 5": 0.2304, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3966, + "hfopenllm_v2/MMLU-PRO": 0.3602 + } + }, + { + "id": "DeepMount00/Lexora-Lite-3B_v2", + "name": "Lexora-Lite-3B_v2", + "developer": "DeepMount00", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4943, + "hfopenllm_v2/BBH": 0.4812, + "hfopenllm_v2/MATH Level 5": 0.2281, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3822, + "hfopenllm_v2/MMLU-PRO": 0.3544 + } + }, + { + "id": "DeepMount00/Lexora-Medium-7B", + "name": "Lexora-Medium-7B", + "developer": "DeepMount00", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4103, + "hfopenllm_v2/BBH": 0.5145, + "hfopenllm_v2/MATH Level 5": 0.2221, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4439, + "hfopenllm_v2/MMLU-PRO": 0.4325 + } + }, + { + "id": "DeepMount00/Llama-3-8b-Ita", + "name": "Llama-3-8b-Ita", + "developer": "DeepMount00", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.753, + "hfopenllm_v2/BBH": 0.4936, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4268, + "hfopenllm_v2/MMLU-PRO": 0.3852 + } + }, + { + "id": "DeepMount00/Llama-3.1-8b-ITA", + "name": "Llama-3.1-8b-ITA", + "developer": "DeepMount00", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7917, + "hfopenllm_v2/BBH": 0.5109, + "hfopenllm_v2/MATH Level 5": 0.1088, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.4136, + "hfopenllm_v2/MMLU-PRO": 0.3876 + } + }, + { + "id": "DeepMount00/Llama-3.1-8b-Ita", + "name": "Llama-3.1-8b-Ita", + "developer": "DeepMount00", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5365, + "hfopenllm_v2/BBH": 0.517, + "hfopenllm_v2/MATH Level 5": 0.1707, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4487, + "hfopenllm_v2/MMLU-PRO": 0.396 + } + }, + { + "id": "DeepMount00/Llama-3.1-Distilled", + "name": "Llama-3.1-Distilled", + "developer": "DeepMount00", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7844, + "hfopenllm_v2/BBH": 0.5101, + "hfopenllm_v2/MATH Level 5": 0.2032, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4058, + "hfopenllm_v2/MMLU-PRO": 0.3782 + } + }, + { + "id": "DeepMount00/Qwen2-1.5B-Ita", + "name": "Qwen2-1.5B-Ita", + "developer": "DeepMount00", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5173, + "hfopenllm_v2/BBH": 0.3981, + "hfopenllm_v2/MATH Level 5": 0.114, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3504, + "hfopenllm_v2/MMLU-PRO": 0.2772 + } + }, + { + "id": "DeepMount00/Qwen2-1.5B-Ita_v2", + "name": "Qwen2-1.5B-Ita_v2", + "developer": "DeepMount00", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5, + "hfopenllm_v2/BBH": 0.3954, + "hfopenllm_v2/MATH Level 5": 0.0967, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3702, + "hfopenllm_v2/MMLU-PRO": 0.3032 + } + }, + { + "id": "DeepMount00/Qwen2-1.5B-Ita_v3", + "name": "Qwen2-1.5B-Ita_v3", + "developer": "DeepMount00", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.489, + "hfopenllm_v2/BBH": 0.3948, + "hfopenllm_v2/MATH Level 5": 0.1042, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3742, + "hfopenllm_v2/MMLU-PRO": 0.3018 + } + }, + { + "id": "DeepMount00/Qwen2-1.5B-Ita_v5", + "name": "Qwen2-1.5B-Ita_v5", + "developer": "DeepMount00", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4987, + "hfopenllm_v2/BBH": 0.4032, + "hfopenllm_v2/MATH Level 5": 0.1178, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.3422, + "hfopenllm_v2/MMLU-PRO": 0.2943 + } + }, + { + "id": "DeepMount00/Qwen2-1.5B-Ita_v6", + "name": "Qwen2-1.5B-Ita_v6", + "developer": "DeepMount00", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2999, + "hfopenllm_v2/BBH": 0.4249, + "hfopenllm_v2/MATH Level 5": 0.0846, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.3755, + "hfopenllm_v2/MMLU-PRO": 0.2872 + } + }, + { + "id": "DeepMount00/Qwen2.5-7B-Instruct-MathCoder", + "name": "Qwen2.5-7B-Instruct-MathCoder", + "developer": "DeepMount00", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.153, + "hfopenllm_v2/BBH": 0.2998, + "hfopenllm_v2/MATH Level 5": 0.0008, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3806, + "hfopenllm_v2/MMLU-PRO": 0.1118 + } + }, + { + "id": "DeepMount00/mergekit-ties-okvgjfz", + "name": "mergekit-ties-okvgjfz", + "developer": "DeepMount00", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.153, + "hfopenllm_v2/BBH": 0.2998, + "hfopenllm_v2/MATH Level 5": 0.0008, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3806, + "hfopenllm_v2/MMLU-PRO": 0.1118 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/DeepSeek.json b/data/developers/DeepSeek.json new file mode 100644 index 0000000000000000000000000000000000000000..b7172f25cd90b14cf61518ba803a99d37ec6b744 --- /dev/null +++ b/data/developers/DeepSeek.json @@ -0,0 +1,58 @@ +{ + "developer": "DeepSeek", + "models": [ + { + "id": "deepseek/chat-v3-0324", + "name": "deepseek/chat-v3-0324", + "developer": "DeepSeek", + "evaluator_relationship": null, + "benchmark_scores": { + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.0, + "livecodebenchpro/Easy Problems": 0.19718309859154928 + } + }, + { + "id": "deepseek/deepseek-v3.2", + "name": "DeepSeek-V3.2", + "developer": "DeepSeek", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 39.6 + } + }, + { + "id": "deepseek/ep-20250214004308-p7n89", + "name": "ep-20250214004308-p7n89", + "developer": "DeepSeek", + "evaluator_relationship": null, + "benchmark_scores": { + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.014084507042253521, + "livecodebenchpro/Easy Problems": 0.4225352112676056 + } + }, + { + "id": "deepseek/ep-20250228232227-z44x5", + "name": "ep-20250228232227-z44x5", + "developer": "DeepSeek", + "evaluator_relationship": null, + "benchmark_scores": { + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.0, + "livecodebenchpro/Easy Problems": 0.1267605633802817 + } + }, + { + "id": "deepseek/ep-20250603132404-cgpjm", + "name": "ep-20250603132404-cgpjm", + "developer": "DeepSeek", + "evaluator_relationship": null, + "benchmark_scores": { + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.08450704225352113, + "livecodebenchpro/Easy Problems": 0.5774647887323944 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Delta-Vector.json b/data/developers/Delta-Vector.json new file mode 100644 index 0000000000000000000000000000000000000000..f980692017f518fbbeec5b206efc8074ace9213c --- /dev/null +++ b/data/developers/Delta-Vector.json @@ -0,0 +1,103 @@ +{ + "developer": "Delta-Vector", + "models": [ + { + "id": "Delta-Vector/Baldur-8B", + "name": "Baldur-8B", + "developer": "Delta-Vector", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4782, + "hfopenllm_v2/BBH": 0.5306, + "hfopenllm_v2/MATH Level 5": 0.1435, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4372, + "hfopenllm_v2/MMLU-PRO": 0.3654 + } + }, + { + "id": "Delta-Vector/Control-8B", + "name": "Control-8B", + "developer": "Delta-Vector", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.549, + "hfopenllm_v2/BBH": 0.5041, + "hfopenllm_v2/MATH Level 5": 0.139, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4355, + "hfopenllm_v2/MMLU-PRO": 0.3732 + } + }, + { + "id": "Delta-Vector/Control-8B-V1.1", + "name": "Control-8B-V1.1", + "developer": "Delta-Vector", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5697, + "hfopenllm_v2/BBH": 0.4993, + "hfopenllm_v2/MATH Level 5": 0.1276, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4237, + "hfopenllm_v2/MMLU-PRO": 0.3745 + } + }, + { + "id": "Delta-Vector/Darkens-8B", + "name": "Darkens-8B", + "developer": "Delta-Vector", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2548, + "hfopenllm_v2/BBH": 0.5251, + "hfopenllm_v2/MATH Level 5": 0.0589, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.4106, + "hfopenllm_v2/MMLU-PRO": 0.3736 + } + }, + { + "id": "Delta-Vector/Henbane-7b-attempt2", + "name": "Henbane-7b-attempt2", + "developer": "Delta-Vector", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4157, + "hfopenllm_v2/BBH": 0.5061, + "hfopenllm_v2/MATH Level 5": 0.2273, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.3973, + "hfopenllm_v2/MMLU-PRO": 0.4028 + } + }, + { + "id": "Delta-Vector/Odin-9B", + "name": "Odin-9B", + "developer": "Delta-Vector", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3692, + "hfopenllm_v2/BBH": 0.544, + "hfopenllm_v2/MATH Level 5": 0.145, + "hfopenllm_v2/GPQA": 0.3414, + "hfopenllm_v2/MUSR": 0.4648, + "hfopenllm_v2/MMLU-PRO": 0.4047 + } + }, + { + "id": "Delta-Vector/Tor-8B", + "name": "Tor-8B", + "developer": "Delta-Vector", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2382, + "hfopenllm_v2/BBH": 0.5209, + "hfopenllm_v2/MATH Level 5": 0.0589, + "hfopenllm_v2/GPQA": 0.3238, + "hfopenllm_v2/MUSR": 0.4092, + "hfopenllm_v2/MMLU-PRO": 0.373 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/DevQuasar.json b/data/developers/DevQuasar.json new file mode 100644 index 0000000000000000000000000000000000000000..d207cd7873b393f5c3f18840cd65f8a70406f7dc --- /dev/null +++ b/data/developers/DevQuasar.json @@ -0,0 +1,19 @@ +{ + "developer": "DevQuasar", + "models": [ + { + "id": "DevQuasar/DevQuasar-R1-Uncensored-Llama-8B", + "name": "DevQuasar-R1-Uncensored-Llama-8B", + "developer": "DevQuasar", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3849, + "hfopenllm_v2/BBH": 0.5118, + "hfopenllm_v2/MATH Level 5": 0.3308, + "hfopenllm_v2/GPQA": 0.3473, + "hfopenllm_v2/MUSR": 0.4436, + "hfopenllm_v2/MMLU-PRO": 0.3615 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Dongwei.json b/data/developers/Dongwei.json new file mode 100644 index 0000000000000000000000000000000000000000..c29c9cae20cb8626ffd076e468a3c74680f49c2d --- /dev/null +++ b/data/developers/Dongwei.json @@ -0,0 +1,19 @@ +{ + "developer": "Dongwei", + "models": [ + { + "id": "Dongwei/DeepSeek-R1-Distill-Qwen-7B-GRPO", + "name": "DeepSeek-R1-Distill-Qwen-7B-GRPO", + "developer": "Dongwei", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4038, + "hfopenllm_v2/BBH": 0.3443, + "hfopenllm_v2/MATH Level 5": 0.1956, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3663, + "hfopenllm_v2/MMLU-PRO": 0.2322 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/DoppelReflEx.json b/data/developers/DoppelReflEx.json new file mode 100644 index 0000000000000000000000000000000000000000..0fd168d9d9c3692abf18af3ecd64df1b25a53288 --- /dev/null +++ b/data/developers/DoppelReflEx.json @@ -0,0 +1,411 @@ +{ + "developer": "DoppelReflEx", + "models": [ + { + "id": "DoppelReflEx/L3-8B-R1-WolfCore", + "name": "L3-8B-R1-WolfCore", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3775, + "hfopenllm_v2/BBH": 0.5318, + "hfopenllm_v2/MATH Level 5": 0.1631, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.4277, + "hfopenllm_v2/MMLU-PRO": 0.3717 + } + }, + { + "id": "DoppelReflEx/L3-8B-R1-WolfCore-V1.5-test", + "name": "L3-8B-R1-WolfCore-V1.5-test", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3955, + "hfopenllm_v2/BBH": 0.5315, + "hfopenllm_v2/MATH Level 5": 0.1231, + "hfopenllm_v2/GPQA": 0.3263, + "hfopenllm_v2/MUSR": 0.3841, + "hfopenllm_v2/MMLU-PRO": 0.3728 + } + }, + { + "id": "DoppelReflEx/L3-8B-WolfCore", + "name": "L3-8B-WolfCore", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4022, + "hfopenllm_v2/BBH": 0.5182, + "hfopenllm_v2/MATH Level 5": 0.0982, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.3973, + "hfopenllm_v2/MMLU-PRO": 0.3705 + } + }, + { + "id": "DoppelReflEx/MN-12B-FoxFrame-test", + "name": "MN-12B-FoxFrame-test", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4222, + "hfopenllm_v2/BBH": 0.5456, + "hfopenllm_v2/MATH Level 5": 0.1397, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4254, + "hfopenllm_v2/MMLU-PRO": 0.3503 + } + }, + { + "id": "DoppelReflEx/MN-12B-FoxFrame2-test", + "name": "MN-12B-FoxFrame2-test", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4319, + "hfopenllm_v2/BBH": 0.5485, + "hfopenllm_v2/MATH Level 5": 0.1405, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.4252, + "hfopenllm_v2/MMLU-PRO": 0.3569 + } + }, + { + "id": "DoppelReflEx/MN-12B-FoxFrame3-test", + "name": "MN-12B-FoxFrame3-test", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4323, + "hfopenllm_v2/BBH": 0.5395, + "hfopenllm_v2/MATH Level 5": 0.1322, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4598, + "hfopenllm_v2/MMLU-PRO": 0.3529 + } + }, + { + "id": "DoppelReflEx/MN-12B-Kakigori", + "name": "MN-12B-Kakigori", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3593, + "hfopenllm_v2/BBH": 0.5416, + "hfopenllm_v2/MATH Level 5": 0.1193, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.4052, + "hfopenllm_v2/MMLU-PRO": 0.3581 + } + }, + { + "id": "DoppelReflEx/MN-12B-LilithFrame", + "name": "MN-12B-LilithFrame", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.451, + "hfopenllm_v2/BBH": 0.4944, + "hfopenllm_v2/MATH Level 5": 0.1156, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.3896, + "hfopenllm_v2/MMLU-PRO": 0.3256 + } + }, + { + "id": "DoppelReflEx/MN-12B-LilithFrame-Experiment-2", + "name": "MN-12B-LilithFrame-Experiment-2", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4299, + "hfopenllm_v2/BBH": 0.4983, + "hfopenllm_v2/MATH Level 5": 0.1073, + "hfopenllm_v2/GPQA": 0.3255, + "hfopenllm_v2/MUSR": 0.3804, + "hfopenllm_v2/MMLU-PRO": 0.3276 + } + }, + { + "id": "DoppelReflEx/MN-12B-LilithFrame-Experiment-3", + "name": "MN-12B-LilithFrame-Experiment-3", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4128, + "hfopenllm_v2/BBH": 0.5468, + "hfopenllm_v2/MATH Level 5": 0.1344, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.4039, + "hfopenllm_v2/MMLU-PRO": 0.3604 + } + }, + { + "id": "DoppelReflEx/MN-12B-LilithFrame-Experiment-4", + "name": "MN-12B-LilithFrame-Experiment-4", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3981, + "hfopenllm_v2/BBH": 0.5534, + "hfopenllm_v2/MATH Level 5": 0.1224, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4371, + "hfopenllm_v2/MMLU-PRO": 0.3649 + } + }, + { + "id": "DoppelReflEx/MN-12B-Mimicore-GreenSnake", + "name": "MN-12B-Mimicore-GreenSnake", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.478, + "hfopenllm_v2/BBH": 0.5481, + "hfopenllm_v2/MATH Level 5": 0.139, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.4306, + "hfopenllm_v2/MMLU-PRO": 0.3651 + } + }, + { + "id": "DoppelReflEx/MN-12B-Mimicore-Nocturne", + "name": "MN-12B-Mimicore-Nocturne", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3957, + "hfopenllm_v2/BBH": 0.5703, + "hfopenllm_v2/MATH Level 5": 0.1057, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.4569, + "hfopenllm_v2/MMLU-PRO": 0.3634 + } + }, + { + "id": "DoppelReflEx/MN-12B-Mimicore-Orochi", + "name": "MN-12B-Mimicore-Orochi", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.462, + "hfopenllm_v2/BBH": 0.5498, + "hfopenllm_v2/MATH Level 5": 0.136, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4546, + "hfopenllm_v2/MMLU-PRO": 0.3447 + } + }, + { + "id": "DoppelReflEx/MN-12B-Mimicore-Orochi-v2-Experiment", + "name": "MN-12B-Mimicore-Orochi-v2-Experiment", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2842, + "hfopenllm_v2/BBH": 0.5323, + "hfopenllm_v2/MATH Level 5": 0.0612, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4574, + "hfopenllm_v2/MMLU-PRO": 0.3423 + } + }, + { + "id": "DoppelReflEx/MN-12B-Mimicore-Orochi-v3-Experiment", + "name": "MN-12B-Mimicore-Orochi-v3-Experiment", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4102, + "hfopenllm_v2/BBH": 0.5438, + "hfopenllm_v2/MATH Level 5": 0.1216, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4438, + "hfopenllm_v2/MMLU-PRO": 0.3396 + } + }, + { + "id": "DoppelReflEx/MN-12B-Mimicore-Orochi-v4-Experiment", + "name": "MN-12B-Mimicore-Orochi-v4-Experiment", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4321, + "hfopenllm_v2/BBH": 0.5463, + "hfopenllm_v2/MATH Level 5": 0.1208, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4449, + "hfopenllm_v2/MMLU-PRO": 0.352 + } + }, + { + "id": "DoppelReflEx/MN-12B-Mimicore-WhiteSnake", + "name": "MN-12B-Mimicore-WhiteSnake", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4438, + "hfopenllm_v2/BBH": 0.5605, + "hfopenllm_v2/MATH Level 5": 0.1314, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.4569, + "hfopenllm_v2/MMLU-PRO": 0.3658 + } + }, + { + "id": "DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-1", + "name": "MN-12B-Mimicore-WhiteSnake-v2-Experiment-1", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3909, + "hfopenllm_v2/BBH": 0.4866, + "hfopenllm_v2/MATH Level 5": 0.0785, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.379, + "hfopenllm_v2/MMLU-PRO": 0.3114 + } + }, + { + "id": "DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-2", + "name": "MN-12B-Mimicore-WhiteSnake-v2-Experiment-2", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3124, + "hfopenllm_v2/BBH": 0.5126, + "hfopenllm_v2/MATH Level 5": 0.1125, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.3975, + "hfopenllm_v2/MMLU-PRO": 0.3314 + } + }, + { + "id": "DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-3", + "name": "MN-12B-Mimicore-WhiteSnake-v2-Experiment-3", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4302, + "hfopenllm_v2/BBH": 0.4812, + "hfopenllm_v2/MATH Level 5": 0.0899, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.3684, + "hfopenllm_v2/MMLU-PRO": 0.3198 + } + }, + { + "id": "DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-4", + "name": "MN-12B-Mimicore-WhiteSnake-v2-Experiment-4", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4241, + "hfopenllm_v2/BBH": 0.5185, + "hfopenllm_v2/MATH Level 5": 0.114, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4002, + "hfopenllm_v2/MMLU-PRO": 0.3342 + } + }, + { + "id": "DoppelReflEx/MN-12B-Unleashed-Twilight", + "name": "MN-12B-Unleashed-Twilight", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3505, + "hfopenllm_v2/BBH": 0.5521, + "hfopenllm_v2/MATH Level 5": 0.0959, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.4384, + "hfopenllm_v2/MMLU-PRO": 0.3678 + } + }, + { + "id": "DoppelReflEx/MN-12B-WolFrame", + "name": "MN-12B-WolFrame", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4397, + "hfopenllm_v2/BBH": 0.5117, + "hfopenllm_v2/MATH Level 5": 0.1314, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4015, + "hfopenllm_v2/MMLU-PRO": 0.3393 + } + }, + { + "id": "DoppelReflEx/MiniusLight-24B", + "name": "MiniusLight-24B", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2577, + "hfopenllm_v2/BBH": 0.6256, + "hfopenllm_v2/MATH Level 5": 0.1261, + "hfopenllm_v2/GPQA": 0.3582, + "hfopenllm_v2/MUSR": 0.4319, + "hfopenllm_v2/MMLU-PRO": 0.5091 + } + }, + { + "id": "DoppelReflEx/MiniusLight-24B-test", + "name": "MiniusLight-24B-test", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0394, + "hfopenllm_v2/BBH": 0.6334, + "hfopenllm_v2/MATH Level 5": 0.0257, + "hfopenllm_v2/GPQA": 0.3683, + "hfopenllm_v2/MUSR": 0.4093, + "hfopenllm_v2/MMLU-PRO": 0.5182 + } + }, + { + "id": "DoppelReflEx/MiniusLight-24B-v1b-test", + "name": "MiniusLight-24B-v1b-test", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3791, + "hfopenllm_v2/BBH": 0.6617, + "hfopenllm_v2/MATH Level 5": 0.2394, + "hfopenllm_v2/GPQA": 0.3792, + "hfopenllm_v2/MUSR": 0.4557, + "hfopenllm_v2/MMLU-PRO": 0.5365 + } + }, + { + "id": "DoppelReflEx/MiniusLight-24B-v1c-test", + "name": "MiniusLight-24B-v1c-test", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3786, + "hfopenllm_v2/BBH": 0.6753, + "hfopenllm_v2/MATH Level 5": 0.2968, + "hfopenllm_v2/GPQA": 0.3951, + "hfopenllm_v2/MUSR": 0.4634, + "hfopenllm_v2/MMLU-PRO": 0.5487 + } + }, + { + "id": "DoppelReflEx/MiniusLight-24B-v1d-test", + "name": "MiniusLight-24B-v1d-test", + "developer": "DoppelReflEx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4032, + "hfopenllm_v2/BBH": 0.6712, + "hfopenllm_v2/MATH Level 5": 0.2946, + "hfopenllm_v2/GPQA": 0.3951, + "hfopenllm_v2/MUSR": 0.4621, + "hfopenllm_v2/MMLU-PRO": 0.5489 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/DreadPoor.json b/data/developers/DreadPoor.json new file mode 100644 index 0000000000000000000000000000000000000000..a04f08d3f4dfcf0d0a7d1b7795cd6336096c1a5e --- /dev/null +++ b/data/developers/DreadPoor.json @@ -0,0 +1,1671 @@ +{ + "developer": "DreadPoor", + "models": [ + { + "id": "DreadPoor/Again-8B-Model_Stock", + "name": "Again-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6724, + "hfopenllm_v2/BBH": 0.531, + "hfopenllm_v2/MATH Level 5": 0.1201, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.3987, + "hfopenllm_v2/MMLU-PRO": 0.3518 + } + }, + { + "id": "DreadPoor/Alita99-8B-LINEAR", + "name": "Alita99-8B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.719, + "hfopenllm_v2/BBH": 0.5442, + "hfopenllm_v2/MATH Level 5": 0.1647, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4266, + "hfopenllm_v2/MMLU-PRO": 0.3809 + } + }, + { + "id": "DreadPoor/AnotherTest", + "name": "AnotherTest", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4701, + "hfopenllm_v2/BBH": 0.4683, + "hfopenllm_v2/MATH Level 5": 0.0619, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4213, + "hfopenllm_v2/MMLU-PRO": 0.2875 + } + }, + { + "id": "DreadPoor/Aspire-8B-model_stock", + "name": "Aspire-8B-model_stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7141, + "hfopenllm_v2/BBH": 0.5278, + "hfopenllm_v2/MATH Level 5": 0.1495, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.4212, + "hfopenllm_v2/MMLU-PRO": 0.3763 + } + }, + { + "id": "DreadPoor/Aspire_1.3-8B_model-stock", + "name": "Aspire_1.3-8B_model-stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7062, + "hfopenllm_v2/BBH": 0.5302, + "hfopenllm_v2/MATH Level 5": 0.1692, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4105, + "hfopenllm_v2/MMLU-PRO": 0.3716 + } + }, + { + "id": "DreadPoor/Aspire_V2-8B-Model_Stock", + "name": "Aspire_V2-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7371, + "hfopenllm_v2/BBH": 0.533, + "hfopenllm_v2/MATH Level 5": 0.176, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.3894, + "hfopenllm_v2/MMLU-PRO": 0.3697 + } + }, + { + "id": "DreadPoor/Aspire_V2.1-8B-Model_Stock", + "name": "Aspire_V2.1-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7238, + "hfopenllm_v2/BBH": 0.5236, + "hfopenllm_v2/MATH Level 5": 0.1767, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4136, + "hfopenllm_v2/MMLU-PRO": 0.3801 + } + }, + { + "id": "DreadPoor/Aspire_V2_ALT-8B-Model_Stock", + "name": "Aspire_V2_ALT-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7381, + "hfopenllm_v2/BBH": 0.5266, + "hfopenllm_v2/MATH Level 5": 0.173, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.3975, + "hfopenllm_v2/MMLU-PRO": 0.3727 + } + }, + { + "id": "DreadPoor/Aspire_V2_ALT_ROW-8B-Model_Stock", + "name": "Aspire_V2_ALT_ROW-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7381, + "hfopenllm_v2/BBH": 0.5266, + "hfopenllm_v2/MATH Level 5": 0.173, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.3975, + "hfopenllm_v2/MMLU-PRO": 0.3727 + } + }, + { + "id": "DreadPoor/Aspire_V3-8B-Model_Stock", + "name": "Aspire_V3-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5119, + "hfopenllm_v2/BBH": 0.5268, + "hfopenllm_v2/MATH Level 5": 0.1858, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4015, + "hfopenllm_v2/MMLU-PRO": 0.3642 + } + }, + { + "id": "DreadPoor/Aspire_V4-8B-Model_Stock", + "name": "Aspire_V4-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7694, + "hfopenllm_v2/BBH": 0.5314, + "hfopenllm_v2/MATH Level 5": 0.1926, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.3867, + "hfopenllm_v2/MMLU-PRO": 0.3708 + } + }, + { + "id": "DreadPoor/Aspire_V4_ALT-8B-Model_Stock", + "name": "Aspire_V4_ALT-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7366, + "hfopenllm_v2/BBH": 0.5268, + "hfopenllm_v2/MATH Level 5": 0.1813, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.392, + "hfopenllm_v2/MMLU-PRO": 0.3682 + } + }, + { + "id": "DreadPoor/Asymmetric_Linearity-8B-Model_Stock", + "name": "Asymmetric_Linearity-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7174, + "hfopenllm_v2/BBH": 0.5465, + "hfopenllm_v2/MATH Level 5": 0.1647, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.4199, + "hfopenllm_v2/MMLU-PRO": 0.3844 + } + }, + { + "id": "DreadPoor/Aurora_faustus-8B-LINEAR", + "name": "Aurora_faustus-8B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7281, + "hfopenllm_v2/BBH": 0.5516, + "hfopenllm_v2/MATH Level 5": 0.1707, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4146, + "hfopenllm_v2/MMLU-PRO": 0.3842 + } + }, + { + "id": "DreadPoor/Aurora_faustus-8B-LORABLATED", + "name": "Aurora_faustus-8B-LORABLATED", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7527, + "hfopenllm_v2/BBH": 0.5392, + "hfopenllm_v2/MATH Level 5": 0.1488, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4239, + "hfopenllm_v2/MMLU-PRO": 0.3673 + } + }, + { + "id": "DreadPoor/Aurora_faustus-8B-LORABLATED_ALT", + "name": "Aurora_faustus-8B-LORABLATED_ALT", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7378, + "hfopenllm_v2/BBH": 0.5388, + "hfopenllm_v2/MATH Level 5": 0.1586, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4225, + "hfopenllm_v2/MMLU-PRO": 0.3694 + } + }, + { + "id": "DreadPoor/Autumn_Dawn-8B-LINEAR", + "name": "Autumn_Dawn-8B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7293, + "hfopenllm_v2/BBH": 0.5459, + "hfopenllm_v2/MATH Level 5": 0.1858, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4186, + "hfopenllm_v2/MMLU-PRO": 0.3968 + } + }, + { + "id": "DreadPoor/BaeZel-8B-LINEAR", + "name": "BaeZel-8B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7378, + "hfopenllm_v2/BBH": 0.5464, + "hfopenllm_v2/MATH Level 5": 0.1813, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.4227, + "hfopenllm_v2/MMLU-PRO": 0.3861 + } + }, + { + "id": "DreadPoor/BaeZel-8B-Model_Stock", + "name": "BaeZel-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7713, + "hfopenllm_v2/BBH": 0.5408, + "hfopenllm_v2/MATH Level 5": 0.1639, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4199, + "hfopenllm_v2/MMLU-PRO": 0.388 + } + }, + { + "id": "DreadPoor/BaeZel_V2-8B-Model_Stock", + "name": "BaeZel_V2-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7677, + "hfopenllm_v2/BBH": 0.5374, + "hfopenllm_v2/MATH Level 5": 0.1798, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4186, + "hfopenllm_v2/MMLU-PRO": 0.3947 + } + }, + { + "id": "DreadPoor/BaeZel_V2_ALT-8B-Model_Stock", + "name": "BaeZel_V2_ALT-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7677, + "hfopenllm_v2/BBH": 0.5374, + "hfopenllm_v2/MATH Level 5": 0.1798, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4186, + "hfopenllm_v2/MMLU-PRO": 0.3947 + } + }, + { + "id": "DreadPoor/BaeZel_V3-8B-Model_Stock", + "name": "BaeZel_V3-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7832, + "hfopenllm_v2/BBH": 0.5392, + "hfopenllm_v2/MATH Level 5": 0.1896, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.4174, + "hfopenllm_v2/MMLU-PRO": 0.3888 + } + }, + { + "id": "DreadPoor/Blunt_Edge-8B-SLERP", + "name": "Blunt_Edge-8B-SLERP", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7497, + "hfopenllm_v2/BBH": 0.5389, + "hfopenllm_v2/MATH Level 5": 0.1858, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.4174, + "hfopenllm_v2/MMLU-PRO": 0.3767 + } + }, + { + "id": "DreadPoor/BulkUp", + "name": "BulkUp", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1778, + "hfopenllm_v2/BBH": 0.287, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2475, + "hfopenllm_v2/MUSR": 0.3447, + "hfopenllm_v2/MMLU-PRO": 0.111 + } + }, + { + "id": "DreadPoor/Cadence-8B-LINEAR", + "name": "Cadence-8B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7682, + "hfopenllm_v2/BBH": 0.5433, + "hfopenllm_v2/MATH Level 5": 0.1677, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4173, + "hfopenllm_v2/MMLU-PRO": 0.3803 + } + }, + { + "id": "DreadPoor/Caelid-8B-Model_Stock", + "name": "Caelid-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7247, + "hfopenllm_v2/BBH": 0.546, + "hfopenllm_v2/MATH Level 5": 0.1511, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4001, + "hfopenllm_v2/MMLU-PRO": 0.3816 + } + }, + { + "id": "DreadPoor/Casuar-9B-Model_Stock", + "name": "Casuar-9B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7765, + "hfopenllm_v2/BBH": 0.6107, + "hfopenllm_v2/MATH Level 5": 0.213, + "hfopenllm_v2/GPQA": 0.3448, + "hfopenllm_v2/MUSR": 0.4165, + "hfopenllm_v2/MMLU-PRO": 0.4156 + } + }, + { + "id": "DreadPoor/Condensed_Milk-8B-Model_Stock", + "name": "Condensed_Milk-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7536, + "hfopenllm_v2/BBH": 0.5435, + "hfopenllm_v2/MATH Level 5": 0.1745, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.416, + "hfopenllm_v2/MMLU-PRO": 0.3876 + } + }, + { + "id": "DreadPoor/CoolerCoder-8B-LINEAR", + "name": "CoolerCoder-8B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4519, + "hfopenllm_v2/BBH": 0.4762, + "hfopenllm_v2/MATH Level 5": 0.0793, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.3964, + "hfopenllm_v2/MMLU-PRO": 0.3159 + } + }, + { + "id": "DreadPoor/Damasteel-8B-LINEAR", + "name": "Damasteel-8B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7384, + "hfopenllm_v2/BBH": 0.5388, + "hfopenllm_v2/MATH Level 5": 0.1669, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4212, + "hfopenllm_v2/MMLU-PRO": 0.3779 + } + }, + { + "id": "DreadPoor/Dearly_Beloved-8B-TIES", + "name": "Dearly_Beloved-8B-TIES", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8267, + "hfopenllm_v2/BBH": 0.405, + "hfopenllm_v2/MATH Level 5": 0.2115, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4175, + "hfopenllm_v2/MMLU-PRO": 0.2827 + } + }, + { + "id": "DreadPoor/Decayed-8B-LINEAR", + "name": "Decayed-8B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7676, + "hfopenllm_v2/BBH": 0.5417, + "hfopenllm_v2/MATH Level 5": 0.1715, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4186, + "hfopenllm_v2/MMLU-PRO": 0.3763 + } + }, + { + "id": "DreadPoor/Derivative-8B-Model_Stock", + "name": "Derivative-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7667, + "hfopenllm_v2/BBH": 0.5395, + "hfopenllm_v2/MATH Level 5": 0.179, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.42, + "hfopenllm_v2/MMLU-PRO": 0.3811 + } + }, + { + "id": "DreadPoor/Derivative_V2-8B-Model_Stock", + "name": "Derivative_V2-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7537, + "hfopenllm_v2/BBH": 0.5393, + "hfopenllm_v2/MATH Level 5": 0.1798, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4123, + "hfopenllm_v2/MMLU-PRO": 0.3856 + } + }, + { + "id": "DreadPoor/Derivative_V2_ALT-8B-Model_Stock", + "name": "Derivative_V2_ALT-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.772, + "hfopenllm_v2/BBH": 0.5365, + "hfopenllm_v2/MATH Level 5": 0.1881, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.4135, + "hfopenllm_v2/MMLU-PRO": 0.3882 + } + }, + { + "id": "DreadPoor/Derivative_V3-8B-Model_Stock", + "name": "Derivative_V3-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6964, + "hfopenllm_v2/BBH": 0.5243, + "hfopenllm_v2/MATH Level 5": 0.1465, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.415, + "hfopenllm_v2/MMLU-PRO": 0.3502 + } + }, + { + "id": "DreadPoor/Elusive_Dragon_Heart-8B-LINEAR", + "name": "Elusive_Dragon_Heart-8B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7131, + "hfopenllm_v2/BBH": 0.5456, + "hfopenllm_v2/MATH Level 5": 0.148, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4146, + "hfopenllm_v2/MMLU-PRO": 0.3814 + } + }, + { + "id": "DreadPoor/Emu_Eggs-9B-Model_Stock", + "name": "Emu_Eggs-9B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7607, + "hfopenllm_v2/BBH": 0.6052, + "hfopenllm_v2/MATH Level 5": 0.21, + "hfopenllm_v2/GPQA": 0.3331, + "hfopenllm_v2/MUSR": 0.4071, + "hfopenllm_v2/MMLU-PRO": 0.4227 + } + }, + { + "id": "DreadPoor/Eunoia_Vespera-8B-LINEAR", + "name": "Eunoia_Vespera-8B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7235, + "hfopenllm_v2/BBH": 0.5399, + "hfopenllm_v2/MATH Level 5": 0.1541, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4185, + "hfopenllm_v2/MMLU-PRO": 0.3839 + } + }, + { + "id": "DreadPoor/Fu_sion_HA-8B-SLERP", + "name": "Fu_sion_HA-8B-SLERP", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7609, + "hfopenllm_v2/BBH": 0.5373, + "hfopenllm_v2/MATH Level 5": 0.1752, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.416, + "hfopenllm_v2/MMLU-PRO": 0.3825 + } + }, + { + "id": "DreadPoor/HOT_STINKING_GARBAGE", + "name": "HOT_STINKING_GARBAGE", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5754, + "hfopenllm_v2/BBH": 0.4884, + "hfopenllm_v2/MATH Level 5": 0.0672, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.425, + "hfopenllm_v2/MMLU-PRO": 0.3017 + } + }, + { + "id": "DreadPoor/H_the_eighth-8B-LINEAR", + "name": "H_the_eighth-8B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7469, + "hfopenllm_v2/BBH": 0.5384, + "hfopenllm_v2/MATH Level 5": 0.1775, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.4173, + "hfopenllm_v2/MMLU-PRO": 0.3824 + } + }, + { + "id": "DreadPoor/Happy_New_Year-8B-Model_Stock", + "name": "Happy_New_Year-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7616, + "hfopenllm_v2/BBH": 0.5368, + "hfopenllm_v2/MATH Level 5": 0.1594, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4186, + "hfopenllm_v2/MMLU-PRO": 0.3879 + } + }, + { + "id": "DreadPoor/Heart_Stolen-8B-Model_Stock", + "name": "Heart_Stolen-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7245, + "hfopenllm_v2/BBH": 0.5395, + "hfopenllm_v2/MATH Level 5": 0.1722, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4162, + "hfopenllm_v2/MMLU-PRO": 0.3794 + } + }, + { + "id": "DreadPoor/Heart_Stolen-ALT-8B-Model_Stock", + "name": "Heart_Stolen-ALT-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7184, + "hfopenllm_v2/BBH": 0.5263, + "hfopenllm_v2/MATH Level 5": 0.1563, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4055, + "hfopenllm_v2/MMLU-PRO": 0.3772 + } + }, + { + "id": "DreadPoor/Here_We_Go_Again-8B-SLERP", + "name": "Here_We_Go_Again-8B-SLERP", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7442, + "hfopenllm_v2/BBH": 0.546, + "hfopenllm_v2/MATH Level 5": 0.173, + "hfopenllm_v2/GPQA": 0.3188, + "hfopenllm_v2/MUSR": 0.4187, + "hfopenllm_v2/MMLU-PRO": 0.3873 + } + }, + { + "id": "DreadPoor/Howdy-8B-LINEAR", + "name": "Howdy-8B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7378, + "hfopenllm_v2/BBH": 0.5384, + "hfopenllm_v2/MATH Level 5": 0.1775, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.4121, + "hfopenllm_v2/MMLU-PRO": 0.3807 + } + }, + { + "id": "DreadPoor/Incidental-8B-Model_Stock", + "name": "Incidental-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7482, + "hfopenllm_v2/BBH": 0.5452, + "hfopenllm_v2/MATH Level 5": 0.1616, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.424, + "hfopenllm_v2/MMLU-PRO": 0.3873 + } + }, + { + "id": "DreadPoor/Irina-8B-model_stock", + "name": "Irina-8B-model_stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6799, + "hfopenllm_v2/BBH": 0.5237, + "hfopenllm_v2/MATH Level 5": 0.102, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4003, + "hfopenllm_v2/MMLU-PRO": 0.3574 + } + }, + { + "id": "DreadPoor/Kindling-8B-Model_Stock", + "name": "Kindling-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7308, + "hfopenllm_v2/BBH": 0.5492, + "hfopenllm_v2/MATH Level 5": 0.1752, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.4068, + "hfopenllm_v2/MMLU-PRO": 0.383 + } + }, + { + "id": "DreadPoor/L3.1-BaeZel-8B-Della", + "name": "L3.1-BaeZel-8B-Della", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.518, + "hfopenllm_v2/BBH": 0.5448, + "hfopenllm_v2/MATH Level 5": 0.1745, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.42, + "hfopenllm_v2/MMLU-PRO": 0.3902 + } + }, + { + "id": "DreadPoor/Laughing_Stock-8B-Model_Stock", + "name": "Laughing_Stock-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.719, + "hfopenllm_v2/BBH": 0.5449, + "hfopenllm_v2/MATH Level 5": 0.1579, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.4146, + "hfopenllm_v2/MMLU-PRO": 0.3764 + } + }, + { + "id": "DreadPoor/Lava_Lamp-8B-SLERP", + "name": "Lava_Lamp-8B-SLERP", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7381, + "hfopenllm_v2/BBH": 0.5368, + "hfopenllm_v2/MATH Level 5": 0.1737, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4187, + "hfopenllm_v2/MMLU-PRO": 0.375 + } + }, + { + "id": "DreadPoor/LemonP-8B-Model_Stock", + "name": "LemonP-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7676, + "hfopenllm_v2/BBH": 0.5439, + "hfopenllm_v2/MATH Level 5": 0.1767, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4081, + "hfopenllm_v2/MMLU-PRO": 0.4004 + } + }, + { + "id": "DreadPoor/Lydia_of_Whiterun-8B-LINEAR", + "name": "Lydia_of_Whiterun-8B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7603, + "hfopenllm_v2/BBH": 0.538, + "hfopenllm_v2/MATH Level 5": 0.1767, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4251, + "hfopenllm_v2/MMLU-PRO": 0.3801 + } + }, + { + "id": "DreadPoor/Matryoshka-8B-LINEAR", + "name": "Matryoshka-8B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7263, + "hfopenllm_v2/BBH": 0.5444, + "hfopenllm_v2/MATH Level 5": 0.1752, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.4252, + "hfopenllm_v2/MMLU-PRO": 0.3866 + } + }, + { + "id": "DreadPoor/Mercury_In_Retrograde-8b-Model-Stock", + "name": "Mercury_In_Retrograde-8b-Model-Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7296, + "hfopenllm_v2/BBH": 0.5391, + "hfopenllm_v2/MATH Level 5": 0.1647, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4199, + "hfopenllm_v2/MMLU-PRO": 0.3829 + } + }, + { + "id": "DreadPoor/Minthy-8B-Model_Stock", + "name": "Minthy-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7658, + "hfopenllm_v2/BBH": 0.5353, + "hfopenllm_v2/MATH Level 5": 0.1918, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4094, + "hfopenllm_v2/MMLU-PRO": 0.3993 + } + }, + { + "id": "DreadPoor/Minthy_ALT-8B-Model_Stock", + "name": "Minthy_ALT-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6992, + "hfopenllm_v2/BBH": 0.5375, + "hfopenllm_v2/MATH Level 5": 0.176, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4225, + "hfopenllm_v2/MMLU-PRO": 0.3674 + } + }, + { + "id": "DreadPoor/Minthy_V2-8B-Model_Stock", + "name": "Minthy_V2-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7126, + "hfopenllm_v2/BBH": 0.5491, + "hfopenllm_v2/MATH Level 5": 0.1594, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4199, + "hfopenllm_v2/MMLU-PRO": 0.3737 + } + }, + { + "id": "DreadPoor/Minus_Penus-8B-Model_Stock", + "name": "Minus_Penus-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7311, + "hfopenllm_v2/BBH": 0.5344, + "hfopenllm_v2/MATH Level 5": 0.2002, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4019, + "hfopenllm_v2/MMLU-PRO": 0.3752 + } + }, + { + "id": "DreadPoor/Morphing-8B-Model_Stock", + "name": "Morphing-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7445, + "hfopenllm_v2/BBH": 0.5397, + "hfopenllm_v2/MATH Level 5": 0.1888, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4069, + "hfopenllm_v2/MMLU-PRO": 0.3852 + } + }, + { + "id": "DreadPoor/Not_Even_My_Final_Form-8B-Model_Stock", + "name": "Not_Even_My_Final_Form-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7722, + "hfopenllm_v2/BBH": 0.5351, + "hfopenllm_v2/MATH Level 5": 0.176, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4147, + "hfopenllm_v2/MMLU-PRO": 0.384 + } + }, + { + "id": "DreadPoor/Nother_One-8B-Model_Stock", + "name": "Nother_One-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6863, + "hfopenllm_v2/BBH": 0.5205, + "hfopenllm_v2/MATH Level 5": 0.1518, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.387, + "hfopenllm_v2/MMLU-PRO": 0.3595 + } + }, + { + "id": "DreadPoor/Noxis-8B-LINEAR", + "name": "Noxis-8B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6913, + "hfopenllm_v2/BBH": 0.5421, + "hfopenllm_v2/MATH Level 5": 0.1979, + "hfopenllm_v2/GPQA": 0.3188, + "hfopenllm_v2/MUSR": 0.4231, + "hfopenllm_v2/MMLU-PRO": 0.366 + } + }, + { + "id": "DreadPoor/Nullsworn-12B-LINEAR", + "name": "Nullsworn-12B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4436, + "hfopenllm_v2/BBH": 0.5483, + "hfopenllm_v2/MATH Level 5": 0.1125, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.435, + "hfopenllm_v2/MMLU-PRO": 0.3645 + } + }, + { + "id": "DreadPoor/Nwah-8B-Model_Stock", + "name": "Nwah-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7716, + "hfopenllm_v2/BBH": 0.5384, + "hfopenllm_v2/MATH Level 5": 0.1798, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4039, + "hfopenllm_v2/MMLU-PRO": 0.3807 + } + }, + { + "id": "DreadPoor/ONeil-model_stock-8B", + "name": "ONeil-model_stock-8B", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6786, + "hfopenllm_v2/BBH": 0.5548, + "hfopenllm_v2/MATH Level 5": 0.1012, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4173, + "hfopenllm_v2/MMLU-PRO": 0.3599 + } + }, + { + "id": "DreadPoor/Oh_Boy-8B-LINEAR", + "name": "Oh_Boy-8B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7503, + "hfopenllm_v2/BBH": 0.5375, + "hfopenllm_v2/MATH Level 5": 0.1782, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4108, + "hfopenllm_v2/MMLU-PRO": 0.3849 + } + }, + { + "id": "DreadPoor/OrangeJ-8B-Model_Stock", + "name": "OrangeJ-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7841, + "hfopenllm_v2/BBH": 0.5413, + "hfopenllm_v2/MATH Level 5": 0.176, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4028, + "hfopenllm_v2/MMLU-PRO": 0.3969 + } + }, + { + "id": "DreadPoor/Promissum_Mane-8B-LINEAR", + "name": "Promissum_Mane-8B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.715, + "hfopenllm_v2/BBH": 0.5458, + "hfopenllm_v2/MATH Level 5": 0.1556, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.42, + "hfopenllm_v2/MMLU-PRO": 0.3851 + } + }, + { + "id": "DreadPoor/Promissum_Mane-8B-LINEAR-lorablated", + "name": "Promissum_Mane-8B-LINEAR-lorablated", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7156, + "hfopenllm_v2/BBH": 0.5435, + "hfopenllm_v2/MATH Level 5": 0.1533, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4198, + "hfopenllm_v2/MMLU-PRO": 0.3739 + } + }, + { + "id": "DreadPoor/RPMash-8B-Model_Stock", + "name": "RPMash-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4564, + "hfopenllm_v2/BBH": 0.5169, + "hfopenllm_v2/MATH Level 5": 0.108, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.4054, + "hfopenllm_v2/MMLU-PRO": 0.3604 + } + }, + { + "id": "DreadPoor/RPMash_V3-8B-Model_Stock", + "name": "RPMash_V3-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7049, + "hfopenllm_v2/BBH": 0.5217, + "hfopenllm_v2/MATH Level 5": 0.1042, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.3778, + "hfopenllm_v2/MMLU-PRO": 0.3614 + } + }, + { + "id": "DreadPoor/Rusted_Gold-8B-LINEAR", + "name": "Rusted_Gold-8B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7296, + "hfopenllm_v2/BBH": 0.5387, + "hfopenllm_v2/MATH Level 5": 0.1934, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4178, + "hfopenllm_v2/MMLU-PRO": 0.378 + } + }, + { + "id": "DreadPoor/Rusted_Platinum-8B-LINEAR", + "name": "Rusted_Platinum-8B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.718, + "hfopenllm_v2/BBH": 0.5428, + "hfopenllm_v2/MATH Level 5": 0.1722, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3967, + "hfopenllm_v2/MMLU-PRO": 0.373 + } + }, + { + "id": "DreadPoor/Rusted_Platinum-8B-Model_Stock", + "name": "Rusted_Platinum-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4408, + "hfopenllm_v2/BBH": 0.5243, + "hfopenllm_v2/MATH Level 5": 0.102, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3741, + "hfopenllm_v2/MMLU-PRO": 0.3546 + } + }, + { + "id": "DreadPoor/Sellen-8B-model_stock", + "name": "Sellen-8B-model_stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7113, + "hfopenllm_v2/BBH": 0.5232, + "hfopenllm_v2/MATH Level 5": 0.1337, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.396, + "hfopenllm_v2/MMLU-PRO": 0.357 + } + }, + { + "id": "DreadPoor/Something-8B-Model_Stock", + "name": "Something-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5043, + "hfopenllm_v2/BBH": 0.5395, + "hfopenllm_v2/MATH Level 5": 0.1798, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4187, + "hfopenllm_v2/MMLU-PRO": 0.3885 + } + }, + { + "id": "DreadPoor/Spring_Dusk-8B-SCE", + "name": "Spring_Dusk-8B-SCE", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6515, + "hfopenllm_v2/BBH": 0.5635, + "hfopenllm_v2/MATH Level 5": 0.0763, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.46, + "hfopenllm_v2/MMLU-PRO": 0.3436 + } + }, + { + "id": "DreadPoor/Summer_Dawn-8B-SCE", + "name": "Summer_Dawn-8B-SCE", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6642, + "hfopenllm_v2/BBH": 0.5391, + "hfopenllm_v2/MATH Level 5": 0.1722, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.412, + "hfopenllm_v2/MMLU-PRO": 0.3753 + } + }, + { + "id": "DreadPoor/Summer_Dusk-8B-TIES", + "name": "Summer_Dusk-8B-TIES", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4922, + "hfopenllm_v2/BBH": 0.536, + "hfopenllm_v2/MATH Level 5": 0.1805, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4267, + "hfopenllm_v2/MMLU-PRO": 0.3856 + } + }, + { + "id": "DreadPoor/Summer_Rain-8B-SCE", + "name": "Summer_Rain-8B-SCE", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5459, + "hfopenllm_v2/BBH": 0.5846, + "hfopenllm_v2/MATH Level 5": 0.0702, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4477, + "hfopenllm_v2/MMLU-PRO": 0.3551 + } + }, + { + "id": "DreadPoor/Summer_Rain-8B-TIES", + "name": "Summer_Rain-8B-TIES", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5444, + "hfopenllm_v2/BBH": 0.5846, + "hfopenllm_v2/MATH Level 5": 0.0702, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4477, + "hfopenllm_v2/MMLU-PRO": 0.3551 + } + }, + { + "id": "DreadPoor/Sun-8B-Model_Stock", + "name": "Sun-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7758, + "hfopenllm_v2/BBH": 0.5264, + "hfopenllm_v2/MATH Level 5": 0.21, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4098, + "hfopenllm_v2/MMLU-PRO": 0.3835 + } + }, + { + "id": "DreadPoor/Sweetened_Condensed_Milk-8B-Model_Stock", + "name": "Sweetened_Condensed_Milk-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7417, + "hfopenllm_v2/BBH": 0.5406, + "hfopenllm_v2/MATH Level 5": 0.1873, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4107, + "hfopenllm_v2/MMLU-PRO": 0.3848 + } + }, + { + "id": "DreadPoor/TEST02-Ignore", + "name": "TEST02-Ignore", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6119, + "hfopenllm_v2/BBH": 0.5602, + "hfopenllm_v2/MATH Level 5": 0.0869, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4199, + "hfopenllm_v2/MMLU-PRO": 0.3468 + } + }, + { + "id": "DreadPoor/TEST03-ignore", + "name": "TEST03-ignore", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6967, + "hfopenllm_v2/BBH": 0.5383, + "hfopenllm_v2/MATH Level 5": 0.1654, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4186, + "hfopenllm_v2/MMLU-PRO": 0.3789 + } + }, + { + "id": "DreadPoor/TEST06-ignore", + "name": "TEST06-ignore", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7323, + "hfopenllm_v2/BBH": 0.5509, + "hfopenllm_v2/MATH Level 5": 0.1178, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.4225, + "hfopenllm_v2/MMLU-PRO": 0.3615 + } + }, + { + "id": "DreadPoor/TEST07-ignore", + "name": "TEST07-ignore", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.74, + "hfopenllm_v2/BBH": 0.5561, + "hfopenllm_v2/MATH Level 5": 0.1662, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4094, + "hfopenllm_v2/MMLU-PRO": 0.388 + } + }, + { + "id": "DreadPoor/TEST08-ignore", + "name": "TEST08-ignore", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7467, + "hfopenllm_v2/BBH": 0.5454, + "hfopenllm_v2/MATH Level 5": 0.182, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4081, + "hfopenllm_v2/MMLU-PRO": 0.3853 + } + }, + { + "id": "DreadPoor/Trinas_Nectar-8B-model_stock", + "name": "Trinas_Nectar-8B-model_stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7259, + "hfopenllm_v2/BBH": 0.5256, + "hfopenllm_v2/MATH Level 5": 0.1526, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.4068, + "hfopenllm_v2/MMLU-PRO": 0.3618 + } + }, + { + "id": "DreadPoor/UNTESTED-VENN_1.2-8B-Model_Stock", + "name": "UNTESTED-VENN_1.2-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4718, + "hfopenllm_v2/BBH": 0.5475, + "hfopenllm_v2/MATH Level 5": 0.1541, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.4449, + "hfopenllm_v2/MMLU-PRO": 0.3787 + } + }, + { + "id": "DreadPoor/VENN_1.2-8B-Model_Stock", + "name": "VENN_1.2-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7226, + "hfopenllm_v2/BBH": 0.5459, + "hfopenllm_v2/MATH Level 5": 0.1707, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.42, + "hfopenllm_v2/MMLU-PRO": 0.3721 + } + }, + { + "id": "DreadPoor/WIP-Acacia-8B-Model_Stock", + "name": "WIP-Acacia-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6246, + "hfopenllm_v2/BBH": 0.5195, + "hfopenllm_v2/MATH Level 5": 0.1669, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4226, + "hfopenllm_v2/MMLU-PRO": 0.3737 + } + }, + { + "id": "DreadPoor/WIP_Damascus-8B-TIES", + "name": "WIP_Damascus-8B-TIES", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4776, + "hfopenllm_v2/BBH": 0.5411, + "hfopenllm_v2/MATH Level 5": 0.1654, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4119, + "hfopenllm_v2/MMLU-PRO": 0.3761 + } + }, + { + "id": "DreadPoor/Wannabe-8B-Model_Stock", + "name": "Wannabe-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7205, + "hfopenllm_v2/BBH": 0.539, + "hfopenllm_v2/MATH Level 5": 0.1775, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4135, + "hfopenllm_v2/MMLU-PRO": 0.3831 + } + }, + { + "id": "DreadPoor/What_A_Thrill-8B-Model_Stock", + "name": "What_A_Thrill-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7064, + "hfopenllm_v2/BBH": 0.5311, + "hfopenllm_v2/MATH Level 5": 0.182, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.408, + "hfopenllm_v2/MMLU-PRO": 0.3615 + } + }, + { + "id": "DreadPoor/Winter-8B-SCE", + "name": "Winter-8B-SCE", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7536, + "hfopenllm_v2/BBH": 0.5262, + "hfopenllm_v2/MATH Level 5": 0.1918, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4071, + "hfopenllm_v2/MMLU-PRO": 0.3839 + } + }, + { + "id": "DreadPoor/Winter_Dawn-8B-TIES", + "name": "Winter_Dawn-8B-TIES", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5496, + "hfopenllm_v2/BBH": 0.5309, + "hfopenllm_v2/MATH Level 5": 0.1858, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4279, + "hfopenllm_v2/MMLU-PRO": 0.391 + } + }, + { + "id": "DreadPoor/Winter_Dusk-8B-TIES", + "name": "Winter_Dusk-8B-TIES", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7153, + "hfopenllm_v2/BBH": 0.4952, + "hfopenllm_v2/MATH Level 5": 0.0718, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.3688, + "hfopenllm_v2/MMLU-PRO": 0.3478 + } + }, + { + "id": "DreadPoor/Winter_Night-8B-Model_Stock", + "name": "Winter_Night-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.704, + "hfopenllm_v2/BBH": 0.5185, + "hfopenllm_v2/MATH Level 5": 0.1458, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.3914, + "hfopenllm_v2/MMLU-PRO": 0.3666 + } + }, + { + "id": "DreadPoor/Yafune-8B-Model_Stock", + "name": "Yafune-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7533, + "hfopenllm_v2/BBH": 0.5467, + "hfopenllm_v2/MATH Level 5": 0.1662, + "hfopenllm_v2/GPQA": 0.3272, + "hfopenllm_v2/MUSR": 0.4173, + "hfopenllm_v2/MMLU-PRO": 0.3851 + } + }, + { + "id": "DreadPoor/Yearn_V3-8B-Model_Stock", + "name": "Yearn_V3-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.729, + "hfopenllm_v2/BBH": 0.5322, + "hfopenllm_v2/MATH Level 5": 0.1896, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.3909, + "hfopenllm_v2/MMLU-PRO": 0.3802 + } + }, + { + "id": "DreadPoor/ZEUS-8B-V17-Abliterated_ALT", + "name": "ZEUS-8B-V17-Abliterated_ALT", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5511, + "hfopenllm_v2/BBH": 0.5231, + "hfopenllm_v2/MATH Level 5": 0.1903, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4149, + "hfopenllm_v2/MMLU-PRO": 0.389 + } + }, + { + "id": "DreadPoor/Zelus-8B-Model_Stock", + "name": "Zelus-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7788, + "hfopenllm_v2/BBH": 0.5307, + "hfopenllm_v2/MATH Level 5": 0.1647, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4214, + "hfopenllm_v2/MMLU-PRO": 0.3841 + } + }, + { + "id": "DreadPoor/Zelus_V2-8B-Model_Stock", + "name": "Zelus_V2-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7898, + "hfopenllm_v2/BBH": 0.5345, + "hfopenllm_v2/MATH Level 5": 0.2054, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.3961, + "hfopenllm_v2/MMLU-PRO": 0.3833 + } + }, + { + "id": "DreadPoor/felix_dies-mistral-7B-model_stock", + "name": "felix_dies-mistral-7B-model_stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3008, + "hfopenllm_v2/BBH": 0.4901, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4518, + "hfopenllm_v2/MMLU-PRO": 0.3109 + } + }, + { + "id": "DreadPoor/hakuchido-8B-MODEL_STOCK", + "name": "hakuchido-8B-MODEL_STOCK", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7375, + "hfopenllm_v2/BBH": 0.5398, + "hfopenllm_v2/MATH Level 5": 0.1949, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4175, + "hfopenllm_v2/MMLU-PRO": 0.3782 + } + }, + { + "id": "DreadPoor/ichor-8B-Model_Stock", + "name": "ichor-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5386, + "hfopenllm_v2/BBH": 0.5084, + "hfopenllm_v2/MATH Level 5": 0.1088, + "hfopenllm_v2/GPQA": 0.3238, + "hfopenllm_v2/MUSR": 0.4212, + "hfopenllm_v2/MMLU-PRO": 0.3151 + } + }, + { + "id": "DreadPoor/ichor_1.1-8B-Model_Stock", + "name": "ichor_1.1-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8096, + "hfopenllm_v2/BBH": 0.5281, + "hfopenllm_v2/MATH Level 5": 0.1775, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4068, + "hfopenllm_v2/MMLU-PRO": 0.3856 + } + }, + { + "id": "DreadPoor/inexpertus-8B-Model_Stock", + "name": "inexpertus-8B-Model_Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7795, + "hfopenllm_v2/BBH": 0.528, + "hfopenllm_v2/MATH Level 5": 0.1707, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4118, + "hfopenllm_v2/MMLU-PRO": 0.3791 + } + }, + { + "id": "DreadPoor/inexpertus_1.1-8B-LINEAR", + "name": "inexpertus_1.1-8B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7527, + "hfopenllm_v2/BBH": 0.5525, + "hfopenllm_v2/MATH Level 5": 0.173, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4173, + "hfopenllm_v2/MMLU-PRO": 0.3827 + } + }, + { + "id": "DreadPoor/inexpertus_1.2-8B-LINEAR", + "name": "inexpertus_1.2-8B-LINEAR", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7348, + "hfopenllm_v2/BBH": 0.5523, + "hfopenllm_v2/MATH Level 5": 0.1586, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4133, + "hfopenllm_v2/MMLU-PRO": 0.3788 + } + }, + { + "id": "DreadPoor/mergekit-nuslerp-nqzkedi", + "name": "mergekit-nuslerp-nqzkedi", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7765, + "hfopenllm_v2/BBH": 0.5362, + "hfopenllm_v2/MATH Level 5": 0.1881, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4225, + "hfopenllm_v2/MMLU-PRO": 0.3919 + } + }, + { + "id": "DreadPoor/remember_to_breathe-8b-Model-Stock", + "name": "remember_to_breathe-8b-Model-Stock", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7104, + "hfopenllm_v2/BBH": 0.5412, + "hfopenllm_v2/MATH Level 5": 0.1488, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4145, + "hfopenllm_v2/MMLU-PRO": 0.3761 + } + }, + { + "id": "DreadPoor/test", + "name": "test", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4937, + "hfopenllm_v2/BBH": 0.5372, + "hfopenllm_v2/MATH Level 5": 0.1934, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.4351, + "hfopenllm_v2/MMLU-PRO": 0.3647 + } + }, + { + "id": "DreadPoor/test_ALT", + "name": "test_ALT", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4997, + "hfopenllm_v2/BBH": 0.537, + "hfopenllm_v2/MATH Level 5": 0.1707, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.4363, + "hfopenllm_v2/MMLU-PRO": 0.3492 + } + }, + { + "id": "DreadPoor/tests_pending-do_not_use_yet", + "name": "tests_pending-do_not_use_yet", + "developer": "DreadPoor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7691, + "hfopenllm_v2/BBH": 0.5408, + "hfopenllm_v2/MATH Level 5": 0.1979, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4005, + "hfopenllm_v2/MMLU-PRO": 0.3827 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ECE-ILAB-PRYMMAL.json b/data/developers/ECE-ILAB-PRYMMAL.json new file mode 100644 index 0000000000000000000000000000000000000000..e19af4e3564183feba04bb48b33074e8d50df87d --- /dev/null +++ b/data/developers/ECE-ILAB-PRYMMAL.json @@ -0,0 +1,19 @@ +{ + "developer": "ECE-ILAB-PRYMMAL", + "models": [ + { + "id": "ECE-ILAB-PRYMMAL/ILAB-Merging-3B-V2", + "name": "ILAB-Merging-3B-V2", + "developer": "ECE-ILAB-PRYMMAL", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4029, + "hfopenllm_v2/BBH": 0.5402, + "hfopenllm_v2/MATH Level 5": 0.1518, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4332, + "hfopenllm_v2/MMLU-PRO": 0.3861 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/EVA-UNIT-01.json b/data/developers/EVA-UNIT-01.json new file mode 100644 index 0000000000000000000000000000000000000000..95e29158f9545d9cb83c5139331a60df8a36ee69 --- /dev/null +++ b/data/developers/EVA-UNIT-01.json @@ -0,0 +1,33 @@ +{ + "developer": "EVA-UNIT-01", + "models": [ + { + "id": "EVA-UNIT-01/EVA-Qwen2.5-14B-v0.2", + "name": "EVA-Qwen2.5-14B-v0.2", + "developer": "EVA-UNIT-01", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4038, + "hfopenllm_v2/BBH": 0.609, + "hfopenllm_v2/MATH Level 5": 0.3406, + "hfopenllm_v2/GPQA": 0.3943, + "hfopenllm_v2/MUSR": 0.4794, + "hfopenllm_v2/MMLU-PRO": 0.5135 + } + }, + { + "id": "EVA-UNIT-01/EVA-Qwen2.5-72B-v0.2", + "name": "EVA-Qwen2.5-72B-v0.2", + "developer": "EVA-UNIT-01", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6879, + "hfopenllm_v2/BBH": 0.7088, + "hfopenllm_v2/MATH Level 5": 0.4313, + "hfopenllm_v2/GPQA": 0.4086, + "hfopenllm_v2/MUSR": 0.472, + "hfopenllm_v2/MMLU-PRO": 0.5813 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Edgerunners.json b/data/developers/Edgerunners.json new file mode 100644 index 0000000000000000000000000000000000000000..4246bbe52b0a73a5b3b0b86aa437a799af1db85b --- /dev/null +++ b/data/developers/Edgerunners.json @@ -0,0 +1,19 @@ +{ + "developer": "Edgerunners", + "models": [ + { + "id": "Edgerunners/meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16", + "name": "meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16", + "developer": "Edgerunners", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7147, + "hfopenllm_v2/BBH": 0.498, + "hfopenllm_v2/MATH Level 5": 0.0906, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3342, + "hfopenllm_v2/MMLU-PRO": 0.3636 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/EleutherAI.json b/data/developers/EleutherAI.json new file mode 100644 index 0000000000000000000000000000000000000000..3581e281f97cadb3a960db5014920f2d801e0606 --- /dev/null +++ b/data/developers/EleutherAI.json @@ -0,0 +1,173 @@ +{ + "developer": "EleutherAI", + "models": [ + { + "id": "EleutherAI/gpt-j-6b", + "name": "gpt-j-6b", + "developer": "EleutherAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2522, + "hfopenllm_v2/BBH": 0.3191, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2458, + "hfopenllm_v2/MUSR": 0.3658, + "hfopenllm_v2/MMLU-PRO": 0.1241 + } + }, + { + "id": "EleutherAI/gpt-neo-1.3B", + "name": "gpt-neo-1.3B", + "developer": "EleutherAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2079, + "hfopenllm_v2/BBH": 0.3039, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3817, + "hfopenllm_v2/MMLU-PRO": 0.1164 + } + }, + { + "id": "EleutherAI/gpt-neo-125m", + "name": "gpt-neo-125m", + "developer": "EleutherAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1905, + "hfopenllm_v2/BBH": 0.3115, + "hfopenllm_v2/MATH Level 5": 0.006, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3593, + "hfopenllm_v2/MMLU-PRO": 0.1026 + } + }, + { + "id": "EleutherAI/gpt-neo-2.7B", + "name": "gpt-neo-2.7B", + "developer": "EleutherAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.259, + "hfopenllm_v2/BBH": 0.314, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3554, + "hfopenllm_v2/MMLU-PRO": 0.1163 + } + }, + { + "id": "EleutherAI/gpt-neox-20b", + "name": "gpt-neox-20b", + "developer": "EleutherAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2587, + "hfopenllm_v2/BBH": 0.3165, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2433, + "hfopenllm_v2/MUSR": 0.3647, + "hfopenllm_v2/MMLU-PRO": 0.1155 + } + }, + { + "id": "EleutherAI/pythia-1.4b", + "name": "pythia-1.4b", + "developer": "EleutherAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2371, + "hfopenllm_v2/BBH": 0.315, + "hfopenllm_v2/MATH Level 5": 0.0151, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3538, + "hfopenllm_v2/MMLU-PRO": 0.1123 + } + }, + { + "id": "EleutherAI/pythia-12b", + "name": "pythia-12b", + "developer": "EleutherAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2471, + "hfopenllm_v2/BBH": 0.318, + "hfopenllm_v2/MATH Level 5": 0.0166, + "hfopenllm_v2/GPQA": 0.2466, + "hfopenllm_v2/MUSR": 0.3647, + "hfopenllm_v2/MMLU-PRO": 0.1109 + } + }, + { + "id": "EleutherAI/pythia-160m", + "name": "pythia-160m", + "developer": "EleutherAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1816, + "hfopenllm_v2/BBH": 0.297, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.4179, + "hfopenllm_v2/MMLU-PRO": 0.112 + } + }, + { + "id": "EleutherAI/pythia-1b", + "name": "pythia-1b", + "developer": "EleutherAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2208, + "hfopenllm_v2/BBH": 0.3004, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3552, + "hfopenllm_v2/MMLU-PRO": 0.1136 + } + }, + { + "id": "EleutherAI/pythia-2.8b", + "name": "pythia-2.8b", + "developer": "EleutherAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2173, + "hfopenllm_v2/BBH": 0.3224, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3486, + "hfopenllm_v2/MMLU-PRO": 0.1137 + } + }, + { + "id": "EleutherAI/pythia-410m", + "name": "pythia-410m", + "developer": "EleutherAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2195, + "hfopenllm_v2/BBH": 0.3028, + "hfopenllm_v2/MATH Level 5": 0.0098, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3578, + "hfopenllm_v2/MMLU-PRO": 0.1128 + } + }, + { + "id": "EleutherAI/pythia-6.9b", + "name": "pythia-6.9b", + "developer": "EleutherAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2281, + "hfopenllm_v2/BBH": 0.3232, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2517, + "hfopenllm_v2/MUSR": 0.3591, + "hfopenllm_v2/MMLU-PRO": 0.1147 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Enno-Ai.json b/data/developers/Enno-Ai.json new file mode 100644 index 0000000000000000000000000000000000000000..81beef5d546a44b48c4a4a47dc0960691b5c38e9 --- /dev/null +++ b/data/developers/Enno-Ai.json @@ -0,0 +1,61 @@ +{ + "developer": "Enno-Ai", + "models": [ + { + "id": "Enno-Ai/EnnoAi-Pro-French-Llama-3-8B-v0.4", + "name": "EnnoAi-Pro-French-Llama-3-8B-v0.4", + "developer": "Enno-Ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4189, + "hfopenllm_v2/BBH": 0.4075, + "hfopenllm_v2/MATH Level 5": 0.0363, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.417, + "hfopenllm_v2/MMLU-PRO": 0.2635 + } + }, + { + "id": "Enno-Ai/EnnoAi-Pro-Llama-3-8B", + "name": "EnnoAi-Pro-Llama-3-8B", + "developer": "Enno-Ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3195, + "hfopenllm_v2/BBH": 0.4152, + "hfopenllm_v2/MATH Level 5": 0.0219, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.4071, + "hfopenllm_v2/MMLU-PRO": 0.2151 + } + }, + { + "id": "Enno-Ai/EnnoAi-Pro-Llama-3-8B-v0.3", + "name": "EnnoAi-Pro-Llama-3-8B-v0.3", + "developer": "Enno-Ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5083, + "hfopenllm_v2/BBH": 0.4101, + "hfopenllm_v2/MATH Level 5": 0.0483, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.4236, + "hfopenllm_v2/MMLU-PRO": 0.299 + } + }, + { + "id": "Enno-Ai/EnnoAi-Pro-Llama-3.1-8B-v0.9", + "name": "EnnoAi-Pro-Llama-3.1-8B-v0.9", + "developer": "Enno-Ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4689, + "hfopenllm_v2/BBH": 0.416, + "hfopenllm_v2/MATH Level 5": 0.0378, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3832, + "hfopenllm_v2/MMLU-PRO": 0.2596 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/EnnoAi.json b/data/developers/EnnoAi.json new file mode 100644 index 0000000000000000000000000000000000000000..bb9a9b10584d6bf298bca59b742b8303b6f54bec --- /dev/null +++ b/data/developers/EnnoAi.json @@ -0,0 +1,33 @@ +{ + "developer": "EnnoAi", + "models": [ + { + "id": "EnnoAi/EnnoAi-7B-French-Instruct-202502", + "name": "EnnoAi-7B-French-Instruct-202502", + "developer": "EnnoAi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5564, + "hfopenllm_v2/BBH": 0.5575, + "hfopenllm_v2/MATH Level 5": 0.3724, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.46, + "hfopenllm_v2/MMLU-PRO": 0.4013 + } + }, + { + "id": "EnnoAi/EnnoAi-Pro-Llama-3.1-8B-v1.0", + "name": "EnnoAi-Pro-Llama-3.1-8B-v1.0", + "developer": "EnnoAi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4704, + "hfopenllm_v2/BBH": 0.416, + "hfopenllm_v2/MATH Level 5": 0.0378, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3832, + "hfopenllm_v2/MMLU-PRO": 0.2596 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Epiculous.json b/data/developers/Epiculous.json new file mode 100644 index 0000000000000000000000000000000000000000..c8128b33bb4fd177a979cf60b3ec6be0ca17039a --- /dev/null +++ b/data/developers/Epiculous.json @@ -0,0 +1,61 @@ +{ + "developer": "Epiculous", + "models": [ + { + "id": "Epiculous/Azure_Dusk-v0.2", + "name": "Azure_Dusk-v0.2", + "developer": "Epiculous", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3467, + "hfopenllm_v2/BBH": 0.412, + "hfopenllm_v2/MATH Level 5": 0.0295, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3835, + "hfopenllm_v2/MMLU-PRO": 0.3034 + } + }, + { + "id": "Epiculous/Crimson_Dawn-v0.2", + "name": "Crimson_Dawn-v0.2", + "developer": "Epiculous", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3103, + "hfopenllm_v2/BBH": 0.4482, + "hfopenllm_v2/MATH Level 5": 0.0431, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.4152, + "hfopenllm_v2/MMLU-PRO": 0.2721 + } + }, + { + "id": "Epiculous/NovaSpark", + "name": "NovaSpark", + "developer": "Epiculous", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6408, + "hfopenllm_v2/BBH": 0.5064, + "hfopenllm_v2/MATH Level 5": 0.1518, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.3882, + "hfopenllm_v2/MMLU-PRO": 0.3649 + } + }, + { + "id": "Epiculous/Violet_Twilight-v0.2", + "name": "Violet_Twilight-v0.2", + "developer": "Epiculous", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4532, + "hfopenllm_v2/BBH": 0.4615, + "hfopenllm_v2/MATH Level 5": 0.0287, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.4299, + "hfopenllm_v2/MMLU-PRO": 0.3111 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/EpistemeAI.json b/data/developers/EpistemeAI.json new file mode 100644 index 0000000000000000000000000000000000000000..3ee1bd0082a65f7ca5933f1f33b7d33e9aa5bf42 --- /dev/null +++ b/data/developers/EpistemeAI.json @@ -0,0 +1,663 @@ +{ + "developer": "EpistemeAI", + "models": [ + { + "id": "EpistemeAI/Alpaca-Llama3.1-8B", + "name": "Alpaca-Llama3.1-8B", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1599, + "hfopenllm_v2/BBH": 0.4755, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.3403, + "hfopenllm_v2/MMLU-PRO": 0.3246 + } + }, + { + "id": "EpistemeAI/Athena-gemma-2-2b-it", + "name": "Athena-gemma-2-2b-it", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3134, + "hfopenllm_v2/BBH": 0.4264, + "hfopenllm_v2/MATH Level 5": 0.0491, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.4351, + "hfopenllm_v2/MMLU-PRO": 0.2422 + } + }, + { + "id": "EpistemeAI/Athena-gemma-2-2b-it-Philos", + "name": "Athena-gemma-2-2b-it-Philos", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4621, + "hfopenllm_v2/BBH": 0.3795, + "hfopenllm_v2/MATH Level 5": 0.037, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.4314, + "hfopenllm_v2/MMLU-PRO": 0.2248 + } + }, + { + "id": "EpistemeAI/Athene-codegemma-2-7b-it-alpaca-v1.3", + "name": "Athene-codegemma-2-7b-it-alpaca-v1.3", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.403, + "hfopenllm_v2/BBH": 0.4332, + "hfopenllm_v2/MATH Level 5": 0.0619, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.4503, + "hfopenllm_v2/MMLU-PRO": 0.2587 + } + }, + { + "id": "EpistemeAI/DeepPhi-3.5-mini-instruct", + "name": "DeepPhi-3.5-mini-instruct", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1326, + "hfopenllm_v2/BBH": 0.2882, + "hfopenllm_v2/MATH Level 5": 0.0068, + "hfopenllm_v2/GPQA": 0.2332, + "hfopenllm_v2/MUSR": 0.3656, + "hfopenllm_v2/MMLU-PRO": 0.1103 + } + }, + { + "id": "EpistemeAI/DeepThinkers-Phi4", + "name": "DeepThinkers-Phi4", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.694, + "hfopenllm_v2/BBH": 0.679, + "hfopenllm_v2/MATH Level 5": 0.4585, + "hfopenllm_v2/GPQA": 0.3406, + "hfopenllm_v2/MUSR": 0.3981, + "hfopenllm_v2/MMLU-PRO": 0.5258 + } + }, + { + "id": "EpistemeAI/FineLlama3.1-8B-Instruct", + "name": "FineLlama3.1-8B-Instruct", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.08, + "hfopenllm_v2/BBH": 0.4557, + "hfopenllm_v2/MATH Level 5": 0.0347, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3482, + "hfopenllm_v2/MMLU-PRO": 0.3113 + } + }, + { + "id": "EpistemeAI/Fireball-12B", + "name": "Fireball-12B", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1834, + "hfopenllm_v2/BBH": 0.5111, + "hfopenllm_v2/MATH Level 5": 0.0408, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.4236, + "hfopenllm_v2/MMLU-PRO": 0.3344 + } + }, + { + "id": "EpistemeAI/Fireball-12B-v1.13a-philosophers", + "name": "Fireball-12B-v1.13a-philosophers", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0876, + "hfopenllm_v2/BBH": 0.5103, + "hfopenllm_v2/MATH Level 5": 0.0461, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4081, + "hfopenllm_v2/MMLU-PRO": 0.3367 + } + }, + { + "id": "EpistemeAI/Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200", + "name": "Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4577, + "hfopenllm_v2/BBH": 0.4838, + "hfopenllm_v2/MATH Level 5": 0.1231, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.3945, + "hfopenllm_v2/MMLU-PRO": 0.3583 + } + }, + { + "id": "EpistemeAI/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta", + "name": "Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7274, + "hfopenllm_v2/BBH": 0.4865, + "hfopenllm_v2/MATH Level 5": 0.1526, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3619, + "hfopenllm_v2/MMLU-PRO": 0.3543 + } + }, + { + "id": "EpistemeAI/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2", + "name": "Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4673, + "hfopenllm_v2/BBH": 0.4932, + "hfopenllm_v2/MATH Level 5": 0.1239, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.4624, + "hfopenllm_v2/MMLU-PRO": 0.3352 + } + }, + { + "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto", + "name": "Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4432, + "hfopenllm_v2/BBH": 0.4824, + "hfopenllm_v2/MATH Level 5": 0.1329, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4066, + "hfopenllm_v2/MMLU-PRO": 0.3516 + } + }, + { + "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K", + "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4457, + "hfopenllm_v2/BBH": 0.4897, + "hfopenllm_v2/MATH Level 5": 0.1208, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.3762, + "hfopenllm_v2/MMLU-PRO": 0.3543 + } + }, + { + "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code", + "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5975, + "hfopenllm_v2/BBH": 0.4904, + "hfopenllm_v2/MATH Level 5": 0.1337, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.401, + "hfopenllm_v2/MMLU-PRO": 0.3423 + } + }, + { + "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds", + "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6691, + "hfopenllm_v2/BBH": 0.4668, + "hfopenllm_v2/MATH Level 5": 0.1337, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3418, + "hfopenllm_v2/MMLU-PRO": 0.3389 + } + }, + { + "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto", + "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7305, + "hfopenllm_v2/BBH": 0.4649, + "hfopenllm_v2/MATH Level 5": 0.1397, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3209, + "hfopenllm_v2/MMLU-PRO": 0.348 + } + }, + { + "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT", + "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4578, + "hfopenllm_v2/BBH": 0.4761, + "hfopenllm_v2/MATH Level 5": 0.1382, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.3881, + "hfopenllm_v2/MMLU-PRO": 0.3471 + } + }, + { + "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto", + "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7205, + "hfopenllm_v2/BBH": 0.4818, + "hfopenllm_v2/MATH Level 5": 0.1435, + "hfopenllm_v2/GPQA": 0.2483, + "hfopenllm_v2/MUSR": 0.33, + "hfopenllm_v2/MMLU-PRO": 0.3548 + } + }, + { + "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Math", + "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Math", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4623, + "hfopenllm_v2/BBH": 0.4983, + "hfopenllm_v2/MATH Level 5": 0.108, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.3641, + "hfopenllm_v2/MMLU-PRO": 0.3331 + } + }, + { + "id": "EpistemeAI/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO", + "name": "Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4611, + "hfopenllm_v2/BBH": 0.4801, + "hfopenllm_v2/MATH Level 5": 0.1254, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.3998, + "hfopenllm_v2/MMLU-PRO": 0.3521 + } + }, + { + "id": "EpistemeAI/Fireball-Mistral-Nemo-Base-2407-v1-DPO2", + "name": "Fireball-Mistral-Nemo-Base-2407-v1-DPO2", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1861, + "hfopenllm_v2/BBH": 0.4968, + "hfopenllm_v2/MATH Level 5": 0.0363, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.404, + "hfopenllm_v2/MMLU-PRO": 0.3353 + } + }, + { + "id": "EpistemeAI/Fireball-R1-Llama-3.1-8B", + "name": "Fireball-R1-Llama-3.1-8B", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4427, + "hfopenllm_v2/BBH": 0.3643, + "hfopenllm_v2/MATH Level 5": 0.3112, + "hfopenllm_v2/GPQA": 0.2483, + "hfopenllm_v2/MUSR": 0.3288, + "hfopenllm_v2/MMLU-PRO": 0.1115 + } + }, + { + "id": "EpistemeAI/Fireball-R1-Llama-3.1-8B-Medical-COT", + "name": "Fireball-R1-Llama-3.1-8B-Medical-COT", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3216, + "hfopenllm_v2/BBH": 0.3716, + "hfopenllm_v2/MATH Level 5": 0.327, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3114, + "hfopenllm_v2/MMLU-PRO": 0.1402 + } + }, + { + "id": "EpistemeAI/Fireball-R1.1-Llama-3.1-8B", + "name": "Fireball-R1.1-Llama-3.1-8B", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3676, + "hfopenllm_v2/BBH": 0.3326, + "hfopenllm_v2/MATH Level 5": 0.1382, + "hfopenllm_v2/GPQA": 0.2517, + "hfopenllm_v2/MUSR": 0.3419, + "hfopenllm_v2/MMLU-PRO": 0.1115 + } + }, + { + "id": "EpistemeAI/Llama-3.2-3B-Agent007-Coder", + "name": "Llama-3.2-3B-Agent007-Coder", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.54, + "hfopenllm_v2/BBH": 0.4304, + "hfopenllm_v2/MATH Level 5": 0.111, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3668, + "hfopenllm_v2/MMLU-PRO": 0.2852 + } + }, + { + "id": "EpistemeAI/Mistral-Nemo-Instruct-12B-Philosophy-Math", + "name": "Mistral-Nemo-Instruct-12B-Philosophy-Math", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0695, + "hfopenllm_v2/BBH": 0.5365, + "hfopenllm_v2/MATH Level 5": 0.0959, + "hfopenllm_v2/GPQA": 0.3314, + "hfopenllm_v2/MUSR": 0.4292, + "hfopenllm_v2/MMLU-PRO": 0.3296 + } + }, + { + "id": "EpistemeAI/OpenReasoner-Llama-3.2-3B-rs1.0", + "name": "OpenReasoner-Llama-3.2-3B-rs1.0", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7274, + "hfopenllm_v2/BBH": 0.4519, + "hfopenllm_v2/MATH Level 5": 0.1344, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3461, + "hfopenllm_v2/MMLU-PRO": 0.3134 + } + }, + { + "id": "EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy", + "name": "Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7101, + "hfopenllm_v2/BBH": 0.4628, + "hfopenllm_v2/MATH Level 5": 0.1397, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.3195, + "hfopenllm_v2/MMLU-PRO": 0.3311 + } + }, + { + "id": "EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic", + "name": "Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7122, + "hfopenllm_v2/BBH": 0.4566, + "hfopenllm_v2/MATH Level 5": 0.1246, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.3235, + "hfopenllm_v2/MMLU-PRO": 0.335 + } + }, + { + "id": "EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent", + "name": "Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6915, + "hfopenllm_v2/BBH": 0.4525, + "hfopenllm_v2/MATH Level 5": 0.1292, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3578, + "hfopenllm_v2/MMLU-PRO": 0.329 + } + }, + { + "id": "EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT", + "name": "Reasoning-Llama-3.1-CoT-RE1-NMT", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4829, + "hfopenllm_v2/BBH": 0.4736, + "hfopenllm_v2/MATH Level 5": 0.1299, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3182, + "hfopenllm_v2/MMLU-PRO": 0.3343 + } + }, + { + "id": "EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO", + "name": "Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4553, + "hfopenllm_v2/BBH": 0.4804, + "hfopenllm_v2/MATH Level 5": 0.1292, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.3931, + "hfopenllm_v2/MMLU-PRO": 0.3598 + } + }, + { + "id": "EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.2", + "name": "Reasoning-Llama-3.2-1B-Instruct-v1.2", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4087, + "hfopenllm_v2/BBH": 0.3324, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3222, + "hfopenllm_v2/MMLU-PRO": 0.1179 + } + }, + { + "id": "EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.3", + "name": "Reasoning-Llama-3.2-1B-Instruct-v1.3", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3273, + "hfopenllm_v2/BBH": 0.3263, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.326, + "hfopenllm_v2/MMLU-PRO": 0.1173 + } + }, + { + "id": "EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1", + "name": "Reasoning-Llama-3.2-3B-Math-Instruct-RE1", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.512, + "hfopenllm_v2/BBH": 0.4381, + "hfopenllm_v2/MATH Level 5": 0.108, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3435, + "hfopenllm_v2/MMLU-PRO": 0.2789 + } + }, + { + "id": "EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO", + "name": "Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.729, + "hfopenllm_v2/BBH": 0.4518, + "hfopenllm_v2/MATH Level 5": 0.1533, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3487, + "hfopenllm_v2/MMLU-PRO": 0.31 + } + }, + { + "id": "EpistemeAI/ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math", + "name": "ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5903, + "hfopenllm_v2/BBH": 0.4364, + "hfopenllm_v2/MATH Level 5": 0.148, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3314, + "hfopenllm_v2/MMLU-PRO": 0.2823 + } + }, + { + "id": "EpistemeAI/ReasoningCore-3B-0", + "name": "ReasoningCore-3B-0", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7341, + "hfopenllm_v2/BBH": 0.4446, + "hfopenllm_v2/MATH Level 5": 0.1586, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3554, + "hfopenllm_v2/MMLU-PRO": 0.3172 + } + }, + { + "id": "EpistemeAI/ReasoningCore-3B-Instruct-r01-Reflect", + "name": "ReasoningCore-3B-Instruct-r01-Reflect", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7335, + "hfopenllm_v2/BBH": 0.445, + "hfopenllm_v2/MATH Level 5": 0.1541, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3527, + "hfopenllm_v2/MMLU-PRO": 0.3144 + } + }, + { + "id": "EpistemeAI/ReasoningCore-3B-R01", + "name": "ReasoningCore-3B-R01", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2976, + "hfopenllm_v2/BBH": 0.4373, + "hfopenllm_v2/MATH Level 5": 0.1299, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3195, + "hfopenllm_v2/MMLU-PRO": 0.2591 + } + }, + { + "id": "EpistemeAI/ReasoningCore-3B-RE1-V2", + "name": "ReasoningCore-3B-RE1-V2", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7393, + "hfopenllm_v2/BBH": 0.4462, + "hfopenllm_v2/MATH Level 5": 0.1563, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3541, + "hfopenllm_v2/MMLU-PRO": 0.3181 + } + }, + { + "id": "EpistemeAI/ReasoningCore-3B-RE1-V2A", + "name": "ReasoningCore-3B-RE1-V2A", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5733, + "hfopenllm_v2/BBH": 0.419, + "hfopenllm_v2/MATH Level 5": 0.0929, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.3352, + "hfopenllm_v2/MMLU-PRO": 0.2736 + } + }, + { + "id": "EpistemeAI/ReasoningCore-3B-RE1-V2B", + "name": "ReasoningCore-3B-RE1-V2B", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5051, + "hfopenllm_v2/BBH": 0.4168, + "hfopenllm_v2/MATH Level 5": 0.1073, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3448, + "hfopenllm_v2/MMLU-PRO": 0.2673 + } + }, + { + "id": "EpistemeAI/ReasoningCore-3B-RE1-V2C", + "name": "ReasoningCore-3B-RE1-V2C", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5057, + "hfopenllm_v2/BBH": 0.4177, + "hfopenllm_v2/MATH Level 5": 0.0974, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3422, + "hfopenllm_v2/MMLU-PRO": 0.2691 + } + }, + { + "id": "EpistemeAI/ReasoningCore-3B-T1-V1", + "name": "ReasoningCore-3B-T1-V1", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7208, + "hfopenllm_v2/BBH": 0.4517, + "hfopenllm_v2/MATH Level 5": 0.1458, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.354, + "hfopenllm_v2/MMLU-PRO": 0.312 + } + }, + { + "id": "EpistemeAI/ReasoningCore-3B-T1_1", + "name": "ReasoningCore-3B-T1_1", + "developer": "EpistemeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7275, + "hfopenllm_v2/BBH": 0.4524, + "hfopenllm_v2/MATH Level 5": 0.1541, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3554, + "hfopenllm_v2/MMLU-PRO": 0.3117 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/EpistemeAI2.json b/data/developers/EpistemeAI2.json new file mode 100644 index 0000000000000000000000000000000000000000..da4444af114bf575cc00088ac628146148412404 --- /dev/null +++ b/data/developers/EpistemeAI2.json @@ -0,0 +1,215 @@ +{ + "developer": "EpistemeAI2", + "models": [ + { + "id": "EpistemeAI2/Athene-codegemma-2-7b-it-alpaca-v1.2", + "name": "Athene-codegemma-2-7b-it-alpaca-v1.2", + "developer": "EpistemeAI2", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4351, + "hfopenllm_v2/BBH": 0.4175, + "hfopenllm_v2/MATH Level 5": 0.0423, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.417, + "hfopenllm_v2/MMLU-PRO": 0.2297 + } + }, + { + "id": "EpistemeAI2/Fireball-12B-v1.2", + "name": "Fireball-12B-v1.2", + "developer": "EpistemeAI2", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1355, + "hfopenllm_v2/BBH": 0.5019, + "hfopenllm_v2/MATH Level 5": 0.0415, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4173, + "hfopenllm_v2/MMLU-PRO": 0.3337 + } + }, + { + "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1-8B-Philos", + "name": "Fireball-Alpaca-Llama3.1-8B-Philos", + "developer": "EpistemeAI2", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4986, + "hfopenllm_v2/BBH": 0.4978, + "hfopenllm_v2/MATH Level 5": 0.1186, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4277, + "hfopenllm_v2/MMLU-PRO": 0.3406 + } + }, + { + "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.01-8B-Philos", + "name": "Fireball-Alpaca-Llama3.1.01-8B-Philos", + "developer": "EpistemeAI2", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4212, + "hfopenllm_v2/BBH": 0.4956, + "hfopenllm_v2/MATH Level 5": 0.136, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.4371, + "hfopenllm_v2/MMLU-PRO": 0.3383 + } + }, + { + "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.03-8B-Philos", + "name": "Fireball-Alpaca-Llama3.1.03-8B-Philos", + "developer": "EpistemeAI2", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3881, + "hfopenllm_v2/BBH": 0.4951, + "hfopenllm_v2/MATH Level 5": 0.1284, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.428, + "hfopenllm_v2/MMLU-PRO": 0.3355 + } + }, + { + "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.04-8B-Philos", + "name": "Fireball-Alpaca-Llama3.1.04-8B-Philos", + "developer": "EpistemeAI2", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4084, + "hfopenllm_v2/BBH": 0.493, + "hfopenllm_v2/MATH Level 5": 0.1201, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4372, + "hfopenllm_v2/MMLU-PRO": 0.3403 + } + }, + { + "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo", + "name": "Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo", + "developer": "EpistemeAI2", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4866, + "hfopenllm_v2/BBH": 0.4881, + "hfopenllm_v2/MATH Level 5": 0.1307, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.3932, + "hfopenllm_v2/MMLU-PRO": 0.3615 + } + }, + { + "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math", + "name": "Fireball-Alpaca-Llama3.1.07-8B-Philos-Math", + "developer": "EpistemeAI2", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5079, + "hfopenllm_v2/BBH": 0.4847, + "hfopenllm_v2/MATH Level 5": 0.1201, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4063, + "hfopenllm_v2/MMLU-PRO": 0.3531 + } + }, + { + "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection", + "name": "Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection", + "developer": "EpistemeAI2", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3952, + "hfopenllm_v2/BBH": 0.4955, + "hfopenllm_v2/MATH Level 5": 0.1246, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4048, + "hfopenllm_v2/MMLU-PRO": 0.3593 + } + }, + { + "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1", + "name": "Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1", + "developer": "EpistemeAI2", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5316, + "hfopenllm_v2/BBH": 0.4828, + "hfopenllm_v2/MATH Level 5": 0.1239, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4103, + "hfopenllm_v2/MMLU-PRO": 0.3523 + } + }, + { + "id": "EpistemeAI2/Fireball-Llama-3.1-8B-Philos-Reflection", + "name": "Fireball-Llama-3.1-8B-Philos-Reflection", + "developer": "EpistemeAI2", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3596, + "hfopenllm_v2/BBH": 0.4898, + "hfopenllm_v2/MATH Level 5": 0.1284, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.3957, + "hfopenllm_v2/MMLU-PRO": 0.3551 + } + }, + { + "id": "EpistemeAI2/Fireball-MathMistral-Nemo-Base-2407-v2dpo", + "name": "Fireball-MathMistral-Nemo-Base-2407-v2dpo", + "developer": "EpistemeAI2", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3097, + "hfopenllm_v2/BBH": 0.4328, + "hfopenllm_v2/MATH Level 5": 0.037, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.403, + "hfopenllm_v2/MMLU-PRO": 0.1148 + } + }, + { + "id": "EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math", + "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math", + "developer": "EpistemeAI2", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5515, + "hfopenllm_v2/BBH": 0.4808, + "hfopenllm_v2/MATH Level 5": 0.1352, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.3693, + "hfopenllm_v2/MMLU-PRO": 0.342 + } + }, + { + "id": "EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT", + "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT", + "developer": "EpistemeAI2", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4633, + "hfopenllm_v2/BBH": 0.4791, + "hfopenllm_v2/MATH Level 5": 0.1171, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.3774, + "hfopenllm_v2/MMLU-PRO": 0.3565 + } + }, + { + "id": "EpistemeAI2/Fireball-Phi-3-medium-4k-inst-Philos", + "name": "Fireball-Phi-3-medium-4k-inst-Philos", + "developer": "EpistemeAI2", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5313, + "hfopenllm_v2/BBH": 0.6178, + "hfopenllm_v2/MATH Level 5": 0.1707, + "hfopenllm_v2/GPQA": 0.3322, + "hfopenllm_v2/MUSR": 0.4139, + "hfopenllm_v2/MMLU-PRO": 0.4599 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Eric111.json b/data/developers/Eric111.json new file mode 100644 index 0000000000000000000000000000000000000000..1258d247c8aad971653dcdf78b4107a103cd0ece --- /dev/null +++ b/data/developers/Eric111.json @@ -0,0 +1,33 @@ +{ + "developer": "Eric111", + "models": [ + { + "id": "Eric111/CatunaMayo", + "name": "CatunaMayo", + "developer": "Eric111", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4074, + "hfopenllm_v2/BBH": 0.5244, + "hfopenllm_v2/MATH Level 5": 0.0846, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.454, + "hfopenllm_v2/MMLU-PRO": 0.3178 + } + }, + { + "id": "Eric111/CatunaMayo-DPO", + "name": "CatunaMayo-DPO", + "developer": "Eric111", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4215, + "hfopenllm_v2/BBH": 0.5224, + "hfopenllm_v2/MATH Level 5": 0.0816, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.445, + "hfopenllm_v2/MMLU-PRO": 0.317 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Etherll.json b/data/developers/Etherll.json new file mode 100644 index 0000000000000000000000000000000000000000..2be2455f8558b72fcbd342c72cc671c443f3155e --- /dev/null +++ b/data/developers/Etherll.json @@ -0,0 +1,117 @@ +{ + "developer": "Etherll", + "models": [ + { + "id": "Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties", + "name": "Chocolatine-3B-Instruct-DPO-Revised-Ties", + "developer": "Etherll", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3725, + "hfopenllm_v2/BBH": 0.5411, + "hfopenllm_v2/MATH Level 5": 0.1631, + "hfopenllm_v2/GPQA": 0.3238, + "hfopenllm_v2/MUSR": 0.4649, + "hfopenllm_v2/MMLU-PRO": 0.3978 + } + }, + { + "id": "Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties-v2", + "name": "Chocolatine-3B-Instruct-DPO-Revised-Ties-v2", + "developer": "Etherll", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.374, + "hfopenllm_v2/BBH": 0.5411, + "hfopenllm_v2/MATH Level 5": 0.1631, + "hfopenllm_v2/GPQA": 0.3238, + "hfopenllm_v2/MUSR": 0.4649, + "hfopenllm_v2/MMLU-PRO": 0.3978 + } + }, + { + "id": "Etherll/Herplete-LLM-Llama-3.1-8b", + "name": "Herplete-LLM-Llama-3.1-8b", + "developer": "Etherll", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4672, + "hfopenllm_v2/BBH": 0.5013, + "hfopenllm_v2/MATH Level 5": 0.0279, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.386, + "hfopenllm_v2/MMLU-PRO": 0.3482 + } + }, + { + "id": "Etherll/Herplete-LLM-Llama-3.1-8b-Ties", + "name": "Herplete-LLM-Llama-3.1-8b-Ties", + "developer": "Etherll", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6164, + "hfopenllm_v2/BBH": 0.5338, + "hfopenllm_v2/MATH Level 5": 0.1601, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4017, + "hfopenllm_v2/MMLU-PRO": 0.3752 + } + }, + { + "id": "Etherll/Qwen2.5-7B-della-test", + "name": "Qwen2.5-7B-della-test", + "developer": "Etherll", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7625, + "hfopenllm_v2/BBH": 0.5447, + "hfopenllm_v2/MATH Level 5": 0.4894, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4047, + "hfopenllm_v2/MMLU-PRO": 0.4361 + } + }, + { + "id": "Etherll/Qwen2.5-Coder-7B-Instruct-Ties", + "name": "Qwen2.5-Coder-7B-Instruct-Ties", + "developer": "Etherll", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5005, + "hfopenllm_v2/BBH": 0.4895, + "hfopenllm_v2/MATH Level 5": 0.2915, + "hfopenllm_v2/GPQA": 0.3297, + "hfopenllm_v2/MUSR": 0.4373, + "hfopenllm_v2/MMLU-PRO": 0.3503 + } + }, + { + "id": "Etherll/Replete-LLM-V3-Llama-3.1-8b", + "name": "Replete-LLM-V3-Llama-3.1-8b", + "developer": "Etherll", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5263, + "hfopenllm_v2/BBH": 0.4543, + "hfopenllm_v2/MATH Level 5": 0.2273, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3516, + "hfopenllm_v2/MMLU-PRO": 0.347 + } + }, + { + "id": "Etherll/SuperHermes", + "name": "SuperHermes", + "developer": "Etherll", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5459, + "hfopenllm_v2/BBH": 0.529, + "hfopenllm_v2/MATH Level 5": 0.1654, + "hfopenllm_v2/GPQA": 0.3238, + "hfopenllm_v2/MUSR": 0.44, + "hfopenllm_v2/MMLU-PRO": 0.3949 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Eurdem.json b/data/developers/Eurdem.json new file mode 100644 index 0000000000000000000000000000000000000000..09f01cf2fb5e4403a5b1d238151f0376eeb7f57f --- /dev/null +++ b/data/developers/Eurdem.json @@ -0,0 +1,19 @@ +{ + "developer": "Eurdem", + "models": [ + { + "id": "Eurdem/Defne-llama3.1-8B", + "name": "Defne-llama3.1-8B", + "developer": "Eurdem", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5036, + "hfopenllm_v2/BBH": 0.5321, + "hfopenllm_v2/MATH Level 5": 0.1601, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4331, + "hfopenllm_v2/MMLU-PRO": 0.3866 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/FINGU-AI.json b/data/developers/FINGU-AI.json new file mode 100644 index 0000000000000000000000000000000000000000..a9cb10b2300b37999737dcd9dee254ff2a6ae446 --- /dev/null +++ b/data/developers/FINGU-AI.json @@ -0,0 +1,103 @@ +{ + "developer": "FINGU-AI", + "models": [ + { + "id": "FINGU-AI/Chocolatine-Fusion-14B", + "name": "Chocolatine-Fusion-14B", + "developer": "FINGU-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6949, + "hfopenllm_v2/BBH": 0.6413, + "hfopenllm_v2/MATH Level 5": 0.3852, + "hfopenllm_v2/GPQA": 0.3716, + "hfopenllm_v2/MUSR": 0.494, + "hfopenllm_v2/MMLU-PRO": 0.5262 + } + }, + { + "id": "FINGU-AI/L3-8B", + "name": "L3-8B", + "developer": "FINGU-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7517, + "hfopenllm_v2/BBH": 0.4986, + "hfopenllm_v2/MATH Level 5": 0.2545, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.3828, + "hfopenllm_v2/MMLU-PRO": 0.3639 + } + }, + { + "id": "FINGU-AI/Phi-4-RRStock", + "name": "Phi-4-RRStock", + "developer": "FINGU-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2855, + "hfopenllm_v2/BBH": 0.6443, + "hfopenllm_v2/MATH Level 5": 0.0582, + "hfopenllm_v2/GPQA": 0.38, + "hfopenllm_v2/MUSR": 0.4479, + "hfopenllm_v2/MMLU-PRO": 0.4883 + } + }, + { + "id": "FINGU-AI/Q-Small-3B", + "name": "Q-Small-3B", + "developer": "FINGU-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4145, + "hfopenllm_v2/BBH": 0.4319, + "hfopenllm_v2/MATH Level 5": 0.0831, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.4005, + "hfopenllm_v2/MMLU-PRO": 0.279 + } + }, + { + "id": "FINGU-AI/QwQ-Buddy-32B-Alpha", + "name": "QwQ-Buddy-32B-Alpha", + "developer": "FINGU-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3446, + "hfopenllm_v2/BBH": 0.6424, + "hfopenllm_v2/MATH Level 5": 0.3852, + "hfopenllm_v2/GPQA": 0.3792, + "hfopenllm_v2/MUSR": 0.506, + "hfopenllm_v2/MMLU-PRO": 0.5294 + } + }, + { + "id": "FINGU-AI/RomboUltima-32B", + "name": "RomboUltima-32B", + "developer": "FINGU-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6672, + "hfopenllm_v2/BBH": 0.6938, + "hfopenllm_v2/MATH Level 5": 0.5385, + "hfopenllm_v2/GPQA": 0.3716, + "hfopenllm_v2/MUSR": 0.4836, + "hfopenllm_v2/MMLU-PRO": 0.5789 + } + }, + { + "id": "FINGU-AI/Ultimos-32B", + "name": "Ultimos-32B", + "developer": "FINGU-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1592, + "hfopenllm_v2/BBH": 0.2906, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2492, + "hfopenllm_v2/MUSR": 0.3286, + "hfopenllm_v2/MMLU-PRO": 0.1111 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/FallenMerick.json b/data/developers/FallenMerick.json new file mode 100644 index 0000000000000000000000000000000000000000..6b820742ad66046952037b8b731298080b8025de --- /dev/null +++ b/data/developers/FallenMerick.json @@ -0,0 +1,19 @@ +{ + "developer": "FallenMerick", + "models": [ + { + "id": "FallenMerick/Chewy-Lemon-Cookie-11B", + "name": "Chewy-Lemon-Cookie-11B", + "developer": "FallenMerick", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4875, + "hfopenllm_v2/BBH": 0.5251, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.4546, + "hfopenllm_v2/MMLU-PRO": 0.3267 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Felladrin.json b/data/developers/Felladrin.json new file mode 100644 index 0000000000000000000000000000000000000000..0ed0c980df3982a097fdb38c763aa3c946af3e34 --- /dev/null +++ b/data/developers/Felladrin.json @@ -0,0 +1,33 @@ +{ + "developer": "Felladrin", + "models": [ + { + "id": "Felladrin/Llama-160M-Chat-v1", + "name": "Llama-160M-Chat-v1", + "developer": "Felladrin", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1575, + "hfopenllm_v2/BBH": 0.3036, + "hfopenllm_v2/MATH Level 5": 0.006, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3661, + "hfopenllm_v2/MMLU-PRO": 0.1136 + } + }, + { + "id": "Felladrin/Minueza-32M-UltraChat", + "name": "Minueza-32M-UltraChat", + "developer": "Felladrin", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1376, + "hfopenllm_v2/BBH": 0.2941, + "hfopenllm_v2/MATH Level 5": 0.0045, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3742, + "hfopenllm_v2/MMLU-PRO": 0.1133 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/FlofloB.json b/data/developers/FlofloB.json new file mode 100644 index 0000000000000000000000000000000000000000..ddf73ba12b92d64a299d70def26521c7e9deb31f --- /dev/null +++ b/data/developers/FlofloB.json @@ -0,0 +1,383 @@ +{ + "developer": "FlofloB", + "models": [ + { + "id": "FlofloB/100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", + "name": "100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3083, + "hfopenllm_v2/BBH": 0.3323, + "hfopenllm_v2/MATH Level 5": 0.0408, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3302, + "hfopenllm_v2/MMLU-PRO": 0.1498 + } + }, + { + "id": "FlofloB/10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit", + "name": "10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5097, + "hfopenllm_v2/BBH": 0.5215, + "hfopenllm_v2/MATH Level 5": 0.0974, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.431, + "hfopenllm_v2/MMLU-PRO": 0.3769 + } + }, + { + "id": "FlofloB/10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", + "name": "10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2815, + "hfopenllm_v2/BBH": 0.3306, + "hfopenllm_v2/MATH Level 5": 0.031, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3302, + "hfopenllm_v2/MMLU-PRO": 0.1541 + } + }, + { + "id": "FlofloB/40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", + "name": "40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3016, + "hfopenllm_v2/BBH": 0.3325, + "hfopenllm_v2/MATH Level 5": 0.0332, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3408, + "hfopenllm_v2/MMLU-PRO": 0.1485 + } + }, + { + "id": "FlofloB/83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", + "name": "83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2869, + "hfopenllm_v2/BBH": 0.3347, + "hfopenllm_v2/MATH Level 5": 0.0302, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3289, + "hfopenllm_v2/MMLU-PRO": 0.1555 + } + }, + { + "id": "FlofloB/smollm2-135M_pretrained_1000k_fineweb", + "name": "smollm2-135M_pretrained_1000k_fineweb", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1485, + "hfopenllm_v2/BBH": 0.2918, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3581, + "hfopenllm_v2/MMLU-PRO": 0.1164 + } + }, + { + "id": "FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed", + "name": "smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1554, + "hfopenllm_v2/BBH": 0.3066, + "hfopenllm_v2/MATH Level 5": 0.006, + "hfopenllm_v2/GPQA": 0.2508, + "hfopenllm_v2/MUSR": 0.358, + "hfopenllm_v2/MMLU-PRO": 0.1143 + } + }, + { + "id": "FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_selected", + "name": "smollm2-135M_pretrained_1000k_fineweb_uncovai_selected", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1468, + "hfopenllm_v2/BBH": 0.2932, + "hfopenllm_v2/MATH Level 5": 0.0068, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.4048, + "hfopenllm_v2/MMLU-PRO": 0.1157 + } + }, + { + "id": "FlofloB/smollm2-135M_pretrained_1200k_fineweb", + "name": "smollm2-135M_pretrained_1200k_fineweb", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1581, + "hfopenllm_v2/BBH": 0.2941, + "hfopenllm_v2/MATH Level 5": 0.0068, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3714, + "hfopenllm_v2/MMLU-PRO": 0.1076 + } + }, + { + "id": "FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed", + "name": "smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1578, + "hfopenllm_v2/BBH": 0.295, + "hfopenllm_v2/MATH Level 5": 0.0008, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.37, + "hfopenllm_v2/MMLU-PRO": 0.1139 + } + }, + { + "id": "FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_selected", + "name": "smollm2-135M_pretrained_1200k_fineweb_uncovai_selected", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1585, + "hfopenllm_v2/BBH": 0.296, + "hfopenllm_v2/MATH Level 5": 0.0076, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3567, + "hfopenllm_v2/MMLU-PRO": 0.1164 + } + }, + { + "id": "FlofloB/smollm2-135M_pretrained_1400k_fineweb", + "name": "smollm2-135M_pretrained_1400k_fineweb", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1764, + "hfopenllm_v2/BBH": 0.2922, + "hfopenllm_v2/MATH Level 5": 0.0113, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3873, + "hfopenllm_v2/MMLU-PRO": 0.108 + } + }, + { + "id": "FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed", + "name": "smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1707, + "hfopenllm_v2/BBH": 0.2992, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3939, + "hfopenllm_v2/MMLU-PRO": 0.1105 + } + }, + { + "id": "FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_selected", + "name": "smollm2-135M_pretrained_1400k_fineweb_uncovai_selected", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1538, + "hfopenllm_v2/BBH": 0.2917, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3741, + "hfopenllm_v2/MMLU-PRO": 0.1137 + } + }, + { + "id": "FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed", + "name": "smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1475, + "hfopenllm_v2/BBH": 0.3029, + "hfopenllm_v2/MATH Level 5": 0.0038, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3578, + "hfopenllm_v2/MMLU-PRO": 0.112 + } + }, + { + "id": "FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_selected", + "name": "smollm2-135M_pretrained_200k_fineweb_uncovai_selected", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1345, + "hfopenllm_v2/BBH": 0.2927, + "hfopenllm_v2/MATH Level 5": 0.0076, + "hfopenllm_v2/GPQA": 0.2508, + "hfopenllm_v2/MUSR": 0.366, + "hfopenllm_v2/MMLU-PRO": 0.1131 + } + }, + { + "id": "FlofloB/smollm2-135M_pretrained_400k_fineweb", + "name": "smollm2-135M_pretrained_400k_fineweb", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1511, + "hfopenllm_v2/BBH": 0.2972, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.3794, + "hfopenllm_v2/MMLU-PRO": 0.1163 + } + }, + { + "id": "FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed", + "name": "smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1556, + "hfopenllm_v2/BBH": 0.3049, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.255, + "hfopenllm_v2/MUSR": 0.386, + "hfopenllm_v2/MMLU-PRO": 0.1138 + } + }, + { + "id": "FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_selected", + "name": "smollm2-135M_pretrained_400k_fineweb_uncovai_selected", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1584, + "hfopenllm_v2/BBH": 0.2925, + "hfopenllm_v2/MATH Level 5": 0.0068, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.382, + "hfopenllm_v2/MMLU-PRO": 0.1158 + } + }, + { + "id": "FlofloB/smollm2-135M_pretrained_600k_fineweb", + "name": "smollm2-135M_pretrained_600k_fineweb", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1639, + "hfopenllm_v2/BBH": 0.3014, + "hfopenllm_v2/MATH Level 5": 0.006, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3809, + "hfopenllm_v2/MMLU-PRO": 0.1126 + } + }, + { + "id": "FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed", + "name": "smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1641, + "hfopenllm_v2/BBH": 0.3, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3793, + "hfopenllm_v2/MMLU-PRO": 0.1147 + } + }, + { + "id": "FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_selected", + "name": "smollm2-135M_pretrained_600k_fineweb_uncovai_selected", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1606, + "hfopenllm_v2/BBH": 0.2983, + "hfopenllm_v2/MATH Level 5": 0.0076, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3846, + "hfopenllm_v2/MMLU-PRO": 0.1162 + } + }, + { + "id": "FlofloB/smollm2-135M_pretrained_800k_fineweb", + "name": "smollm2-135M_pretrained_800k_fineweb", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1641, + "hfopenllm_v2/BBH": 0.2959, + "hfopenllm_v2/MATH Level 5": 0.0083, + "hfopenllm_v2/GPQA": 0.2492, + "hfopenllm_v2/MUSR": 0.3701, + "hfopenllm_v2/MMLU-PRO": 0.1152 + } + }, + { + "id": "FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed", + "name": "smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1623, + "hfopenllm_v2/BBH": 0.3038, + "hfopenllm_v2/MATH Level 5": 0.0068, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.3993, + "hfopenllm_v2/MMLU-PRO": 0.1138 + } + }, + { + "id": "FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_selected", + "name": "smollm2-135M_pretrained_800k_fineweb_uncovai_selected", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1474, + "hfopenllm_v2/BBH": 0.2943, + "hfopenllm_v2/MATH Level 5": 0.0045, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3766, + "hfopenllm_v2/MMLU-PRO": 0.113 + } + }, + { + "id": "FlofloB/smollm2_pretrained_200k_fineweb", + "name": "smollm2_pretrained_200k_fineweb", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1527, + "hfopenllm_v2/BBH": 0.2995, + "hfopenllm_v2/MATH Level 5": 0.0038, + "hfopenllm_v2/GPQA": 0.2475, + "hfopenllm_v2/MUSR": 0.3699, + "hfopenllm_v2/MMLU-PRO": 0.1159 + } + }, + { + "id": "FlofloB/test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit", + "name": "test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit", + "developer": "FlofloB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5215, + "hfopenllm_v2/BBH": 0.5241, + "hfopenllm_v2/MATH Level 5": 0.1103, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.4244, + "hfopenllm_v2/MMLU-PRO": 0.3721 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/FuJhen.json b/data/developers/FuJhen.json new file mode 100644 index 0000000000000000000000000000000000000000..18df6b5a615decb777fa7c14bf593368991d43d7 --- /dev/null +++ b/data/developers/FuJhen.json @@ -0,0 +1,61 @@ +{ + "developer": "FuJhen", + "models": [ + { + "id": "FuJhen/ft-openhermes-25-mistral-7b-irca-dpo-pairs", + "name": "ft-openhermes-25-mistral-7b-irca-dpo-pairs", + "developer": "FuJhen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.542, + "hfopenllm_v2/BBH": 0.4773, + "hfopenllm_v2/MATH Level 5": 0.0483, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.4174, + "hfopenllm_v2/MMLU-PRO": 0.2956 + } + }, + { + "id": "FuJhen/mistral-instruct-7B-DPO", + "name": "mistral-instruct-7B-DPO", + "developer": "FuJhen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4968, + "hfopenllm_v2/BBH": 0.4624, + "hfopenllm_v2/MATH Level 5": 0.0385, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.4016, + "hfopenllm_v2/MMLU-PRO": 0.3034 + } + }, + { + "id": "FuJhen/mistral_7b_v0.1_structedData_e2e", + "name": "mistral_7b_v0.1_structedData_e2e", + "developer": "FuJhen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1727, + "hfopenllm_v2/BBH": 0.4114, + "hfopenllm_v2/MATH Level 5": 0.0045, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3723, + "hfopenllm_v2/MMLU-PRO": 0.2811 + } + }, + { + "id": "FuJhen/mistral_7b_v0.1_structedData_viggo", + "name": "mistral_7b_v0.1_structedData_viggo", + "developer": "FuJhen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1783, + "hfopenllm_v2/BBH": 0.4524, + "hfopenllm_v2/MATH Level 5": 0.0287, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3738, + "hfopenllm_v2/MMLU-PRO": 0.2942 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/FuseAI.json b/data/developers/FuseAI.json new file mode 100644 index 0000000000000000000000000000000000000000..13a53260c5d7b9d584f47f65b16b17de97cfb59e --- /dev/null +++ b/data/developers/FuseAI.json @@ -0,0 +1,61 @@ +{ + "developer": "FuseAI", + "models": [ + { + "id": "FuseAI/FuseChat-7B-v2.0", + "name": "FuseChat-7B-v2.0", + "developer": "FuseAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3423, + "hfopenllm_v2/BBH": 0.4954, + "hfopenllm_v2/MATH Level 5": 0.0612, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4797, + "hfopenllm_v2/MMLU-PRO": 0.3162 + } + }, + { + "id": "FuseAI/FuseChat-Llama-3.1-8B-Instruct", + "name": "FuseChat-Llama-3.1-8B-Instruct", + "developer": "FuseAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7205, + "hfopenllm_v2/BBH": 0.512, + "hfopenllm_v2/MATH Level 5": 0.2477, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.382, + "hfopenllm_v2/MMLU-PRO": 0.3733 + } + }, + { + "id": "FuseAI/FuseChat-Llama-3.2-3B-Instruct", + "name": "FuseChat-Llama-3.2-3B-Instruct", + "developer": "FuseAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6849, + "hfopenllm_v2/BBH": 0.4658, + "hfopenllm_v2/MATH Level 5": 0.2424, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.3914, + "hfopenllm_v2/MMLU-PRO": 0.3132 + } + }, + { + "id": "FuseAI/FuseChat-Qwen-2.5-7B-Instruct", + "name": "FuseChat-Qwen-2.5-7B-Instruct", + "developer": "FuseAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5906, + "hfopenllm_v2/BBH": 0.5526, + "hfopenllm_v2/MATH Level 5": 0.4562, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.3874, + "hfopenllm_v2/MMLU-PRO": 0.4118 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/GalrionSoftworks.json b/data/developers/GalrionSoftworks.json new file mode 100644 index 0000000000000000000000000000000000000000..8a639dd28ce440e3182ce6365f0822b0e4debac5 --- /dev/null +++ b/data/developers/GalrionSoftworks.json @@ -0,0 +1,33 @@ +{ + "developer": "GalrionSoftworks", + "models": [ + { + "id": "GalrionSoftworks/MN-LooseCannon-12B-v1", + "name": "MN-LooseCannon-12B-v1", + "developer": "GalrionSoftworks", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5418, + "hfopenllm_v2/BBH": 0.5128, + "hfopenllm_v2/MATH Level 5": 0.0853, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.4138, + "hfopenllm_v2/MMLU-PRO": 0.3196 + } + }, + { + "id": "GalrionSoftworks/MagnusIntellectus-12B-v1", + "name": "MagnusIntellectus-12B-v1", + "developer": "GalrionSoftworks", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4421, + "hfopenllm_v2/BBH": 0.5323, + "hfopenllm_v2/MATH Level 5": 0.065, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4428, + "hfopenllm_v2/MMLU-PRO": 0.3421 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/GenVRadmin.json b/data/developers/GenVRadmin.json new file mode 100644 index 0000000000000000000000000000000000000000..ba52e8492d8f5c76d78f479e48e0f756596b4bac --- /dev/null +++ b/data/developers/GenVRadmin.json @@ -0,0 +1,61 @@ +{ + "developer": "GenVRadmin", + "models": [ + { + "id": "GenVRadmin/AryaBhatta-GemmaOrca-2-Merged", + "name": "AryaBhatta-GemmaOrca-2-Merged", + "developer": "GenVRadmin", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3064, + "hfopenllm_v2/BBH": 0.3887, + "hfopenllm_v2/MATH Level 5": 0.0498, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.455, + "hfopenllm_v2/MMLU-PRO": 0.2384 + } + }, + { + "id": "GenVRadmin/AryaBhatta-GemmaOrca-Merged", + "name": "AryaBhatta-GemmaOrca-Merged", + "developer": "GenVRadmin", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3064, + "hfopenllm_v2/BBH": 0.4131, + "hfopenllm_v2/MATH Level 5": 0.0514, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3524, + "hfopenllm_v2/MMLU-PRO": 0.2228 + } + }, + { + "id": "GenVRadmin/AryaBhatta-GemmaUltra-Merged", + "name": "AryaBhatta-GemmaUltra-Merged", + "developer": "GenVRadmin", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3021, + "hfopenllm_v2/BBH": 0.4141, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.4279, + "hfopenllm_v2/MMLU-PRO": 0.2266 + } + }, + { + "id": "GenVRadmin/llama38bGenZ_Vikas-Merged", + "name": "llama38bGenZ_Vikas-Merged", + "developer": "GenVRadmin", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3, + "hfopenllm_v2/BBH": 0.4536, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4402, + "hfopenllm_v2/MMLU-PRO": 0.2622 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/GoToCompany.json b/data/developers/GoToCompany.json new file mode 100644 index 0000000000000000000000000000000000000000..4d0524802da3cdd1c3199a75ac08df458128a62c --- /dev/null +++ b/data/developers/GoToCompany.json @@ -0,0 +1,33 @@ +{ + "developer": "GoToCompany", + "models": [ + { + "id": "GoToCompany/gemma2-9b-cpt-sahabatai-v1-instruct", + "name": "gemma2-9b-cpt-sahabatai-v1-instruct", + "developer": "GoToCompany", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6551, + "hfopenllm_v2/BBH": 0.5955, + "hfopenllm_v2/MATH Level 5": 0.2054, + "hfopenllm_v2/GPQA": 0.3347, + "hfopenllm_v2/MUSR": 0.4779, + "hfopenllm_v2/MMLU-PRO": 0.4264 + } + }, + { + "id": "GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct", + "name": "llama3-8b-cpt-sahabatai-v1-instruct", + "developer": "GoToCompany", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5238, + "hfopenllm_v2/BBH": 0.4951, + "hfopenllm_v2/MATH Level 5": 0.1276, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.4488, + "hfopenllm_v2/MMLU-PRO": 0.3453 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Goekdeniz-Guelmez.json b/data/developers/Goekdeniz-Guelmez.json new file mode 100644 index 0000000000000000000000000000000000000000..55f817ee11d95051df287aacc2d790d8c57602b9 --- /dev/null +++ b/data/developers/Goekdeniz-Guelmez.json @@ -0,0 +1,145 @@ +{ + "developer": "Goekdeniz-Guelmez", + "models": [ + { + "id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1", + "name": "Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1", + "developer": "Goekdeniz-Guelmez", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3472, + "hfopenllm_v2/BBH": 0.3268, + "hfopenllm_v2/MATH Level 5": 0.0891, + "hfopenllm_v2/GPQA": 0.2517, + "hfopenllm_v2/MUSR": 0.3262, + "hfopenllm_v2/MMLU-PRO": 0.1641 + } + }, + { + "id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1", + "name": "Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1", + "developer": "Goekdeniz-Guelmez", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4769, + "hfopenllm_v2/BBH": 0.4186, + "hfopenllm_v2/MATH Level 5": 0.2085, + "hfopenllm_v2/GPQA": 0.2433, + "hfopenllm_v2/MUSR": 0.3675, + "hfopenllm_v2/MMLU-PRO": 0.2783 + } + }, + { + "id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2", + "name": "Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2", + "developer": "Goekdeniz-Guelmez", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4216, + "hfopenllm_v2/BBH": 0.4042, + "hfopenllm_v2/MATH Level 5": 0.1269, + "hfopenllm_v2/GPQA": 0.2399, + "hfopenllm_v2/MUSR": 0.3769, + "hfopenllm_v2/MMLU-PRO": 0.2562 + } + }, + { + "id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3", + "name": "Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3", + "developer": "Goekdeniz-Guelmez", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4253, + "hfopenllm_v2/BBH": 0.4053, + "hfopenllm_v2/MATH Level 5": 0.1307, + "hfopenllm_v2/GPQA": 0.2433, + "hfopenllm_v2/MUSR": 0.3702, + "hfopenllm_v2/MMLU-PRO": 0.2556 + } + }, + { + "id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-14B-Instruct-abliterated-v4", + "name": "Josiefied-Qwen2.5-14B-Instruct-abliterated-v4", + "developer": "Goekdeniz-Guelmez", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8292, + "hfopenllm_v2/BBH": 0.6356, + "hfopenllm_v2/MATH Level 5": 0.5423, + "hfopenllm_v2/GPQA": 0.3423, + "hfopenllm_v2/MUSR": 0.4287, + "hfopenllm_v2/MMLU-PRO": 0.5018 + } + }, + { + "id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2", + "name": "Josiefied-Qwen2.5-7B-Instruct-abliterated-v2", + "developer": "Goekdeniz-Guelmez", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7814, + "hfopenllm_v2/BBH": 0.531, + "hfopenllm_v2/MATH Level 5": 0.4532, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4354, + "hfopenllm_v2/MMLU-PRO": 0.412 + } + }, + { + "id": "Goekdeniz-Guelmez/j.o.s.i.e.v4o-1.5b-dpo-stage1-v1", + "name": "j.o.s.i.e.v4o-1.5b-dpo-stage1-v1", + "developer": "Goekdeniz-Guelmez", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4188, + "hfopenllm_v2/BBH": 0.4124, + "hfopenllm_v2/MATH Level 5": 0.1201, + "hfopenllm_v2/GPQA": 0.2508, + "hfopenllm_v2/MUSR": 0.3529, + "hfopenllm_v2/MMLU-PRO": 0.2555 + } + }, + { + "id": "Goekdeniz-Guelmez/josie-3b-v6.0", + "name": "josie-3b-v6.0", + "developer": "Goekdeniz-Guelmez", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.601, + "hfopenllm_v2/BBH": 0.4496, + "hfopenllm_v2/MATH Level 5": 0.2938, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.3861, + "hfopenllm_v2/MMLU-PRO": 0.322 + } + }, + { + "id": "Goekdeniz-Guelmez/josie-7b-v6.0", + "name": "josie-7b-v6.0", + "developer": "Goekdeniz-Guelmez", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7412, + "hfopenllm_v2/BBH": 0.5105, + "hfopenllm_v2/MATH Level 5": 0.4358, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4154, + "hfopenllm_v2/MMLU-PRO": 0.3807 + } + }, + { + "id": "Goekdeniz-Guelmez/josie-7b-v6.0-step2000", + "name": "josie-7b-v6.0-step2000", + "developer": "Goekdeniz-Guelmez", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7598, + "hfopenllm_v2/BBH": 0.5107, + "hfopenllm_v2/MATH Level 5": 0.4237, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.4539, + "hfopenllm_v2/MMLU-PRO": 0.4012 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Google.json b/data/developers/Google.json new file mode 100644 index 0000000000000000000000000000000000000000..a72418ac229d3b0c23d277e602231dc9424cf54c --- /dev/null +++ b/data/developers/Google.json @@ -0,0 +1,65 @@ +{ + "developer": "Google", + "models": [ + { + "id": "google/gemini-3-flash", + "name": "Gemini 3 Flash", + "developer": "Google", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 47.4 + } + }, + { + "id": "google/gemini-3-pro", + "name": "Gemini 3 Pro", + "developer": "Google", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 62.2 + } + }, + { + "id": "google/gemini-3-pro-preview", + "name": "gemini-3-pro-preview", + "developer": "Google", + "evaluator_relationship": null, + "benchmark_scores": { + "appworld_test_normal/appworld/test_normal": 0.505, + "browsecompplus/browsecompplus": 0.48, + "global-mmlu-lite/Global MMLU Lite": 0.9453, + "global-mmlu-lite/Culturally Sensitive": 0.9397, + "global-mmlu-lite/Culturally Agnostic": 0.9509, + "global-mmlu-lite/Arabic": 0.9475, + "global-mmlu-lite/English": 0.9425, + "global-mmlu-lite/Bengali": 0.9425, + "global-mmlu-lite/German": 0.94, + "global-mmlu-lite/French": 0.9575, + "global-mmlu-lite/Hindi": 0.9425, + "global-mmlu-lite/Indonesian": 0.955, + "global-mmlu-lite/Italian": 0.955, + "global-mmlu-lite/Japanese": 0.94, + "global-mmlu-lite/Korean": 0.94, + "global-mmlu-lite/Portuguese": 0.9425, + "global-mmlu-lite/Spanish": 0.9475, + "global-mmlu-lite/Swahili": 0.94, + "global-mmlu-lite/Yoruba": 0.9425, + "global-mmlu-lite/Chinese": 0.9475, + "global-mmlu-lite/Burmese": 0.9425, + "swe-bench/swe-bench": 0.7234, + "tau-bench-2_airline/tau-bench-2/airline": 0.68, + "tau-bench-2_retail/tau-bench-2/retail": 0.7805, + "tau-bench-2_telecom/tau-bench-2/telecom": 0.73 + } + }, + { + "id": "google/gemini-3.1-pro", + "name": "Gemini 3.1 Pro", + "developer": "Google", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 78.4 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/GreenNode.json b/data/developers/GreenNode.json new file mode 100644 index 0000000000000000000000000000000000000000..384914502d43cf5d86dc21c52d2b35cf0ad5800b --- /dev/null +++ b/data/developers/GreenNode.json @@ -0,0 +1,19 @@ +{ + "developer": "GreenNode", + "models": [ + { + "id": "GreenNode/GreenNode-small-9B-it", + "name": "GreenNode-small-9B-it", + "developer": "GreenNode", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7436, + "hfopenllm_v2/BBH": 0.5994, + "hfopenllm_v2/MATH Level 5": 0.1745, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.4204, + "hfopenllm_v2/MMLU-PRO": 0.3927 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/GritLM.json b/data/developers/GritLM.json new file mode 100644 index 0000000000000000000000000000000000000000..5600a5186bede48b0c614f8ee4392c434310cd12 --- /dev/null +++ b/data/developers/GritLM.json @@ -0,0 +1,33 @@ +{ + "developer": "GritLM", + "models": [ + { + "id": "GritLM/GritLM-7B-KTO", + "name": "GritLM-7B-KTO", + "developer": "GritLM", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.531, + "hfopenllm_v2/BBH": 0.4853, + "hfopenllm_v2/MATH Level 5": 0.0272, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.371, + "hfopenllm_v2/MMLU-PRO": 0.268 + } + }, + { + "id": "GritLM/GritLM-8x7B-KTO", + "name": "GritLM-8x7B-KTO", + "developer": "GritLM", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5714, + "hfopenllm_v2/BBH": 0.582, + "hfopenllm_v2/MATH Level 5": 0.1224, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4217, + "hfopenllm_v2/MMLU-PRO": 0.3648 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Groq.json b/data/developers/Groq.json new file mode 100644 index 0000000000000000000000000000000000000000..63e2893de00b9ebd1d7c878df5a729bcfa230e2a --- /dev/null +++ b/data/developers/Groq.json @@ -0,0 +1,19 @@ +{ + "developer": "Groq", + "models": [ + { + "id": "Groq/Llama-3-Groq-8B-Tool-Use", + "name": "Llama-3-Groq-8B-Tool-Use", + "developer": "Groq", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6098, + "hfopenllm_v2/BBH": 0.4863, + "hfopenllm_v2/MATH Level 5": 0.0604, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.366, + "hfopenllm_v2/MMLU-PRO": 0.3399 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Gryphe.json b/data/developers/Gryphe.json new file mode 100644 index 0000000000000000000000000000000000000000..fb499a575b0a6874127b742b0e6e79f47ba72055 --- /dev/null +++ b/data/developers/Gryphe.json @@ -0,0 +1,75 @@ +{ + "developer": "Gryphe", + "models": [ + { + "id": "Gryphe/Pantheon-RP-1.0-8b-Llama-3", + "name": "Pantheon-RP-1.0-8b-Llama-3", + "developer": "Gryphe", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3933, + "hfopenllm_v2/BBH": 0.4539, + "hfopenllm_v2/MATH Level 5": 0.0634, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3832, + "hfopenllm_v2/MMLU-PRO": 0.3067 + } + }, + { + "id": "Gryphe/Pantheon-RP-1.5-12b-Nemo", + "name": "Pantheon-RP-1.5-12b-Nemo", + "developer": "Gryphe", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4763, + "hfopenllm_v2/BBH": 0.5196, + "hfopenllm_v2/MATH Level 5": 0.0491, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.442, + "hfopenllm_v2/MMLU-PRO": 0.3302 + } + }, + { + "id": "Gryphe/Pantheon-RP-1.6-12b-Nemo", + "name": "Pantheon-RP-1.6-12b-Nemo", + "developer": "Gryphe", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4481, + "hfopenllm_v2/BBH": 0.5204, + "hfopenllm_v2/MATH Level 5": 0.0461, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.4288, + "hfopenllm_v2/MMLU-PRO": 0.3311 + } + }, + { + "id": "Gryphe/Pantheon-RP-1.6-12b-Nemo-KTO", + "name": "Pantheon-RP-1.6-12b-Nemo-KTO", + "developer": "Gryphe", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4636, + "hfopenllm_v2/BBH": 0.5277, + "hfopenllm_v2/MATH Level 5": 0.0529, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4248, + "hfopenllm_v2/MMLU-PRO": 0.3382 + } + }, + { + "id": "Gryphe/Pantheon-RP-Pure-1.6.2-22b-Small", + "name": "Pantheon-RP-Pure-1.6.2-22b-Small", + "developer": "Gryphe", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6931, + "hfopenllm_v2/BBH": 0.5305, + "hfopenllm_v2/MATH Level 5": 0.2024, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.3765, + "hfopenllm_v2/MMLU-PRO": 0.3942 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/GuilhermeNaturaUmana.json b/data/developers/GuilhermeNaturaUmana.json new file mode 100644 index 0000000000000000000000000000000000000000..42443971cc3612439e60073a685d6fc6ddf14fe4 --- /dev/null +++ b/data/developers/GuilhermeNaturaUmana.json @@ -0,0 +1,19 @@ +{ + "developer": "GuilhermeNaturaUmana", + "models": [ + { + "id": "GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall", + "name": "Nature-Reason-1.2-reallysmall", + "developer": "GuilhermeNaturaUmana", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4791, + "hfopenllm_v2/BBH": 0.5649, + "hfopenllm_v2/MATH Level 5": 0.25, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4439, + "hfopenllm_v2/MMLU-PRO": 0.4408 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Gunulhona.json b/data/developers/Gunulhona.json new file mode 100644 index 0000000000000000000000000000000000000000..3d63c85d80df83c6632e31f815d8dc78510d19a0 --- /dev/null +++ b/data/developers/Gunulhona.json @@ -0,0 +1,33 @@ +{ + "developer": "Gunulhona", + "models": [ + { + "id": "Gunulhona/Gemma-Ko-Merge", + "name": "Gemma-Ko-Merge", + "developer": "Gunulhona", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6416, + "hfopenllm_v2/BBH": 0.5813, + "hfopenllm_v2/MATH Level 5": 0.1881, + "hfopenllm_v2/GPQA": 0.3356, + "hfopenllm_v2/MUSR": 0.4047, + "hfopenllm_v2/MMLU-PRO": 0.3879 + } + }, + { + "id": "Gunulhona/Gemma-Ko-Merge-PEFT", + "name": "Gemma-Ko-Merge-PEFT", + "developer": "Gunulhona", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.288, + "hfopenllm_v2/BBH": 0.5154, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.408, + "hfopenllm_v2/MMLU-PRO": 0.3817 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/HFXM.json b/data/developers/HFXM.json new file mode 100644 index 0000000000000000000000000000000000000000..0cc645611ef3598fb6468fc4c126c9a8eb5680a6 --- /dev/null +++ b/data/developers/HFXM.json @@ -0,0 +1,20 @@ +{ + "developer": "HFXM", + "models": [ + { + "id": "HFXM/RAMO-Llama3.1-8B", + "name": "HFXM/RAMO-Llama3.1-8B", + "developer": "HFXM", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6917, + "reward-bench/Factuality": 0.6547, + "reward-bench/Precise IF": 0.375, + "reward-bench/Math": 0.5628, + "reward-bench/Safety": 0.9756, + "reward-bench/Focus": 0.9071, + "reward-bench/Ties": 0.6752 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/HPAI-BSC.json b/data/developers/HPAI-BSC.json new file mode 100644 index 0000000000000000000000000000000000000000..8e5820265e260824ba737e99923df14285fb7e0e --- /dev/null +++ b/data/developers/HPAI-BSC.json @@ -0,0 +1,47 @@ +{ + "developer": "HPAI-BSC", + "models": [ + { + "id": "HPAI-BSC/Llama3-Aloe-8B-Alpha", + "name": "Llama3-Aloe-8B-Alpha", + "developer": "HPAI-BSC", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5081, + "hfopenllm_v2/BBH": 0.4831, + "hfopenllm_v2/MATH Level 5": 0.0612, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.3673, + "hfopenllm_v2/MMLU-PRO": 0.3295 + } + }, + { + "id": "HPAI-BSC/Llama3.1-Aloe-Beta-8B", + "name": "Llama3.1-Aloe-Beta-8B", + "developer": "HPAI-BSC", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7253, + "hfopenllm_v2/BBH": 0.5093, + "hfopenllm_v2/MATH Level 5": 0.1828, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3835, + "hfopenllm_v2/MMLU-PRO": 0.358 + } + }, + { + "id": "HPAI-BSC/Qwen2.5-Aloe-Beta-7B", + "name": "Qwen2.5-Aloe-Beta-7B", + "developer": "HPAI-BSC", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4554, + "hfopenllm_v2/BBH": 0.5049, + "hfopenllm_v2/MATH Level 5": 0.3542, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.426, + "hfopenllm_v2/MMLU-PRO": 0.4354 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/HarbingerX.json b/data/developers/HarbingerX.json new file mode 100644 index 0000000000000000000000000000000000000000..caedf22e726fadfc7fba92e0aa30e9f3d314bbdc --- /dev/null +++ b/data/developers/HarbingerX.json @@ -0,0 +1,33 @@ +{ + "developer": "HarbingerX", + "models": [ + { + "id": "HarbingerX/Zeitgeist-3b-V1", + "name": "Zeitgeist-3b-V1", + "developer": "HarbingerX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6712, + "hfopenllm_v2/BBH": 0.4441, + "hfopenllm_v2/MATH Level 5": 0.1035, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.3579, + "hfopenllm_v2/MMLU-PRO": 0.3009 + } + }, + { + "id": "HarbingerX/Zeitgeist-3b-V1.2", + "name": "Zeitgeist-3b-V1.2", + "developer": "HarbingerX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6754, + "hfopenllm_v2/BBH": 0.4441, + "hfopenllm_v2/MATH Level 5": 0.1012, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.3579, + "hfopenllm_v2/MMLU-PRO": 0.3056 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Hastagaras.json b/data/developers/Hastagaras.json new file mode 100644 index 0000000000000000000000000000000000000000..4a3c55d43f86eaa49c1c3182528673849d0a1b88 --- /dev/null +++ b/data/developers/Hastagaras.json @@ -0,0 +1,47 @@ +{ + "developer": "Hastagaras", + "models": [ + { + "id": "Hastagaras/L3.2-JametMini-3B-MK.III", + "name": "L3.2-JametMini-3B-MK.III", + "developer": "Hastagaras", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6183, + "hfopenllm_v2/BBH": 0.4539, + "hfopenllm_v2/MATH Level 5": 0.1458, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.3686, + "hfopenllm_v2/MMLU-PRO": 0.2983 + } + }, + { + "id": "Hastagaras/Llama-3.1-Jamet-8B-MK.I", + "name": "Llama-3.1-Jamet-8B-MK.I", + "developer": "Hastagaras", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7338, + "hfopenllm_v2/BBH": 0.5049, + "hfopenllm_v2/MATH Level 5": 0.1269, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3726, + "hfopenllm_v2/MMLU-PRO": 0.3482 + } + }, + { + "id": "Hastagaras/Zabuza-8B-Llama-3.1", + "name": "Zabuza-8B-Llama-3.1", + "developer": "Hastagaras", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6265, + "hfopenllm_v2/BBH": 0.4539, + "hfopenllm_v2/MATH Level 5": 0.0551, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3568, + "hfopenllm_v2/MMLU-PRO": 0.2923 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/HelpingAI.json b/data/developers/HelpingAI.json new file mode 100644 index 0000000000000000000000000000000000000000..085694447262ec4c5f7e4be954741a8e8210e8e2 --- /dev/null +++ b/data/developers/HelpingAI.json @@ -0,0 +1,61 @@ +{ + "developer": "HelpingAI", + "models": [ + { + "id": "HelpingAI/Cipher-20B", + "name": "Cipher-20B", + "developer": "HelpingAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5378, + "hfopenllm_v2/BBH": 0.6032, + "hfopenllm_v2/MATH Level 5": 0.1994, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4003, + "hfopenllm_v2/MMLU-PRO": 0.3744 + } + }, + { + "id": "HelpingAI/Dhanishtha-Large", + "name": "Dhanishtha-Large", + "developer": "HelpingAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2457, + "hfopenllm_v2/BBH": 0.4604, + "hfopenllm_v2/MATH Level 5": 0.3852, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.3845, + "hfopenllm_v2/MMLU-PRO": 0.2755 + } + }, + { + "id": "HelpingAI/Priya-10B", + "name": "Priya-10B", + "developer": "HelpingAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4043, + "hfopenllm_v2/BBH": 0.4441, + "hfopenllm_v2/MATH Level 5": 0.0189, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3793, + "hfopenllm_v2/MMLU-PRO": 0.2493 + } + }, + { + "id": "HelpingAI/Priya-3B", + "name": "Priya-3B", + "developer": "HelpingAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4526, + "hfopenllm_v2/BBH": 0.3961, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3713, + "hfopenllm_v2/MMLU-PRO": 0.2339 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/HeraiHench.json b/data/developers/HeraiHench.json new file mode 100644 index 0000000000000000000000000000000000000000..bc8b8bfdab897d4156c4ef68a19ecb0f65fe6582 --- /dev/null +++ b/data/developers/HeraiHench.json @@ -0,0 +1,61 @@ +{ + "developer": "HeraiHench", + "models": [ + { + "id": "HeraiHench/DeepSeek-R1-Qwen-Coder-8B", + "name": "DeepSeek-R1-Qwen-Coder-8B", + "developer": "HeraiHench", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1869, + "hfopenllm_v2/BBH": 0.2913, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3738, + "hfopenllm_v2/MMLU-PRO": 0.1123 + } + }, + { + "id": "HeraiHench/Double-Down-Qwen-Math-7B", + "name": "Double-Down-Qwen-Math-7B", + "developer": "HeraiHench", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.167, + "hfopenllm_v2/BBH": 0.2845, + "hfopenllm_v2/MATH Level 5": 0.0008, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3737, + "hfopenllm_v2/MMLU-PRO": 0.1112 + } + }, + { + "id": "HeraiHench/Marge-Qwen-Math-7B", + "name": "Marge-Qwen-Math-7B", + "developer": "HeraiHench", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1262, + "hfopenllm_v2/BBH": 0.3069, + "hfopenllm_v2/MATH Level 5": 0.0053, + "hfopenllm_v2/GPQA": 0.2391, + "hfopenllm_v2/MUSR": 0.3939, + "hfopenllm_v2/MMLU-PRO": 0.1056 + } + }, + { + "id": "HeraiHench/Phi-4-slerp-ReasoningRP-14B", + "name": "Phi-4-slerp-ReasoningRP-14B", + "developer": "HeraiHench", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1575, + "hfopenllm_v2/BBH": 0.4196, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.3116, + "hfopenllm_v2/MMLU-PRO": 0.19 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/HiroseKoichi.json b/data/developers/HiroseKoichi.json new file mode 100644 index 0000000000000000000000000000000000000000..3c3233e49f4e56f29c0e270ad345bced0ce862bb --- /dev/null +++ b/data/developers/HiroseKoichi.json @@ -0,0 +1,19 @@ +{ + "developer": "HiroseKoichi", + "models": [ + { + "id": "HiroseKoichi/Llama-Salad-4x8B-V3", + "name": "Llama-Salad-4x8B-V3", + "developer": "HiroseKoichi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6654, + "hfopenllm_v2/BBH": 0.5245, + "hfopenllm_v2/MATH Level 5": 0.0959, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.374, + "hfopenllm_v2/MMLU-PRO": 0.3518 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/HoangHa.json b/data/developers/HoangHa.json new file mode 100644 index 0000000000000000000000000000000000000000..8ad548c64bf7c7d209db7b3b149c712f0eb52220 --- /dev/null +++ b/data/developers/HoangHa.json @@ -0,0 +1,19 @@ +{ + "developer": "HoangHa", + "models": [ + { + "id": "HoangHa/Pensez-Llama3.1-8B", + "name": "Pensez-Llama3.1-8B", + "developer": "HoangHa", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3887, + "hfopenllm_v2/BBH": 0.4669, + "hfopenllm_v2/MATH Level 5": 0.1148, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.3597, + "hfopenllm_v2/MMLU-PRO": 0.3126 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/HuggingFaceH4.json b/data/developers/HuggingFaceH4.json new file mode 100644 index 0000000000000000000000000000000000000000..d983a21bea96bc2344d9d2a5c3498bcf14b73acc --- /dev/null +++ b/data/developers/HuggingFaceH4.json @@ -0,0 +1,93 @@ +{ + "developer": "HuggingFaceH4", + "models": [ + { + "id": "HuggingFaceH4/starchat2-15b-v0.1", + "name": "HuggingFaceH4/starchat2-15b-v0.1", + "developer": "HuggingFaceH4", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7322, + "reward-bench/Chat": 0.9385, + "reward-bench/Chat Hard": 0.5548, + "reward-bench/Safety": 0.7095, + "reward-bench/Reasoning": 0.8159, + "reward-bench/Prior Sets (0.5 weight)": 0.5525 + } + }, + { + "id": "HuggingFaceH4/zephyr-7b-alpha", + "name": "zephyr-7b-alpha", + "developer": "HuggingFaceH4", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5191, + "hfopenllm_v2/BBH": 0.4583, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.395, + "hfopenllm_v2/MMLU-PRO": 0.2795, + "reward-bench/Score": 0.7392, + "reward-bench/Chat": 0.9162, + "reward-bench/Chat Hard": 0.625, + "reward-bench/Safety": 0.7662, + "reward-bench/Reasoning": 0.7514, + "reward-bench/Prior Sets (0.5 weight)": 0.5353 + } + }, + { + "id": "HuggingFaceH4/zephyr-7b-beta", + "name": "zephyr-7b-beta", + "developer": "HuggingFaceH4", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.495, + "hfopenllm_v2/BBH": 0.4316, + "hfopenllm_v2/MATH Level 5": 0.0287, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.3925, + "hfopenllm_v2/MMLU-PRO": 0.2781, + "reward-bench/Score": 0.7281, + "reward-bench/Chat": 0.9525, + "reward-bench/Chat Hard": 0.6272, + "reward-bench/Safety": 0.6568, + "reward-bench/Reasoning": 0.7789, + "reward-bench/Prior Sets (0.5 weight)": 0.5216 + } + }, + { + "id": "HuggingFaceH4/zephyr-7b-gemma-v0.1", + "name": "zephyr-7b-gemma-v0.1", + "developer": "HuggingFaceH4", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3364, + "hfopenllm_v2/BBH": 0.4624, + "hfopenllm_v2/MATH Level 5": 0.0816, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.374, + "hfopenllm_v2/MMLU-PRO": 0.2847, + "reward-bench/Score": 0.6758, + "reward-bench/Chat": 0.9581, + "reward-bench/Chat Hard": 0.4956, + "reward-bench/Safety": 0.5824, + "reward-bench/Reasoning": 0.7463, + "reward-bench/Prior Sets (0.5 weight)": 0.5171 + } + }, + { + "id": "HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1", + "name": "zephyr-orpo-141b-A35b-v0.1", + "developer": "HuggingFaceH4", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6511, + "hfopenllm_v2/BBH": 0.629, + "hfopenllm_v2/MATH Level 5": 0.2047, + "hfopenllm_v2/GPQA": 0.3784, + "hfopenllm_v2/MUSR": 0.4465, + "hfopenllm_v2/MMLU-PRO": 0.4586 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/HuggingFaceTB.json b/data/developers/HuggingFaceTB.json new file mode 100644 index 0000000000000000000000000000000000000000..bed31781473fb30427be579aab45ef01bf5054ce --- /dev/null +++ b/data/developers/HuggingFaceTB.json @@ -0,0 +1,173 @@ +{ + "developer": "HuggingFaceTB", + "models": [ + { + "id": "HuggingFaceTB/SmolLM-1.7B", + "name": "SmolLM-1.7B", + "developer": "HuggingFaceTB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2362, + "hfopenllm_v2/BBH": 0.3181, + "hfopenllm_v2/MATH Level 5": 0.0166, + "hfopenllm_v2/GPQA": 0.2416, + "hfopenllm_v2/MUSR": 0.3421, + "hfopenllm_v2/MMLU-PRO": 0.1148 + } + }, + { + "id": "HuggingFaceTB/SmolLM-1.7B-Instruct", + "name": "SmolLM-1.7B-Instruct", + "developer": "HuggingFaceTB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2348, + "hfopenllm_v2/BBH": 0.2885, + "hfopenllm_v2/MATH Level 5": 0.0211, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3487, + "hfopenllm_v2/MMLU-PRO": 0.1166 + } + }, + { + "id": "HuggingFaceTB/SmolLM-135M", + "name": "SmolLM-135M", + "developer": "HuggingFaceTB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2125, + "hfopenllm_v2/BBH": 0.3046, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.4366, + "hfopenllm_v2/MMLU-PRO": 0.1122 + } + }, + { + "id": "HuggingFaceTB/SmolLM-135M-Instruct", + "name": "SmolLM-135M-Instruct", + "developer": "HuggingFaceTB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1214, + "hfopenllm_v2/BBH": 0.3015, + "hfopenllm_v2/MATH Level 5": 0.0053, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3635, + "hfopenllm_v2/MMLU-PRO": 0.1176 + } + }, + { + "id": "HuggingFaceTB/SmolLM-360M", + "name": "SmolLM-360M", + "developer": "HuggingFaceTB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2134, + "hfopenllm_v2/BBH": 0.3065, + "hfopenllm_v2/MATH Level 5": 0.0113, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.4018, + "hfopenllm_v2/MMLU-PRO": 0.1124 + } + }, + { + "id": "HuggingFaceTB/SmolLM-360M-Instruct", + "name": "SmolLM-360M-Instruct", + "developer": "HuggingFaceTB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1952, + "hfopenllm_v2/BBH": 0.2885, + "hfopenllm_v2/MATH Level 5": 0.0181, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3472, + "hfopenllm_v2/MMLU-PRO": 0.1166 + } + }, + { + "id": "HuggingFaceTB/SmolLM2-1.7B", + "name": "SmolLM2-1.7B", + "developer": "HuggingFaceTB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.244, + "hfopenllm_v2/BBH": 0.3453, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3485, + "hfopenllm_v2/MMLU-PRO": 0.2138 + } + }, + { + "id": "HuggingFaceTB/SmolLM2-1.7B-Instruct", + "name": "SmolLM2-1.7B-Instruct", + "developer": "HuggingFaceTB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5368, + "hfopenllm_v2/BBH": 0.3599, + "hfopenllm_v2/MATH Level 5": 0.0582, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3421, + "hfopenllm_v2/MMLU-PRO": 0.2054 + } + }, + { + "id": "HuggingFaceTB/SmolLM2-135M", + "name": "SmolLM2-135M", + "developer": "HuggingFaceTB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1818, + "hfopenllm_v2/BBH": 0.3044, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2483, + "hfopenllm_v2/MUSR": 0.4112, + "hfopenllm_v2/MMLU-PRO": 0.1095 + } + }, + { + "id": "HuggingFaceTB/SmolLM2-135M-Instruct", + "name": "SmolLM2-135M-Instruct", + "developer": "HuggingFaceTB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0593, + "hfopenllm_v2/BBH": 0.3135, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2341, + "hfopenllm_v2/MUSR": 0.3871, + "hfopenllm_v2/MMLU-PRO": 0.1092 + } + }, + { + "id": "HuggingFaceTB/SmolLM2-360M", + "name": "SmolLM2-360M", + "developer": "HuggingFaceTB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2115, + "hfopenllm_v2/BBH": 0.3233, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2458, + "hfopenllm_v2/MUSR": 0.3954, + "hfopenllm_v2/MMLU-PRO": 0.1169 + } + }, + { + "id": "HuggingFaceTB/SmolLM2-360M-Instruct", + "name": "SmolLM2-360M-Instruct", + "developer": "HuggingFaceTB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.083, + "hfopenllm_v2/BBH": 0.3053, + "hfopenllm_v2/MATH Level 5": 0.0083, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3423, + "hfopenllm_v2/MMLU-PRO": 0.1126 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/HumanLLMs.json b/data/developers/HumanLLMs.json new file mode 100644 index 0000000000000000000000000000000000000000..d846fdb1c31c64fe2700aca95b192ad5efdb2343 --- /dev/null +++ b/data/developers/HumanLLMs.json @@ -0,0 +1,47 @@ +{ + "developer": "HumanLLMs", + "models": [ + { + "id": "HumanLLMs/Humanish-LLama3-8B-Instruct", + "name": "Humanish-LLama3-8B-Instruct", + "developer": "HumanLLMs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6498, + "hfopenllm_v2/BBH": 0.4968, + "hfopenllm_v2/MATH Level 5": 0.1027, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3582, + "hfopenllm_v2/MMLU-PRO": 0.3702 + } + }, + { + "id": "HumanLLMs/Humanish-Mistral-Nemo-Instruct-2407", + "name": "Humanish-Mistral-Nemo-Instruct-2407", + "developer": "HumanLLMs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5451, + "hfopenllm_v2/BBH": 0.5262, + "hfopenllm_v2/MATH Level 5": 0.1367, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.3968, + "hfopenllm_v2/MMLU-PRO": 0.3521 + } + }, + { + "id": "HumanLLMs/Humanish-Qwen2.5-7B-Instruct", + "name": "Humanish-Qwen2.5-7B-Instruct", + "developer": "HumanLLMs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7284, + "hfopenllm_v2/BBH": 0.5364, + "hfopenllm_v2/MATH Level 5": 0.5, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.3981, + "hfopenllm_v2/MMLU-PRO": 0.4398 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/IDEA-CCNL.json b/data/developers/IDEA-CCNL.json new file mode 100644 index 0000000000000000000000000000000000000000..6f1c086643897d19e96a9d5282c4cb8fb5a80256 --- /dev/null +++ b/data/developers/IDEA-CCNL.json @@ -0,0 +1,33 @@ +{ + "developer": "IDEA-CCNL", + "models": [ + { + "id": "IDEA-CCNL/Ziya-LLaMA-13B-v1", + "name": "Ziya-LLaMA-13B-v1", + "developer": "IDEA-CCNL", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1697, + "hfopenllm_v2/BBH": 0.2877, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2492, + "hfopenllm_v2/MUSR": 0.3751, + "hfopenllm_v2/MMLU-PRO": 0.1101 + } + }, + { + "id": "IDEA-CCNL/Ziya-LLaMA-7B-Reward", + "name": "IDEA-CCNL/Ziya-LLaMA-7B-Reward", + "developer": "IDEA-CCNL", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6378, + "reward-bench/Chat": 0.8687, + "reward-bench/Chat Hard": 0.4605, + "reward-bench/Safety": 0.6405, + "reward-bench/Reasoning": 0.5775, + "reward-bench/Prior Sets (0.5 weight)": 0.6461 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/INSAIT-Institute.json b/data/developers/INSAIT-Institute.json new file mode 100644 index 0000000000000000000000000000000000000000..3eb93e42a7e5a4cb23315af5f5a331cdf6b4feca --- /dev/null +++ b/data/developers/INSAIT-Institute.json @@ -0,0 +1,19 @@ +{ + "developer": "INSAIT-Institute", + "models": [ + { + "id": "INSAIT-Institute/BgGPT-Gemma-2-27B-IT-v1.0", + "name": "BgGPT-Gemma-2-27B-IT-v1.0", + "developer": "INSAIT-Institute", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0, + "hfopenllm_v2/BBH": 0.2912, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3575, + "hfopenllm_v2/MMLU-PRO": 0.1167 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/IlyaGusev.json b/data/developers/IlyaGusev.json new file mode 100644 index 0000000000000000000000000000000000000000..5fa31a756a7150f9e665c6180fc5857924738f82 --- /dev/null +++ b/data/developers/IlyaGusev.json @@ -0,0 +1,33 @@ +{ + "developer": "IlyaGusev", + "models": [ + { + "id": "IlyaGusev/gemma-2-2b-it-abliterated", + "name": "gemma-2-2b-it-abliterated", + "developer": "IlyaGusev", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5331, + "hfopenllm_v2/BBH": 0.4119, + "hfopenllm_v2/MATH Level 5": 0.0612, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3782, + "hfopenllm_v2/MMLU-PRO": 0.2538 + } + }, + { + "id": "IlyaGusev/gemma-2-9b-it-abliterated", + "name": "gemma-2-9b-it-abliterated", + "developer": "IlyaGusev", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7473, + "hfopenllm_v2/BBH": 0.5906, + "hfopenllm_v2/MATH Level 5": 0.1775, + "hfopenllm_v2/GPQA": 0.3456, + "hfopenllm_v2/MUSR": 0.4034, + "hfopenllm_v2/MMLU-PRO": 0.3915 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Infinirc.json b/data/developers/Infinirc.json new file mode 100644 index 0000000000000000000000000000000000000000..f08721bade7b8a610483249e7a508d48a51c77e6 --- /dev/null +++ b/data/developers/Infinirc.json @@ -0,0 +1,19 @@ +{ + "developer": "Infinirc", + "models": [ + { + "id": "Infinirc/Infinirc-Llama3-8B-2G-Release-v1.0", + "name": "Infinirc-Llama3-8B-2G-Release-v1.0", + "developer": "Infinirc", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2024, + "hfopenllm_v2/BBH": 0.4351, + "hfopenllm_v2/MATH Level 5": 0.0166, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4609, + "hfopenllm_v2/MMLU-PRO": 0.216 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Intel.json b/data/developers/Intel.json new file mode 100644 index 0000000000000000000000000000000000000000..3c307773fc949afd4c3654b02313182481348954 --- /dev/null +++ b/data/developers/Intel.json @@ -0,0 +1,61 @@ +{ + "developer": "Intel", + "models": [ + { + "id": "Intel/neural-chat-7b-v3", + "name": "neural-chat-7b-v3", + "developer": "Intel", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2778, + "hfopenllm_v2/BBH": 0.5048, + "hfopenllm_v2/MATH Level 5": 0.0295, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.5055, + "hfopenllm_v2/MMLU-PRO": 0.2699 + } + }, + { + "id": "Intel/neural-chat-7b-v3-1", + "name": "neural-chat-7b-v3-1", + "developer": "Intel", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4687, + "hfopenllm_v2/BBH": 0.5052, + "hfopenllm_v2/MATH Level 5": 0.0355, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4979, + "hfopenllm_v2/MMLU-PRO": 0.2678 + } + }, + { + "id": "Intel/neural-chat-7b-v3-2", + "name": "neural-chat-7b-v3-2", + "developer": "Intel", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4988, + "hfopenllm_v2/BBH": 0.5032, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4895, + "hfopenllm_v2/MMLU-PRO": 0.2667 + } + }, + { + "id": "Intel/neural-chat-7b-v3-3", + "name": "neural-chat-7b-v3-3", + "developer": "Intel", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4763, + "hfopenllm_v2/BBH": 0.4877, + "hfopenllm_v2/MATH Level 5": 0.0408, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.486, + "hfopenllm_v2/MMLU-PRO": 0.2625 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/IntervitensInc.json b/data/developers/IntervitensInc.json new file mode 100644 index 0000000000000000000000000000000000000000..0ea9aa4a8d0ee5254edca8c7b51c3fd96c16439e --- /dev/null +++ b/data/developers/IntervitensInc.json @@ -0,0 +1,19 @@ +{ + "developer": "IntervitensInc", + "models": [ + { + "id": "IntervitensInc/internlm2_5-20b-llamafied", + "name": "internlm2_5-20b-llamafied", + "developer": "IntervitensInc", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.341, + "hfopenllm_v2/BBH": 0.7478, + "hfopenllm_v2/MATH Level 5": 0.1715, + "hfopenllm_v2/GPQA": 0.3381, + "hfopenllm_v2/MUSR": 0.4475, + "hfopenllm_v2/MMLU-PRO": 0.4051 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Invalid-Null.json b/data/developers/Invalid-Null.json new file mode 100644 index 0000000000000000000000000000000000000000..092ae9a1b72cf4aa02d8fb66f5516fadfff0d956 --- /dev/null +++ b/data/developers/Invalid-Null.json @@ -0,0 +1,33 @@ +{ + "developer": "Invalid-Null", + "models": [ + { + "id": "Invalid-Null/PeiYangMe-0.5", + "name": "PeiYangMe-0.5", + "developer": "Invalid-Null", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1409, + "hfopenllm_v2/BBH": 0.2791, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2441, + "hfopenllm_v2/MUSR": 0.3738, + "hfopenllm_v2/MMLU-PRO": 0.1109 + } + }, + { + "id": "Invalid-Null/PeiYangMe-0.7", + "name": "PeiYangMe-0.7", + "developer": "Invalid-Null", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1491, + "hfopenllm_v2/BBH": 0.3028, + "hfopenllm_v2/MATH Level 5": 0.0113, + "hfopenllm_v2/GPQA": 0.2332, + "hfopenllm_v2/MUSR": 0.3857, + "hfopenllm_v2/MMLU-PRO": 0.1101 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Isaak-Carter.json b/data/developers/Isaak-Carter.json new file mode 100644 index 0000000000000000000000000000000000000000..efbc1cacd0b6bbfbc287a808cc9dcc2a866d50e8 --- /dev/null +++ b/data/developers/Isaak-Carter.json @@ -0,0 +1,47 @@ +{ + "developer": "Isaak-Carter", + "models": [ + { + "id": "Isaak-Carter/JOSIEv4o-8b-stage1-v4", + "name": "JOSIEv4o-8b-stage1-v4", + "developer": "Isaak-Carter", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2553, + "hfopenllm_v2/BBH": 0.4725, + "hfopenllm_v2/MATH Level 5": 0.0529, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.3654, + "hfopenllm_v2/MMLU-PRO": 0.3316 + } + }, + { + "id": "Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated", + "name": "Josiefied-Qwen2.5-7B-Instruct-abliterated", + "developer": "Isaak-Carter", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7317, + "hfopenllm_v2/BBH": 0.5396, + "hfopenllm_v2/MATH Level 5": 0.4924, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4087, + "hfopenllm_v2/MMLU-PRO": 0.4276 + } + }, + { + "id": "Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2", + "name": "Josiefied-Qwen2.5-7B-Instruct-abliterated-v2", + "developer": "Isaak-Carter", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7841, + "hfopenllm_v2/BBH": 0.5311, + "hfopenllm_v2/MATH Level 5": 0.4721, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4354, + "hfopenllm_v2/MMLU-PRO": 0.4128 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/J-LAB.json b/data/developers/J-LAB.json new file mode 100644 index 0000000000000000000000000000000000000000..1d3396bc0913242e032ccd1c20ecfd0982c674e7 --- /dev/null +++ b/data/developers/J-LAB.json @@ -0,0 +1,19 @@ +{ + "developer": "J-LAB", + "models": [ + { + "id": "J-LAB/Thynk_orpo", + "name": "Thynk_orpo", + "developer": "J-LAB", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2102, + "hfopenllm_v2/BBH": 0.4463, + "hfopenllm_v2/MATH Level 5": 0.148, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4515, + "hfopenllm_v2/MMLU-PRO": 0.3231 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/JackFram.json b/data/developers/JackFram.json new file mode 100644 index 0000000000000000000000000000000000000000..17a9ecc1505304f429e159e861c561da0bf6d73a --- /dev/null +++ b/data/developers/JackFram.json @@ -0,0 +1,33 @@ +{ + "developer": "JackFram", + "models": [ + { + "id": "JackFram/llama-160m", + "name": "llama-160m", + "developer": "JackFram", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1791, + "hfopenllm_v2/BBH": 0.2888, + "hfopenllm_v2/MATH Level 5": 0.0083, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3792, + "hfopenllm_v2/MMLU-PRO": 0.1128 + } + }, + { + "id": "JackFram/llama-68m", + "name": "llama-68m", + "developer": "JackFram", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1726, + "hfopenllm_v2/BBH": 0.2936, + "hfopenllm_v2/MATH Level 5": 0.006, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.391, + "hfopenllm_v2/MMLU-PRO": 0.1144 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Jacoby746.json b/data/developers/Jacoby746.json new file mode 100644 index 0000000000000000000000000000000000000000..3b115885947526371d1548d3433037196ce9a203 --- /dev/null +++ b/data/developers/Jacoby746.json @@ -0,0 +1,103 @@ +{ + "developer": "Jacoby746", + "models": [ + { + "id": "Jacoby746/Casual-Magnum-34B", + "name": "Casual-Magnum-34B", + "developer": "Jacoby746", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.193, + "hfopenllm_v2/BBH": 0.6032, + "hfopenllm_v2/MATH Level 5": 0.0921, + "hfopenllm_v2/GPQA": 0.3725, + "hfopenllm_v2/MUSR": 0.4078, + "hfopenllm_v2/MMLU-PRO": 0.5184 + } + }, + { + "id": "Jacoby746/Inf-Silent-Kunoichi-v0.1-2x7B", + "name": "Inf-Silent-Kunoichi-v0.1-2x7B", + "developer": "Jacoby746", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.388, + "hfopenllm_v2/BBH": 0.5185, + "hfopenllm_v2/MATH Level 5": 0.071, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.428, + "hfopenllm_v2/MMLU-PRO": 0.3271 + } + }, + { + "id": "Jacoby746/Inf-Silent-Kunoichi-v0.2-2x7B", + "name": "Inf-Silent-Kunoichi-v0.2-2x7B", + "developer": "Jacoby746", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3636, + "hfopenllm_v2/BBH": 0.5209, + "hfopenllm_v2/MATH Level 5": 0.0627, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.432, + "hfopenllm_v2/MMLU-PRO": 0.3272 + } + }, + { + "id": "Jacoby746/Proto-Athena-4x7B", + "name": "Proto-Athena-4x7B", + "developer": "Jacoby746", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3703, + "hfopenllm_v2/BBH": 0.5107, + "hfopenllm_v2/MATH Level 5": 0.065, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4348, + "hfopenllm_v2/MMLU-PRO": 0.3206 + } + }, + { + "id": "Jacoby746/Proto-Athena-v0.2-4x7B", + "name": "Proto-Athena-v0.2-4x7B", + "developer": "Jacoby746", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3752, + "hfopenllm_v2/BBH": 0.5068, + "hfopenllm_v2/MATH Level 5": 0.0634, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4213, + "hfopenllm_v2/MMLU-PRO": 0.3197 + } + }, + { + "id": "Jacoby746/Proto-Harpy-Blazing-Light-v0.1-2x7B", + "name": "Proto-Harpy-Blazing-Light-v0.1-2x7B", + "developer": "Jacoby746", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4905, + "hfopenllm_v2/BBH": 0.5187, + "hfopenllm_v2/MATH Level 5": 0.0748, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.445, + "hfopenllm_v2/MMLU-PRO": 0.3301 + } + }, + { + "id": "Jacoby746/Proto-Harpy-Spark-v0.1-7B", + "name": "Proto-Harpy-Spark-v0.1-7B", + "developer": "Jacoby746", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4333, + "hfopenllm_v2/BBH": 0.4736, + "hfopenllm_v2/MATH Level 5": 0.0619, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4317, + "hfopenllm_v2/MMLU-PRO": 0.3069 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/JayHyeon.json b/data/developers/JayHyeon.json new file mode 100644 index 0000000000000000000000000000000000000000..012d652b9f435b4d0cd8bf9ddabf2af337a9ae5b --- /dev/null +++ b/data/developers/JayHyeon.json @@ -0,0 +1,2441 @@ +{ + "developer": "JayHyeon", + "models": [ + { + "id": "JayHyeon/Qwen-0.5B-DPO-1epoch", + "name": "Qwen-0.5B-DPO-1epoch", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2647, + "hfopenllm_v2/BBH": 0.3191, + "hfopenllm_v2/MATH Level 5": 0.0287, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.3352, + "hfopenllm_v2/MMLU-PRO": 0.1558 + } + }, + { + "id": "JayHyeon/Qwen-0.5B-DPO-5epoch", + "name": "Qwen-0.5B-DPO-5epoch", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.257, + "hfopenllm_v2/BBH": 0.3112, + "hfopenllm_v2/MATH Level 5": 0.04, + "hfopenllm_v2/GPQA": 0.2433, + "hfopenllm_v2/MUSR": 0.338, + "hfopenllm_v2/MMLU-PRO": 0.1533 + } + }, + { + "id": "JayHyeon/Qwen-0.5B-IRPO-1epoch", + "name": "Qwen-0.5B-IRPO-1epoch", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2589, + "hfopenllm_v2/BBH": 0.3164, + "hfopenllm_v2/MATH Level 5": 0.0317, + "hfopenllm_v2/GPQA": 0.2466, + "hfopenllm_v2/MUSR": 0.3286, + "hfopenllm_v2/MMLU-PRO": 0.15 + } + }, + { + "id": "JayHyeon/Qwen-0.5B-IRPO-5epoch", + "name": "Qwen-0.5B-IRPO-5epoch", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2487, + "hfopenllm_v2/BBH": 0.3189, + "hfopenllm_v2/MATH Level 5": 0.0325, + "hfopenllm_v2/GPQA": 0.2399, + "hfopenllm_v2/MUSR": 0.3287, + "hfopenllm_v2/MMLU-PRO": 0.1507 + } + }, + { + "id": "JayHyeon/Qwen-0.5B-eDPO-1epoch", + "name": "Qwen-0.5B-eDPO-1epoch", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2623, + "hfopenllm_v2/BBH": 0.3181, + "hfopenllm_v2/MATH Level 5": 0.0347, + "hfopenllm_v2/GPQA": 0.2424, + "hfopenllm_v2/MUSR": 0.3327, + "hfopenllm_v2/MMLU-PRO": 0.1553 + } + }, + { + "id": "JayHyeon/Qwen-0.5B-eDPO-5epoch", + "name": "Qwen-0.5B-eDPO-5epoch", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2477, + "hfopenllm_v2/BBH": 0.3096, + "hfopenllm_v2/MATH Level 5": 0.0234, + "hfopenllm_v2/GPQA": 0.2492, + "hfopenllm_v2/MUSR": 0.3326, + "hfopenllm_v2/MMLU-PRO": 0.1523 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-Instruct-SFT", + "name": "Qwen2.5-0.5B-Instruct-SFT", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2768, + "hfopenllm_v2/BBH": 0.3254, + "hfopenllm_v2/MATH Level 5": 0.0393, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.3342, + "hfopenllm_v2/MMLU-PRO": 0.152 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1", + "name": "Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2469, + "hfopenllm_v2/BBH": 0.326, + "hfopenllm_v2/MATH Level 5": 0.065, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3434, + "hfopenllm_v2/MMLU-PRO": 0.1575 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1", + "name": "Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2606, + "hfopenllm_v2/BBH": 0.3308, + "hfopenllm_v2/MATH Level 5": 0.0498, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3288, + "hfopenllm_v2/MMLU-PRO": 0.1626 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1", + "name": "Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2529, + "hfopenllm_v2/BBH": 0.3262, + "hfopenllm_v2/MATH Level 5": 0.0566, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3301, + "hfopenllm_v2/MMLU-PRO": 0.1576 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT", + "name": "Qwen2.5-0.5B-SFT", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1964, + "hfopenllm_v2/BBH": 0.3121, + "hfopenllm_v2/MATH Level 5": 0.0272, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.3394, + "hfopenllm_v2/MMLU-PRO": 0.1673 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-4", + "name": "Qwen2.5-0.5B-SFT-1e-4", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.202, + "hfopenllm_v2/BBH": 0.3017, + "hfopenllm_v2/MATH Level 5": 0.0189, + "hfopenllm_v2/GPQA": 0.2508, + "hfopenllm_v2/MUSR": 0.3446, + "hfopenllm_v2/MMLU-PRO": 0.1619 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-4-2ep", + "name": "Qwen2.5-0.5B-SFT-1e-4-2ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.214, + "hfopenllm_v2/BBH": 0.3172, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.2466, + "hfopenllm_v2/MUSR": 0.3473, + "hfopenllm_v2/MMLU-PRO": 0.1537 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-4-3ep", + "name": "Qwen2.5-0.5B-SFT-1e-4-3ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2257, + "hfopenllm_v2/BBH": 0.3064, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.2483, + "hfopenllm_v2/MUSR": 0.3661, + "hfopenllm_v2/MMLU-PRO": 0.1532 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-4-5ep", + "name": "Qwen2.5-0.5B-SFT-1e-4-5ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1987, + "hfopenllm_v2/BBH": 0.3104, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3407, + "hfopenllm_v2/MMLU-PRO": 0.1558 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-5", + "name": "Qwen2.5-0.5B-SFT-1e-5", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1986, + "hfopenllm_v2/BBH": 0.314, + "hfopenllm_v2/MATH Level 5": 0.0378, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.346, + "hfopenllm_v2/MMLU-PRO": 0.1698 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-5-2ep", + "name": "Qwen2.5-0.5B-SFT-1e-5-2ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1971, + "hfopenllm_v2/BBH": 0.3225, + "hfopenllm_v2/MATH Level 5": 0.0529, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3368, + "hfopenllm_v2/MMLU-PRO": 0.1651 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-5-3ep", + "name": "Qwen2.5-0.5B-SFT-1e-5-3ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2241, + "hfopenllm_v2/BBH": 0.3247, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3353, + "hfopenllm_v2/MMLU-PRO": 0.1689 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-5-5ep", + "name": "Qwen2.5-0.5B-SFT-1e-5-5ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2292, + "hfopenllm_v2/BBH": 0.3259, + "hfopenllm_v2/MATH Level 5": 0.0521, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3235, + "hfopenllm_v2/MMLU-PRO": 0.1688 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-4", + "name": "Qwen2.5-0.5B-SFT-2e-4", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2034, + "hfopenllm_v2/BBH": 0.2936, + "hfopenllm_v2/MATH Level 5": 0.0242, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3434, + "hfopenllm_v2/MMLU-PRO": 0.1413 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-4-2ep", + "name": "Qwen2.5-0.5B-SFT-2e-4-2ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1831, + "hfopenllm_v2/BBH": 0.2984, + "hfopenllm_v2/MATH Level 5": 0.0249, + "hfopenllm_v2/GPQA": 0.2424, + "hfopenllm_v2/MUSR": 0.3568, + "hfopenllm_v2/MMLU-PRO": 0.1484 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-4-3ep", + "name": "Qwen2.5-0.5B-SFT-2e-4-3ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.199, + "hfopenllm_v2/BBH": 0.311, + "hfopenllm_v2/MATH Level 5": 0.0151, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3449, + "hfopenllm_v2/MMLU-PRO": 0.1416 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-4-5ep", + "name": "Qwen2.5-0.5B-SFT-2e-4-5ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1897, + "hfopenllm_v2/BBH": 0.2936, + "hfopenllm_v2/MATH Level 5": 0.0181, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3874, + "hfopenllm_v2/MMLU-PRO": 0.1336 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5", + "name": "Qwen2.5-0.5B-SFT-2e-5", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2068, + "hfopenllm_v2/BBH": 0.3204, + "hfopenllm_v2/MATH Level 5": 0.037, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3487, + "hfopenllm_v2/MMLU-PRO": 0.1678 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2201, + "hfopenllm_v2/BBH": 0.3217, + "hfopenllm_v2/MATH Level 5": 0.0408, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.3367, + "hfopenllm_v2/MMLU-PRO": 0.171 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2411, + "hfopenllm_v2/BBH": 0.3167, + "hfopenllm_v2/MATH Level 5": 0.0347, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3301, + "hfopenllm_v2/MMLU-PRO": 0.1562 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2369, + "hfopenllm_v2/BBH": 0.326, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3355, + "hfopenllm_v2/MMLU-PRO": 0.157 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2262, + "hfopenllm_v2/BBH": 0.3262, + "hfopenllm_v2/MATH Level 5": 0.0347, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3408, + "hfopenllm_v2/MMLU-PRO": 0.1541 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2508, + "hfopenllm_v2/BBH": 0.3199, + "hfopenllm_v2/MATH Level 5": 0.0408, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3355, + "hfopenllm_v2/MMLU-PRO": 0.1555 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.239, + "hfopenllm_v2/BBH": 0.3182, + "hfopenllm_v2/MATH Level 5": 0.04, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3328, + "hfopenllm_v2/MMLU-PRO": 0.156 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2423, + "hfopenllm_v2/BBH": 0.3154, + "hfopenllm_v2/MATH Level 5": 0.0347, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3328, + "hfopenllm_v2/MMLU-PRO": 0.1548 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2493, + "hfopenllm_v2/BBH": 0.319, + "hfopenllm_v2/MATH Level 5": 0.0438, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3341, + "hfopenllm_v2/MMLU-PRO": 0.1561 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2542, + "hfopenllm_v2/BBH": 0.3167, + "hfopenllm_v2/MATH Level 5": 0.0408, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3289, + "hfopenllm_v2/MMLU-PRO": 0.158 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2451, + "hfopenllm_v2/BBH": 0.316, + "hfopenllm_v2/MATH Level 5": 0.0408, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3302, + "hfopenllm_v2/MMLU-PRO": 0.1561 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2557, + "hfopenllm_v2/BBH": 0.3142, + "hfopenllm_v2/MATH Level 5": 0.04, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3315, + "hfopenllm_v2/MMLU-PRO": 0.1575 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2605, + "hfopenllm_v2/BBH": 0.3167, + "hfopenllm_v2/MATH Level 5": 0.0363, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3341, + "hfopenllm_v2/MMLU-PRO": 0.1577 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2578, + "hfopenllm_v2/BBH": 0.3173, + "hfopenllm_v2/MATH Level 5": 0.0355, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3288, + "hfopenllm_v2/MMLU-PRO": 0.1583 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2335, + "hfopenllm_v2/BBH": 0.3198, + "hfopenllm_v2/MATH Level 5": 0.0385, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3276, + "hfopenllm_v2/MMLU-PRO": 0.1581 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2472, + "hfopenllm_v2/BBH": 0.3226, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3262, + "hfopenllm_v2/MMLU-PRO": 0.1538 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2474, + "hfopenllm_v2/BBH": 0.3229, + "hfopenllm_v2/MATH Level 5": 0.0415, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3275, + "hfopenllm_v2/MMLU-PRO": 0.1539 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2403, + "hfopenllm_v2/BBH": 0.3245, + "hfopenllm_v2/MATH Level 5": 0.0431, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.3262, + "hfopenllm_v2/MMLU-PRO": 0.1573 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2368, + "hfopenllm_v2/BBH": 0.3224, + "hfopenllm_v2/MATH Level 5": 0.0461, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3355, + "hfopenllm_v2/MMLU-PRO": 0.1516 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2372, + "hfopenllm_v2/BBH": 0.3248, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3394, + "hfopenllm_v2/MMLU-PRO": 0.155 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2499, + "hfopenllm_v2/BBH": 0.3181, + "hfopenllm_v2/MATH Level 5": 0.0415, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3288, + "hfopenllm_v2/MMLU-PRO": 0.1574 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2381, + "hfopenllm_v2/BBH": 0.3242, + "hfopenllm_v2/MATH Level 5": 0.0498, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3328, + "hfopenllm_v2/MMLU-PRO": 0.1572 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2421, + "hfopenllm_v2/BBH": 0.3225, + "hfopenllm_v2/MATH Level 5": 0.04, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3408, + "hfopenllm_v2/MMLU-PRO": 0.1496 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2381, + "hfopenllm_v2/BBH": 0.3265, + "hfopenllm_v2/MATH Level 5": 0.0446, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3408, + "hfopenllm_v2/MMLU-PRO": 0.1499 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2526, + "hfopenllm_v2/BBH": 0.3177, + "hfopenllm_v2/MATH Level 5": 0.0438, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3342, + "hfopenllm_v2/MMLU-PRO": 0.1572 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2457, + "hfopenllm_v2/BBH": 0.316, + "hfopenllm_v2/MATH Level 5": 0.0446, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3302, + "hfopenllm_v2/MMLU-PRO": 0.1572 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2442, + "hfopenllm_v2/BBH": 0.3194, + "hfopenllm_v2/MATH Level 5": 0.0483, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3315, + "hfopenllm_v2/MMLU-PRO": 0.1567 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2604, + "hfopenllm_v2/BBH": 0.3178, + "hfopenllm_v2/MATH Level 5": 0.0355, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3288, + "hfopenllm_v2/MMLU-PRO": 0.1567 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.249, + "hfopenllm_v2/BBH": 0.3173, + "hfopenllm_v2/MATH Level 5": 0.0393, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3302, + "hfopenllm_v2/MMLU-PRO": 0.1569 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2604, + "hfopenllm_v2/BBH": 0.315, + "hfopenllm_v2/MATH Level 5": 0.0378, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3342, + "hfopenllm_v2/MMLU-PRO": 0.1566 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.255, + "hfopenllm_v2/BBH": 0.3211, + "hfopenllm_v2/MATH Level 5": 0.0491, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3288, + "hfopenllm_v2/MMLU-PRO": 0.1571 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2478, + "hfopenllm_v2/BBH": 0.3198, + "hfopenllm_v2/MATH Level 5": 0.0423, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3315, + "hfopenllm_v2/MMLU-PRO": 0.1587 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2475, + "hfopenllm_v2/BBH": 0.3225, + "hfopenllm_v2/MATH Level 5": 0.04, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3301, + "hfopenllm_v2/MMLU-PRO": 0.1556 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.259, + "hfopenllm_v2/BBH": 0.3185, + "hfopenllm_v2/MATH Level 5": 0.0363, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3275, + "hfopenllm_v2/MMLU-PRO": 0.1586 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2323, + "hfopenllm_v2/BBH": 0.3179, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.3262, + "hfopenllm_v2/MMLU-PRO": 0.1548 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2315, + "hfopenllm_v2/BBH": 0.326, + "hfopenllm_v2/MATH Level 5": 0.0415, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3383, + "hfopenllm_v2/MMLU-PRO": 0.1521 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2298, + "hfopenllm_v2/BBH": 0.332, + "hfopenllm_v2/MATH Level 5": 0.0431, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3329, + "hfopenllm_v2/MMLU-PRO": 0.1567 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2469, + "hfopenllm_v2/BBH": 0.3179, + "hfopenllm_v2/MATH Level 5": 0.0415, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3302, + "hfopenllm_v2/MMLU-PRO": 0.1575 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.252, + "hfopenllm_v2/BBH": 0.3168, + "hfopenllm_v2/MATH Level 5": 0.037, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3328, + "hfopenllm_v2/MMLU-PRO": 0.1576 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2666, + "hfopenllm_v2/BBH": 0.3191, + "hfopenllm_v2/MATH Level 5": 0.0347, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3289, + "hfopenllm_v2/MMLU-PRO": 0.1567 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2499, + "hfopenllm_v2/BBH": 0.3178, + "hfopenllm_v2/MATH Level 5": 0.037, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3341, + "hfopenllm_v2/MMLU-PRO": 0.1562 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2417, + "hfopenllm_v2/BBH": 0.3178, + "hfopenllm_v2/MATH Level 5": 0.04, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3328, + "hfopenllm_v2/MMLU-PRO": 0.1575 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2562, + "hfopenllm_v2/BBH": 0.319, + "hfopenllm_v2/MATH Level 5": 0.0423, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3341, + "hfopenllm_v2/MMLU-PRO": 0.1576 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2408, + "hfopenllm_v2/BBH": 0.3165, + "hfopenllm_v2/MATH Level 5": 0.0431, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3315, + "hfopenllm_v2/MMLU-PRO": 0.1557 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2481, + "hfopenllm_v2/BBH": 0.3204, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3302, + "hfopenllm_v2/MMLU-PRO": 0.1592 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2545, + "hfopenllm_v2/BBH": 0.3186, + "hfopenllm_v2/MATH Level 5": 0.0498, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3289, + "hfopenllm_v2/MMLU-PRO": 0.1561 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.252, + "hfopenllm_v2/BBH": 0.3204, + "hfopenllm_v2/MATH Level 5": 0.0393, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3262, + "hfopenllm_v2/MMLU-PRO": 0.1538 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2315, + "hfopenllm_v2/BBH": 0.3213, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3222, + "hfopenllm_v2/MMLU-PRO": 0.1582 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2515, + "hfopenllm_v2/BBH": 0.3187, + "hfopenllm_v2/MATH Level 5": 0.0431, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3289, + "hfopenllm_v2/MMLU-PRO": 0.1539 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2472, + "hfopenllm_v2/BBH": 0.3213, + "hfopenllm_v2/MATH Level 5": 0.0347, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3262, + "hfopenllm_v2/MMLU-PRO": 0.1588 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.246, + "hfopenllm_v2/BBH": 0.3234, + "hfopenllm_v2/MATH Level 5": 0.0378, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3302, + "hfopenllm_v2/MMLU-PRO": 0.1533 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2524, + "hfopenllm_v2/BBH": 0.3256, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.3368, + "hfopenllm_v2/MMLU-PRO": 0.1531 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2265, + "hfopenllm_v2/BBH": 0.3252, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3262, + "hfopenllm_v2/MMLU-PRO": 0.1568 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2302, + "hfopenllm_v2/BBH": 0.3224, + "hfopenllm_v2/MATH Level 5": 0.0438, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.3408, + "hfopenllm_v2/MMLU-PRO": 0.15 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2524, + "hfopenllm_v2/BBH": 0.3278, + "hfopenllm_v2/MATH Level 5": 0.0408, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.3395, + "hfopenllm_v2/MMLU-PRO": 0.1521 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2658, + "hfopenllm_v2/BBH": 0.3175, + "hfopenllm_v2/MATH Level 5": 0.0363, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3302, + "hfopenllm_v2/MMLU-PRO": 0.1575 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2487, + "hfopenllm_v2/BBH": 0.3189, + "hfopenllm_v2/MATH Level 5": 0.0378, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3275, + "hfopenllm_v2/MMLU-PRO": 0.1595 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.256, + "hfopenllm_v2/BBH": 0.3159, + "hfopenllm_v2/MATH Level 5": 0.0378, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.3275, + "hfopenllm_v2/MMLU-PRO": 0.1562 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2499, + "hfopenllm_v2/BBH": 0.3156, + "hfopenllm_v2/MATH Level 5": 0.04, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3302, + "hfopenllm_v2/MMLU-PRO": 0.1556 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2496, + "hfopenllm_v2/BBH": 0.3177, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3315, + "hfopenllm_v2/MMLU-PRO": 0.1567 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2515, + "hfopenllm_v2/BBH": 0.3172, + "hfopenllm_v2/MATH Level 5": 0.0438, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3275, + "hfopenllm_v2/MMLU-PRO": 0.1553 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-3ep", + "name": "Qwen2.5-0.5B-SFT-2e-5-3ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2281, + "hfopenllm_v2/BBH": 0.324, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3301, + "hfopenllm_v2/MMLU-PRO": 0.1746 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep", + "name": "Qwen2.5-0.5B-SFT-2e-5-5ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2348, + "hfopenllm_v2/BBH": 0.3308, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3409, + "hfopenllm_v2/MMLU-PRO": 0.1695 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2526, + "hfopenllm_v2/BBH": 0.3238, + "hfopenllm_v2/MATH Level 5": 0.0393, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3528, + "hfopenllm_v2/MMLU-PRO": 0.1574 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep", + "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2481, + "hfopenllm_v2/BBH": 0.3175, + "hfopenllm_v2/MATH Level 5": 0.0385, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3475, + "hfopenllm_v2/MMLU-PRO": 0.1597 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep", + "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2548, + "hfopenllm_v2/BBH": 0.3199, + "hfopenllm_v2/MATH Level 5": 0.0385, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3435, + "hfopenllm_v2/MMLU-PRO": 0.1562 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam", + "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2423, + "hfopenllm_v2/BBH": 0.3219, + "hfopenllm_v2/MATH Level 5": 0.034, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3515, + "hfopenllm_v2/MMLU-PRO": 0.1563 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep", + "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2493, + "hfopenllm_v2/BBH": 0.3191, + "hfopenllm_v2/MATH Level 5": 0.0393, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3475, + "hfopenllm_v2/MMLU-PRO": 0.1592 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep", + "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2478, + "hfopenllm_v2/BBH": 0.3218, + "hfopenllm_v2/MATH Level 5": 0.0415, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3515, + "hfopenllm_v2/MMLU-PRO": 0.1556 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-5e-5", + "name": "Qwen2.5-0.5B-SFT-5e-5", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.201, + "hfopenllm_v2/BBH": 0.3109, + "hfopenllm_v2/MATH Level 5": 0.034, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3381, + "hfopenllm_v2/MMLU-PRO": 0.1672 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-5e-5-2ep", + "name": "Qwen2.5-0.5B-SFT-5e-5-2ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2175, + "hfopenllm_v2/BBH": 0.318, + "hfopenllm_v2/MATH Level 5": 0.0378, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3368, + "hfopenllm_v2/MMLU-PRO": 0.1627 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-5e-5-3ep", + "name": "Qwen2.5-0.5B-SFT-5e-5-3ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2199, + "hfopenllm_v2/BBH": 0.3297, + "hfopenllm_v2/MATH Level 5": 0.0302, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3593, + "hfopenllm_v2/MMLU-PRO": 0.1651 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-5e-5-5ep", + "name": "Qwen2.5-0.5B-SFT-5e-5-5ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2077, + "hfopenllm_v2/BBH": 0.3276, + "hfopenllm_v2/MATH Level 5": 0.0272, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3766, + "hfopenllm_v2/MMLU-PRO": 0.1587 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-7e-5", + "name": "Qwen2.5-0.5B-SFT-7e-5", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2093, + "hfopenllm_v2/BBH": 0.3158, + "hfopenllm_v2/MATH Level 5": 0.0302, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3367, + "hfopenllm_v2/MMLU-PRO": 0.1622 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-7e-5-2ep", + "name": "Qwen2.5-0.5B-SFT-7e-5-2ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2156, + "hfopenllm_v2/BBH": 0.31, + "hfopenllm_v2/MATH Level 5": 0.0393, + "hfopenllm_v2/GPQA": 0.2424, + "hfopenllm_v2/MUSR": 0.3367, + "hfopenllm_v2/MMLU-PRO": 0.1567 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-7e-5-3ep", + "name": "Qwen2.5-0.5B-SFT-7e-5-3ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2381, + "hfopenllm_v2/BBH": 0.3199, + "hfopenllm_v2/MATH Level 5": 0.0332, + "hfopenllm_v2/GPQA": 0.2366, + "hfopenllm_v2/MUSR": 0.3554, + "hfopenllm_v2/MMLU-PRO": 0.1522 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-7e-5-5ep", + "name": "Qwen2.5-0.5B-SFT-7e-5-5ep", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.212, + "hfopenllm_v2/BBH": 0.32, + "hfopenllm_v2/MATH Level 5": 0.0219, + "hfopenllm_v2/GPQA": 0.2458, + "hfopenllm_v2/MUSR": 0.3713, + "hfopenllm_v2/MMLU-PRO": 0.1628 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-DPO-1epoch_v1", + "name": "Qwen2.5-0.5B-SFT-DPO-1epoch_v1", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2025, + "hfopenllm_v2/BBH": 0.3268, + "hfopenllm_v2/MATH Level 5": 0.0363, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3209, + "hfopenllm_v2/MMLU-PRO": 0.133 + } + }, + { + "id": "JayHyeon/Qwen2.5-0.5B-SFT-MDPO-1epoch_v1", + "name": "Qwen2.5-0.5B-SFT-MDPO-1epoch_v1", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1964, + "hfopenllm_v2/BBH": 0.3293, + "hfopenllm_v2/MATH Level 5": 0.0468, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3262, + "hfopenllm_v2/MMLU-PRO": 0.1337 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2532, + "hfopenllm_v2/BBH": 0.314, + "hfopenllm_v2/MATH Level 5": 0.0491, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3315, + "hfopenllm_v2/MMLU-PRO": 0.1566 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.267, + "hfopenllm_v2/BBH": 0.3189, + "hfopenllm_v2/MATH Level 5": 0.0408, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3288, + "hfopenllm_v2/MMLU-PRO": 0.1562 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2481, + "hfopenllm_v2/BBH": 0.3261, + "hfopenllm_v2/MATH Level 5": 0.0438, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3368, + "hfopenllm_v2/MMLU-PRO": 0.1565 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2383, + "hfopenllm_v2/BBH": 0.3218, + "hfopenllm_v2/MATH Level 5": 0.0431, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3342, + "hfopenllm_v2/MMLU-PRO": 0.1503 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2471, + "hfopenllm_v2/BBH": 0.3224, + "hfopenllm_v2/MATH Level 5": 0.04, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3328, + "hfopenllm_v2/MMLU-PRO": 0.1533 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2447, + "hfopenllm_v2/BBH": 0.3181, + "hfopenllm_v2/MATH Level 5": 0.0438, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3341, + "hfopenllm_v2/MMLU-PRO": 0.1565 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2551, + "hfopenllm_v2/BBH": 0.3194, + "hfopenllm_v2/MATH Level 5": 0.0446, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3262, + "hfopenllm_v2/MMLU-PRO": 0.1567 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2538, + "hfopenllm_v2/BBH": 0.3153, + "hfopenllm_v2/MATH Level 5": 0.0415, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3261, + "hfopenllm_v2/MMLU-PRO": 0.1583 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2402, + "hfopenllm_v2/BBH": 0.3168, + "hfopenllm_v2/MATH Level 5": 0.0378, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3328, + "hfopenllm_v2/MMLU-PRO": 0.1568 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2484, + "hfopenllm_v2/BBH": 0.3211, + "hfopenllm_v2/MATH Level 5": 0.0438, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3288, + "hfopenllm_v2/MMLU-PRO": 0.1573 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam", + "name": "Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2578, + "hfopenllm_v2/BBH": 0.3203, + "hfopenllm_v2/MATH Level 5": 0.0423, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3289, + "hfopenllm_v2/MMLU-PRO": 0.1583 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPO_1e-6-3ep_0alp_0lam", + "name": "Qwen_0.5-DPO_1e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2316, + "hfopenllm_v2/BBH": 0.3258, + "hfopenllm_v2/MATH Level 5": 0.0529, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3221, + "hfopenllm_v2/MMLU-PRO": 0.158 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPO_1e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-DPO_1e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.236, + "hfopenllm_v2/BBH": 0.3225, + "hfopenllm_v2/MATH Level 5": 0.0438, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3222, + "hfopenllm_v2/MMLU-PRO": 0.1596 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPO_3e-6-1ep_0alp_0lam", + "name": "Qwen_0.5-DPO_3e-6-1ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2337, + "hfopenllm_v2/BBH": 0.3132, + "hfopenllm_v2/MATH Level 5": 0.0347, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3235, + "hfopenllm_v2/MMLU-PRO": 0.1533 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPO_3e-6-2ep_0alp_0lam", + "name": "Qwen_0.5-DPO_3e-6-2ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2569, + "hfopenllm_v2/BBH": 0.3276, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3156, + "hfopenllm_v2/MMLU-PRO": 0.1565 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPO_3e-6-3ep_0alp_0lam", + "name": "Qwen_0.5-DPO_3e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.246, + "hfopenllm_v2/BBH": 0.3267, + "hfopenllm_v2/MATH Level 5": 0.0431, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3209, + "hfopenllm_v2/MMLU-PRO": 0.1543 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPO_3e-7-1ep_0alp_0lam", + "name": "Qwen_0.5-DPO_3e-7-1ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2529, + "hfopenllm_v2/BBH": 0.3229, + "hfopenllm_v2/MATH Level 5": 0.0551, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3195, + "hfopenllm_v2/MMLU-PRO": 0.1597 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPO_3e-7-2ep_0alp_0lam", + "name": "Qwen_0.5-DPO_3e-7-2ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2505, + "hfopenllm_v2/BBH": 0.3256, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3195, + "hfopenllm_v2/MMLU-PRO": 0.1599 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPO_3e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-DPO_3e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2387, + "hfopenllm_v2/BBH": 0.3258, + "hfopenllm_v2/MATH Level 5": 0.0446, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3169, + "hfopenllm_v2/MMLU-PRO": 0.1589 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPO_5e-7-1ep_0alp_0lam", + "name": "Qwen_0.5-DPO_5e-7-1ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2532, + "hfopenllm_v2/BBH": 0.3218, + "hfopenllm_v2/MATH Level 5": 0.0634, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3209, + "hfopenllm_v2/MMLU-PRO": 0.1593 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPO_5e-7-2ep_0alp_0lam", + "name": "Qwen_0.5-DPO_5e-7-2ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2456, + "hfopenllm_v2/BBH": 0.3299, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3181, + "hfopenllm_v2/MMLU-PRO": 0.1602 + } + }, + { + "id": "JayHyeon/Qwen_0.5-DPO_5e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-DPO_5e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2423, + "hfopenllm_v2/BBH": 0.3271, + "hfopenllm_v2/MATH Level 5": 0.0514, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3181, + "hfopenllm_v2/MMLU-PRO": 0.1595 + } + }, + { + "id": "JayHyeon/Qwen_0.5-IPO_5e-7-1ep_0alp_0lam", + "name": "Qwen_0.5-IPO_5e-7-1ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2574, + "hfopenllm_v2/BBH": 0.3279, + "hfopenllm_v2/MATH Level 5": 0.0559, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3169, + "hfopenllm_v2/MMLU-PRO": 0.1651 + } + }, + { + "id": "JayHyeon/Qwen_0.5-IPO_5e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-IPO_5e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3072, + "hfopenllm_v2/BBH": 0.3264, + "hfopenllm_v2/MATH Level 5": 0.0582, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3156, + "hfopenllm_v2/MMLU-PRO": 0.1624 + } + }, + { + "id": "JayHyeon/Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam", + "name": "Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2551, + "hfopenllm_v2/BBH": 0.3242, + "hfopenllm_v2/MATH Level 5": 0.0468, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3182, + "hfopenllm_v2/MMLU-PRO": 0.1574 + } + }, + { + "id": "JayHyeon/Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam", + "name": "Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2636, + "hfopenllm_v2/BBH": 0.3198, + "hfopenllm_v2/MATH Level 5": 0.0514, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3262, + "hfopenllm_v2/MMLU-PRO": 0.1586 + } + }, + { + "id": "JayHyeon/Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam", + "name": "Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2323, + "hfopenllm_v2/BBH": 0.3255, + "hfopenllm_v2/MATH Level 5": 0.037, + "hfopenllm_v2/GPQA": 0.2508, + "hfopenllm_v2/MUSR": 0.3169, + "hfopenllm_v2/MMLU-PRO": 0.1612 + } + }, + { + "id": "JayHyeon/Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam", + "name": "Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2414, + "hfopenllm_v2/BBH": 0.3314, + "hfopenllm_v2/MATH Level 5": 0.0347, + "hfopenllm_v2/GPQA": 0.2517, + "hfopenllm_v2/MUSR": 0.3342, + "hfopenllm_v2/MMLU-PRO": 0.1532 + } + }, + { + "id": "JayHyeon/Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam", + "name": "Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2678, + "hfopenllm_v2/BBH": 0.3362, + "hfopenllm_v2/MATH Level 5": 0.0514, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.3382, + "hfopenllm_v2/MMLU-PRO": 0.1561 + } + }, + { + "id": "JayHyeon/Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam", + "name": "Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2561, + "hfopenllm_v2/BBH": 0.3231, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3196, + "hfopenllm_v2/MMLU-PRO": 0.1589 + } + }, + { + "id": "JayHyeon/Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam", + "name": "Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2639, + "hfopenllm_v2/BBH": 0.3257, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3209, + "hfopenllm_v2/MMLU-PRO": 0.1587 + } + }, + { + "id": "JayHyeon/Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam", + "name": "Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2518, + "hfopenllm_v2/BBH": 0.3214, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3169, + "hfopenllm_v2/MMLU-PRO": 0.1585 + } + }, + { + "id": "JayHyeon/Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam", + "name": "Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2438, + "hfopenllm_v2/BBH": 0.3266, + "hfopenllm_v2/MATH Level 5": 0.0619, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3196, + "hfopenllm_v2/MMLU-PRO": 0.1554 + } + }, + { + "id": "JayHyeon/Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam", + "name": "Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2465, + "hfopenllm_v2/BBH": 0.3246, + "hfopenllm_v2/MATH Level 5": 0.0529, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3182, + "hfopenllm_v2/MMLU-PRO": 0.1563 + } + }, + { + "id": "JayHyeon/Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2506, + "hfopenllm_v2/BBH": 0.3261, + "hfopenllm_v2/MATH Level 5": 0.0498, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.3382, + "hfopenllm_v2/MMLU-PRO": 0.1522 + } + }, + { + "id": "JayHyeon/Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2457, + "hfopenllm_v2/BBH": 0.318, + "hfopenllm_v2/MATH Level 5": 0.0347, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3315, + "hfopenllm_v2/MMLU-PRO": 0.1566 + } + }, + { + "id": "JayHyeon/Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2454, + "hfopenllm_v2/BBH": 0.3216, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3382, + "hfopenllm_v2/MMLU-PRO": 0.1544 + } + }, + { + "id": "JayHyeon/Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2342, + "hfopenllm_v2/BBH": 0.3189, + "hfopenllm_v2/MATH Level 5": 0.04, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3302, + "hfopenllm_v2/MMLU-PRO": 0.158 + } + }, + { + "id": "JayHyeon/Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.232, + "hfopenllm_v2/BBH": 0.3234, + "hfopenllm_v2/MATH Level 5": 0.0393, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3369, + "hfopenllm_v2/MMLU-PRO": 0.1543 + } + }, + { + "id": "JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2418, + "hfopenllm_v2/BBH": 0.3175, + "hfopenllm_v2/MATH Level 5": 0.0423, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3288, + "hfopenllm_v2/MMLU-PRO": 0.158 + } + }, + { + "id": "JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2493, + "hfopenllm_v2/BBH": 0.3197, + "hfopenllm_v2/MATH Level 5": 0.0423, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3315, + "hfopenllm_v2/MMLU-PRO": 0.1571 + } + }, + { + "id": "JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.252, + "hfopenllm_v2/BBH": 0.3198, + "hfopenllm_v2/MATH Level 5": 0.0423, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3262, + "hfopenllm_v2/MMLU-PRO": 0.1551 + } + }, + { + "id": "JayHyeon/Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.258, + "hfopenllm_v2/BBH": 0.3248, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3422, + "hfopenllm_v2/MMLU-PRO": 0.1539 + } + }, + { + "id": "JayHyeon/Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.232, + "hfopenllm_v2/BBH": 0.3265, + "hfopenllm_v2/MATH Level 5": 0.0385, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3395, + "hfopenllm_v2/MMLU-PRO": 0.1537 + } + }, + { + "id": "JayHyeon/Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2488, + "hfopenllm_v2/BBH": 0.3273, + "hfopenllm_v2/MATH Level 5": 0.0461, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3342, + "hfopenllm_v2/MMLU-PRO": 0.1531 + } + }, + { + "id": "JayHyeon/Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2524, + "hfopenllm_v2/BBH": 0.313, + "hfopenllm_v2/MATH Level 5": 0.0446, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3289, + "hfopenllm_v2/MMLU-PRO": 0.1564 + } + }, + { + "id": "JayHyeon/Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2514, + "hfopenllm_v2/BBH": 0.3221, + "hfopenllm_v2/MATH Level 5": 0.0438, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3315, + "hfopenllm_v2/MMLU-PRO": 0.1538 + } + }, + { + "id": "JayHyeon/Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2457, + "hfopenllm_v2/BBH": 0.318, + "hfopenllm_v2/MATH Level 5": 0.0385, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3275, + "hfopenllm_v2/MMLU-PRO": 0.1572 + } + }, + { + "id": "JayHyeon/Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2636, + "hfopenllm_v2/BBH": 0.3181, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3235, + "hfopenllm_v2/MMLU-PRO": 0.1574 + } + }, + { + "id": "JayHyeon/Qwen_0.5-VDPO_3e-6-1ep_3vpo_const", + "name": "Qwen_0.5-VDPO_3e-6-1ep_3vpo_const", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2483, + "hfopenllm_v2/BBH": 0.3174, + "hfopenllm_v2/MATH Level 5": 0.0378, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.3328, + "hfopenllm_v2/MMLU-PRO": 0.1558 + } + }, + { + "id": "JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam", + "name": "Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2518, + "hfopenllm_v2/BBH": 0.3218, + "hfopenllm_v2/MATH Level 5": 0.0529, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3235, + "hfopenllm_v2/MMLU-PRO": 0.1595 + } + }, + { + "id": "JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_10vpo_const", + "name": "Qwen_0.5-VDPO_5e-7-1ep_10vpo_const", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2536, + "hfopenllm_v2/BBH": 0.3234, + "hfopenllm_v2/MATH Level 5": 0.0491, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3236, + "hfopenllm_v2/MMLU-PRO": 0.1597 + } + }, + { + "id": "JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_1vpo_const", + "name": "Qwen_0.5-VDPO_5e-7-1ep_1vpo_const", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2448, + "hfopenllm_v2/BBH": 0.324, + "hfopenllm_v2/MATH Level 5": 0.0604, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3249, + "hfopenllm_v2/MMLU-PRO": 0.1587 + } + }, + { + "id": "JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_3vpo_const", + "name": "Qwen_0.5-VDPO_5e-7-1ep_3vpo_const", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2505, + "hfopenllm_v2/BBH": 0.3227, + "hfopenllm_v2/MATH Level 5": 0.0468, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3209, + "hfopenllm_v2/MMLU-PRO": 0.1589 + } + }, + { + "id": "JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2472, + "hfopenllm_v2/BBH": 0.3255, + "hfopenllm_v2/MATH Level 5": 0.0498, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3208, + "hfopenllm_v2/MMLU-PRO": 0.1587 + } + }, + { + "id": "JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_1vpo_const", + "name": "Qwen_0.5-VDPO_5e-7-3ep_1vpo_const", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2417, + "hfopenllm_v2/BBH": 0.3256, + "hfopenllm_v2/MATH Level 5": 0.0582, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3275, + "hfopenllm_v2/MMLU-PRO": 0.1562 + } + }, + { + "id": "JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_3vpo_const", + "name": "Qwen_0.5-VDPO_5e-7-3ep_3vpo_const", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2527, + "hfopenllm_v2/BBH": 0.3235, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.3235, + "hfopenllm_v2/MMLU-PRO": 0.158 + } + }, + { + "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam", + "name": "Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2669, + "hfopenllm_v2/BBH": 0.3314, + "hfopenllm_v2/MATH Level 5": 0.071, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3168, + "hfopenllm_v2/MMLU-PRO": 0.1634 + } + }, + { + "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_10vpo_const", + "name": "Qwen_0.5-VIPO_5e-7-1ep_10vpo_const", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2702, + "hfopenllm_v2/BBH": 0.33, + "hfopenllm_v2/MATH Level 5": 0.074, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3208, + "hfopenllm_v2/MMLU-PRO": 0.1635 + } + }, + { + "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_1vpo_const", + "name": "Qwen_0.5-VIPO_5e-7-1ep_1vpo_const", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.248, + "hfopenllm_v2/BBH": 0.3309, + "hfopenllm_v2/MATH Level 5": 0.068, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3208, + "hfopenllm_v2/MMLU-PRO": 0.1649 + } + }, + { + "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_30vpo_const", + "name": "Qwen_0.5-VIPO_5e-7-1ep_30vpo_const", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2622, + "hfopenllm_v2/BBH": 0.3282, + "hfopenllm_v2/MATH Level 5": 0.074, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3221, + "hfopenllm_v2/MMLU-PRO": 0.1634 + } + }, + { + "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_3vpo_const", + "name": "Qwen_0.5-VIPO_5e-7-1ep_3vpo_const", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2609, + "hfopenllm_v2/BBH": 0.3298, + "hfopenllm_v2/MATH Level 5": 0.065, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3168, + "hfopenllm_v2/MMLU-PRO": 0.1651 + } + }, + { + "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam", + "name": "Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.293, + "hfopenllm_v2/BBH": 0.322, + "hfopenllm_v2/MATH Level 5": 0.0627, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3116, + "hfopenllm_v2/MMLU-PRO": 0.1591 + } + }, + { + "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_10vpo_const", + "name": "Qwen_0.5-VIPO_5e-7-3ep_10vpo_const", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2881, + "hfopenllm_v2/BBH": 0.3255, + "hfopenllm_v2/MATH Level 5": 0.0725, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3102, + "hfopenllm_v2/MMLU-PRO": 0.1582 + } + }, + { + "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_1vpo_const", + "name": "Qwen_0.5-VIPO_5e-7-3ep_1vpo_const", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2887, + "hfopenllm_v2/BBH": 0.3237, + "hfopenllm_v2/MATH Level 5": 0.0748, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3142, + "hfopenllm_v2/MMLU-PRO": 0.1609 + } + }, + { + "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_30vpo_const", + "name": "Qwen_0.5-VIPO_5e-7-3ep_30vpo_const", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2905, + "hfopenllm_v2/BBH": 0.3254, + "hfopenllm_v2/MATH Level 5": 0.077, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3129, + "hfopenllm_v2/MMLU-PRO": 0.1574 + } + }, + { + "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_3vpo_const", + "name": "Qwen_0.5-VIPO_5e-7-3ep_3vpo_const", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2905, + "hfopenllm_v2/BBH": 0.3238, + "hfopenllm_v2/MATH Level 5": 0.0702, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3089, + "hfopenllm_v2/MMLU-PRO": 0.1592 + } + }, + { + "id": "JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1", + "name": "Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2393, + "hfopenllm_v2/BBH": 0.3244, + "hfopenllm_v2/MATH Level 5": 0.0514, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.3222, + "hfopenllm_v2/MMLU-PRO": 0.1573 + } + }, + { + "id": "JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3", + "name": "Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2475, + "hfopenllm_v2/BBH": 0.3209, + "hfopenllm_v2/MATH Level 5": 0.0461, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.3275, + "hfopenllm_v2/MMLU-PRO": 0.1567 + } + }, + { + "id": "JayHyeon/Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1", + "name": "Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2321, + "hfopenllm_v2/BBH": 0.3278, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3022, + "hfopenllm_v2/MMLU-PRO": 0.1496 + } + }, + { + "id": "JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1", + "name": "Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2542, + "hfopenllm_v2/BBH": 0.3253, + "hfopenllm_v2/MATH Level 5": 0.0529, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3181, + "hfopenllm_v2/MMLU-PRO": 0.1609 + } + }, + { + "id": "JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3", + "name": "Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3", + "developer": "JayHyeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2739, + "hfopenllm_v2/BBH": 0.3245, + "hfopenllm_v2/MATH Level 5": 0.0461, + "hfopenllm_v2/GPQA": 0.2508, + "hfopenllm_v2/MUSR": 0.3089, + "hfopenllm_v2/MMLU-PRO": 0.1597 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Jimmy19991222.json b/data/developers/Jimmy19991222.json new file mode 100644 index 0000000000000000000000000000000000000000..4571753e45458ab5f557c1cd9256b52d5c151a78 --- /dev/null +++ b/data/developers/Jimmy19991222.json @@ -0,0 +1,117 @@ +{ + "developer": "Jimmy19991222", + "models": [ + { + "id": "Jimmy19991222/Llama-3-Instruct-8B-SimPO-v0.2", + "name": "Llama-3-Instruct-8B-SimPO-v0.2", + "developer": "Jimmy19991222", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.654, + "hfopenllm_v2/BBH": 0.4984, + "hfopenllm_v2/MATH Level 5": 0.0619, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.4013, + "hfopenllm_v2/MMLU-PRO": 0.3686 + } + }, + { + "id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun", + "name": "llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun", + "developer": "Jimmy19991222", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6717, + "hfopenllm_v2/BBH": 0.488, + "hfopenllm_v2/MATH Level 5": 0.0604, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4041, + "hfopenllm_v2/MMLU-PRO": 0.3634 + } + }, + { + "id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log", + "name": "llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log", + "developer": "Jimmy19991222", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6556, + "hfopenllm_v2/BBH": 0.4935, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4, + "hfopenllm_v2/MMLU-PRO": 0.3658 + } + }, + { + "id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log", + "name": "llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log", + "developer": "Jimmy19991222", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6315, + "hfopenllm_v2/BBH": 0.4916, + "hfopenllm_v2/MATH Level 5": 0.065, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.3935, + "hfopenllm_v2/MMLU-PRO": 0.3611 + } + }, + { + "id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4", + "name": "llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4", + "developer": "Jimmy19991222", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6285, + "hfopenllm_v2/BBH": 0.4986, + "hfopenllm_v2/MATH Level 5": 0.0514, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4014, + "hfopenllm_v2/MMLU-PRO": 0.3545 + } + }, + { + "id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun", + "name": "llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun", + "developer": "Jimmy19991222", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6678, + "hfopenllm_v2/BBH": 0.494, + "hfopenllm_v2/MATH Level 5": 0.0612, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.3987, + "hfopenllm_v2/MMLU-PRO": 0.3658 + } + }, + { + "id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log", + "name": "llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log", + "developer": "Jimmy19991222", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6605, + "hfopenllm_v2/BBH": 0.4916, + "hfopenllm_v2/MATH Level 5": 0.0657, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4, + "hfopenllm_v2/MMLU-PRO": 0.3664 + } + }, + { + "id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log", + "name": "llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log", + "developer": "Jimmy19991222", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6492, + "hfopenllm_v2/BBH": 0.4952, + "hfopenllm_v2/MATH Level 5": 0.0642, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.3961, + "hfopenllm_v2/MMLU-PRO": 0.3711 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Joseph717171.json b/data/developers/Joseph717171.json new file mode 100644 index 0000000000000000000000000000000000000000..6363ccc8ae26d06a2ddb252774113b997a3919cc --- /dev/null +++ b/data/developers/Joseph717171.json @@ -0,0 +1,33 @@ +{ + "developer": "Joseph717171", + "models": [ + { + "id": "Joseph717171/Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32", + "name": "Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32", + "developer": "Joseph717171", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6185, + "hfopenllm_v2/BBH": 0.5177, + "hfopenllm_v2/MATH Level 5": 0.0514, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4369, + "hfopenllm_v2/MMLU-PRO": 0.3144 + } + }, + { + "id": "Joseph717171/Llama-3.1-SuperNova-8B-Lite_TIES_with_Base", + "name": "Llama-3.1-SuperNova-8B-Lite_TIES_with_Base", + "developer": "Joseph717171", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8096, + "hfopenllm_v2/BBH": 0.5147, + "hfopenllm_v2/MATH Level 5": 0.1835, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.411, + "hfopenllm_v2/MMLU-PRO": 0.388 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Josephgflowers.json b/data/developers/Josephgflowers.json new file mode 100644 index 0000000000000000000000000000000000000000..3042f8c46bdce75e6f71b9200fc2b6cb3910ba14 --- /dev/null +++ b/data/developers/Josephgflowers.json @@ -0,0 +1,103 @@ +{ + "developer": "Josephgflowers", + "models": [ + { + "id": "Josephgflowers/Cinder-Phi-2-V1-F16-gguf", + "name": "Cinder-Phi-2-V1-F16-gguf", + "developer": "Josephgflowers", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2357, + "hfopenllm_v2/BBH": 0.4397, + "hfopenllm_v2/MATH Level 5": 0.0242, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.3435, + "hfopenllm_v2/MMLU-PRO": 0.2161 + } + }, + { + "id": "Josephgflowers/Differential-Attention-Liquid-Metal-Tinyllama", + "name": "Differential-Attention-Liquid-Metal-Tinyllama", + "developer": "Josephgflowers", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2227, + "hfopenllm_v2/BBH": 0.2926, + "hfopenllm_v2/MATH Level 5": 0.0325, + "hfopenllm_v2/GPQA": 0.2508, + "hfopenllm_v2/MUSR": 0.3356, + "hfopenllm_v2/MMLU-PRO": 0.1214 + } + }, + { + "id": "Josephgflowers/TinyLlama-Cinder-Agent-v1", + "name": "TinyLlama-Cinder-Agent-v1", + "developer": "Josephgflowers", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.267, + "hfopenllm_v2/BBH": 0.3116, + "hfopenllm_v2/MATH Level 5": 0.0347, + "hfopenllm_v2/GPQA": 0.2441, + "hfopenllm_v2/MUSR": 0.3395, + "hfopenllm_v2/MMLU-PRO": 0.1161 + } + }, + { + "id": "Josephgflowers/TinyLlama-v1.1-Cinders-World", + "name": "TinyLlama-v1.1-Cinders-World", + "developer": "Josephgflowers", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2469, + "hfopenllm_v2/BBH": 0.2998, + "hfopenllm_v2/MATH Level 5": 0.0347, + "hfopenllm_v2/GPQA": 0.2441, + "hfopenllm_v2/MUSR": 0.3356, + "hfopenllm_v2/MMLU-PRO": 0.1198 + } + }, + { + "id": "Josephgflowers/TinyLlama_v1.1_math_code-world-test-1", + "name": "TinyLlama_v1.1_math_code-world-test-1", + "developer": "Josephgflowers", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0078, + "hfopenllm_v2/BBH": 0.3146, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2341, + "hfopenllm_v2/MUSR": 0.3499, + "hfopenllm_v2/MMLU-PRO": 0.1132 + } + }, + { + "id": "Josephgflowers/Tinyllama-STEM-Cinder-Agent-v1", + "name": "Tinyllama-STEM-Cinder-Agent-v1", + "developer": "Josephgflowers", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2126, + "hfopenllm_v2/BBH": 0.3084, + "hfopenllm_v2/MATH Level 5": 0.0672, + "hfopenllm_v2/GPQA": 0.2349, + "hfopenllm_v2/MUSR": 0.3341, + "hfopenllm_v2/MMLU-PRO": 0.1086 + } + }, + { + "id": "Josephgflowers/Tinyllama-r1", + "name": "Tinyllama-r1", + "developer": "Josephgflowers", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2119, + "hfopenllm_v2/BBH": 0.3015, + "hfopenllm_v2/MATH Level 5": 0.0325, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3315, + "hfopenllm_v2/MMLU-PRO": 0.1134 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/JungZoona.json b/data/developers/JungZoona.json new file mode 100644 index 0000000000000000000000000000000000000000..ac06e3f8c40f69e4c17995c20faab8440fb2647a --- /dev/null +++ b/data/developers/JungZoona.json @@ -0,0 +1,33 @@ +{ + "developer": "JungZoona", + "models": [ + { + "id": "JungZoona/T3Q-Qwen2.5-14B-Instruct-1M-e3", + "name": "T3Q-Qwen2.5-14B-Instruct-1M-e3", + "developer": "JungZoona", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7324, + "hfopenllm_v2/BBH": 0.7586, + "hfopenllm_v2/MATH Level 5": 0.2863, + "hfopenllm_v2/GPQA": 0.4169, + "hfopenllm_v2/MUSR": 0.5911, + "hfopenllm_v2/MMLU-PRO": 0.5884 + } + }, + { + "id": "JungZoona/T3Q-qwen2.5-14b-v1.0-e3", + "name": "T3Q-qwen2.5-14b-v1.0-e3", + "developer": "JungZoona", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7324, + "hfopenllm_v2/BBH": 0.7586, + "hfopenllm_v2/MATH Level 5": 0.2863, + "hfopenllm_v2/GPQA": 0.4169, + "hfopenllm_v2/MUSR": 0.5911, + "hfopenllm_v2/MMLU-PRO": 0.5884 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Junhoee.json b/data/developers/Junhoee.json new file mode 100644 index 0000000000000000000000000000000000000000..2b0c4f097bf8740d6939a8ab53509651fb64fb47 --- /dev/null +++ b/data/developers/Junhoee.json @@ -0,0 +1,19 @@ +{ + "developer": "Junhoee", + "models": [ + { + "id": "Junhoee/Qwen-Megumin", + "name": "Qwen-Megumin", + "developer": "Junhoee", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7141, + "hfopenllm_v2/BBH": 0.5285, + "hfopenllm_v2/MATH Level 5": 0.4902, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.398, + "hfopenllm_v2/MMLU-PRO": 0.4199 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/KSU-HW-SEC.json b/data/developers/KSU-HW-SEC.json new file mode 100644 index 0000000000000000000000000000000000000000..7df9b6de5475b7da7c530ee8d3cdfc4b8b99d1f6 --- /dev/null +++ b/data/developers/KSU-HW-SEC.json @@ -0,0 +1,61 @@ +{ + "developer": "KSU-HW-SEC", + "models": [ + { + "id": "KSU-HW-SEC/Llama3-70b-SVA-FT-1415", + "name": "Llama3-70b-SVA-FT-1415", + "developer": "KSU-HW-SEC", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.618, + "hfopenllm_v2/BBH": 0.665, + "hfopenllm_v2/MATH Level 5": 0.2198, + "hfopenllm_v2/GPQA": 0.375, + "hfopenllm_v2/MUSR": 0.4565, + "hfopenllm_v2/MMLU-PRO": 0.5243 + } + }, + { + "id": "KSU-HW-SEC/Llama3-70b-SVA-FT-500", + "name": "Llama3-70b-SVA-FT-500", + "developer": "KSU-HW-SEC", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6105, + "hfopenllm_v2/BBH": 0.6692, + "hfopenllm_v2/MATH Level 5": 0.2137, + "hfopenllm_v2/GPQA": 0.3809, + "hfopenllm_v2/MUSR": 0.4511, + "hfopenllm_v2/MMLU-PRO": 0.5227 + } + }, + { + "id": "KSU-HW-SEC/Llama3-70b-SVA-FT-final", + "name": "Llama3-70b-SVA-FT-final", + "developer": "KSU-HW-SEC", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6165, + "hfopenllm_v2/BBH": 0.665, + "hfopenllm_v2/MATH Level 5": 0.2198, + "hfopenllm_v2/GPQA": 0.375, + "hfopenllm_v2/MUSR": 0.4565, + "hfopenllm_v2/MMLU-PRO": 0.5243 + } + }, + { + "id": "KSU-HW-SEC/Llama3.1-70b-SVA-FT-1000step", + "name": "Llama3.1-70b-SVA-FT-1000step", + "developer": "KSU-HW-SEC", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7238, + "hfopenllm_v2/BBH": 0.6903, + "hfopenllm_v2/MATH Level 5": 0.321, + "hfopenllm_v2/GPQA": 0.396, + "hfopenllm_v2/MUSR": 0.4592, + "hfopenllm_v2/MMLU-PRO": 0.5252 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Khetterman.json b/data/developers/Khetterman.json new file mode 100644 index 0000000000000000000000000000000000000000..b0a72c659418ec6373be438380414d0ba3eea3ce --- /dev/null +++ b/data/developers/Khetterman.json @@ -0,0 +1,33 @@ +{ + "developer": "Khetterman", + "models": [ + { + "id": "Khetterman/DarkAtom-12B-v3", + "name": "DarkAtom-12B-v3", + "developer": "Khetterman", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6173, + "hfopenllm_v2/BBH": 0.5154, + "hfopenllm_v2/MATH Level 5": 0.111, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4468, + "hfopenllm_v2/MMLU-PRO": 0.3546 + } + }, + { + "id": "Khetterman/Kosmos-8B-v1", + "name": "Kosmos-8B-v1", + "developer": "Khetterman", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4129, + "hfopenllm_v2/BBH": 0.5234, + "hfopenllm_v2/MATH Level 5": 0.0989, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.3919, + "hfopenllm_v2/MMLU-PRO": 0.3669 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Kimargin.json b/data/developers/Kimargin.json new file mode 100644 index 0000000000000000000000000000000000000000..f2d25e0bd9e97f1a9b943f28a485582a78d710a4 --- /dev/null +++ b/data/developers/Kimargin.json @@ -0,0 +1,19 @@ +{ + "developer": "Kimargin", + "models": [ + { + "id": "Kimargin/GPT-NEO-1.3B-wiki", + "name": "GPT-NEO-1.3B-wiki", + "developer": "Kimargin", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1921, + "hfopenllm_v2/BBH": 0.3026, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.245, + "hfopenllm_v2/MUSR": 0.3883, + "hfopenllm_v2/MMLU-PRO": 0.1099 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Kimi.json b/data/developers/Kimi.json new file mode 100644 index 0000000000000000000000000000000000000000..1a69217fe4b32f418bb2d3f94f33e9c6502c5cfe --- /dev/null +++ b/data/developers/Kimi.json @@ -0,0 +1,14 @@ +{ + "developer": "Kimi", + "models": [ + { + "id": "moonshot-ai/kimi-k2.5", + "name": "Kimi K2.5", + "developer": "Kimi", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 43.2 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/KingNish.json b/data/developers/KingNish.json new file mode 100644 index 0000000000000000000000000000000000000000..c0ab6d68267c1282cfff3f78deff091ce1a266cb --- /dev/null +++ b/data/developers/KingNish.json @@ -0,0 +1,103 @@ +{ + "developer": "KingNish", + "models": [ + { + "id": "KingNish/Qwen2.5-0.5b-Test-ft", + "name": "Qwen2.5-0.5b-Test-ft", + "developer": "KingNish", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2671, + "hfopenllm_v2/BBH": 0.3232, + "hfopenllm_v2/MATH Level 5": 0.0355, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3421, + "hfopenllm_v2/MMLU-PRO": 0.1689 + } + }, + { + "id": "KingNish/Reasoning-0.5b", + "name": "Reasoning-0.5b", + "developer": "KingNish", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2174, + "hfopenllm_v2/BBH": 0.3354, + "hfopenllm_v2/MATH Level 5": 0.0219, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3513, + "hfopenllm_v2/MMLU-PRO": 0.1641 + } + }, + { + "id": "KingNish/Reasoning-Llama-3b-v0.1", + "name": "Reasoning-Llama-3b-v0.1", + "developer": "KingNish", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6225, + "hfopenllm_v2/BBH": 0.4343, + "hfopenllm_v2/MATH Level 5": 0.1299, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3168, + "hfopenllm_v2/MMLU-PRO": 0.3029 + } + }, + { + "id": "KingNish/qwen-1b-continued", + "name": "qwen-1b-continued", + "developer": "KingNish", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1255, + "hfopenllm_v2/BBH": 0.2991, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3859, + "hfopenllm_v2/MMLU-PRO": 0.1261 + } + }, + { + "id": "KingNish/qwen-1b-continued-v2", + "name": "qwen-1b-continued-v2", + "developer": "KingNish", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1579, + "hfopenllm_v2/BBH": 0.3119, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3393, + "hfopenllm_v2/MMLU-PRO": 0.1193 + } + }, + { + "id": "KingNish/qwen-1b-continued-v2.1", + "name": "qwen-1b-continued-v2.1", + "developer": "KingNish", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1127, + "hfopenllm_v2/BBH": 0.3042, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.4154, + "hfopenllm_v2/MMLU-PRO": 0.1278 + } + }, + { + "id": "KingNish/qwen-1b-continued-v2.2", + "name": "qwen-1b-continued-v2.2", + "developer": "KingNish", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1413, + "hfopenllm_v2/BBH": 0.3059, + "hfopenllm_v2/MATH Level 5": 0.0151, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3513, + "hfopenllm_v2/MMLU-PRO": 0.1262 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Kquant03.json b/data/developers/Kquant03.json new file mode 100644 index 0000000000000000000000000000000000000000..580eddb7701d9eab6d1c5bcff0df7d5b7e676429 --- /dev/null +++ b/data/developers/Kquant03.json @@ -0,0 +1,33 @@ +{ + "developer": "Kquant03", + "models": [ + { + "id": "Kquant03/CognitiveFusion2-4x7B-BF16", + "name": "CognitiveFusion2-4x7B-BF16", + "developer": "Kquant03", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3567, + "hfopenllm_v2/BBH": 0.4108, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.4146, + "hfopenllm_v2/MMLU-PRO": 0.2793 + } + }, + { + "id": "Kquant03/L3-Pneuma-8B", + "name": "L3-Pneuma-8B", + "developer": "Kquant03", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2374, + "hfopenllm_v2/BBH": 0.4955, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4172, + "hfopenllm_v2/MMLU-PRO": 0.3184 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Krystalan.json b/data/developers/Krystalan.json new file mode 100644 index 0000000000000000000000000000000000000000..9353bf118687519f6b00451224b1ff4eba08729e --- /dev/null +++ b/data/developers/Krystalan.json @@ -0,0 +1,33 @@ +{ + "developer": "Krystalan", + "models": [ + { + "id": "Krystalan/DRT-o1-14B", + "name": "DRT-o1-14B", + "developer": "Krystalan", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4068, + "hfopenllm_v2/BBH": 0.6379, + "hfopenllm_v2/MATH Level 5": 0.4826, + "hfopenllm_v2/GPQA": 0.3523, + "hfopenllm_v2/MUSR": 0.4795, + "hfopenllm_v2/MMLU-PRO": 0.5179 + } + }, + { + "id": "Krystalan/DRT-o1-7B", + "name": "DRT-o1-7B", + "developer": "Krystalan", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3928, + "hfopenllm_v2/BBH": 0.5468, + "hfopenllm_v2/MATH Level 5": 0.4479, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.5087, + "hfopenllm_v2/MMLU-PRO": 0.4151 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Kuaishou.json b/data/developers/Kuaishou.json new file mode 100644 index 0000000000000000000000000000000000000000..511550dd6f527bdec6ac47d201b5a3d53f3a57a9 --- /dev/null +++ b/data/developers/Kuaishou.json @@ -0,0 +1,16 @@ +{ + "developer": "Kuaishou", + "models": [ + { + "id": "kuaishou/kwaipilot-40b-0604", + "name": "kwaipilot-40b-0604", + "developer": "Kuaishou", + "evaluator_relationship": null, + "benchmark_scores": { + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.07042253521126761, + "livecodebenchpro/Easy Problems": 0.056338028169014086 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Kukedlc.json b/data/developers/Kukedlc.json new file mode 100644 index 0000000000000000000000000000000000000000..9c255dac8c9fb300c36fff0fb75c54544e7edc7c --- /dev/null +++ b/data/developers/Kukedlc.json @@ -0,0 +1,103 @@ +{ + "developer": "Kukedlc", + "models": [ + { + "id": "Kukedlc/NeuralExperiment-7b-MagicCoder-v7.5", + "name": "NeuralExperiment-7b-MagicCoder-v7.5", + "developer": "Kukedlc", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4553, + "hfopenllm_v2/BBH": 0.3988, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4282, + "hfopenllm_v2/MMLU-PRO": 0.2824 + } + }, + { + "id": "Kukedlc/NeuralLLaMa-3-8b-DT-v0.1", + "name": "NeuralLLaMa-3-8b-DT-v0.1", + "developer": "Kukedlc", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4371, + "hfopenllm_v2/BBH": 0.4987, + "hfopenllm_v2/MATH Level 5": 0.0808, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4071, + "hfopenllm_v2/MMLU-PRO": 0.3792 + } + }, + { + "id": "Kukedlc/NeuralLLaMa-3-8b-ORPO-v0.3", + "name": "NeuralLLaMa-3-8b-ORPO-v0.3", + "developer": "Kukedlc", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5276, + "hfopenllm_v2/BBH": 0.4557, + "hfopenllm_v2/MATH Level 5": 0.0483, + "hfopenllm_v2/GPQA": 0.2391, + "hfopenllm_v2/MUSR": 0.37, + "hfopenllm_v2/MMLU-PRO": 0.3057 + } + }, + { + "id": "Kukedlc/NeuralSynthesis-7B-v0.1", + "name": "NeuralSynthesis-7B-v0.1", + "developer": "Kukedlc", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4185, + "hfopenllm_v2/BBH": 0.5145, + "hfopenllm_v2/MATH Level 5": 0.0634, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.4333, + "hfopenllm_v2/MMLU-PRO": 0.3049 + } + }, + { + "id": "Kukedlc/NeuralSynthesis-7B-v0.3", + "name": "NeuralSynthesis-7B-v0.3", + "developer": "Kukedlc", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4078, + "hfopenllm_v2/BBH": 0.5138, + "hfopenllm_v2/MATH Level 5": 0.0778, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.4346, + "hfopenllm_v2/MMLU-PRO": 0.305 + } + }, + { + "id": "Kukedlc/NeuralSynthesis-7b-v0.4-slerp", + "name": "NeuralSynthesis-7b-v0.4-slerp", + "developer": "Kukedlc", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3947, + "hfopenllm_v2/BBH": 0.5143, + "hfopenllm_v2/MATH Level 5": 0.0627, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.4332, + "hfopenllm_v2/MMLU-PRO": 0.3043 + } + }, + { + "id": "Kukedlc/Qwen-2.5-7b-Spanish-o1-CoT", + "name": "Qwen-2.5-7b-Spanish-o1-CoT", + "developer": "Kukedlc", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.421, + "hfopenllm_v2/BBH": 0.5602, + "hfopenllm_v2/MATH Level 5": 0.2727, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.4777, + "hfopenllm_v2/MMLU-PRO": 0.4363 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Kumar955.json b/data/developers/Kumar955.json new file mode 100644 index 0000000000000000000000000000000000000000..d7d571dddaed7e8c2351e382dd4b2e6d4c6dac3b --- /dev/null +++ b/data/developers/Kumar955.json @@ -0,0 +1,19 @@ +{ + "developer": "Kumar955", + "models": [ + { + "id": "Kumar955/Hemanth-llm", + "name": "Hemanth-llm", + "developer": "Kumar955", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5045, + "hfopenllm_v2/BBH": 0.5225, + "hfopenllm_v2/MATH Level 5": 0.0702, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4486, + "hfopenllm_v2/MMLU-PRO": 0.3113 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/L-RAGE.json b/data/developers/L-RAGE.json new file mode 100644 index 0000000000000000000000000000000000000000..783915f8ad1bd3861f6eccd4ac963b0f1d17fab5 --- /dev/null +++ b/data/developers/L-RAGE.json @@ -0,0 +1,19 @@ +{ + "developer": "L-RAGE", + "models": [ + { + "id": "L-RAGE/3_PRYMMAL-ECE-7B-SLERP-V1", + "name": "3_PRYMMAL-ECE-7B-SLERP-V1", + "developer": "L-RAGE", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2742, + "hfopenllm_v2/BBH": 0.4228, + "hfopenllm_v2/MATH Level 5": 0.108, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.3841, + "hfopenllm_v2/MMLU-PRO": 0.2925 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/LEESM.json b/data/developers/LEESM.json new file mode 100644 index 0000000000000000000000000000000000000000..cc5f95bada8da70e396ab3b966f6b04d7f03ead1 --- /dev/null +++ b/data/developers/LEESM.json @@ -0,0 +1,61 @@ +{ + "developer": "LEESM", + "models": [ + { + "id": "LEESM/llama-2-7b-hf-lora-oki100p", + "name": "llama-2-7b-hf-lora-oki100p", + "developer": "LEESM", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2513, + "hfopenllm_v2/BBH": 0.3492, + "hfopenllm_v2/MATH Level 5": 0.0166, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3687, + "hfopenllm_v2/MMLU-PRO": 0.1856 + } + }, + { + "id": "LEESM/llama-2-7b-hf-lora-oki10p", + "name": "llama-2-7b-hf-lora-oki10p", + "developer": "LEESM", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.227, + "hfopenllm_v2/BBH": 0.3531, + "hfopenllm_v2/MATH Level 5": 0.0166, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.3475, + "hfopenllm_v2/MMLU-PRO": 0.1679 + } + }, + { + "id": "LEESM/llama-3-8b-bnb-4b-kowiki231101", + "name": "llama-3-8b-bnb-4b-kowiki231101", + "developer": "LEESM", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1685, + "hfopenllm_v2/BBH": 0.4131, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3551, + "hfopenllm_v2/MMLU-PRO": 0.2425 + } + }, + { + "id": "LEESM/llama-3-Korean-Bllossom-8B-trexlab-oki10p", + "name": "llama-3-Korean-Bllossom-8B-trexlab-oki10p", + "developer": "LEESM", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2137, + "hfopenllm_v2/BBH": 0.4343, + "hfopenllm_v2/MATH Level 5": 0.0468, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3869, + "hfopenllm_v2/MMLU-PRO": 0.3177 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/LGAI-EXAONE.json b/data/developers/LGAI-EXAONE.json new file mode 100644 index 0000000000000000000000000000000000000000..3ad004616133013115f31cdace4658aa65a9b3fb --- /dev/null +++ b/data/developers/LGAI-EXAONE.json @@ -0,0 +1,61 @@ +{ + "developer": "LGAI-EXAONE", + "models": [ + { + "id": "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", + "name": "EXAONE-3.0-7.8B-Instruct", + "developer": "LGAI-EXAONE", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7193, + "hfopenllm_v2/BBH": 0.4174, + "hfopenllm_v2/MATH Level 5": 0.3044, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3661, + "hfopenllm_v2/MMLU-PRO": 0.3577 + } + }, + { + "id": "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct", + "name": "EXAONE-3.5-2.4B-Instruct", + "developer": "LGAI-EXAONE", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.795, + "hfopenllm_v2/BBH": 0.4092, + "hfopenllm_v2/MATH Level 5": 0.3678, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3661, + "hfopenllm_v2/MMLU-PRO": 0.328 + } + }, + { + "id": "LGAI-EXAONE/EXAONE-3.5-32B-Instruct", + "name": "EXAONE-3.5-32B-Instruct", + "developer": "LGAI-EXAONE", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8392, + "hfopenllm_v2/BBH": 0.5761, + "hfopenllm_v2/MATH Level 5": 0.5128, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.3807, + "hfopenllm_v2/MMLU-PRO": 0.4637 + } + }, + { + "id": "LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct", + "name": "EXAONE-3.5-7.8B-Instruct", + "developer": "LGAI-EXAONE", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8136, + "hfopenllm_v2/BBH": 0.4728, + "hfopenllm_v2/MATH Level 5": 0.4751, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3779, + "hfopenllm_v2/MMLU-PRO": 0.4133 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/LLM360.json b/data/developers/LLM360.json new file mode 100644 index 0000000000000000000000000000000000000000..43e4b7d4754cc4528b7ea939ba53182d190b2ac5 --- /dev/null +++ b/data/developers/LLM360.json @@ -0,0 +1,33 @@ +{ + "developer": "LLM360", + "models": [ + { + "id": "LLM360/K2", + "name": "K2", + "developer": "LLM360", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2252, + "hfopenllm_v2/BBH": 0.4972, + "hfopenllm_v2/MATH Level 5": 0.0272, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.398, + "hfopenllm_v2/MMLU-PRO": 0.3004 + } + }, + { + "id": "LLM360/K2-Chat", + "name": "K2-Chat", + "developer": "LLM360", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5152, + "hfopenllm_v2/BBH": 0.5358, + "hfopenllm_v2/MATH Level 5": 0.1035, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.457, + "hfopenllm_v2/MMLU-PRO": 0.3371 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/LLM4Binary.json b/data/developers/LLM4Binary.json new file mode 100644 index 0000000000000000000000000000000000000000..e913f674439bcd63b70d593c0666708573616d83 --- /dev/null +++ b/data/developers/LLM4Binary.json @@ -0,0 +1,19 @@ +{ + "developer": "LLM4Binary", + "models": [ + { + "id": "LLM4Binary/llm4decompile-1.3b-v2", + "name": "llm4decompile-1.3b-v2", + "developer": "LLM4Binary", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2268, + "hfopenllm_v2/BBH": 0.3272, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2357, + "hfopenllm_v2/MUSR": 0.4072, + "hfopenllm_v2/MMLU-PRO": 0.1209 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Lambent.json b/data/developers/Lambent.json new file mode 100644 index 0000000000000000000000000000000000000000..01ac84d8a6579b3f404615e46c99ca996887260b --- /dev/null +++ b/data/developers/Lambent.json @@ -0,0 +1,19 @@ +{ + "developer": "Lambent", + "models": [ + { + "id": "Lambent/qwen2.5-reinstruct-alternate-lumen-14B", + "name": "qwen2.5-reinstruct-alternate-lumen-14B", + "developer": "Lambent", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4794, + "hfopenllm_v2/BBH": 0.6459, + "hfopenllm_v2/MATH Level 5": 0.4622, + "hfopenllm_v2/GPQA": 0.3767, + "hfopenllm_v2/MUSR": 0.477, + "hfopenllm_v2/MMLU-PRO": 0.5388 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Langboat.json b/data/developers/Langboat.json new file mode 100644 index 0000000000000000000000000000000000000000..929f700b20c79cc3b08c22fb8dd90fc5d30fc929 --- /dev/null +++ b/data/developers/Langboat.json @@ -0,0 +1,19 @@ +{ + "developer": "Langboat", + "models": [ + { + "id": "Langboat/Mengzi3-8B-Chat", + "name": "Mengzi3-8B-Chat", + "developer": "Langboat", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.514, + "hfopenllm_v2/BBH": 0.4684, + "hfopenllm_v2/MATH Level 5": 0.0906, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.4078, + "hfopenllm_v2/MMLU-PRO": 0.3142 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Lawnakk.json b/data/developers/Lawnakk.json new file mode 100644 index 0000000000000000000000000000000000000000..846e685409048252cceb41219ec4c1e0b6c2fe7a --- /dev/null +++ b/data/developers/Lawnakk.json @@ -0,0 +1,145 @@ +{ + "developer": "Lawnakk", + "models": [ + { + "id": "Lawnakk/BBA100", + "name": "BBA100", + "developer": "Lawnakk", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2076, + "hfopenllm_v2/BBH": 0.2826, + "hfopenllm_v2/MATH Level 5": 0.0098, + "hfopenllm_v2/GPQA": 0.2441, + "hfopenllm_v2/MUSR": 0.402, + "hfopenllm_v2/MMLU-PRO": 0.1122 + } + }, + { + "id": "Lawnakk/BBALAW1", + "name": "BBALAW1", + "developer": "Lawnakk", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1905, + "hfopenllm_v2/BBH": 0.2872, + "hfopenllm_v2/MATH Level 5": 0.0098, + "hfopenllm_v2/GPQA": 0.2433, + "hfopenllm_v2/MUSR": 0.4153, + "hfopenllm_v2/MMLU-PRO": 0.1121 + } + }, + { + "id": "Lawnakk/BBALAW1.0", + "name": "BBALAW1.0", + "developer": "Lawnakk", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1351, + "hfopenllm_v2/BBH": 0.2828, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3526, + "hfopenllm_v2/MMLU-PRO": 0.1128 + } + }, + { + "id": "Lawnakk/BBALAW1.2", + "name": "BBALAW1.2", + "developer": "Lawnakk", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1354, + "hfopenllm_v2/BBH": 0.2811, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3579, + "hfopenllm_v2/MMLU-PRO": 0.1105 + } + }, + { + "id": "Lawnakk/BBALAW1.3", + "name": "BBALAW1.3", + "developer": "Lawnakk", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1354, + "hfopenllm_v2/BBH": 0.2827, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3619, + "hfopenllm_v2/MMLU-PRO": 0.1094 + } + }, + { + "id": "Lawnakk/BBALAW1.6", + "name": "BBALAW1.6", + "developer": "Lawnakk", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5245, + "hfopenllm_v2/BBH": 0.5554, + "hfopenllm_v2/MATH Level 5": 0.3603, + "hfopenllm_v2/GPQA": 0.3238, + "hfopenllm_v2/MUSR": 0.4368, + "hfopenllm_v2/MMLU-PRO": 0.4507 + } + }, + { + "id": "Lawnakk/BBALAW1.61", + "name": "BBALAW1.61", + "developer": "Lawnakk", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5771, + "hfopenllm_v2/BBH": 0.5549, + "hfopenllm_v2/MATH Level 5": 0.3663, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4355, + "hfopenllm_v2/MMLU-PRO": 0.4471 + } + }, + { + "id": "Lawnakk/BBALAW1.62", + "name": "BBALAW1.62", + "developer": "Lawnakk", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5046, + "hfopenllm_v2/BBH": 0.5581, + "hfopenllm_v2/MATH Level 5": 0.2825, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.4343, + "hfopenllm_v2/MMLU-PRO": 0.4545 + } + }, + { + "id": "Lawnakk/BBALAW1.63", + "name": "BBALAW1.63", + "developer": "Lawnakk", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4407, + "hfopenllm_v2/BBH": 0.5541, + "hfopenllm_v2/MATH Level 5": 0.3701, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4303, + "hfopenllm_v2/MMLU-PRO": 0.4471 + } + }, + { + "id": "Lawnakk/BBALAW1.64", + "name": "BBALAW1.64", + "developer": "Lawnakk", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1395, + "hfopenllm_v2/BBH": 0.2779, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2483, + "hfopenllm_v2/MUSR": 0.3447, + "hfopenllm_v2/MMLU-PRO": 0.1115 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/LenguajeNaturalAI.json b/data/developers/LenguajeNaturalAI.json new file mode 100644 index 0000000000000000000000000000000000000000..aeef2cf3b559975d29a9491a81b3abb61384a4d2 --- /dev/null +++ b/data/developers/LenguajeNaturalAI.json @@ -0,0 +1,33 @@ +{ + "developer": "LenguajeNaturalAI", + "models": [ + { + "id": "LenguajeNaturalAI/leniachat-gemma-2b-v0", + "name": "leniachat-gemma-2b-v0", + "developer": "LenguajeNaturalAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.215, + "hfopenllm_v2/BBH": 0.3074, + "hfopenllm_v2/MATH Level 5": 0.0113, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3659, + "hfopenllm_v2/MMLU-PRO": 0.117 + } + }, + { + "id": "LenguajeNaturalAI/leniachat-qwen2-1.5B-v0", + "name": "leniachat-qwen2-1.5B-v0", + "developer": "LenguajeNaturalAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2221, + "hfopenllm_v2/BBH": 0.3684, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.375, + "hfopenllm_v2/MMLU-PRO": 0.188 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/LeroyDyer.json b/data/developers/LeroyDyer.json new file mode 100644 index 0000000000000000000000000000000000000000..8cdfc71a1f6286ff1af50e7bd60834a9276ee515 --- /dev/null +++ b/data/developers/LeroyDyer.json @@ -0,0 +1,817 @@ +{ + "developer": "LeroyDyer", + "models": [ + { + "id": "LeroyDyer/CheckPoint_A", + "name": "CheckPoint_A", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4513, + "hfopenllm_v2/BBH": 0.4748, + "hfopenllm_v2/MATH Level 5": 0.0589, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.4231, + "hfopenllm_v2/MMLU-PRO": 0.288 + } + }, + { + "id": "LeroyDyer/CheckPoint_B", + "name": "CheckPoint_B", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.444, + "hfopenllm_v2/BBH": 0.478, + "hfopenllm_v2/MATH Level 5": 0.0718, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.3898, + "hfopenllm_v2/MMLU-PRO": 0.2907 + } + }, + { + "id": "LeroyDyer/CheckPoint_C", + "name": "CheckPoint_C", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3477, + "hfopenllm_v2/BBH": 0.4586, + "hfopenllm_v2/MATH Level 5": 0.0551, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.4346, + "hfopenllm_v2/MMLU-PRO": 0.3021 + } + }, + { + "id": "LeroyDyer/CheckPoint_R1", + "name": "CheckPoint_R1", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1728, + "hfopenllm_v2/BBH": 0.4225, + "hfopenllm_v2/MATH Level 5": 0.0431, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.4031, + "hfopenllm_v2/MMLU-PRO": 0.2205 + } + }, + { + "id": "LeroyDyer/LCARS_AI_001", + "name": "LCARS_AI_001", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3109, + "hfopenllm_v2/BBH": 0.4258, + "hfopenllm_v2/MATH Level 5": 0.0234, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.4384, + "hfopenllm_v2/MMLU-PRO": 0.267 + } + }, + { + "id": "LeroyDyer/LCARS_AI_1x4_003_SuperAI", + "name": "LCARS_AI_1x4_003_SuperAI", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4111, + "hfopenllm_v2/BBH": 0.492, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4506, + "hfopenllm_v2/MMLU-PRO": 0.2972 + } + }, + { + "id": "LeroyDyer/LCARS_AI_StarTrek_Computer", + "name": "LCARS_AI_StarTrek_Computer", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3583, + "hfopenllm_v2/BBH": 0.4446, + "hfopenllm_v2/MATH Level 5": 0.0408, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.395, + "hfopenllm_v2/MMLU-PRO": 0.2458 + } + }, + { + "id": "LeroyDyer/LCARS_TOP_SCORE", + "name": "LCARS_TOP_SCORE", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4371, + "hfopenllm_v2/BBH": 0.5127, + "hfopenllm_v2/MATH Level 5": 0.0672, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.4293, + "hfopenllm_v2/MMLU-PRO": 0.3031 + } + }, + { + "id": "LeroyDyer/Mixtral_AI_SwahiliTron_7b", + "name": "Mixtral_AI_SwahiliTron_7b", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1534, + "hfopenllm_v2/BBH": 0.3055, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.342, + "hfopenllm_v2/MMLU-PRO": 0.1208 + } + }, + { + "id": "LeroyDyer/SpydazWebAI_Human_AGI", + "name": "SpydazWebAI_Human_AGI", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3388, + "hfopenllm_v2/BBH": 0.3375, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.3966, + "hfopenllm_v2/MMLU-PRO": 0.1479 + } + }, + { + "id": "LeroyDyer/SpydazWebAI_Human_AGI_001", + "name": "SpydazWebAI_Human_AGI_001", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3118, + "hfopenllm_v2/BBH": 0.3433, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.3994, + "hfopenllm_v2/MMLU-PRO": 0.1426 + } + }, + { + "id": "LeroyDyer/SpydazWeb_AI_CyberTron_Ultra_7b", + "name": "SpydazWeb_AI_CyberTron_Ultra_7b", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1556, + "hfopenllm_v2/BBH": 0.4811, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4136, + "hfopenllm_v2/MMLU-PRO": 0.2866 + } + }, + { + "id": "LeroyDyer/SpydazWeb_AI_HumanAGI_001_M2", + "name": "SpydazWeb_AI_HumanAGI_001_M2", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.394, + "hfopenllm_v2/BBH": 0.4888, + "hfopenllm_v2/MATH Level 5": 0.0385, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.4503, + "hfopenllm_v2/MMLU-PRO": 0.3005 + } + }, + { + "id": "LeroyDyer/SpydazWeb_AI_HumanAGI_002", + "name": "SpydazWeb_AI_HumanAGI_002", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4088, + "hfopenllm_v2/BBH": 0.5044, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.4865, + "hfopenllm_v2/MMLU-PRO": 0.3059 + } + }, + { + "id": "LeroyDyer/SpydazWeb_AI_HumanAI_001", + "name": "SpydazWeb_AI_HumanAI_001", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2252, + "hfopenllm_v2/BBH": 0.3344, + "hfopenllm_v2/MATH Level 5": 0.0166, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.386, + "hfopenllm_v2/MMLU-PRO": 0.1271 + } + }, + { + "id": "LeroyDyer/SpydazWeb_AI_HumanAI_006", + "name": "SpydazWeb_AI_HumanAI_006", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.143, + "hfopenllm_v2/BBH": 0.3302, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3568, + "hfopenllm_v2/MMLU-PRO": 0.1135 + } + }, + { + "id": "LeroyDyer/SpydazWeb_AI_HumanAI_007", + "name": "SpydazWeb_AI_HumanAI_007", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3352, + "hfopenllm_v2/BBH": 0.3416, + "hfopenllm_v2/MATH Level 5": 0.0227, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.4096, + "hfopenllm_v2/MMLU-PRO": 0.1352 + } + }, + { + "id": "LeroyDyer/SpydazWeb_AI_HumanAI_009_CHAT", + "name": "SpydazWeb_AI_HumanAI_009_CHAT", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2973, + "hfopenllm_v2/BBH": 0.3307, + "hfopenllm_v2/MATH Level 5": 0.0166, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.4138, + "hfopenllm_v2/MMLU-PRO": 0.1433 + } + }, + { + "id": "LeroyDyer/SpydazWeb_AI_HumanAI_010_CHAT", + "name": "SpydazWeb_AI_HumanAI_010_CHAT", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2507, + "hfopenllm_v2/BBH": 0.3336, + "hfopenllm_v2/MATH Level 5": 0.0181, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.4137, + "hfopenllm_v2/MMLU-PRO": 0.143 + } + }, + { + "id": "LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT", + "name": "SpydazWeb_AI_HumanAI_011_INSTRUCT", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3149, + "hfopenllm_v2/BBH": 0.3523, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3831, + "hfopenllm_v2/MMLU-PRO": 0.1595 + } + }, + { + "id": "LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML", + "name": "SpydazWeb_AI_HumanAI_011_INSTRUCT_ML", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3752, + "hfopenllm_v2/BBH": 0.3984, + "hfopenllm_v2/MATH Level 5": 0.0257, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4239, + "hfopenllm_v2/MMLU-PRO": 0.2019 + } + }, + { + "id": "LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1", + "name": "SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.405, + "hfopenllm_v2/BBH": 0.4858, + "hfopenllm_v2/MATH Level 5": 0.0551, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.3921, + "hfopenllm_v2/MMLU-PRO": 0.2956 + } + }, + { + "id": "LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA", + "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_IA", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3066, + "hfopenllm_v2/BBH": 0.4577, + "hfopenllm_v2/MATH Level 5": 0.0446, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4254, + "hfopenllm_v2/MMLU-PRO": 0.2318 + } + }, + { + "id": "LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_MX", + "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_MX", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3066, + "hfopenllm_v2/BBH": 0.3158, + "hfopenllm_v2/MATH Level 5": 0.0151, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.3444, + "hfopenllm_v2/MMLU-PRO": 0.1107 + } + }, + { + "id": "LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA", + "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_XA", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3798, + "hfopenllm_v2/BBH": 0.4483, + "hfopenllm_v2/MATH Level 5": 0.04, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4148, + "hfopenllm_v2/MMLU-PRO": 0.2389 + } + }, + { + "id": "LeroyDyer/SpydazWeb_AI_HumanAI_RP", + "name": "SpydazWeb_AI_HumanAI_RP", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2541, + "hfopenllm_v2/BBH": 0.3323, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3883, + "hfopenllm_v2/MMLU-PRO": 0.1324 + } + }, + { + "id": "LeroyDyer/SpydazWeb_AI_HumanAI_TextVision", + "name": "SpydazWeb_AI_HumanAI_TextVision", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3063, + "hfopenllm_v2/BBH": 0.3354, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.3938, + "hfopenllm_v2/MMLU-PRO": 0.1387 + } + }, + { + "id": "LeroyDyer/SpydazWeb_HumanAI_M1", + "name": "SpydazWeb_HumanAI_M1", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3582, + "hfopenllm_v2/BBH": 0.3563, + "hfopenllm_v2/MATH Level 5": 0.0249, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3671, + "hfopenllm_v2/MMLU-PRO": 0.1663 + } + }, + { + "id": "LeroyDyer/SpydazWeb_HumanAI_M2", + "name": "SpydazWeb_HumanAI_M2", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.375, + "hfopenllm_v2/BBH": 0.3931, + "hfopenllm_v2/MATH Level 5": 0.0287, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3751, + "hfopenllm_v2/MMLU-PRO": 0.201 + } + }, + { + "id": "LeroyDyer/SpydazWeb_HumanAI_M3", + "name": "SpydazWeb_HumanAI_M3", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1579, + "hfopenllm_v2/BBH": 0.3127, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3914, + "hfopenllm_v2/MMLU-PRO": 0.1149 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_12", + "name": "_Spydaz_Web_AI_12", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2765, + "hfopenllm_v2/BBH": 0.3163, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3582, + "hfopenllm_v2/MMLU-PRO": 0.1137 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_14", + "name": "_Spydaz_Web_AI_14", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1812, + "hfopenllm_v2/BBH": 0.2989, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3395, + "hfopenllm_v2/MMLU-PRO": 0.1139 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_001", + "name": "_Spydaz_Web_AI_AGI_R1_001", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4505, + "hfopenllm_v2/BBH": 0.4609, + "hfopenllm_v2/MATH Level 5": 0.0634, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.4256, + "hfopenllm_v2/MMLU-PRO": 0.2734 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_002", + "name": "_Spydaz_Web_AI_AGI_R1_002", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5307, + "hfopenllm_v2/BBH": 0.4683, + "hfopenllm_v2/MATH Level 5": 0.0582, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.4255, + "hfopenllm_v2/MMLU-PRO": 0.2894 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_MUSR", + "name": "_Spydaz_Web_AI_AGI_R1_MUSR", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4786, + "hfopenllm_v2/BBH": 0.4672, + "hfopenllm_v2/MATH Level 5": 0.0604, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4869, + "hfopenllm_v2/MMLU-PRO": 0.2828 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_MasterCoder", + "name": "_Spydaz_Web_AI_AGI_R1_MasterCoder", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4143, + "hfopenllm_v2/BBH": 0.4689, + "hfopenllm_v2/MATH Level 5": 0.0612, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.472, + "hfopenllm_v2/MMLU-PRO": 0.2719 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_001", + "name": "_Spydaz_Web_AI_AGI_R1_Math_001", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4571, + "hfopenllm_v2/BBH": 0.4818, + "hfopenllm_v2/MATH Level 5": 0.0695, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.4778, + "hfopenllm_v2/MMLU-PRO": 0.2681 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_003", + "name": "_Spydaz_Web_AI_AGI_R1_Math_003", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.62, + "hfopenllm_v2/BBH": 0.4756, + "hfopenllm_v2/MATH Level 5": 0.0695, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.4202, + "hfopenllm_v2/MMLU-PRO": 0.2999 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent", + "name": "_Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5951, + "hfopenllm_v2/BBH": 0.4927, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.5198, + "hfopenllm_v2/MMLU-PRO": 0.3 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Student", + "name": "_Spydaz_Web_AI_AGI_R1_Math_Student", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5736, + "hfopenllm_v2/BBH": 0.4881, + "hfopenllm_v2/MATH Level 5": 0.0514, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.5098, + "hfopenllm_v2/MMLU-PRO": 0.2927 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Teacher", + "name": "_Spydaz_Web_AI_AGI_R1_Math_Teacher", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5772, + "hfopenllm_v2/BBH": 0.4805, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.5222, + "hfopenllm_v2/MMLU-PRO": 0.2956 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_001", + "name": "_Spydaz_Web_AI_AGI_R1_OmG_001", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5818, + "hfopenllm_v2/BBH": 0.4908, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4486, + "hfopenllm_v2/MMLU-PRO": 0.2906 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_002", + "name": "_Spydaz_Web_AI_AGI_R1_OmG_002", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5462, + "hfopenllm_v2/BBH": 0.4655, + "hfopenllm_v2/MATH Level 5": 0.0498, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.4511, + "hfopenllm_v2/MMLU-PRO": 0.2867 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Coder", + "name": "_Spydaz_Web_AI_AGI_R1_OmG_Coder", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4924, + "hfopenllm_v2/BBH": 0.4638, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.5625, + "hfopenllm_v2/MMLU-PRO": 0.289 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Math", + "name": "_Spydaz_Web_AI_AGI_R1_OmG_Math", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5033, + "hfopenllm_v2/BBH": 0.4677, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4326, + "hfopenllm_v2/MMLU-PRO": 0.2913 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_MathMaster", + "name": "_Spydaz_Web_AI_AGI_R1_OmG_MathMaster", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5558, + "hfopenllm_v2/BBH": 0.4742, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.451, + "hfopenllm_v2/MMLU-PRO": 0.2672 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Student_Coder", + "name": "_Spydaz_Web_AI_AGI_R1_Student_Coder", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.545, + "hfopenllm_v2/BBH": 0.4651, + "hfopenllm_v2/MATH Level 5": 0.0657, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4388, + "hfopenllm_v2/MMLU-PRO": 0.2768 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Teacher_Coder", + "name": "_Spydaz_Web_AI_AGI_R1_Teacher_Coder", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5082, + "hfopenllm_v2/BBH": 0.4797, + "hfopenllm_v2/MATH Level 5": 0.065, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.4338, + "hfopenllm_v2/MMLU-PRO": 0.2845 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Top_Student", + "name": "_Spydaz_Web_AI_AGI_R1_Top_Student", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.604, + "hfopenllm_v2/BBH": 0.4988, + "hfopenllm_v2/MATH Level 5": 0.0725, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.5398, + "hfopenllm_v2/MMLU-PRO": 0.3024 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_X1", + "name": "_Spydaz_Web_AI_AGI_R1_X1", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4273, + "hfopenllm_v2/BBH": 0.4759, + "hfopenllm_v2/MATH Level 5": 0.0566, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.4232, + "hfopenllm_v2/MMLU-PRO": 0.2891 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_X2", + "name": "_Spydaz_Web_AI_AGI_R1_X2", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5434, + "hfopenllm_v2/BBH": 0.4786, + "hfopenllm_v2/MATH Level 5": 0.0612, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4695, + "hfopenllm_v2/MMLU-PRO": 0.2921 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_AGI_RP_R1", + "name": "_Spydaz_Web_AI_AGI_RP_R1", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5426, + "hfopenllm_v2/BBH": 0.4701, + "hfopenllm_v2/MATH Level 5": 0.0604, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.4201, + "hfopenllm_v2/MMLU-PRO": 0.2894 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_BIBLE_002", + "name": "_Spydaz_Web_AI_BIBLE_002", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2195, + "hfopenllm_v2/BBH": 0.3289, + "hfopenllm_v2/MATH Level 5": 0.0174, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.3407, + "hfopenllm_v2/MMLU-PRO": 0.1368 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_ChatML_002", + "name": "_Spydaz_Web_AI_ChatML_002", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2412, + "hfopenllm_v2/BBH": 0.3106, + "hfopenllm_v2/MATH Level 5": 0.0113, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3623, + "hfopenllm_v2/MMLU-PRO": 0.1095 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_ChatQA", + "name": "_Spydaz_Web_AI_ChatQA", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1415, + "hfopenllm_v2/BBH": 0.3236, + "hfopenllm_v2/MATH Level 5": 0.0098, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3447, + "hfopenllm_v2/MMLU-PRO": 0.1475 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_ChatQA_003", + "name": "_Spydaz_Web_AI_ChatQA_003", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2209, + "hfopenllm_v2/BBH": 0.3172, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3818, + "hfopenllm_v2/MMLU-PRO": 0.1133 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_TEMP_", + "name": "_Spydaz_Web_AI_TEMP_", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4795, + "hfopenllm_v2/BBH": 0.4957, + "hfopenllm_v2/MATH Level 5": 0.1239, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.4218, + "hfopenllm_v2/MMLU-PRO": 0.3121 + } + }, + { + "id": "LeroyDyer/_Spydaz_Web_AI_Top_Teacher_", + "name": "_Spydaz_Web_AI_Top_Teacher_", + "developer": "LeroyDyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4404, + "hfopenllm_v2/BBH": 0.4891, + "hfopenllm_v2/MATH Level 5": 0.1156, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.4366, + "hfopenllm_v2/MMLU-PRO": 0.315 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/LightningRodLabs.json b/data/developers/LightningRodLabs.json new file mode 100644 index 0000000000000000000000000000000000000000..2cac16c12675b0e38476f17047518aa0f6b980a5 --- /dev/null +++ b/data/developers/LightningRodLabs.json @@ -0,0 +1,47 @@ +{ + "developer": "LightningRodLabs", + "models": [ + { + "id": "LightningRodLabs/Flashlight-v1.0", + "name": "Flashlight-v1.0", + "developer": "LightningRodLabs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6745, + "hfopenllm_v2/BBH": 0.6877, + "hfopenllm_v2/MATH Level 5": 0.497, + "hfopenllm_v2/GPQA": 0.3423, + "hfopenllm_v2/MUSR": 0.4101, + "hfopenllm_v2/MMLU-PRO": 0.5402 + } + }, + { + "id": "LightningRodLabs/Flashlight-v1.1", + "name": "Flashlight-v1.1", + "developer": "LightningRodLabs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6721, + "hfopenllm_v2/BBH": 0.6901, + "hfopenllm_v2/MATH Level 5": 0.5325, + "hfopenllm_v2/GPQA": 0.3398, + "hfopenllm_v2/MUSR": 0.4048, + "hfopenllm_v2/MMLU-PRO": 0.5416 + } + }, + { + "id": "LightningRodLabs/Flashlight-v1.2", + "name": "Flashlight-v1.2", + "developer": "LightningRodLabs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.436, + "hfopenllm_v2/BBH": 0.3265, + "hfopenllm_v2/MATH Level 5": 0.1556, + "hfopenllm_v2/GPQA": 0.2357, + "hfopenllm_v2/MUSR": 0.4554, + "hfopenllm_v2/MMLU-PRO": 0.2485 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Lil-R.json b/data/developers/Lil-R.json new file mode 100644 index 0000000000000000000000000000000000000000..1a6a47cac2ee87327c9ea0e6eecfb82aaf1293ad --- /dev/null +++ b/data/developers/Lil-R.json @@ -0,0 +1,117 @@ +{ + "developer": "Lil-R", + "models": [ + { + "id": "Lil-R/2_PRYMMAL-ECE-2B-SLERP-V1", + "name": "2_PRYMMAL-ECE-2B-SLERP-V1", + "developer": "Lil-R", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5823, + "hfopenllm_v2/BBH": 0.4287, + "hfopenllm_v2/MATH Level 5": 0.0914, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4375, + "hfopenllm_v2/MMLU-PRO": 0.2678 + } + }, + { + "id": "Lil-R/2_PRYMMAL-ECE-2B-SLERP-V2", + "name": "2_PRYMMAL-ECE-2B-SLERP-V2", + "developer": "Lil-R", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5543, + "hfopenllm_v2/BBH": 0.4376, + "hfopenllm_v2/MATH Level 5": 0.0944, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4482, + "hfopenllm_v2/MMLU-PRO": 0.2744 + } + }, + { + "id": "Lil-R/2_PRYMMAL-ECE-7B-SLERP", + "name": "2_PRYMMAL-ECE-7B-SLERP", + "developer": "Lil-R", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5577, + "hfopenllm_v2/BBH": 0.5557, + "hfopenllm_v2/MATH Level 5": 0.3633, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4396, + "hfopenllm_v2/MMLU-PRO": 0.4507 + } + }, + { + "id": "Lil-R/2_PRYMMAL-ECE-7B-SLERP-V1", + "name": "2_PRYMMAL-ECE-7B-SLERP-V1", + "developer": "Lil-R", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1073, + "hfopenllm_v2/BBH": 0.3053, + "hfopenllm_v2/MATH Level 5": 0.0008, + "hfopenllm_v2/GPQA": 0.2508, + "hfopenllm_v2/MUSR": 0.3911, + "hfopenllm_v2/MMLU-PRO": 0.1124 + } + }, + { + "id": "Lil-R/2_PRYMMAL-ECE-7B-SLERP-V2", + "name": "2_PRYMMAL-ECE-7B-SLERP-V2", + "developer": "Lil-R", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1073, + "hfopenllm_v2/BBH": 0.3053, + "hfopenllm_v2/MATH Level 5": 0.0008, + "hfopenllm_v2/GPQA": 0.2508, + "hfopenllm_v2/MUSR": 0.3911, + "hfopenllm_v2/MMLU-PRO": 0.1124 + } + }, + { + "id": "Lil-R/2_PRYMMAL-ECE-7B-SLERP-V3", + "name": "2_PRYMMAL-ECE-7B-SLERP-V3", + "developer": "Lil-R", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2235, + "hfopenllm_v2/BBH": 0.3578, + "hfopenllm_v2/MATH Level 5": 0.006, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.4107, + "hfopenllm_v2/MMLU-PRO": 0.1817 + } + }, + { + "id": "Lil-R/PRYMMAL-ECE-1B-SLERP-V1", + "name": "PRYMMAL-ECE-1B-SLERP-V1", + "developer": "Lil-R", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2874, + "hfopenllm_v2/BBH": 0.419, + "hfopenllm_v2/MATH Level 5": 0.1035, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3974, + "hfopenllm_v2/MMLU-PRO": 0.2926 + } + }, + { + "id": "Lil-R/PRYMMAL-ECE-7B-SLERP-V8", + "name": "PRYMMAL-ECE-7B-SLERP-V8", + "developer": "Lil-R", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1258, + "hfopenllm_v2/BBH": 0.2955, + "hfopenllm_v2/MATH Level 5": 0.0098, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3631, + "hfopenllm_v2/MMLU-PRO": 0.1128 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/LilRg.json b/data/developers/LilRg.json new file mode 100644 index 0000000000000000000000000000000000000000..6ec2c46461d7079d0a9dd53ece713a8f1174cf8d --- /dev/null +++ b/data/developers/LilRg.json @@ -0,0 +1,145 @@ +{ + "developer": "LilRg", + "models": [ + { + "id": "LilRg/10PRYMMAL-3B-slerp", + "name": "10PRYMMAL-3B-slerp", + "developer": "LilRg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1946, + "hfopenllm_v2/BBH": 0.532, + "hfopenllm_v2/MATH Level 5": 0.1495, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.4529, + "hfopenllm_v2/MMLU-PRO": 0.3881 + } + }, + { + "id": "LilRg/ECE-1B-merge-PRYMMAL", + "name": "ECE-1B-merge-PRYMMAL", + "developer": "LilRg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2712, + "hfopenllm_v2/BBH": 0.4235, + "hfopenllm_v2/MATH Level 5": 0.1012, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.3801, + "hfopenllm_v2/MMLU-PRO": 0.2906 + } + }, + { + "id": "LilRg/ECE_Finetunning", + "name": "ECE_Finetunning", + "developer": "LilRg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0445, + "hfopenllm_v2/BBH": 0.4732, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.3839, + "hfopenllm_v2/MMLU-PRO": 0.3191 + } + }, + { + "id": "LilRg/PRYMMAL-6B-slerp", + "name": "PRYMMAL-6B-slerp", + "developer": "LilRg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1153, + "hfopenllm_v2/BBH": 0.2868, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2458, + "hfopenllm_v2/MUSR": 0.3698, + "hfopenllm_v2/MMLU-PRO": 0.1108 + } + }, + { + "id": "LilRg/PRYMMAL-ECE-7B-SLERP-V3", + "name": "PRYMMAL-ECE-7B-SLERP-V3", + "developer": "LilRg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1243, + "hfopenllm_v2/BBH": 0.2957, + "hfopenllm_v2/MATH Level 5": 0.0098, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3671, + "hfopenllm_v2/MMLU-PRO": 0.1127 + } + }, + { + "id": "LilRg/PRYMMAL-ECE-7B-SLERP-V4", + "name": "PRYMMAL-ECE-7B-SLERP-V4", + "developer": "LilRg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1249, + "hfopenllm_v2/BBH": 0.2957, + "hfopenllm_v2/MATH Level 5": 0.0098, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3671, + "hfopenllm_v2/MMLU-PRO": 0.1127 + } + }, + { + "id": "LilRg/PRYMMAL-ECE-7B-SLERP-V5", + "name": "PRYMMAL-ECE-7B-SLERP-V5", + "developer": "LilRg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1249, + "hfopenllm_v2/BBH": 0.2957, + "hfopenllm_v2/MATH Level 5": 0.0098, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3671, + "hfopenllm_v2/MMLU-PRO": 0.1127 + } + }, + { + "id": "LilRg/PRYMMAL-ECE-7B-SLERP-V6", + "name": "PRYMMAL-ECE-7B-SLERP-V6", + "developer": "LilRg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1243, + "hfopenllm_v2/BBH": 0.2957, + "hfopenllm_v2/MATH Level 5": 0.0098, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3671, + "hfopenllm_v2/MMLU-PRO": 0.1127 + } + }, + { + "id": "LilRg/PRYMMAL-ECE-7B-SLERP-V7", + "name": "PRYMMAL-ECE-7B-SLERP-V7", + "developer": "LilRg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1249, + "hfopenllm_v2/BBH": 0.2957, + "hfopenllm_v2/MATH Level 5": 0.0098, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3671, + "hfopenllm_v2/MMLU-PRO": 0.1127 + } + }, + { + "id": "LilRg/PRYMMAL-slerp-Merge", + "name": "PRYMMAL-slerp-Merge", + "developer": "LilRg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3044, + "hfopenllm_v2/BBH": 0.5364, + "hfopenllm_v2/MATH Level 5": 0.1616, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.4635, + "hfopenllm_v2/MMLU-PRO": 0.3863 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/LimYeri.json b/data/developers/LimYeri.json new file mode 100644 index 0000000000000000000000000000000000000000..48325ed0a50912b45c2e5ff82562c0d0c8ba5c21 --- /dev/null +++ b/data/developers/LimYeri.json @@ -0,0 +1,75 @@ +{ + "developer": "LimYeri", + "models": [ + { + "id": "LimYeri/CodeMind-Llama3-8B-unsloth_v2-merged", + "name": "CodeMind-Llama3-8B-unsloth_v2-merged", + "developer": "LimYeri", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6946, + "hfopenllm_v2/BBH": 0.486, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3316, + "hfopenllm_v2/MMLU-PRO": 0.3506 + } + }, + { + "id": "LimYeri/CodeMind-Llama3-8B-unsloth_v3-merged", + "name": "CodeMind-Llama3-8B-unsloth_v3-merged", + "developer": "LimYeri", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6763, + "hfopenllm_v2/BBH": 0.4908, + "hfopenllm_v2/MATH Level 5": 0.068, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3356, + "hfopenllm_v2/MMLU-PRO": 0.3496 + } + }, + { + "id": "LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged", + "name": "CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged", + "developer": "LimYeri", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6492, + "hfopenllm_v2/BBH": 0.4853, + "hfopenllm_v2/MATH Level 5": 0.068, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3608, + "hfopenllm_v2/MMLU-PRO": 0.3354 + } + }, + { + "id": "LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-merged", + "name": "CodeMind-Llama3-8B-unsloth_v4-one-merged", + "developer": "LimYeri", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3211, + "hfopenllm_v2/BBH": 0.4739, + "hfopenllm_v2/MATH Level 5": 0.0551, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4069, + "hfopenllm_v2/MMLU-PRO": 0.3353 + } + }, + { + "id": "LimYeri/CodeMind-Llama3.1-8B-unsloth-merged", + "name": "CodeMind-Llama3.1-8B-unsloth-merged", + "developer": "LimYeri", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.649, + "hfopenllm_v2/BBH": 0.4695, + "hfopenllm_v2/MATH Level 5": 0.1088, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3752, + "hfopenllm_v2/MMLU-PRO": 0.334 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Locutusque.json b/data/developers/Locutusque.json new file mode 100644 index 0000000000000000000000000000000000000000..79fc85cde4cbae34291a4bb6ed193ab4ab0df365 --- /dev/null +++ b/data/developers/Locutusque.json @@ -0,0 +1,89 @@ +{ + "developer": "Locutusque", + "models": [ + { + "id": "Locutusque/CollectiveLM-Falcon-3-7B", + "name": "CollectiveLM-Falcon-3-7B", + "developer": "Locutusque", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3918, + "hfopenllm_v2/BBH": 0.5105, + "hfopenllm_v2/MATH Level 5": 0.2183, + "hfopenllm_v2/GPQA": 0.3255, + "hfopenllm_v2/MUSR": 0.3887, + "hfopenllm_v2/MMLU-PRO": 0.3599 + } + }, + { + "id": "Locutusque/Hercules-6.0-Llama-3.1-8B", + "name": "Hercules-6.0-Llama-3.1-8B", + "developer": "Locutusque", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.663, + "hfopenllm_v2/BBH": 0.4813, + "hfopenllm_v2/MATH Level 5": 0.1669, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3621, + "hfopenllm_v2/MMLU-PRO": 0.3615 + } + }, + { + "id": "Locutusque/Hercules-6.1-Llama-3.1-8B", + "name": "Hercules-6.1-Llama-3.1-8B", + "developer": "Locutusque", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6007, + "hfopenllm_v2/BBH": 0.4656, + "hfopenllm_v2/MATH Level 5": 0.176, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3553, + "hfopenllm_v2/MMLU-PRO": 0.3669 + } + }, + { + "id": "Locutusque/Llama-3-NeuralHercules-5.0-8B", + "name": "Llama-3-NeuralHercules-5.0-8B", + "developer": "Locutusque", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4489, + "hfopenllm_v2/BBH": 0.394, + "hfopenllm_v2/MATH Level 5": 0.0431, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3881, + "hfopenllm_v2/MMLU-PRO": 0.2933 + } + }, + { + "id": "Locutusque/Llama-3-Yggdrasil-2.0-8B", + "name": "Llama-3-Yggdrasil-2.0-8B", + "developer": "Locutusque", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5371, + "hfopenllm_v2/BBH": 0.4772, + "hfopenllm_v2/MATH Level 5": 0.0831, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3977, + "hfopenllm_v2/MMLU-PRO": 0.3167 + } + }, + { + "id": "Locutusque/TinyMistral-248M-v2.5", + "name": "TinyMistral-248M-v2.5", + "developer": "Locutusque", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1336, + "hfopenllm_v2/BBH": 0.3039, + "hfopenllm_v2/MATH Level 5": 0.0098, + "hfopenllm_v2/GPQA": 0.2508, + "hfopenllm_v2/MUSR": 0.3782, + "hfopenllm_v2/MMLU-PRO": 0.1135 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Luni.json b/data/developers/Luni.json new file mode 100644 index 0000000000000000000000000000000000000000..5949c4e715c588b20c3086e11472f22b6fe16386 --- /dev/null +++ b/data/developers/Luni.json @@ -0,0 +1,33 @@ +{ + "developer": "Luni", + "models": [ + { + "id": "Luni/StarDust-12b-v1", + "name": "StarDust-12b-v1", + "developer": "Luni", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5459, + "hfopenllm_v2/BBH": 0.5366, + "hfopenllm_v2/MATH Level 5": 0.0763, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.4324, + "hfopenllm_v2/MMLU-PRO": 0.3412 + } + }, + { + "id": "Luni/StarDust-12b-v2", + "name": "StarDust-12b-v2", + "developer": "Luni", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5629, + "hfopenllm_v2/BBH": 0.5419, + "hfopenllm_v2/MATH Level 5": 0.0687, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4338, + "hfopenllm_v2/MMLU-PRO": 0.3439 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Lunzima.json b/data/developers/Lunzima.json new file mode 100644 index 0000000000000000000000000000000000000000..a4fac6019f60ace84222176cca9db79aa521373e --- /dev/null +++ b/data/developers/Lunzima.json @@ -0,0 +1,257 @@ +{ + "developer": "Lunzima", + "models": [ + { + "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v3", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v3", + "developer": "Lunzima", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7049, + "hfopenllm_v2/BBH": 0.6478, + "hfopenllm_v2/MATH Level 5": 0.4162, + "hfopenllm_v2/GPQA": 0.3817, + "hfopenllm_v2/MUSR": 0.4808, + "hfopenllm_v2/MMLU-PRO": 0.5394 + } + }, + { + "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v4", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v4", + "developer": "Lunzima", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6943, + "hfopenllm_v2/BBH": 0.642, + "hfopenllm_v2/MATH Level 5": 0.3467, + "hfopenllm_v2/GPQA": 0.3716, + "hfopenllm_v2/MUSR": 0.4769, + "hfopenllm_v2/MMLU-PRO": 0.5252 + } + }, + { + "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v5", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v5", + "developer": "Lunzima", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7485, + "hfopenllm_v2/BBH": 0.6467, + "hfopenllm_v2/MATH Level 5": 0.4358, + "hfopenllm_v2/GPQA": 0.3624, + "hfopenllm_v2/MUSR": 0.4473, + "hfopenllm_v2/MMLU-PRO": 0.514 + } + }, + { + "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v6", + "developer": "Lunzima", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7043, + "hfopenllm_v2/BBH": 0.6458, + "hfopenllm_v2/MATH Level 5": 0.3958, + "hfopenllm_v2/GPQA": 0.3775, + "hfopenllm_v2/MUSR": 0.4768, + "hfopenllm_v2/MMLU-PRO": 0.5392 + } + }, + { + "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt", + "developer": "Lunzima", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4663, + "hfopenllm_v2/BBH": 0.6215, + "hfopenllm_v2/MATH Level 5": 0.3316, + "hfopenllm_v2/GPQA": 0.3758, + "hfopenllm_v2/MUSR": 0.4937, + "hfopenllm_v2/MMLU-PRO": 0.5204 + } + }, + { + "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v7", + "developer": "Lunzima", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6794, + "hfopenllm_v2/BBH": 0.6531, + "hfopenllm_v2/MATH Level 5": 0.4101, + "hfopenllm_v2/GPQA": 0.3792, + "hfopenllm_v2/MUSR": 0.4834, + "hfopenllm_v2/MMLU-PRO": 0.5376 + } + }, + { + "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase", + "developer": "Lunzima", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6931, + "hfopenllm_v2/BBH": 0.6423, + "hfopenllm_v2/MATH Level 5": 0.3406, + "hfopenllm_v2/GPQA": 0.375, + "hfopenllm_v2/MUSR": 0.4888, + "hfopenllm_v2/MMLU-PRO": 0.5277 + } + }, + { + "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8", + "developer": "Lunzima", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7875, + "hfopenllm_v2/BBH": 0.6419, + "hfopenllm_v2/MATH Level 5": 0.5559, + "hfopenllm_v2/GPQA": 0.3356, + "hfopenllm_v2/MUSR": 0.4394, + "hfopenllm_v2/MMLU-PRO": 0.5206 + } + }, + { + "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.5", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8.5", + "developer": "Lunzima", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5929, + "hfopenllm_v2/BBH": 0.6451, + "hfopenllm_v2/MATH Level 5": 0.3656, + "hfopenllm_v2/GPQA": 0.38, + "hfopenllm_v2/MUSR": 0.477, + "hfopenllm_v2/MMLU-PRO": 0.529 + } + }, + { + "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.6", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8.6", + "developer": "Lunzima", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5919, + "hfopenllm_v2/BBH": 0.6457, + "hfopenllm_v2/MATH Level 5": 0.4071, + "hfopenllm_v2/GPQA": 0.3842, + "hfopenllm_v2/MUSR": 0.4953, + "hfopenllm_v2/MMLU-PRO": 0.54 + } + }, + { + "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.7", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8.7", + "developer": "Lunzima", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7875, + "hfopenllm_v2/BBH": 0.6483, + "hfopenllm_v2/MATH Level 5": 0.5408, + "hfopenllm_v2/GPQA": 0.3515, + "hfopenllm_v2/MUSR": 0.4381, + "hfopenllm_v2/MMLU-PRO": 0.5242 + } + }, + { + "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.8", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8.8", + "developer": "Lunzima", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7028, + "hfopenllm_v2/BBH": 0.6566, + "hfopenllm_v2/MATH Level 5": 0.4237, + "hfopenllm_v2/GPQA": 0.3758, + "hfopenllm_v2/MUSR": 0.4912, + "hfopenllm_v2/MMLU-PRO": 0.5323 + } + }, + { + "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.9", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8.9", + "developer": "Lunzima", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7993, + "hfopenllm_v2/BBH": 0.6483, + "hfopenllm_v2/MATH Level 5": 0.537, + "hfopenllm_v2/GPQA": 0.3297, + "hfopenllm_v2/MUSR": 0.4328, + "hfopenllm_v2/MMLU-PRO": 0.5199 + } + }, + { + "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v9", + "developer": "Lunzima", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5235, + "hfopenllm_v2/BBH": 0.6546, + "hfopenllm_v2/MATH Level 5": 0.4366, + "hfopenllm_v2/GPQA": 0.3884, + "hfopenllm_v2/MUSR": 0.4806, + "hfopenllm_v2/MMLU-PRO": 0.5422 + } + }, + { + "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9-stock", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v9-stock", + "developer": "Lunzima", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6514, + "hfopenllm_v2/BBH": 0.6571, + "hfopenllm_v2/MATH Level 5": 0.4184, + "hfopenllm_v2/GPQA": 0.3842, + "hfopenllm_v2/MUSR": 0.482, + "hfopenllm_v2/MMLU-PRO": 0.5412 + } + }, + { + "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.1", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v9.1", + "developer": "Lunzima", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8003, + "hfopenllm_v2/BBH": 0.6555, + "hfopenllm_v2/MATH Level 5": 0.5468, + "hfopenllm_v2/GPQA": 0.3431, + "hfopenllm_v2/MUSR": 0.4354, + "hfopenllm_v2/MMLU-PRO": 0.5251 + } + }, + { + "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.2", + "name": "NQLSG-Qwen2.5-14B-MegaFusion-v9.2", + "developer": "Lunzima", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7862, + "hfopenllm_v2/BBH": 0.6538, + "hfopenllm_v2/MATH Level 5": 0.5332, + "hfopenllm_v2/GPQA": 0.3557, + "hfopenllm_v2/MUSR": 0.4381, + "hfopenllm_v2/MMLU-PRO": 0.5283 + } + }, + { + "id": "Lunzima/NQLSG-Qwen2.5-14B-OriginalFusion", + "name": "NQLSG-Qwen2.5-14B-OriginalFusion", + "developer": "Lunzima", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6142, + "hfopenllm_v2/BBH": 0.6592, + "hfopenllm_v2/MATH Level 5": 0.4275, + "hfopenllm_v2/GPQA": 0.3809, + "hfopenllm_v2/MUSR": 0.5122, + "hfopenllm_v2/MMLU-PRO": 0.5239 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/LxzGordon.json b/data/developers/LxzGordon.json new file mode 100644 index 0000000000000000000000000000000000000000..e4ace3cc8f7193c8c711403c980535ee73fdd6d3 --- /dev/null +++ b/data/developers/LxzGordon.json @@ -0,0 +1,36 @@ +{ + "developer": "LxzGordon", + "models": [ + { + "id": "LxzGordon/URM-LLaMa-3-8B", + "name": "LxzGordon/URM-LLaMa-3-8B", + "developer": "LxzGordon", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8991, + "reward-bench/Chat": 0.9693, + "reward-bench/Chat Hard": 0.7873, + "reward-bench/Safety": 0.8824, + "reward-bench/Reasoning": 0.9574 + } + }, + { + "id": "LxzGordon/URM-LLaMa-3.1-8B", + "name": "LxzGordon/URM-LLaMa-3.1-8B", + "developer": "LxzGordon", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9294, + "reward-bench/Factuality": 0.6884, + "reward-bench/Precise IF": 0.45, + "reward-bench/Math": 0.6393, + "reward-bench/Safety": 0.9108, + "reward-bench/Focus": 0.9758, + "reward-bench/Ties": 0.7653, + "reward-bench/Chat": 0.9553, + "reward-bench/Chat Hard": 0.8816, + "reward-bench/Reasoning": 0.9698 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Lyte.json b/data/developers/Lyte.json new file mode 100644 index 0000000000000000000000000000000000000000..e9dc729c1cc008adcb228cadfbe35ded3607f85f --- /dev/null +++ b/data/developers/Lyte.json @@ -0,0 +1,47 @@ +{ + "developer": "Lyte", + "models": [ + { + "id": "Lyte/Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3", + "name": "Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3", + "developer": "Lyte", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7098, + "hfopenllm_v2/BBH": 0.495, + "hfopenllm_v2/MATH Level 5": 0.1903, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3461, + "hfopenllm_v2/MMLU-PRO": 0.3618 + } + }, + { + "id": "Lyte/Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04", + "name": "Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04", + "developer": "Lyte", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5774, + "hfopenllm_v2/BBH": 0.3515, + "hfopenllm_v2/MATH Level 5": 0.0801, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3236, + "hfopenllm_v2/MMLU-PRO": 0.1843 + } + }, + { + "id": "Lyte/Llama-3.2-3B-Overthinker", + "name": "Llama-3.2-3B-Overthinker", + "developer": "Lyte", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6408, + "hfopenllm_v2/BBH": 0.432, + "hfopenllm_v2/MATH Level 5": 0.1563, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3419, + "hfopenllm_v2/MMLU-PRO": 0.2985 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/M4-ai.json b/data/developers/M4-ai.json new file mode 100644 index 0000000000000000000000000000000000000000..09abbc2754581708d5cc628249887222f388d0f0 --- /dev/null +++ b/data/developers/M4-ai.json @@ -0,0 +1,19 @@ +{ + "developer": "M4-ai", + "models": [ + { + "id": "M4-ai/TinyMistral-248M-v3", + "name": "TinyMistral-248M-v3", + "developer": "M4-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1639, + "hfopenllm_v2/BBH": 0.2885, + "hfopenllm_v2/MATH Level 5": 0.0045, + "hfopenllm_v2/GPQA": 0.2408, + "hfopenllm_v2/MUSR": 0.3793, + "hfopenllm_v2/MMLU-PRO": 0.1132 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/MEscriva.json b/data/developers/MEscriva.json new file mode 100644 index 0000000000000000000000000000000000000000..c6530a91eb22542864578782dab455ed8027a891 --- /dev/null +++ b/data/developers/MEscriva.json @@ -0,0 +1,19 @@ +{ + "developer": "MEscriva", + "models": [ + { + "id": "MEscriva/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis", + "name": "ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis", + "developer": "MEscriva", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0866, + "hfopenllm_v2/BBH": 0.3057, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2517, + "hfopenllm_v2/MUSR": 0.4017, + "hfopenllm_v2/MMLU-PRO": 0.1154 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/MLP-KTLim.json b/data/developers/MLP-KTLim.json new file mode 100644 index 0000000000000000000000000000000000000000..e1744a40c6fb3456db7ed14ab1be534c66d21527 --- /dev/null +++ b/data/developers/MLP-KTLim.json @@ -0,0 +1,19 @@ +{ + "developer": "MLP-KTLim", + "models": [ + { + "id": "MLP-KTLim/llama-3-Korean-Bllossom-8B", + "name": "llama-3-Korean-Bllossom-8B", + "developer": "MLP-KTLim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5113, + "hfopenllm_v2/BBH": 0.49, + "hfopenllm_v2/MATH Level 5": 0.102, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3675, + "hfopenllm_v2/MMLU-PRO": 0.3594 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/MTSAIR.json b/data/developers/MTSAIR.json new file mode 100644 index 0000000000000000000000000000000000000000..b31cfa942700007f43ba6fb3959ed442419ea4f1 --- /dev/null +++ b/data/developers/MTSAIR.json @@ -0,0 +1,33 @@ +{ + "developer": "MTSAIR", + "models": [ + { + "id": "MTSAIR/Cotype-Nano", + "name": "Cotype-Nano", + "developer": "MTSAIR", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3748, + "hfopenllm_v2/BBH": 0.3865, + "hfopenllm_v2/MATH Level 5": 0.0974, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3289, + "hfopenllm_v2/MMLU-PRO": 0.2477 + } + }, + { + "id": "MTSAIR/MultiVerse_70B", + "name": "MultiVerse_70B", + "developer": "MTSAIR", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5249, + "hfopenllm_v2/BBH": 0.6183, + "hfopenllm_v2/MATH Level 5": 0.1926, + "hfopenllm_v2/GPQA": 0.354, + "hfopenllm_v2/MUSR": 0.474, + "hfopenllm_v2/MMLU-PRO": 0.486 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Magpie-Align.json b/data/developers/Magpie-Align.json new file mode 100644 index 0000000000000000000000000000000000000000..155a416e715db19186e6af681f6736ea9ed101d1 --- /dev/null +++ b/data/developers/Magpie-Align.json @@ -0,0 +1,117 @@ +{ + "developer": "Magpie-Align", + "models": [ + { + "id": "Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.1", + "name": "Llama-3-8B-Magpie-Align-SFT-v0.1", + "developer": "Magpie-Align", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4361, + "hfopenllm_v2/BBH": 0.4615, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3277, + "hfopenllm_v2/MMLU-PRO": 0.2863 + } + }, + { + "id": "Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.3", + "name": "Llama-3-8B-Magpie-Align-SFT-v0.3", + "developer": "Magpie-Align", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5064, + "hfopenllm_v2/BBH": 0.4572, + "hfopenllm_v2/MATH Level 5": 0.0733, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3424, + "hfopenllm_v2/MMLU-PRO": 0.2902 + } + }, + { + "id": "Magpie-Align/Llama-3-8B-Magpie-Align-v0.1", + "name": "Llama-3-8B-Magpie-Align-v0.1", + "developer": "Magpie-Align", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4118, + "hfopenllm_v2/BBH": 0.4811, + "hfopenllm_v2/MATH Level 5": 0.034, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3047, + "hfopenllm_v2/MMLU-PRO": 0.3006 + } + }, + { + "id": "Magpie-Align/Llama-3-8B-Magpie-Align-v0.3", + "name": "Llama-3-8B-Magpie-Align-v0.3", + "developer": "Magpie-Align", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4497, + "hfopenllm_v2/BBH": 0.457, + "hfopenllm_v2/MATH Level 5": 0.0566, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3406, + "hfopenllm_v2/MMLU-PRO": 0.3134 + } + }, + { + "id": "Magpie-Align/Llama-3.1-8B-Magpie-Align-SFT-v0.1", + "name": "Llama-3.1-8B-Magpie-Align-SFT-v0.1", + "developer": "Magpie-Align", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4782, + "hfopenllm_v2/BBH": 0.4764, + "hfopenllm_v2/MATH Level 5": 0.0899, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3397, + "hfopenllm_v2/MMLU-PRO": 0.2943 + } + }, + { + "id": "Magpie-Align/Llama-3.1-8B-Magpie-Align-v0.1", + "name": "Llama-3.1-8B-Magpie-Align-v0.1", + "developer": "Magpie-Align", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4458, + "hfopenllm_v2/BBH": 0.4622, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3141, + "hfopenllm_v2/MMLU-PRO": 0.3262 + } + }, + { + "id": "Magpie-Align/MagpieLM-8B-Chat-v0.1", + "name": "MagpieLM-8B-Chat-v0.1", + "developer": "Magpie-Align", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3701, + "hfopenllm_v2/BBH": 0.4172, + "hfopenllm_v2/MATH Level 5": 0.0612, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3501, + "hfopenllm_v2/MMLU-PRO": 0.3195 + } + }, + { + "id": "Magpie-Align/MagpieLM-8B-SFT-v0.1", + "name": "MagpieLM-8B-SFT-v0.1", + "developer": "Magpie-Align", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4721, + "hfopenllm_v2/BBH": 0.4553, + "hfopenllm_v2/MATH Level 5": 0.0755, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3649, + "hfopenllm_v2/MMLU-PRO": 0.299 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/MagusCorp.json b/data/developers/MagusCorp.json new file mode 100644 index 0000000000000000000000000000000000000000..03765be6117b10742a99027717bd71648696b76f --- /dev/null +++ b/data/developers/MagusCorp.json @@ -0,0 +1,19 @@ +{ + "developer": "MagusCorp", + "models": [ + { + "id": "MagusCorp/grpo_lora_enem_llama3_7b", + "name": "grpo_lora_enem_llama3_7b", + "developer": "MagusCorp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4724, + "hfopenllm_v2/BBH": 0.4801, + "hfopenllm_v2/MATH Level 5": 0.1216, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.3971, + "hfopenllm_v2/MMLU-PRO": 0.3574 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ManoloPueblo.json b/data/developers/ManoloPueblo.json new file mode 100644 index 0000000000000000000000000000000000000000..aa2d755da56eb00357d2cb595b16f96384ab5fbf --- /dev/null +++ b/data/developers/ManoloPueblo.json @@ -0,0 +1,47 @@ +{ + "developer": "ManoloPueblo", + "models": [ + { + "id": "ManoloPueblo/ContentCuisine_1-7B-slerp", + "name": "ContentCuisine_1-7B-slerp", + "developer": "ManoloPueblo", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3907, + "hfopenllm_v2/BBH": 0.5188, + "hfopenllm_v2/MATH Level 5": 0.0733, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4672, + "hfopenllm_v2/MMLU-PRO": 0.3054 + } + }, + { + "id": "ManoloPueblo/LLM_MERGE_CC2", + "name": "LLM_MERGE_CC2", + "developer": "ManoloPueblo", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3853, + "hfopenllm_v2/BBH": 0.5209, + "hfopenllm_v2/MATH Level 5": 0.0642, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4593, + "hfopenllm_v2/MMLU-PRO": 0.3032 + } + }, + { + "id": "ManoloPueblo/LLM_MERGE_CC3", + "name": "LLM_MERGE_CC3", + "developer": "ManoloPueblo", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3959, + "hfopenllm_v2/BBH": 0.5246, + "hfopenllm_v2/MATH Level 5": 0.0793, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4672, + "hfopenllm_v2/MMLU-PRO": 0.3156 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/MarinaraSpaghetti.json b/data/developers/MarinaraSpaghetti.json new file mode 100644 index 0000000000000000000000000000000000000000..23bc15f8e7194dbec6cd2d2d0789ff12a745aa55 --- /dev/null +++ b/data/developers/MarinaraSpaghetti.json @@ -0,0 +1,33 @@ +{ + "developer": "MarinaraSpaghetti", + "models": [ + { + "id": "MarinaraSpaghetti/NemoReRemix-12B", + "name": "NemoReRemix-12B", + "developer": "MarinaraSpaghetti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3343, + "hfopenllm_v2/BBH": 0.5537, + "hfopenllm_v2/MATH Level 5": 0.0906, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.4501, + "hfopenllm_v2/MMLU-PRO": 0.3598 + } + }, + { + "id": "MarinaraSpaghetti/Nemomix-v4.0-12B", + "name": "Nemomix-v4.0-12B", + "developer": "MarinaraSpaghetti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5575, + "hfopenllm_v2/BBH": 0.5275, + "hfopenllm_v2/MATH Level 5": 0.108, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4244, + "hfopenllm_v2/MMLU-PRO": 0.3613 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Marsouuu.json b/data/developers/Marsouuu.json new file mode 100644 index 0000000000000000000000000000000000000000..98f19cbf50325ffc603271e81f2501b3b9a82e59 --- /dev/null +++ b/data/developers/Marsouuu.json @@ -0,0 +1,117 @@ +{ + "developer": "Marsouuu", + "models": [ + { + "id": "Marsouuu/MiniMathExpert-2_61B-ECE-PRYMMAL-Martial", + "name": "MiniMathExpert-2_61B-ECE-PRYMMAL-Martial", + "developer": "Marsouuu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2548, + "hfopenllm_v2/BBH": 0.3953, + "hfopenllm_v2/MATH Level 5": 0.074, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.4083, + "hfopenllm_v2/MMLU-PRO": 0.2274 + } + }, + { + "id": "Marsouuu/MiniQwenMathExpert-ECE-PRYMMAL-Martial", + "name": "MiniQwenMathExpert-ECE-PRYMMAL-Martial", + "developer": "Marsouuu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2795, + "hfopenllm_v2/BBH": 0.423, + "hfopenllm_v2/MATH Level 5": 0.114, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.3867, + "hfopenllm_v2/MMLU-PRO": 0.2922 + } + }, + { + "id": "Marsouuu/MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial", + "name": "MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial", + "developer": "Marsouuu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1697, + "hfopenllm_v2/BBH": 0.3464, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3991, + "hfopenllm_v2/MMLU-PRO": 0.1379 + } + }, + { + "id": "Marsouuu/general3B-ECE-PRYMMAL-Martial", + "name": "general3B-ECE-PRYMMAL-Martial", + "developer": "Marsouuu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2722, + "hfopenllm_v2/BBH": 0.5394, + "hfopenllm_v2/MATH Level 5": 0.1548, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.4701, + "hfopenllm_v2/MMLU-PRO": 0.3876 + } + }, + { + "id": "Marsouuu/general3Bv2-ECE-PRYMMAL-Martial", + "name": "general3Bv2-ECE-PRYMMAL-Martial", + "developer": "Marsouuu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5693, + "hfopenllm_v2/BBH": 0.5637, + "hfopenllm_v2/MATH Level 5": 0.3671, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4396, + "hfopenllm_v2/MMLU-PRO": 0.4498 + } + }, + { + "id": "Marsouuu/lareneg1_78B-ECE-PRYMMAL-Martial", + "name": "lareneg1_78B-ECE-PRYMMAL-Martial", + "developer": "Marsouuu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2795, + "hfopenllm_v2/BBH": 0.423, + "hfopenllm_v2/MATH Level 5": 0.114, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.3867, + "hfopenllm_v2/MMLU-PRO": 0.2922 + } + }, + { + "id": "Marsouuu/lareneg3B-ECE-PRYMMAL-Martial", + "name": "lareneg3B-ECE-PRYMMAL-Martial", + "developer": "Marsouuu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3303, + "hfopenllm_v2/BBH": 0.5453, + "hfopenllm_v2/MATH Level 5": 0.1518, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.4725, + "hfopenllm_v2/MMLU-PRO": 0.3767 + } + }, + { + "id": "Marsouuu/lareneg3Bv2-ECE-PRYMMAL-Martial", + "name": "lareneg3Bv2-ECE-PRYMMAL-Martial", + "developer": "Marsouuu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5753, + "hfopenllm_v2/BBH": 0.5623, + "hfopenllm_v2/MATH Level 5": 0.3656, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.4369, + "hfopenllm_v2/MMLU-PRO": 0.4511 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/MaziyarPanahi.json b/data/developers/MaziyarPanahi.json new file mode 100644 index 0000000000000000000000000000000000000000..2093b9e747044b495c3fe2fb27fc9fea37ab0dbc --- /dev/null +++ b/data/developers/MaziyarPanahi.json @@ -0,0 +1,621 @@ +{ + "developer": "MaziyarPanahi", + "models": [ + { + "id": "MaziyarPanahi/Calme-4x7B-MoE-v0.1", + "name": "Calme-4x7B-MoE-v0.1", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4315, + "hfopenllm_v2/BBH": 0.5103, + "hfopenllm_v2/MATH Level 5": 0.0801, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.4199, + "hfopenllm_v2/MMLU-PRO": 0.3057 + } + }, + { + "id": "MaziyarPanahi/Calme-4x7B-MoE-v0.2", + "name": "Calme-4x7B-MoE-v0.2", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4294, + "hfopenllm_v2/BBH": 0.5111, + "hfopenllm_v2/MATH Level 5": 0.074, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.4318, + "hfopenllm_v2/MMLU-PRO": 0.3058 + } + }, + { + "id": "MaziyarPanahi/Llama-3-70B-Instruct-v0.1", + "name": "Llama-3-70B-Instruct-v0.1", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4714, + "hfopenllm_v2/BBH": 0.5366, + "hfopenllm_v2/MATH Level 5": 0.1805, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4433, + "hfopenllm_v2/MMLU-PRO": 0.4618 + } + }, + { + "id": "MaziyarPanahi/Llama-3-8B-Instruct-v0.10", + "name": "Llama-3-8B-Instruct-v0.10", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7667, + "hfopenllm_v2/BBH": 0.4924, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4214, + "hfopenllm_v2/MMLU-PRO": 0.3862 + } + }, + { + "id": "MaziyarPanahi/Llama-3-8B-Instruct-v0.8", + "name": "Llama-3-8B-Instruct-v0.8", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7528, + "hfopenllm_v2/BBH": 0.4963, + "hfopenllm_v2/MATH Level 5": 0.0778, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4202, + "hfopenllm_v2/MMLU-PRO": 0.3853 + } + }, + { + "id": "MaziyarPanahi/Llama-3-8B-Instruct-v0.9", + "name": "Llama-3-8B-Instruct-v0.9", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.763, + "hfopenllm_v2/BBH": 0.4936, + "hfopenllm_v2/MATH Level 5": 0.0733, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4148, + "hfopenllm_v2/MMLU-PRO": 0.3846 + } + }, + { + "id": "MaziyarPanahi/Qwen1.5-MoE-A2.7B-Wikihow", + "name": "Qwen1.5-MoE-A2.7B-Wikihow", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2954, + "hfopenllm_v2/BBH": 0.392, + "hfopenllm_v2/MATH Level 5": 0.0823, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3502, + "hfopenllm_v2/MMLU-PRO": 0.238 + } + }, + { + "id": "MaziyarPanahi/Qwen2-7B-Instruct-v0.1", + "name": "Qwen2-7B-Instruct-v0.1", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3352, + "hfopenllm_v2/BBH": 0.5123, + "hfopenllm_v2/MATH Level 5": 0.2213, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.4435, + "hfopenllm_v2/MMLU-PRO": 0.3857 + } + }, + { + "id": "MaziyarPanahi/Qwen2-7B-Instruct-v0.8", + "name": "Qwen2-7B-Instruct-v0.8", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2775, + "hfopenllm_v2/BBH": 0.4637, + "hfopenllm_v2/MATH Level 5": 0.1767, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4293, + "hfopenllm_v2/MMLU-PRO": 0.3566 + } + }, + { + "id": "MaziyarPanahi/calme-2.1-llama3.1-70b", + "name": "calme-2.1-llama3.1-70b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8434, + "hfopenllm_v2/BBH": 0.6448, + "hfopenllm_v2/MATH Level 5": 0.4101, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.438, + "hfopenllm_v2/MMLU-PRO": 0.5283 + } + }, + { + "id": "MaziyarPanahi/calme-2.1-phi3-4b", + "name": "calme-2.1-phi3-4b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5525, + "hfopenllm_v2/BBH": 0.5595, + "hfopenllm_v2/MATH Level 5": 0.1314, + "hfopenllm_v2/GPQA": 0.3297, + "hfopenllm_v2/MUSR": 0.4015, + "hfopenllm_v2/MMLU-PRO": 0.3746 + } + }, + { + "id": "MaziyarPanahi/calme-2.1-phi3.5-4b", + "name": "calme-2.1-phi3.5-4b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5659, + "hfopenllm_v2/BBH": 0.5484, + "hfopenllm_v2/MATH Level 5": 0.2039, + "hfopenllm_v2/GPQA": 0.344, + "hfopenllm_v2/MUSR": 0.3995, + "hfopenllm_v2/MMLU-PRO": 0.3935 + } + }, + { + "id": "MaziyarPanahi/calme-2.1-qwen2-72b", + "name": "calme-2.1-qwen2-72b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8163, + "hfopenllm_v2/BBH": 0.6966, + "hfopenllm_v2/MATH Level 5": 0.4079, + "hfopenllm_v2/GPQA": 0.3809, + "hfopenllm_v2/MUSR": 0.4732, + "hfopenllm_v2/MMLU-PRO": 0.5415 + } + }, + { + "id": "MaziyarPanahi/calme-2.1-qwen2-7b", + "name": "calme-2.1-qwen2-7b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3816, + "hfopenllm_v2/BBH": 0.5046, + "hfopenllm_v2/MATH Level 5": 0.2311, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.4437, + "hfopenllm_v2/MMLU-PRO": 0.3693 + } + }, + { + "id": "MaziyarPanahi/calme-2.1-qwen2.5-72b", + "name": "calme-2.1-qwen2.5-72b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8662, + "hfopenllm_v2/BBH": 0.7262, + "hfopenllm_v2/MATH Level 5": 0.5914, + "hfopenllm_v2/GPQA": 0.3633, + "hfopenllm_v2/MUSR": 0.4298, + "hfopenllm_v2/MMLU-PRO": 0.5619 + } + }, + { + "id": "MaziyarPanahi/calme-2.1-rys-78b", + "name": "calme-2.1-rys-78b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8136, + "hfopenllm_v2/BBH": 0.7098, + "hfopenllm_v2/MATH Level 5": 0.3943, + "hfopenllm_v2/GPQA": 0.3943, + "hfopenllm_v2/MUSR": 0.4693, + "hfopenllm_v2/MMLU-PRO": 0.5444 + } + }, + { + "id": "MaziyarPanahi/calme-2.2-llama3-70b", + "name": "calme-2.2-llama3-70b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8208, + "hfopenllm_v2/BBH": 0.6435, + "hfopenllm_v2/MATH Level 5": 0.2394, + "hfopenllm_v2/GPQA": 0.3414, + "hfopenllm_v2/MUSR": 0.4446, + "hfopenllm_v2/MMLU-PRO": 0.5207 + } + }, + { + "id": "MaziyarPanahi/calme-2.2-llama3.1-70b", + "name": "calme-2.2-llama3.1-70b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8593, + "hfopenllm_v2/BBH": 0.6793, + "hfopenllm_v2/MATH Level 5": 0.4366, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.4542, + "hfopenllm_v2/MMLU-PRO": 0.5415 + } + }, + { + "id": "MaziyarPanahi/calme-2.2-phi3-4b", + "name": "calme-2.2-phi3-4b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5069, + "hfopenllm_v2/BBH": 0.553, + "hfopenllm_v2/MATH Level 5": 0.145, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.3976, + "hfopenllm_v2/MMLU-PRO": 0.3814 + } + }, + { + "id": "MaziyarPanahi/calme-2.2-qwen2-72b", + "name": "calme-2.2-qwen2-72b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8008, + "hfopenllm_v2/BBH": 0.694, + "hfopenllm_v2/MATH Level 5": 0.4532, + "hfopenllm_v2/GPQA": 0.3742, + "hfopenllm_v2/MUSR": 0.4508, + "hfopenllm_v2/MMLU-PRO": 0.5435 + } + }, + { + "id": "MaziyarPanahi/calme-2.2-qwen2-7b", + "name": "calme-2.2-qwen2-7b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3597, + "hfopenllm_v2/BBH": 0.5215, + "hfopenllm_v2/MATH Level 5": 0.2145, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.4358, + "hfopenllm_v2/MMLU-PRO": 0.3899 + } + }, + { + "id": "MaziyarPanahi/calme-2.2-qwen2.5-72b", + "name": "calme-2.2-qwen2.5-72b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8477, + "hfopenllm_v2/BBH": 0.7276, + "hfopenllm_v2/MATH Level 5": 0.5891, + "hfopenllm_v2/GPQA": 0.3591, + "hfopenllm_v2/MUSR": 0.4207, + "hfopenllm_v2/MMLU-PRO": 0.5618 + } + }, + { + "id": "MaziyarPanahi/calme-2.2-rys-78b", + "name": "calme-2.2-rys-78b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7986, + "hfopenllm_v2/BBH": 0.7081, + "hfopenllm_v2/MATH Level 5": 0.4071, + "hfopenllm_v2/GPQA": 0.4069, + "hfopenllm_v2/MUSR": 0.4536, + "hfopenllm_v2/MMLU-PRO": 0.5386 + } + }, + { + "id": "MaziyarPanahi/calme-2.3-llama3-70b", + "name": "calme-2.3-llama3-70b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.801, + "hfopenllm_v2/BBH": 0.6399, + "hfopenllm_v2/MATH Level 5": 0.2326, + "hfopenllm_v2/GPQA": 0.3381, + "hfopenllm_v2/MUSR": 0.4261, + "hfopenllm_v2/MMLU-PRO": 0.5204 + } + }, + { + "id": "MaziyarPanahi/calme-2.3-llama3.1-70b", + "name": "calme-2.3-llama3.1-70b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8605, + "hfopenllm_v2/BBH": 0.6872, + "hfopenllm_v2/MATH Level 5": 0.3927, + "hfopenllm_v2/GPQA": 0.344, + "hfopenllm_v2/MUSR": 0.4568, + "hfopenllm_v2/MMLU-PRO": 0.5363 + } + }, + { + "id": "MaziyarPanahi/calme-2.3-phi3-4b", + "name": "calme-2.3-phi3-4b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4926, + "hfopenllm_v2/BBH": 0.5538, + "hfopenllm_v2/MATH Level 5": 0.1473, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.3988, + "hfopenllm_v2/MMLU-PRO": 0.3828 + } + }, + { + "id": "MaziyarPanahi/calme-2.3-qwen2-72b", + "name": "calme-2.3-qwen2-72b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.385, + "hfopenllm_v2/BBH": 0.6576, + "hfopenllm_v2/MATH Level 5": 0.3172, + "hfopenllm_v2/GPQA": 0.3716, + "hfopenllm_v2/MUSR": 0.4112, + "hfopenllm_v2/MMLU-PRO": 0.5419 + } + }, + { + "id": "MaziyarPanahi/calme-2.3-qwen2-7b", + "name": "calme-2.3-qwen2-7b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3825, + "hfopenllm_v2/BBH": 0.5064, + "hfopenllm_v2/MATH Level 5": 0.2069, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4422, + "hfopenllm_v2/MMLU-PRO": 0.3611 + } + }, + { + "id": "MaziyarPanahi/calme-2.3-rys-78b", + "name": "calme-2.3-rys-78b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8066, + "hfopenllm_v2/BBH": 0.7108, + "hfopenllm_v2/MATH Level 5": 0.398, + "hfopenllm_v2/GPQA": 0.4044, + "hfopenllm_v2/MUSR": 0.4549, + "hfopenllm_v2/MMLU-PRO": 0.5475 + } + }, + { + "id": "MaziyarPanahi/calme-2.4-llama3-70b", + "name": "calme-2.4-llama3-70b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5027, + "hfopenllm_v2/BBH": 0.6418, + "hfopenllm_v2/MATH Level 5": 0.2447, + "hfopenllm_v2/GPQA": 0.3398, + "hfopenllm_v2/MUSR": 0.4288, + "hfopenllm_v2/MMLU-PRO": 0.5204 + } + }, + { + "id": "MaziyarPanahi/calme-2.4-qwen2-7b", + "name": "calme-2.4-qwen2-7b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.33, + "hfopenllm_v2/BBH": 0.5101, + "hfopenllm_v2/MATH Level 5": 0.2032, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.4453, + "hfopenllm_v2/MMLU-PRO": 0.3977 + } + }, + { + "id": "MaziyarPanahi/calme-2.4-rys-78b", + "name": "calme-2.4-rys-78b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8011, + "hfopenllm_v2/BBH": 0.728, + "hfopenllm_v2/MATH Level 5": 0.4071, + "hfopenllm_v2/GPQA": 0.4027, + "hfopenllm_v2/MUSR": 0.5771, + "hfopenllm_v2/MMLU-PRO": 0.7002 + } + }, + { + "id": "MaziyarPanahi/calme-2.5-qwen2-7b", + "name": "calme-2.5-qwen2-7b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3145, + "hfopenllm_v2/BBH": 0.4887, + "hfopenllm_v2/MATH Level 5": 0.2258, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4565, + "hfopenllm_v2/MMLU-PRO": 0.3682 + } + }, + { + "id": "MaziyarPanahi/calme-2.6-qwen2-7b", + "name": "calme-2.6-qwen2-7b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3443, + "hfopenllm_v2/BBH": 0.493, + "hfopenllm_v2/MATH Level 5": 0.1216, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4586, + "hfopenllm_v2/MMLU-PRO": 0.3732 + } + }, + { + "id": "MaziyarPanahi/calme-2.7-qwen2-7b", + "name": "calme-2.7-qwen2-7b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3592, + "hfopenllm_v2/BBH": 0.4883, + "hfopenllm_v2/MATH Level 5": 0.1382, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.4824, + "hfopenllm_v2/MMLU-PRO": 0.3705 + } + }, + { + "id": "MaziyarPanahi/calme-3.1-baguette-3b", + "name": "calme-3.1-baguette-3b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6234, + "hfopenllm_v2/BBH": 0.4683, + "hfopenllm_v2/MATH Level 5": 0.256, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.4008, + "hfopenllm_v2/MMLU-PRO": 0.3399 + } + }, + { + "id": "MaziyarPanahi/calme-3.1-instruct-3b", + "name": "calme-3.1-instruct-3b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4336, + "hfopenllm_v2/BBH": 0.4813, + "hfopenllm_v2/MATH Level 5": 0.1775, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.3952, + "hfopenllm_v2/MMLU-PRO": 0.3557 + } + }, + { + "id": "MaziyarPanahi/calme-3.1-instruct-78b", + "name": "calme-3.1-instruct-78b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8136, + "hfopenllm_v2/BBH": 0.7305, + "hfopenllm_v2/MATH Level 5": 0.3927, + "hfopenllm_v2/GPQA": 0.396, + "hfopenllm_v2/MUSR": 0.5891, + "hfopenllm_v2/MMLU-PRO": 0.7185 + } + }, + { + "id": "MaziyarPanahi/calme-3.1-llamaloi-3b", + "name": "calme-3.1-llamaloi-3b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7375, + "hfopenllm_v2/BBH": 0.4587, + "hfopenllm_v2/MATH Level 5": 0.173, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.3515, + "hfopenllm_v2/MMLU-PRO": 0.3205 + } + }, + { + "id": "MaziyarPanahi/calme-3.2-baguette-3b", + "name": "calme-3.2-baguette-3b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6338, + "hfopenllm_v2/BBH": 0.4709, + "hfopenllm_v2/MATH Level 5": 0.2825, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4021, + "hfopenllm_v2/MMLU-PRO": 0.3338 + } + }, + { + "id": "MaziyarPanahi/calme-3.2-instruct-3b", + "name": "calme-3.2-instruct-3b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5533, + "hfopenllm_v2/BBH": 0.4866, + "hfopenllm_v2/MATH Level 5": 0.2168, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.4047, + "hfopenllm_v2/MMLU-PRO": 0.3653 + } + }, + { + "id": "MaziyarPanahi/calme-3.2-instruct-78b", + "name": "calme-3.2-instruct-78b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8063, + "hfopenllm_v2/BBH": 0.7319, + "hfopenllm_v2/MATH Level 5": 0.4033, + "hfopenllm_v2/GPQA": 0.4027, + "hfopenllm_v2/MUSR": 0.6024, + "hfopenllm_v2/MMLU-PRO": 0.7303 + } + }, + { + "id": "MaziyarPanahi/calme-3.3-baguette-3b", + "name": "calme-3.3-baguette-3b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.636, + "hfopenllm_v2/BBH": 0.4678, + "hfopenllm_v2/MATH Level 5": 0.3807, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3928, + "hfopenllm_v2/MMLU-PRO": 0.3342 + } + }, + { + "id": "MaziyarPanahi/calme-3.3-instruct-3b", + "name": "calme-3.3-instruct-3b", + "developer": "MaziyarPanahi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6423, + "hfopenllm_v2/BBH": 0.4693, + "hfopenllm_v2/MATH Level 5": 0.3739, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4074, + "hfopenllm_v2/MMLU-PRO": 0.3305 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Meta.json b/data/developers/Meta.json new file mode 100644 index 0000000000000000000000000000000000000000..22227f5c93a37eaf8487a1078b002421f17c3dec --- /dev/null +++ b/data/developers/Meta.json @@ -0,0 +1,16 @@ +{ + "developer": "Meta", + "models": [ + { + "id": "meta/llama-4-maverick", + "name": "meta/llama-4-maverick", + "developer": "Meta", + "evaluator_relationship": null, + "benchmark_scores": { + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.0, + "livecodebenchpro/Easy Problems": 0.09859154929577464 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Minami-su.json b/data/developers/Minami-su.json new file mode 100644 index 0000000000000000000000000000000000000000..ab1f6d2f21891005d8d34b394b96e4ef5daa1330 --- /dev/null +++ b/data/developers/Minami-su.json @@ -0,0 +1,75 @@ +{ + "developer": "Minami-su", + "models": [ + { + "id": "Minami-su/Amara-o1-7B-Qwen", + "name": "Amara-o1-7B-Qwen", + "developer": "Minami-su", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.739, + "hfopenllm_v2/BBH": 0.5199, + "hfopenllm_v2/MATH Level 5": 0.5181, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4007, + "hfopenllm_v2/MMLU-PRO": 0.4083 + } + }, + { + "id": "Minami-su/Amara-o2-7B-Qwen", + "name": "Amara-o2-7B-Qwen", + "developer": "Minami-su", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7147, + "hfopenllm_v2/BBH": 0.5173, + "hfopenllm_v2/MATH Level 5": 0.4086, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3781, + "hfopenllm_v2/MMLU-PRO": 0.4165 + } + }, + { + "id": "Minami-su/test-7B-00", + "name": "test-7B-00", + "developer": "Minami-su", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.669, + "hfopenllm_v2/BBH": 0.4466, + "hfopenllm_v2/MATH Level 5": 0.4517, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4126, + "hfopenllm_v2/MMLU-PRO": 0.3588 + } + }, + { + "id": "Minami-su/test-7B-01", + "name": "test-7B-01", + "developer": "Minami-su", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6736, + "hfopenllm_v2/BBH": 0.4422, + "hfopenllm_v2/MATH Level 5": 0.4554, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4153, + "hfopenllm_v2/MMLU-PRO": 0.3536 + } + }, + { + "id": "Minami-su/test-v2-7B-00", + "name": "test-v2-7B-00", + "developer": "Minami-su", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6747, + "hfopenllm_v2/BBH": 0.4416, + "hfopenllm_v2/MATH Level 5": 0.4418, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4154, + "hfopenllm_v2/MMLU-PRO": 0.3472 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/MiniMax.json b/data/developers/MiniMax.json new file mode 100644 index 0000000000000000000000000000000000000000..bf1e95583288fed674acdea5cb3109f4a1eaa232 --- /dev/null +++ b/data/developers/MiniMax.json @@ -0,0 +1,23 @@ +{ + "developer": "MiniMax", + "models": [ + { + "id": "minimax/minimax-m2", + "name": "MiniMax M2", + "developer": "MiniMax", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 30.0 + } + }, + { + "id": "minimax/minimax-m2.1", + "name": "MiniMax M2.1", + "developer": "MiniMax", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 29.2 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Minimax.json b/data/developers/Minimax.json new file mode 100644 index 0000000000000000000000000000000000000000..a47e6e8c5b7af8d07d5ab11c54cde104c2cb3327 --- /dev/null +++ b/data/developers/Minimax.json @@ -0,0 +1,14 @@ +{ + "developer": "Minimax", + "models": [ + { + "id": "minimax/minimax-m2.5", + "name": "Minimax m2.5", + "developer": "Minimax", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 42.2 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ModelCloud.json b/data/developers/ModelCloud.json new file mode 100644 index 0000000000000000000000000000000000000000..5f7a1ed1accb4f1de07451c248bdcb2c6e36288c --- /dev/null +++ b/data/developers/ModelCloud.json @@ -0,0 +1,19 @@ +{ + "developer": "ModelCloud", + "models": [ + { + "id": "ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1", + "name": "Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1", + "developer": "ModelCloud", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5269, + "hfopenllm_v2/BBH": 0.3253, + "hfopenllm_v2/MATH Level 5": 0.0604, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3249, + "hfopenllm_v2/MMLU-PRO": 0.1764 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ModelSpace.json b/data/developers/ModelSpace.json new file mode 100644 index 0000000000000000000000000000000000000000..e2d1333815369d20f17555f9d4482294426e5fec --- /dev/null +++ b/data/developers/ModelSpace.json @@ -0,0 +1,19 @@ +{ + "developer": "ModelSpace", + "models": [ + { + "id": "ModelSpace/GemmaX2-28-9B-v0.1", + "name": "GemmaX2-28-9B-v0.1", + "developer": "ModelSpace", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0039, + "hfopenllm_v2/BBH": 0.3687, + "hfopenllm_v2/MATH Level 5": 0.0272, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.3537, + "hfopenllm_v2/MMLU-PRO": 0.2231 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/MoonRide.json b/data/developers/MoonRide.json new file mode 100644 index 0000000000000000000000000000000000000000..4fbd1f8f52394253d5382b982b906769dfc07443 --- /dev/null +++ b/data/developers/MoonRide.json @@ -0,0 +1,19 @@ +{ + "developer": "MoonRide", + "models": [ + { + "id": "MoonRide/Llama-3.2-3B-Khelavaster", + "name": "Llama-3.2-3B-Khelavaster", + "developer": "MoonRide", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4925, + "hfopenllm_v2/BBH": 0.4516, + "hfopenllm_v2/MATH Level 5": 0.1616, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.3699, + "hfopenllm_v2/MMLU-PRO": 0.3122 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Moonshot_AI.json b/data/developers/Moonshot_AI.json new file mode 100644 index 0000000000000000000000000000000000000000..d83f11402fcca0c39c33d09eed495f2aefd69384 --- /dev/null +++ b/data/developers/Moonshot_AI.json @@ -0,0 +1,23 @@ +{ + "developer": "Moonshot AI", + "models": [ + { + "id": "moonshot-ai/kimi-k2-instruct", + "name": "Kimi K2 Instruct", + "developer": "Moonshot AI", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 26.7 + } + }, + { + "id": "moonshot-ai/kimi-k2-thinking", + "name": "Kimi K2 Thinking", + "developer": "Moonshot AI", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 35.7 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Mostafa8Mehrabi.json b/data/developers/Mostafa8Mehrabi.json new file mode 100644 index 0000000000000000000000000000000000000000..26762a1f3e8dd9a339a35475797600f1bbbf801d --- /dev/null +++ b/data/developers/Mostafa8Mehrabi.json @@ -0,0 +1,19 @@ +{ + "developer": "Mostafa8Mehrabi", + "models": [ + { + "id": "Mostafa8Mehrabi/llama-3.2-1b-Insomnia-ChatBot-merged", + "name": "llama-3.2-1b-Insomnia-ChatBot-merged", + "developer": "Mostafa8Mehrabi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1321, + "hfopenllm_v2/BBH": 0.3004, + "hfopenllm_v2/MATH Level 5": 0.0076, + "hfopenllm_v2/GPQA": 0.2366, + "hfopenllm_v2/MUSR": 0.3382, + "hfopenllm_v2/MMLU-PRO": 0.1131 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/MrRobotoAI.json b/data/developers/MrRobotoAI.json new file mode 100644 index 0000000000000000000000000000000000000000..b9fdc0a7cabd982ae8ff8de14a4fc625e30515d7 --- /dev/null +++ b/data/developers/MrRobotoAI.json @@ -0,0 +1,33 @@ +{ + "developer": "MrRobotoAI", + "models": [ + { + "id": "MrRobotoAI/MrRoboto-ProLong-8b-v4i", + "name": "MrRoboto-ProLong-8b-v4i", + "developer": "MrRobotoAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3835, + "hfopenllm_v2/BBH": 0.4585, + "hfopenllm_v2/MATH Level 5": 0.0551, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.4014, + "hfopenllm_v2/MMLU-PRO": 0.3068 + } + }, + { + "id": "MrRobotoAI/MrRoboto-ProLongBASE-pt8-unaligned-8b", + "name": "MrRoboto-ProLongBASE-pt8-unaligned-8b", + "developer": "MrRobotoAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3475, + "hfopenllm_v2/BBH": 0.4515, + "hfopenllm_v2/MATH Level 5": 0.0423, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.4279, + "hfopenllm_v2/MMLU-PRO": 0.2566 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Multiple.json b/data/developers/Multiple.json new file mode 100644 index 0000000000000000000000000000000000000000..b464ffc342f6679111ee9b59dfbf7cef7cf4cfc7 --- /dev/null +++ b/data/developers/Multiple.json @@ -0,0 +1,14 @@ +{ + "developer": "Multiple", + "models": [ + { + "id": "multiple/multiple", + "name": "Multiple", + "developer": "Multiple", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 50.1 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/MultivexAI.json b/data/developers/MultivexAI.json new file mode 100644 index 0000000000000000000000000000000000000000..15d8d06c573f154d06df47c58778fb9bd81fb4fe --- /dev/null +++ b/data/developers/MultivexAI.json @@ -0,0 +1,75 @@ +{ + "developer": "MultivexAI", + "models": [ + { + "id": "MultivexAI/Gladiator-Mini-Exp-1211-3B", + "name": "Gladiator-Mini-Exp-1211-3B", + "developer": "MultivexAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6876, + "hfopenllm_v2/BBH": 0.4484, + "hfopenllm_v2/MATH Level 5": 0.1375, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.326, + "hfopenllm_v2/MMLU-PRO": 0.3152 + } + }, + { + "id": "MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct", + "name": "Gladiator-Mini-Exp-1221-3B-Instruct", + "developer": "MultivexAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6079, + "hfopenllm_v2/BBH": 0.437, + "hfopenllm_v2/MATH Level 5": 0.1352, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3115, + "hfopenllm_v2/MMLU-PRO": 0.3049 + } + }, + { + "id": "MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct-V2", + "name": "Gladiator-Mini-Exp-1221-3B-Instruct-V2", + "developer": "MultivexAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6215, + "hfopenllm_v2/BBH": 0.4389, + "hfopenllm_v2/MATH Level 5": 0.1412, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3008, + "hfopenllm_v2/MMLU-PRO": 0.3025 + } + }, + { + "id": "MultivexAI/Gladiator-Mini-Exp-1222-3B-Instruct", + "name": "Gladiator-Mini-Exp-1222-3B-Instruct", + "developer": "MultivexAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6163, + "hfopenllm_v2/BBH": 0.4373, + "hfopenllm_v2/MATH Level 5": 0.1412, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3128, + "hfopenllm_v2/MMLU-PRO": 0.3017 + } + }, + { + "id": "MultivexAI/Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF", + "name": "Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF", + "developer": "MultivexAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.144, + "hfopenllm_v2/BBH": 0.2908, + "hfopenllm_v2/MATH Level 5": 0.006, + "hfopenllm_v2/GPQA": 0.255, + "hfopenllm_v2/MUSR": 0.3642, + "hfopenllm_v2/MMLU-PRO": 0.1109 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Mxode.json b/data/developers/Mxode.json new file mode 100644 index 0000000000000000000000000000000000000000..779e6ca24b641c50f0ea6e66402d308745ee5144 --- /dev/null +++ b/data/developers/Mxode.json @@ -0,0 +1,75 @@ +{ + "developer": "Mxode", + "models": [ + { + "id": "Mxode/NanoLM-0.3B-Instruct-v1", + "name": "NanoLM-0.3B-Instruct-v1", + "developer": "Mxode", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1537, + "hfopenllm_v2/BBH": 0.3028, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.4155, + "hfopenllm_v2/MMLU-PRO": 0.1105 + } + }, + { + "id": "Mxode/NanoLM-0.3B-Instruct-v1.1", + "name": "NanoLM-0.3B-Instruct-v1.1", + "developer": "Mxode", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1783, + "hfopenllm_v2/BBH": 0.3014, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.4273, + "hfopenllm_v2/MMLU-PRO": 0.1121 + } + }, + { + "id": "Mxode/NanoLM-0.3B-Instruct-v2", + "name": "NanoLM-0.3B-Instruct-v2", + "developer": "Mxode", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1668, + "hfopenllm_v2/BBH": 0.2921, + "hfopenllm_v2/MATH Level 5": 0.0068, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3955, + "hfopenllm_v2/MMLU-PRO": 0.1134 + } + }, + { + "id": "Mxode/NanoLM-1B-Instruct-v1.1", + "name": "NanoLM-1B-Instruct-v1.1", + "developer": "Mxode", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2395, + "hfopenllm_v2/BBH": 0.3184, + "hfopenllm_v2/MATH Level 5": 0.0363, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3433, + "hfopenllm_v2/MMLU-PRO": 0.1215 + } + }, + { + "id": "Mxode/NanoLM-1B-Instruct-v2", + "name": "NanoLM-1B-Instruct-v2", + "developer": "Mxode", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.263, + "hfopenllm_v2/BBH": 0.3123, + "hfopenllm_v2/MATH Level 5": 0.0415, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3552, + "hfopenllm_v2/MMLU-PRO": 0.1238 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/NAPS-ai.json b/data/developers/NAPS-ai.json new file mode 100644 index 0000000000000000000000000000000000000000..60aa8bbe4d0698b9b04228c9b0a0cc45135553e4 --- /dev/null +++ b/data/developers/NAPS-ai.json @@ -0,0 +1,103 @@ +{ + "developer": "NAPS-ai", + "models": [ + { + "id": "NAPS-ai/naps-gemma-2-27b-v-0.1.0", + "name": "naps-gemma-2-27b-v-0.1.0", + "developer": "NAPS-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0, + "hfopenllm_v2/BBH": 0.2912, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3575, + "hfopenllm_v2/MMLU-PRO": 0.1168 + } + }, + { + "id": "NAPS-ai/naps-gemma-2-27b-v0.1.0", + "name": "naps-gemma-2-27b-v0.1.0", + "developer": "NAPS-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0, + "hfopenllm_v2/BBH": 0.2912, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3575, + "hfopenllm_v2/MMLU-PRO": 0.1168 + } + }, + { + "id": "NAPS-ai/naps-llama-3_1-8b-instruct-v0.3", + "name": "naps-llama-3_1-8b-instruct-v0.3", + "developer": "NAPS-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5391, + "hfopenllm_v2/BBH": 0.4901, + "hfopenllm_v2/MATH Level 5": 0.1903, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.3787, + "hfopenllm_v2/MMLU-PRO": 0.3398 + } + }, + { + "id": "NAPS-ai/naps-llama-3_1-8b-instruct-v0.4", + "name": "naps-llama-3_1-8b-instruct-v0.4", + "developer": "NAPS-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7344, + "hfopenllm_v2/BBH": 0.4862, + "hfopenllm_v2/MATH Level 5": 0.1964, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.4421, + "hfopenllm_v2/MMLU-PRO": 0.3475 + } + }, + { + "id": "NAPS-ai/naps-llama-3_1-instruct-v0.5.0", + "name": "naps-llama-3_1-instruct-v0.5.0", + "developer": "NAPS-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.502, + "hfopenllm_v2/BBH": 0.4148, + "hfopenllm_v2/MATH Level 5": 0.0363, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3713, + "hfopenllm_v2/MMLU-PRO": 0.2614 + } + }, + { + "id": "NAPS-ai/naps-llama-3_1_instruct-v0.6.0", + "name": "naps-llama-3_1_instruct-v0.6.0", + "developer": "NAPS-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.328, + "hfopenllm_v2/BBH": 0.4528, + "hfopenllm_v2/MATH Level 5": 0.0642, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.3739, + "hfopenllm_v2/MMLU-PRO": 0.3241 + } + }, + { + "id": "NAPS-ai/naps-llama3.1-70B-v0.2-fp16", + "name": "naps-llama3.1-70B-v0.2-fp16", + "developer": "NAPS-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1845, + "hfopenllm_v2/BBH": 0.3041, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2391, + "hfopenllm_v2/MUSR": 0.3486, + "hfopenllm_v2/MMLU-PRO": 0.1099 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/NCSOFT.json b/data/developers/NCSOFT.json new file mode 100644 index 0000000000000000000000000000000000000000..5cc19c52d2bf4d85f74e73a3056f8da048c42869 --- /dev/null +++ b/data/developers/NCSOFT.json @@ -0,0 +1,50 @@ +{ + "developer": "NCSOFT", + "models": [ + { + "id": "NCSOFT/Llama-3-OffsetBias-8B", + "name": "NCSOFT/Llama-3-OffsetBias-8B", + "developer": "NCSOFT", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8397, + "reward-bench/Chat": 0.9246, + "reward-bench/Chat Hard": 0.8026, + "reward-bench/Safety": 0.8676, + "reward-bench/Reasoning": 0.7639 + } + }, + { + "id": "NCSOFT/Llama-3-OffsetBias-RM-8B", + "name": "NCSOFT/Llama-3-OffsetBias-RM-8B", + "developer": "NCSOFT", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.648, + "reward-bench/Chat": 0.9721, + "reward-bench/Chat Hard": 0.818, + "reward-bench/Safety": 0.7222, + "reward-bench/Reasoning": 0.9192, + "reward-bench/Factuality": 0.6084, + "reward-bench/Precise IF": 0.4, + "reward-bench/Math": 0.5191, + "reward-bench/Focus": 0.9596, + "reward-bench/Ties": 0.6786 + } + }, + { + "id": "NCSOFT/Llama-VARCO-8B-Instruct", + "name": "Llama-VARCO-8B-Instruct", + "developer": "NCSOFT", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.447, + "hfopenllm_v2/BBH": 0.5023, + "hfopenllm_v2/MATH Level 5": 0.1065, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.3841, + "hfopenllm_v2/MMLU-PRO": 0.319 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/NJS26.json b/data/developers/NJS26.json new file mode 100644 index 0000000000000000000000000000000000000000..c8c02a590067afd0a7615bfcb7a0d012a7ca1ad9 --- /dev/null +++ b/data/developers/NJS26.json @@ -0,0 +1,19 @@ +{ + "developer": "NJS26", + "models": [ + { + "id": "NJS26/NJS_777", + "name": "NJS_777", + "developer": "NJS26", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1881, + "hfopenllm_v2/BBH": 0.2178, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2064, + "hfopenllm_v2/MUSR": 0.3538, + "hfopenllm_v2/MMLU-PRO": 0.1163 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/NLPark.json b/data/developers/NLPark.json new file mode 100644 index 0000000000000000000000000000000000000000..75347595685858a623f9524c45e52afaf614b395 --- /dev/null +++ b/data/developers/NLPark.json @@ -0,0 +1,47 @@ +{ + "developer": "NLPark", + "models": [ + { + "id": "NLPark/AnFeng_v3.1-Avocet", + "name": "AnFeng_v3.1-Avocet", + "developer": "NLPark", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5096, + "hfopenllm_v2/BBH": 0.5829, + "hfopenllm_v2/MATH Level 5": 0.1594, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.4476, + "hfopenllm_v2/MMLU-PRO": 0.4438 + } + }, + { + "id": "NLPark/B-and-W_Flycatcher-3AD1E", + "name": "B-and-W_Flycatcher-3AD1E", + "developer": "NLPark", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4908, + "hfopenllm_v2/BBH": 0.6065, + "hfopenllm_v2/MATH Level 5": 0.2379, + "hfopenllm_v2/GPQA": 0.3305, + "hfopenllm_v2/MUSR": 0.4423, + "hfopenllm_v2/MMLU-PRO": 0.4741 + } + }, + { + "id": "NLPark/Shi-Ci-Robin-Test_3AD80", + "name": "Shi-Ci-Robin-Test_3AD80", + "developer": "NLPark", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7227, + "hfopenllm_v2/BBH": 0.6705, + "hfopenllm_v2/MATH Level 5": 0.3157, + "hfopenllm_v2/GPQA": 0.3599, + "hfopenllm_v2/MUSR": 0.4696, + "hfopenllm_v2/MMLU-PRO": 0.5121 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/NTQAI.json b/data/developers/NTQAI.json new file mode 100644 index 0000000000000000000000000000000000000000..cd7f75e55ae2a1e6514129a58d36855b53f5dc30 --- /dev/null +++ b/data/developers/NTQAI.json @@ -0,0 +1,33 @@ +{ + "developer": "NTQAI", + "models": [ + { + "id": "NTQAI/NxMobileLM-1.5B-SFT", + "name": "NxMobileLM-1.5B-SFT", + "developer": "NTQAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6392, + "hfopenllm_v2/BBH": 0.3957, + "hfopenllm_v2/MATH Level 5": 0.0846, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3555, + "hfopenllm_v2/MMLU-PRO": 0.2817 + } + }, + { + "id": "NTQAI/Nxcode-CQ-7B-orpo", + "name": "Nxcode-CQ-7B-orpo", + "developer": "NTQAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4007, + "hfopenllm_v2/BBH": 0.4143, + "hfopenllm_v2/MATH Level 5": 0.0219, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.394, + "hfopenllm_v2/MMLU-PRO": 0.1612 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/NYTK.json b/data/developers/NYTK.json new file mode 100644 index 0000000000000000000000000000000000000000..c8a4ec05353b2f4afd23852adae2b3d98bfecb55 --- /dev/null +++ b/data/developers/NYTK.json @@ -0,0 +1,33 @@ +{ + "developer": "NYTK", + "models": [ + { + "id": "NYTK/PULI-GPTrio", + "name": "PULI-GPTrio", + "developer": "NYTK", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.218, + "hfopenllm_v2/BBH": 0.306, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3819, + "hfopenllm_v2/MMLU-PRO": 0.1137 + } + }, + { + "id": "NYTK/PULI-LlumiX-32K", + "name": "PULI-LlumiX-32K", + "developer": "NYTK", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.17, + "hfopenllm_v2/BBH": 0.3189, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3964, + "hfopenllm_v2/MMLU-PRO": 0.1681 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Naveenpoliasetty.json b/data/developers/Naveenpoliasetty.json new file mode 100644 index 0000000000000000000000000000000000000000..262b795646cd38997b32729c79e40e74b7a63971 --- /dev/null +++ b/data/developers/Naveenpoliasetty.json @@ -0,0 +1,19 @@ +{ + "developer": "Naveenpoliasetty", + "models": [ + { + "id": "Naveenpoliasetty/llama3-8B-V2", + "name": "llama3-8B-V2", + "developer": "Naveenpoliasetty", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4123, + "hfopenllm_v2/BBH": 0.5189, + "hfopenllm_v2/MATH Level 5": 0.0785, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4081, + "hfopenllm_v2/MMLU-PRO": 0.3738 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/NbAiLab.json b/data/developers/NbAiLab.json new file mode 100644 index 0000000000000000000000000000000000000000..8989f7d6b04d7c1110a46b27b27e21fb7b728650 --- /dev/null +++ b/data/developers/NbAiLab.json @@ -0,0 +1,33 @@ +{ + "developer": "NbAiLab", + "models": [ + { + "id": "NbAiLab/nb-llama-3.1-8B-Instruct", + "name": "nb-llama-3.1-8B-Instruct", + "developer": "NbAiLab", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3625, + "hfopenllm_v2/BBH": 0.3247, + "hfopenllm_v2/MATH Level 5": 0.0227, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3208, + "hfopenllm_v2/MMLU-PRO": 0.1197 + } + }, + { + "id": "NbAiLab/nb-llama-3.1-8B-sft", + "name": "nb-llama-3.1-8B-sft", + "developer": "NbAiLab", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3616, + "hfopenllm_v2/BBH": 0.3282, + "hfopenllm_v2/MATH Level 5": 0.0219, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.3287, + "hfopenllm_v2/MMLU-PRO": 0.1222 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Nekochu.json b/data/developers/Nekochu.json new file mode 100644 index 0000000000000000000000000000000000000000..1e65c9938b3c11c108beda6837e1a35b296ff35d --- /dev/null +++ b/data/developers/Nekochu.json @@ -0,0 +1,61 @@ +{ + "developer": "Nekochu", + "models": [ + { + "id": "Nekochu/Llama-3.1-8B-German-ORPO", + "name": "Llama-3.1-8B-German-ORPO", + "developer": "Nekochu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4611, + "hfopenllm_v2/BBH": 0.4983, + "hfopenllm_v2/MATH Level 5": 0.1171, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4647, + "hfopenllm_v2/MMLU-PRO": 0.3393 + } + }, + { + "id": "Nekochu/Llama-3.1-8B-french-DPO", + "name": "Llama-3.1-8B-french-DPO", + "developer": "Nekochu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4656, + "hfopenllm_v2/BBH": 0.5111, + "hfopenllm_v2/MATH Level 5": 0.0974, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.4216, + "hfopenllm_v2/MMLU-PRO": 0.3414 + } + }, + { + "id": "Nekochu/Luminia-13B-v3", + "name": "Luminia-13B-v3", + "developer": "Nekochu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2523, + "hfopenllm_v2/BBH": 0.4112, + "hfopenllm_v2/MATH Level 5": 0.0181, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3983, + "hfopenllm_v2/MMLU-PRO": 0.2215 + } + }, + { + "id": "Nekochu/Luminia-8B-RP", + "name": "Luminia-8B-RP", + "developer": "Nekochu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5574, + "hfopenllm_v2/BBH": 0.5218, + "hfopenllm_v2/MATH Level 5": 0.136, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.3998, + "hfopenllm_v2/MMLU-PRO": 0.3631 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/NeverSleep.json b/data/developers/NeverSleep.json new file mode 100644 index 0000000000000000000000000000000000000000..42fe13428db7943afea84088f1b314bfebca83f1 --- /dev/null +++ b/data/developers/NeverSleep.json @@ -0,0 +1,33 @@ +{ + "developer": "NeverSleep", + "models": [ + { + "id": "NeverSleep/Lumimaid-v0.2-12B", + "name": "Lumimaid-v0.2-12B", + "developer": "NeverSleep", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1099, + "hfopenllm_v2/BBH": 0.5396, + "hfopenllm_v2/MATH Level 5": 0.0566, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.4821, + "hfopenllm_v2/MMLU-PRO": 0.3511 + } + }, + { + "id": "NeverSleep/Lumimaid-v0.2-8B", + "name": "Lumimaid-v0.2-8B", + "developer": "NeverSleep", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5038, + "hfopenllm_v2/BBH": 0.5238, + "hfopenllm_v2/MATH Level 5": 0.1435, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.4303, + "hfopenllm_v2/MMLU-PRO": 0.3636 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Nexesenex.json b/data/developers/Nexesenex.json new file mode 100644 index 0000000000000000000000000000000000000000..4174762106605b5f049547c900f6c44fdfde43ef --- /dev/null +++ b/data/developers/Nexesenex.json @@ -0,0 +1,635 @@ +{ + "developer": "Nexesenex", + "models": [ + { + "id": "Nexesenex/Dolphin3.0-Llama3.1-1B-abliterated", + "name": "Dolphin3.0-Llama3.1-1B-abliterated", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5312, + "hfopenllm_v2/BBH": 0.3241, + "hfopenllm_v2/MATH Level 5": 0.0385, + "hfopenllm_v2/GPQA": 0.2408, + "hfopenllm_v2/MUSR": 0.3237, + "hfopenllm_v2/MMLU-PRO": 0.1373 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_DeepDive_3_Prev_v1.0", + "name": "Llama_3.1_8b_DeepDive_3_Prev_v1.0", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6809, + "hfopenllm_v2/BBH": 0.5155, + "hfopenllm_v2/MATH Level 5": 0.1866, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.3666, + "hfopenllm_v2/MMLU-PRO": 0.3438 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0", + "name": "Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7101, + "hfopenllm_v2/BBH": 0.512, + "hfopenllm_v2/MATH Level 5": 0.1926, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.3758, + "hfopenllm_v2/MMLU-PRO": 0.3441 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_DobHerWild_R1_v1.1R", + "name": "Llama_3.1_8b_DobHerWild_R1_v1.1R", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.76, + "hfopenllm_v2/BBH": 0.5257, + "hfopenllm_v2/MATH Level 5": 0.2319, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.3852, + "hfopenllm_v2/MMLU-PRO": 0.3688 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_DoberWild_v2.01", + "name": "Llama_3.1_8b_DoberWild_v2.01", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7996, + "hfopenllm_v2/BBH": 0.5251, + "hfopenllm_v2/MATH Level 5": 0.2002, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4012, + "hfopenllm_v2/MMLU-PRO": 0.3791 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_DoberWild_v2.02", + "name": "Llama_3.1_8b_DoberWild_v2.02", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7746, + "hfopenllm_v2/BBH": 0.5313, + "hfopenllm_v2/MATH Level 5": 0.1994, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.3946, + "hfopenllm_v2/MMLU-PRO": 0.3764 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_DoberWild_v2.03", + "name": "Llama_3.1_8b_DoberWild_v2.03", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7764, + "hfopenllm_v2/BBH": 0.5294, + "hfopenllm_v2/MATH Level 5": 0.2077, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.3906, + "hfopenllm_v2/MMLU-PRO": 0.3722 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_DodoWild_v2.01", + "name": "Llama_3.1_8b_DodoWild_v2.01", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7978, + "hfopenllm_v2/BBH": 0.5253, + "hfopenllm_v2/MATH Level 5": 0.1986, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.409, + "hfopenllm_v2/MMLU-PRO": 0.3738 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_DodoWild_v2.02", + "name": "Llama_3.1_8b_DodoWild_v2.02", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8017, + "hfopenllm_v2/BBH": 0.5262, + "hfopenllm_v2/MATH Level 5": 0.2273, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.3971, + "hfopenllm_v2/MMLU-PRO": 0.3761 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_DodoWild_v2.03", + "name": "Llama_3.1_8b_DodoWild_v2.03", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7941, + "hfopenllm_v2/BBH": 0.5308, + "hfopenllm_v2/MATH Level 5": 0.2221, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.3959, + "hfopenllm_v2/MMLU-PRO": 0.3786 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_DodoWild_v2.10", + "name": "Llama_3.1_8b_DodoWild_v2.10", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8054, + "hfopenllm_v2/BBH": 0.5278, + "hfopenllm_v2/MATH Level 5": 0.1971, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4157, + "hfopenllm_v2/MMLU-PRO": 0.3855 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.01", + "name": "Llama_3.1_8b_Dolermed_R1_V1.01", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7534, + "hfopenllm_v2/BBH": 0.5312, + "hfopenllm_v2/MATH Level 5": 0.2017, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.3747, + "hfopenllm_v2/MMLU-PRO": 0.3733 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.03", + "name": "Llama_3.1_8b_Dolermed_R1_V1.03", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7564, + "hfopenllm_v2/BBH": 0.5316, + "hfopenllm_v2/MATH Level 5": 0.2092, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.38, + "hfopenllm_v2/MMLU-PRO": 0.372 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_Dolermed_V1.01", + "name": "Llama_3.1_8b_Dolermed_V1.01", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5087, + "hfopenllm_v2/BBH": 0.5194, + "hfopenllm_v2/MATH Level 5": 0.1344, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.3945, + "hfopenllm_v2/MMLU-PRO": 0.357 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_Dolerstormed_V1.04", + "name": "Llama_3.1_8b_Dolerstormed_V1.04", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7889, + "hfopenllm_v2/BBH": 0.5195, + "hfopenllm_v2/MATH Level 5": 0.1926, + "hfopenllm_v2/GPQA": 0.3221, + "hfopenllm_v2/MUSR": 0.403, + "hfopenllm_v2/MMLU-PRO": 0.3889 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_Hermedash_R1_V1.04", + "name": "Llama_3.1_8b_Hermedash_R1_V1.04", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7872, + "hfopenllm_v2/BBH": 0.5192, + "hfopenllm_v2/MATH Level 5": 0.1866, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.4111, + "hfopenllm_v2/MMLU-PRO": 0.3882 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.01", + "name": "Llama_3.1_8b_Hermedive_R1_V1.01", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5001, + "hfopenllm_v2/BBH": 0.5171, + "hfopenllm_v2/MATH Level 5": 0.1775, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4008, + "hfopenllm_v2/MMLU-PRO": 0.3427 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.03", + "name": "Llama_3.1_8b_Hermedive_R1_V1.03", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6648, + "hfopenllm_v2/BBH": 0.5141, + "hfopenllm_v2/MATH Level 5": 0.1858, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.3613, + "hfopenllm_v2/MMLU-PRO": 0.3488 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_Hermedive_V1.01", + "name": "Llama_3.1_8b_Hermedive_V1.01", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5062, + "hfopenllm_v2/BBH": 0.4918, + "hfopenllm_v2/MATH Level 5": 0.1647, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.3697, + "hfopenllm_v2/MMLU-PRO": 0.3551 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_Mediver_V1.01", + "name": "Llama_3.1_8b_Mediver_V1.01", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1885, + "hfopenllm_v2/BBH": 0.4415, + "hfopenllm_v2/MATH Level 5": 0.0015, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.3898, + "hfopenllm_v2/MMLU-PRO": 0.2994 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_Medusa_v1.01", + "name": "Llama_3.1_8b_Medusa_v1.01", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7685, + "hfopenllm_v2/BBH": 0.5018, + "hfopenllm_v2/MATH Level 5": 0.1465, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4067, + "hfopenllm_v2/MMLU-PRO": 0.3531 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_Smarteaz_0.2_R1", + "name": "Llama_3.1_8b_Smarteaz_0.2_R1", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6346, + "hfopenllm_v2/BBH": 0.5113, + "hfopenllm_v2/MATH Level 5": 0.2606, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4188, + "hfopenllm_v2/MMLU-PRO": 0.3645 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_Smarteaz_V1.01", + "name": "Llama_3.1_8b_Smarteaz_V1.01", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8151, + "hfopenllm_v2/BBH": 0.5241, + "hfopenllm_v2/MATH Level 5": 0.2341, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.3789, + "hfopenllm_v2/MMLU-PRO": 0.3736 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_Stormeder_v1.04", + "name": "Llama_3.1_8b_Stormeder_v1.04", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7853, + "hfopenllm_v2/BBH": 0.5207, + "hfopenllm_v2/MATH Level 5": 0.185, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.3949, + "hfopenllm_v2/MMLU-PRO": 0.3852 + } + }, + { + "id": "Nexesenex/Llama_3.1_8b_Typhoon_v1.03", + "name": "Llama_3.1_8b_Typhoon_v1.03", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8078, + "hfopenllm_v2/BBH": 0.5314, + "hfopenllm_v2/MATH Level 5": 0.2273, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.3815, + "hfopenllm_v2/MMLU-PRO": 0.3842 + } + }, + { + "id": "Nexesenex/Llama_3.2_1b_AquaSyn_0.1", + "name": "Llama_3.2_1b_AquaSyn_0.1", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2741, + "hfopenllm_v2/BBH": 0.3284, + "hfopenllm_v2/MATH Level 5": 0.0219, + "hfopenllm_v2/GPQA": 0.2483, + "hfopenllm_v2/MUSR": 0.346, + "hfopenllm_v2/MMLU-PRO": 0.1378 + } + }, + { + "id": "Nexesenex/Llama_3.2_1b_AquaSyn_0.11", + "name": "Llama_3.2_1b_AquaSyn_0.11", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2431, + "hfopenllm_v2/BBH": 0.3112, + "hfopenllm_v2/MATH Level 5": 0.0234, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3368, + "hfopenllm_v2/MMLU-PRO": 0.1116 + } + }, + { + "id": "Nexesenex/Llama_3.2_1b_Dolto_0.1", + "name": "Llama_3.2_1b_Dolto_0.1", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5434, + "hfopenllm_v2/BBH": 0.335, + "hfopenllm_v2/MATH Level 5": 0.037, + "hfopenllm_v2/GPQA": 0.2374, + "hfopenllm_v2/MUSR": 0.3421, + "hfopenllm_v2/MMLU-PRO": 0.1364 + } + }, + { + "id": "Nexesenex/Llama_3.2_1b_Odyssea_V1", + "name": "Llama_3.2_1b_Odyssea_V1", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2553, + "hfopenllm_v2/BBH": 0.301, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3394, + "hfopenllm_v2/MMLU-PRO": 0.1153 + } + }, + { + "id": "Nexesenex/Llama_3.2_1b_Odyssea_V1.01", + "name": "Llama_3.2_1b_Odyssea_V1.01", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2495, + "hfopenllm_v2/BBH": 0.3045, + "hfopenllm_v2/MATH Level 5": 0.0174, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.342, + "hfopenllm_v2/MMLU-PRO": 0.1152 + } + }, + { + "id": "Nexesenex/Llama_3.2_1b_OpenTree_R1_0.1", + "name": "Llama_3.2_1b_OpenTree_R1_0.1", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5366, + "hfopenllm_v2/BBH": 0.328, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.3131, + "hfopenllm_v2/MMLU-PRO": 0.1675 + } + }, + { + "id": "Nexesenex/Llama_3.2_1b_OrcaSun_V1", + "name": "Llama_3.2_1b_OrcaSun_V1", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5949, + "hfopenllm_v2/BBH": 0.355, + "hfopenllm_v2/MATH Level 5": 0.0597, + "hfopenllm_v2/GPQA": 0.2366, + "hfopenllm_v2/MUSR": 0.338, + "hfopenllm_v2/MMLU-PRO": 0.1904 + } + }, + { + "id": "Nexesenex/Llama_3.2_1b_RandomLego_RP_R1_0.1", + "name": "Llama_3.2_1b_RandomLego_RP_R1_0.1", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5543, + "hfopenllm_v2/BBH": 0.3428, + "hfopenllm_v2/MATH Level 5": 0.0566, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3249, + "hfopenllm_v2/MMLU-PRO": 0.1563 + } + }, + { + "id": "Nexesenex/Llama_3.2_1b_SunOrca_V1", + "name": "Llama_3.2_1b_SunOrca_V1", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.543, + "hfopenllm_v2/BBH": 0.3431, + "hfopenllm_v2/MATH Level 5": 0.0672, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3262, + "hfopenllm_v2/MMLU-PRO": 0.1884 + } + }, + { + "id": "Nexesenex/Llama_3.2_1b_Sydonia_0.1", + "name": "Llama_3.2_1b_Sydonia_0.1", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2197, + "hfopenllm_v2/BBH": 0.3121, + "hfopenllm_v2/MATH Level 5": 0.0204, + "hfopenllm_v2/GPQA": 0.2282, + "hfopenllm_v2/MUSR": 0.3382, + "hfopenllm_v2/MMLU-PRO": 0.1224 + } + }, + { + "id": "Nexesenex/Llama_3.2_1b_Syneridol_0.2", + "name": "Llama_3.2_1b_Syneridol_0.2", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2157, + "hfopenllm_v2/BBH": 0.3139, + "hfopenllm_v2/MATH Level 5": 0.0219, + "hfopenllm_v2/GPQA": 0.2349, + "hfopenllm_v2/MUSR": 0.3343, + "hfopenllm_v2/MMLU-PRO": 0.1227 + } + }, + { + "id": "Nexesenex/Llama_3.2_1b_Synopsys_0.1", + "name": "Llama_3.2_1b_Synopsys_0.1", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1764, + "hfopenllm_v2/BBH": 0.3162, + "hfopenllm_v2/MATH Level 5": 0.0166, + "hfopenllm_v2/GPQA": 0.2391, + "hfopenllm_v2/MUSR": 0.3461, + "hfopenllm_v2/MMLU-PRO": 0.1231 + } + }, + { + "id": "Nexesenex/Llama_3.2_1b_Synopsys_0.11", + "name": "Llama_3.2_1b_Synopsys_0.11", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2842, + "hfopenllm_v2/BBH": 0.3102, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3513, + "hfopenllm_v2/MMLU-PRO": 0.1123 + } + }, + { + "id": "Nexesenex/Llama_3.2_3b_Kermes_v1", + "name": "Llama_3.2_3b_Kermes_v1", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4852, + "hfopenllm_v2/BBH": 0.441, + "hfopenllm_v2/MATH Level 5": 0.031, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.407, + "hfopenllm_v2/MMLU-PRO": 0.2547 + } + }, + { + "id": "Nexesenex/Llama_3.2_3b_Kermes_v2", + "name": "Llama_3.2_3b_Kermes_v2", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5754, + "hfopenllm_v2/BBH": 0.4455, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3778, + "hfopenllm_v2/MMLU-PRO": 0.2734 + } + }, + { + "id": "Nexesenex/Llama_3.2_3b_Kermes_v2.1", + "name": "Llama_3.2_3b_Kermes_v2.1", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5584, + "hfopenllm_v2/BBH": 0.4464, + "hfopenllm_v2/MATH Level 5": 0.0521, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3964, + "hfopenllm_v2/MMLU-PRO": 0.2692 + } + }, + { + "id": "Nexesenex/Nemotron_W_4b_Halo_0.1", + "name": "Nemotron_W_4b_Halo_0.1", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3627, + "hfopenllm_v2/BBH": 0.4135, + "hfopenllm_v2/MATH Level 5": 0.0423, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.4165, + "hfopenllm_v2/MMLU-PRO": 0.2505 + } + }, + { + "id": "Nexesenex/Nemotron_W_4b_MagLight_0.1", + "name": "Nemotron_W_4b_MagLight_0.1", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.423, + "hfopenllm_v2/BBH": 0.4231, + "hfopenllm_v2/MATH Level 5": 0.04, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.4112, + "hfopenllm_v2/MMLU-PRO": 0.2545 + } + }, + { + "id": "Nexesenex/Qwen_2.5_3b_Smarteaz_0.01a", + "name": "Qwen_2.5_3b_Smarteaz_0.01a", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4012, + "hfopenllm_v2/BBH": 0.4637, + "hfopenllm_v2/MATH Level 5": 0.1805, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.432, + "hfopenllm_v2/MMLU-PRO": 0.286 + } + }, + { + "id": "Nexesenex/pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL", + "name": "pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL", + "developer": "Nexesenex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.589, + "hfopenllm_v2/BBH": 0.3562, + "hfopenllm_v2/MATH Level 5": 0.0748, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3396, + "hfopenllm_v2/MMLU-PRO": 0.1803 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Nexusflow.json b/data/developers/Nexusflow.json new file mode 100644 index 0000000000000000000000000000000000000000..2f78cab8780e463261ce35ef251c295eb3f3fd0f --- /dev/null +++ b/data/developers/Nexusflow.json @@ -0,0 +1,38 @@ +{ + "developer": "Nexusflow", + "models": [ + { + "id": "Nexusflow/NexusRaven-V2-13B", + "name": "NexusRaven-V2-13B", + "developer": "Nexusflow", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1791, + "hfopenllm_v2/BBH": 0.3949, + "hfopenllm_v2/MATH Level 5": 0.0295, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3737, + "hfopenllm_v2/MMLU-PRO": 0.1872 + } + }, + { + "id": "Nexusflow/Starling-RM-34B", + "name": "Nexusflow/Starling-RM-34B", + "developer": "Nexusflow", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8133, + "reward-bench/Factuality": 0.4589, + "reward-bench/Precise IF": 0.3187, + "reward-bench/Math": 0.6175, + "reward-bench/Safety": 0.877, + "reward-bench/Focus": 0.4808, + "reward-bench/Ties": 0.1004, + "reward-bench/Chat": 0.9693, + "reward-bench/Chat Hard": 0.5724, + "reward-bench/Reasoning": 0.8845, + "reward-bench/Prior Sets (0.5 weight)": 0.7137 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/NikolaSigmoid.json b/data/developers/NikolaSigmoid.json new file mode 100644 index 0000000000000000000000000000000000000000..6140d28ba58f1da5c6d116238c2479fcd94e6be4 --- /dev/null +++ b/data/developers/NikolaSigmoid.json @@ -0,0 +1,103 @@ +{ + "developer": "NikolaSigmoid", + "models": [ + { + "id": "NikolaSigmoid/AceMath-1.5B-Instruct-1epoch", + "name": "AceMath-1.5B-Instruct-1epoch", + "developer": "NikolaSigmoid", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2849, + "hfopenllm_v2/BBH": 0.4263, + "hfopenllm_v2/MATH Level 5": 0.3051, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.3925, + "hfopenllm_v2/MMLU-PRO": 0.2376 + } + }, + { + "id": "NikolaSigmoid/AceMath-1.5B-Instruct-dolphin-r1-200", + "name": "AceMath-1.5B-Instruct-dolphin-r1-200", + "developer": "NikolaSigmoid", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1808, + "hfopenllm_v2/BBH": 0.2815, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.375, + "hfopenllm_v2/MMLU-PRO": 0.1143 + } + }, + { + "id": "NikolaSigmoid/DeepSeek-R1-Distill-Qwen-1.5B-500", + "name": "DeepSeek-R1-Distill-Qwen-1.5B-500", + "developer": "NikolaSigmoid", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1749, + "hfopenllm_v2/BBH": 0.2602, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2458, + "hfopenllm_v2/MUSR": 0.338, + "hfopenllm_v2/MMLU-PRO": 0.1125 + } + }, + { + "id": "NikolaSigmoid/acemath-200", + "name": "acemath-200", + "developer": "NikolaSigmoid", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2849, + "hfopenllm_v2/BBH": 0.4263, + "hfopenllm_v2/MATH Level 5": 0.3051, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.3925, + "hfopenllm_v2/MMLU-PRO": 0.2376 + } + }, + { + "id": "NikolaSigmoid/phi-4-14b", + "name": "phi-4-14b", + "developer": "NikolaSigmoid", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0561, + "hfopenllm_v2/BBH": 0.6695, + "hfopenllm_v2/MATH Level 5": 0.2938, + "hfopenllm_v2/GPQA": 0.4035, + "hfopenllm_v2/MUSR": 0.5047, + "hfopenllm_v2/MMLU-PRO": 0.5278 + } + }, + { + "id": "NikolaSigmoid/phi-4-1steps", + "name": "phi-4-1steps", + "developer": "NikolaSigmoid", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0528, + "hfopenllm_v2/BBH": 0.6707, + "hfopenllm_v2/MATH Level 5": 0.2983, + "hfopenllm_v2/GPQA": 0.4018, + "hfopenllm_v2/MUSR": 0.5021, + "hfopenllm_v2/MMLU-PRO": 0.5273 + } + }, + { + "id": "NikolaSigmoid/phi-4-300steps", + "name": "phi-4-300steps", + "developer": "NikolaSigmoid", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0561, + "hfopenllm_v2/BBH": 0.6701, + "hfopenllm_v2/MATH Level 5": 0.2946, + "hfopenllm_v2/GPQA": 0.4052, + "hfopenllm_v2/MUSR": 0.5034, + "hfopenllm_v2/MMLU-PRO": 0.5288 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Nitral-AI.json b/data/developers/Nitral-AI.json new file mode 100644 index 0000000000000000000000000000000000000000..c059df98d4988906f6fc0b24fb49a31144b6a067 --- /dev/null +++ b/data/developers/Nitral-AI.json @@ -0,0 +1,117 @@ +{ + "developer": "Nitral-AI", + "models": [ + { + "id": "Nitral-AI/Captain-Eris-BMO_Violent-GRPO-v0.420", + "name": "Captain-Eris-BMO_Violent-GRPO-v0.420", + "developer": "Nitral-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6313, + "hfopenllm_v2/BBH": 0.5079, + "hfopenllm_v2/MATH Level 5": 0.1314, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4228, + "hfopenllm_v2/MMLU-PRO": 0.3596 + } + }, + { + "id": "Nitral-AI/Captain-Eris_BMO-Violent-12B", + "name": "Captain-Eris_BMO-Violent-12B", + "developer": "Nitral-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6152, + "hfopenllm_v2/BBH": 0.5104, + "hfopenllm_v2/MATH Level 5": 0.1367, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4255, + "hfopenllm_v2/MMLU-PRO": 0.3571 + } + }, + { + "id": "Nitral-AI/Captain-Eris_Violet-GRPO-v0.420", + "name": "Captain-Eris_Violet-GRPO-v0.420", + "developer": "Nitral-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6262, + "hfopenllm_v2/BBH": 0.5159, + "hfopenllm_v2/MATH Level 5": 0.108, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4279, + "hfopenllm_v2/MMLU-PRO": 0.3535 + } + }, + { + "id": "Nitral-AI/Captain-Eris_Violet-V0.420-12B", + "name": "Captain-Eris_Violet-V0.420-12B", + "developer": "Nitral-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4339, + "hfopenllm_v2/BBH": 0.5478, + "hfopenllm_v2/MATH Level 5": 0.1073, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.4331, + "hfopenllm_v2/MMLU-PRO": 0.3723 + } + }, + { + "id": "Nitral-AI/Captain_BMO-12B", + "name": "Captain_BMO-12B", + "developer": "Nitral-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4751, + "hfopenllm_v2/BBH": 0.5286, + "hfopenllm_v2/MATH Level 5": 0.1397, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.3748, + "hfopenllm_v2/MMLU-PRO": 0.3569 + } + }, + { + "id": "Nitral-AI/Hathor_Stable-v0.2-L3-8B", + "name": "Hathor_Stable-v0.2-L3-8B", + "developer": "Nitral-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7175, + "hfopenllm_v2/BBH": 0.5286, + "hfopenllm_v2/MATH Level 5": 0.105, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.3781, + "hfopenllm_v2/MMLU-PRO": 0.3696 + } + }, + { + "id": "Nitral-AI/Hathor_Tahsin-L3-8B-v0.85", + "name": "Hathor_Tahsin-L3-8B-v0.85", + "developer": "Nitral-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.711, + "hfopenllm_v2/BBH": 0.5279, + "hfopenllm_v2/MATH Level 5": 0.1005, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.3647, + "hfopenllm_v2/MMLU-PRO": 0.372 + } + }, + { + "id": "Nitral-AI/Nera_Noctis-12B", + "name": "Nera_Noctis-12B", + "developer": "Nitral-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4562, + "hfopenllm_v2/BBH": 0.5194, + "hfopenllm_v2/MATH Level 5": 0.0876, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3979, + "hfopenllm_v2/MMLU-PRO": 0.3468 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Nohobby.json b/data/developers/Nohobby.json new file mode 100644 index 0000000000000000000000000000000000000000..1f97b72706347a4d3531ecbf4b11cc2c3082e4d6 --- /dev/null +++ b/data/developers/Nohobby.json @@ -0,0 +1,33 @@ +{ + "developer": "Nohobby", + "models": [ + { + "id": "Nohobby/MS-Schisandra-22B-v0.1", + "name": "MS-Schisandra-22B-v0.1", + "developer": "Nohobby", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6331, + "hfopenllm_v2/BBH": 0.579, + "hfopenllm_v2/MATH Level 5": 0.2228, + "hfopenllm_v2/GPQA": 0.3322, + "hfopenllm_v2/MUSR": 0.3928, + "hfopenllm_v2/MMLU-PRO": 0.4096 + } + }, + { + "id": "Nohobby/MS-Schisandra-22B-v0.2", + "name": "MS-Schisandra-22B-v0.2", + "developer": "Nohobby", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6383, + "hfopenllm_v2/BBH": 0.5841, + "hfopenllm_v2/MATH Level 5": 0.2032, + "hfopenllm_v2/GPQA": 0.3356, + "hfopenllm_v2/MUSR": 0.4075, + "hfopenllm_v2/MMLU-PRO": 0.4136 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Norquinal.json b/data/developers/Norquinal.json new file mode 100644 index 0000000000000000000000000000000000000000..cce1ebee228114767c2a6061ae2d77857f5702c6 --- /dev/null +++ b/data/developers/Norquinal.json @@ -0,0 +1,117 @@ +{ + "developer": "Norquinal", + "models": [ + { + "id": "Norquinal/Alpha", + "name": "Alpha", + "developer": "Norquinal", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2803, + "hfopenllm_v2/BBH": 0.3374, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3631, + "hfopenllm_v2/MMLU-PRO": 0.3003 + } + }, + { + "id": "Norquinal/Bravo", + "name": "Bravo", + "developer": "Norquinal", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3025, + "hfopenllm_v2/BBH": 0.3558, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.3869, + "hfopenllm_v2/MMLU-PRO": 0.3127 + } + }, + { + "id": "Norquinal/Charlie", + "name": "Charlie", + "developer": "Norquinal", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3061, + "hfopenllm_v2/BBH": 0.3515, + "hfopenllm_v2/MATH Level 5": 0.0582, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3737, + "hfopenllm_v2/MMLU-PRO": 0.3093 + } + }, + { + "id": "Norquinal/Delta", + "name": "Delta", + "developer": "Norquinal", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2538, + "hfopenllm_v2/BBH": 0.3435, + "hfopenllm_v2/MATH Level 5": 0.0612, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3777, + "hfopenllm_v2/MMLU-PRO": 0.2959 + } + }, + { + "id": "Norquinal/Echo", + "name": "Echo", + "developer": "Norquinal", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3158, + "hfopenllm_v2/BBH": 0.353, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3804, + "hfopenllm_v2/MMLU-PRO": 0.3095 + } + }, + { + "id": "Norquinal/Foxtrot", + "name": "Foxtrot", + "developer": "Norquinal", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3012, + "hfopenllm_v2/BBH": 0.3558, + "hfopenllm_v2/MATH Level 5": 0.0582, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.3804, + "hfopenllm_v2/MMLU-PRO": 0.305 + } + }, + { + "id": "Norquinal/Golf", + "name": "Golf", + "developer": "Norquinal", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3534, + "hfopenllm_v2/BBH": 0.3533, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.338, + "hfopenllm_v2/MMLU-PRO": 0.3056 + } + }, + { + "id": "Norquinal/Hotel", + "name": "Hotel", + "developer": "Norquinal", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3215, + "hfopenllm_v2/BBH": 0.3679, + "hfopenllm_v2/MATH Level 5": 0.0529, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3288, + "hfopenllm_v2/MMLU-PRO": 0.3157 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/NotASI.json b/data/developers/NotASI.json new file mode 100644 index 0000000000000000000000000000000000000000..42c466260d274c745a84140a667b67e6cccdfc01 --- /dev/null +++ b/data/developers/NotASI.json @@ -0,0 +1,61 @@ +{ + "developer": "NotASI", + "models": [ + { + "id": "NotASI/FineTome-Llama3.2-1B-0929", + "name": "FineTome-Llama3.2-1B-0929", + "developer": "NotASI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3991, + "hfopenllm_v2/BBH": 0.3246, + "hfopenllm_v2/MATH Level 5": 0.0363, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3488, + "hfopenllm_v2/MMLU-PRO": 0.1429 + } + }, + { + "id": "NotASI/FineTome-Llama3.2-3B-1002", + "name": "FineTome-Llama3.2-3B-1002", + "developer": "NotASI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5474, + "hfopenllm_v2/BBH": 0.4319, + "hfopenllm_v2/MATH Level 5": 0.0627, + "hfopenllm_v2/GPQA": 0.2508, + "hfopenllm_v2/MUSR": 0.3685, + "hfopenllm_v2/MMLU-PRO": 0.2437 + } + }, + { + "id": "NotASI/FineTome-v1.5-Llama3.2-1B-1007", + "name": "FineTome-v1.5-Llama3.2-1B-1007", + "developer": "NotASI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3924, + "hfopenllm_v2/BBH": 0.3241, + "hfopenllm_v2/MATH Level 5": 0.0317, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3475, + "hfopenllm_v2/MMLU-PRO": 0.1427 + } + }, + { + "id": "NotASI/FineTome-v1.5-Llama3.2-3B-1007", + "name": "FineTome-v1.5-Llama3.2-3B-1007", + "developer": "NotASI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5508, + "hfopenllm_v2/BBH": 0.4312, + "hfopenllm_v2/MATH Level 5": 0.0642, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3645, + "hfopenllm_v2/MMLU-PRO": 0.2448 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/NousResearch.json b/data/developers/NousResearch.json new file mode 100644 index 0000000000000000000000000000000000000000..39234619c0fc2a9ab38092f6171e4cfd2fcdb763 --- /dev/null +++ b/data/developers/NousResearch.json @@ -0,0 +1,288 @@ +{ + "developer": "NousResearch", + "models": [ + { + "id": "NousResearch/DeepHermes-3-Mistral-24B-Preview", + "name": "DeepHermes-3-Mistral-24B-Preview", + "developer": "NousResearch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4536, + "hfopenllm_v2/BBH": 0.6488, + "hfopenllm_v2/MATH Level 5": 0.2576, + "hfopenllm_v2/GPQA": 0.37, + "hfopenllm_v2/MUSR": 0.4503, + "hfopenllm_v2/MMLU-PRO": 0.459 + } + }, + { + "id": "NousResearch/Hermes-2-Pro-Llama-3-8B", + "name": "Hermes-2-Pro-Llama-3-8B", + "developer": "NousResearch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5362, + "hfopenllm_v2/BBH": 0.5071, + "hfopenllm_v2/MATH Level 5": 0.0838, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4262, + "hfopenllm_v2/MMLU-PRO": 0.3052 + } + }, + { + "id": "NousResearch/Hermes-2-Pro-Mistral-7B", + "name": "Hermes-2-Pro-Mistral-7B", + "developer": "NousResearch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5668, + "hfopenllm_v2/BBH": 0.4995, + "hfopenllm_v2/MATH Level 5": 0.0604, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.4376, + "hfopenllm_v2/MMLU-PRO": 0.2946 + } + }, + { + "id": "NousResearch/Hermes-2-Theta-Llama-3-8B", + "name": "Hermes-2-Theta-Llama-3-8B", + "developer": "NousResearch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6518, + "hfopenllm_v2/BBH": 0.5207, + "hfopenllm_v2/MATH Level 5": 0.0967, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.3949, + "hfopenllm_v2/MMLU-PRO": 0.3369 + } + }, + { + "id": "NousResearch/Hermes-3-Llama-3.1-70B", + "name": "Hermes-3-Llama-3.1-70B", + "developer": "NousResearch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7661, + "hfopenllm_v2/BBH": 0.6756, + "hfopenllm_v2/MATH Level 5": 0.21, + "hfopenllm_v2/GPQA": 0.3616, + "hfopenllm_v2/MUSR": 0.4949, + "hfopenllm_v2/MMLU-PRO": 0.4727, + "reward-bench/Score": 0.7847, + "reward-bench/Chat": 0.9623, + "reward-bench/Chat Hard": 0.5669, + "reward-bench/Safety": 0.823, + "reward-bench/Reasoning": 0.7867 + } + }, + { + "id": "NousResearch/Hermes-3-Llama-3.1-8B", + "name": "Hermes-3-Llama-3.1-8B", + "developer": "NousResearch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.617, + "hfopenllm_v2/BBH": 0.5177, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4369, + "hfopenllm_v2/MMLU-PRO": 0.3139 + } + }, + { + "id": "NousResearch/Hermes-3-Llama-3.2-3B", + "name": "Hermes-3-Llama-3.2-3B", + "developer": "NousResearch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3825, + "hfopenllm_v2/BBH": 0.4352, + "hfopenllm_v2/MATH Level 5": 0.0393, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.403, + "hfopenllm_v2/MMLU-PRO": 0.2544 + } + }, + { + "id": "NousResearch/Nous-Hermes-2-Mistral-7B-DPO", + "name": "Nous-Hermes-2-Mistral-7B-DPO", + "developer": "NousResearch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5763, + "hfopenllm_v2/BBH": 0.4853, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4, + "hfopenllm_v2/MMLU-PRO": 0.3015, + "reward-bench/Score": 0.7481, + "reward-bench/Chat": 0.9218, + "reward-bench/Chat Hard": 0.6053, + "reward-bench/Safety": 0.8243, + "reward-bench/Reasoning": 0.7375, + "reward-bench/Prior Sets (0.5 weight)": 0.555 + } + }, + { + "id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO", + "name": "Nous-Hermes-2-Mixtral-8x7B-DPO", + "developer": "NousResearch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5897, + "hfopenllm_v2/BBH": 0.5539, + "hfopenllm_v2/MATH Level 5": 0.1224, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.4595, + "hfopenllm_v2/MMLU-PRO": 0.3666, + "reward-bench/Score": 0.7138, + "reward-bench/Chat": 0.9162, + "reward-bench/Chat Hard": 0.6053, + "reward-bench/Safety": 0.8149, + "reward-bench/Reasoning": 0.6126, + "reward-bench/Prior Sets (0.5 weight)": 0.5266 + } + }, + { + "id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT", + "name": "Nous-Hermes-2-Mixtral-8x7B-SFT", + "developer": "NousResearch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5731, + "hfopenllm_v2/BBH": 0.5058, + "hfopenllm_v2/MATH Level 5": 0.0211, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4214, + "hfopenllm_v2/MMLU-PRO": 0.3066 + } + }, + { + "id": "NousResearch/Nous-Hermes-2-SOLAR-10.7B", + "name": "Nous-Hermes-2-SOLAR-10.7B", + "developer": "NousResearch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5279, + "hfopenllm_v2/BBH": 0.5414, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4373, + "hfopenllm_v2/MMLU-PRO": 0.3458 + } + }, + { + "id": "NousResearch/Nous-Hermes-llama-2-7b", + "name": "Nous-Hermes-llama-2-7b", + "developer": "NousResearch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1729, + "hfopenllm_v2/BBH": 0.3824, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.4257, + "hfopenllm_v2/MMLU-PRO": 0.194 + } + }, + { + "id": "NousResearch/Yarn-Llama-2-13b-128k", + "name": "Yarn-Llama-2-13b-128k", + "developer": "NousResearch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1655, + "hfopenllm_v2/BBH": 0.3827, + "hfopenllm_v2/MATH Level 5": 0.0174, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3458, + "hfopenllm_v2/MMLU-PRO": 0.232 + } + }, + { + "id": "NousResearch/Yarn-Llama-2-7b-128k", + "name": "Yarn-Llama-2-7b-128k", + "developer": "NousResearch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1485, + "hfopenllm_v2/BBH": 0.3248, + "hfopenllm_v2/MATH Level 5": 0.0151, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3967, + "hfopenllm_v2/MMLU-PRO": 0.1791 + } + }, + { + "id": "NousResearch/Yarn-Llama-2-7b-64k", + "name": "Yarn-Llama-2-7b-64k", + "developer": "NousResearch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.17, + "hfopenllm_v2/BBH": 0.3326, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3939, + "hfopenllm_v2/MMLU-PRO": 0.1799 + } + }, + { + "id": "NousResearch/Yarn-Mistral-7b-128k", + "name": "Yarn-Mistral-7b-128k", + "developer": "NousResearch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1934, + "hfopenllm_v2/BBH": 0.4314, + "hfopenllm_v2/MATH Level 5": 0.0317, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4071, + "hfopenllm_v2/MMLU-PRO": 0.2893 + } + }, + { + "id": "NousResearch/Yarn-Mistral-7b-64k", + "name": "Yarn-Mistral-7b-64k", + "developer": "NousResearch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.208, + "hfopenllm_v2/BBH": 0.4293, + "hfopenllm_v2/MATH Level 5": 0.037, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4124, + "hfopenllm_v2/MMLU-PRO": 0.2914 + } + }, + { + "id": "NousResearch/Yarn-Solar-10b-32k", + "name": "Yarn-Solar-10b-32k", + "developer": "NousResearch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1942, + "hfopenllm_v2/BBH": 0.4987, + "hfopenllm_v2/MATH Level 5": 0.0302, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4146, + "hfopenllm_v2/MMLU-PRO": 0.3272 + } + }, + { + "id": "NousResearch/Yarn-Solar-10b-64k", + "name": "Yarn-Solar-10b-64k", + "developer": "NousResearch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1989, + "hfopenllm_v2/BBH": 0.4922, + "hfopenllm_v2/MATH Level 5": 0.0287, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4014, + "hfopenllm_v2/MMLU-PRO": 0.3148 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Novaciano.json b/data/developers/Novaciano.json new file mode 100644 index 0000000000000000000000000000000000000000..df9c040c307dc2d5a01ff24b2f103902fcc322c0 --- /dev/null +++ b/data/developers/Novaciano.json @@ -0,0 +1,159 @@ +{ + "developer": "Novaciano", + "models": [ + { + "id": "Novaciano/ASTAROTH-3.2-1B", + "name": "ASTAROTH-3.2-1B", + "developer": "Novaciano", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5613, + "hfopenllm_v2/BBH": 0.3543, + "hfopenllm_v2/MATH Level 5": 0.0733, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3142, + "hfopenllm_v2/MMLU-PRO": 0.1909 + } + }, + { + "id": "Novaciano/BLAST_PROCESSING-3.2-1B", + "name": "BLAST_PROCESSING-3.2-1B", + "developer": "Novaciano", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3922, + "hfopenllm_v2/BBH": 0.346, + "hfopenllm_v2/MATH Level 5": 0.0748, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3351, + "hfopenllm_v2/MMLU-PRO": 0.1941 + } + }, + { + "id": "Novaciano/Cerberus-3.2-1B", + "name": "Cerberus-3.2-1B", + "developer": "Novaciano", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5017, + "hfopenllm_v2/BBH": 0.4165, + "hfopenllm_v2/MATH Level 5": 0.0582, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3289, + "hfopenllm_v2/MMLU-PRO": 0.1663 + } + }, + { + "id": "Novaciano/Cultist-3.2-1B", + "name": "Cultist-3.2-1B", + "developer": "Novaciano", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5295, + "hfopenllm_v2/BBH": 0.3399, + "hfopenllm_v2/MATH Level 5": 0.0589, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.333, + "hfopenllm_v2/MMLU-PRO": 0.1714 + } + }, + { + "id": "Novaciano/FuseChat-3.2-1B-GRPO_Creative_RP", + "name": "FuseChat-3.2-1B-GRPO_Creative_RP", + "developer": "Novaciano", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5598, + "hfopenllm_v2/BBH": 0.3488, + "hfopenllm_v2/MATH Level 5": 0.0801, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3329, + "hfopenllm_v2/MMLU-PRO": 0.1735 + } + }, + { + "id": "Novaciano/Fusetrix-3.2-1B-GRPO_RP_Creative", + "name": "Fusetrix-3.2-1B-GRPO_RP_Creative", + "developer": "Novaciano", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5366, + "hfopenllm_v2/BBH": 0.3435, + "hfopenllm_v2/MATH Level 5": 0.1148, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3209, + "hfopenllm_v2/MMLU-PRO": 0.1758 + } + }, + { + "id": "Novaciano/Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP", + "name": "Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP", + "developer": "Novaciano", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5343, + "hfopenllm_v2/BBH": 0.3502, + "hfopenllm_v2/MATH Level 5": 0.105, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3183, + "hfopenllm_v2/MMLU-PRO": 0.1823 + } + }, + { + "id": "Novaciano/HarmfulProject-3.2-1B", + "name": "HarmfulProject-3.2-1B", + "developer": "Novaciano", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3874, + "hfopenllm_v2/BBH": 0.3274, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3419, + "hfopenllm_v2/MMLU-PRO": 0.1823 + } + }, + { + "id": "Novaciano/LEWD-Mental-Cultist-3.2-1B", + "name": "LEWD-Mental-Cultist-3.2-1B", + "developer": "Novaciano", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5309, + "hfopenllm_v2/BBH": 0.3513, + "hfopenllm_v2/MATH Level 5": 0.0529, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3223, + "hfopenllm_v2/MMLU-PRO": 0.1769 + } + }, + { + "id": "Novaciano/La_Mejor_Mezcla-3.2-1B", + "name": "La_Mejor_Mezcla-3.2-1B", + "developer": "Novaciano", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.551, + "hfopenllm_v2/BBH": 0.3488, + "hfopenllm_v2/MATH Level 5": 0.0899, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3196, + "hfopenllm_v2/MMLU-PRO": 0.1829 + } + }, + { + "id": "Novaciano/Sigil-Of-Satan-3.2-1B", + "name": "Sigil-Of-Satan-3.2-1B", + "developer": "Novaciano", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5494, + "hfopenllm_v2/BBH": 0.3546, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3276, + "hfopenllm_v2/MMLU-PRO": 0.1855 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/NucleusAI.json b/data/developers/NucleusAI.json new file mode 100644 index 0000000000000000000000000000000000000000..22a66fda36652e355f4ef17c5f1441f472e63eb9 --- /dev/null +++ b/data/developers/NucleusAI.json @@ -0,0 +1,19 @@ +{ + "developer": "NucleusAI", + "models": [ + { + "id": "NucleusAI/nucleus-22B-token-500B", + "name": "nucleus-22B-token-500B", + "developer": "NucleusAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0257, + "hfopenllm_v2/BBH": 0.292, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3511, + "hfopenllm_v2/MMLU-PRO": 0.1162 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/NyxKrage.json b/data/developers/NyxKrage.json new file mode 100644 index 0000000000000000000000000000000000000000..465a044661f12dfb86b4e326cff543e179ac289b --- /dev/null +++ b/data/developers/NyxKrage.json @@ -0,0 +1,19 @@ +{ + "developer": "NyxKrage", + "models": [ + { + "id": "NyxKrage/Microsoft_Phi-4", + "name": "Microsoft_Phi-4", + "developer": "NyxKrage", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0585, + "hfopenllm_v2/BBH": 0.6691, + "hfopenllm_v2/MATH Level 5": 0.2991, + "hfopenllm_v2/GPQA": 0.406, + "hfopenllm_v2/MUSR": 0.5034, + "hfopenllm_v2/MMLU-PRO": 0.5287 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/OEvortex.json b/data/developers/OEvortex.json new file mode 100644 index 0000000000000000000000000000000000000000..20e72c884066d591a9eeec3ef8d2e75f250ce485 --- /dev/null +++ b/data/developers/OEvortex.json @@ -0,0 +1,75 @@ +{ + "developer": "OEvortex", + "models": [ + { + "id": "OEvortex/Emotional-llama-8B", + "name": "Emotional-llama-8B", + "developer": "OEvortex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3516, + "hfopenllm_v2/BBH": 0.4839, + "hfopenllm_v2/MATH Level 5": 0.0816, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.3659, + "hfopenllm_v2/MMLU-PRO": 0.3535 + } + }, + { + "id": "OEvortex/HelpingAI-15B", + "name": "HelpingAI-15B", + "developer": "OEvortex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.203, + "hfopenllm_v2/BBH": 0.2936, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3619, + "hfopenllm_v2/MMLU-PRO": 0.1111 + } + }, + { + "id": "OEvortex/HelpingAI-3B-reloaded", + "name": "HelpingAI-3B-reloaded", + "developer": "OEvortex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4647, + "hfopenllm_v2/BBH": 0.4129, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3524, + "hfopenllm_v2/MMLU-PRO": 0.2595 + } + }, + { + "id": "OEvortex/HelpingAI2-9B", + "name": "HelpingAI2-9B", + "developer": "OEvortex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4413, + "hfopenllm_v2/BBH": 0.4845, + "hfopenllm_v2/MATH Level 5": 0.0589, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3711, + "hfopenllm_v2/MMLU-PRO": 0.29 + } + }, + { + "id": "OEvortex/HelpingAI2.5-10B", + "name": "HelpingAI2.5-10B", + "developer": "OEvortex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3277, + "hfopenllm_v2/BBH": 0.4496, + "hfopenllm_v2/MATH Level 5": 0.0204, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3738, + "hfopenllm_v2/MMLU-PRO": 0.2575 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/OliveiraJLT.json b/data/developers/OliveiraJLT.json new file mode 100644 index 0000000000000000000000000000000000000000..96425d9532f2e884859ea2f3d4edc2512f13356f --- /dev/null +++ b/data/developers/OliveiraJLT.json @@ -0,0 +1,19 @@ +{ + "developer": "OliveiraJLT", + "models": [ + { + "id": "OliveiraJLT/Sagui-7B-Instruct-v0.1", + "name": "Sagui-7B-Instruct-v0.1", + "developer": "OliveiraJLT", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2892, + "hfopenllm_v2/BBH": 0.3111, + "hfopenllm_v2/MATH Level 5": 0.0151, + "hfopenllm_v2/GPQA": 0.2424, + "hfopenllm_v2/MUSR": 0.4191, + "hfopenllm_v2/MMLU-PRO": 0.1485 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Omkar1102.json b/data/developers/Omkar1102.json new file mode 100644 index 0000000000000000000000000000000000000000..1d044781189744d770af993cfdb651c1e02eee6f --- /dev/null +++ b/data/developers/Omkar1102.json @@ -0,0 +1,19 @@ +{ + "developer": "Omkar1102", + "models": [ + { + "id": "Omkar1102/code-yi", + "name": "code-yi", + "developer": "Omkar1102", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2254, + "hfopenllm_v2/BBH": 0.275, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3762, + "hfopenllm_v2/MMLU-PRO": 0.1123 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/OmnicromsBrain.json b/data/developers/OmnicromsBrain.json new file mode 100644 index 0000000000000000000000000000000000000000..5e8901580b7e0479fc5b0e2b31b10bf904608a89 --- /dev/null +++ b/data/developers/OmnicromsBrain.json @@ -0,0 +1,19 @@ +{ + "developer": "OmnicromsBrain", + "models": [ + { + "id": "OmnicromsBrain/NeuralStar_FusionWriter_4x7b", + "name": "NeuralStar_FusionWriter_4x7b", + "developer": "OmnicromsBrain", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5964, + "hfopenllm_v2/BBH": 0.4776, + "hfopenllm_v2/MATH Level 5": 0.0491, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.4019, + "hfopenllm_v2/MMLU-PRO": 0.2606 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/OnlyCheeini.json b/data/developers/OnlyCheeini.json new file mode 100644 index 0000000000000000000000000000000000000000..fb95652078e138cf9868b98c03d0f01121e777bc --- /dev/null +++ b/data/developers/OnlyCheeini.json @@ -0,0 +1,19 @@ +{ + "developer": "OnlyCheeini", + "models": [ + { + "id": "OnlyCheeini/greesychat-turbo", + "name": "greesychat-turbo", + "developer": "OnlyCheeini", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0233, + "hfopenllm_v2/BBH": 0.3092, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3314, + "hfopenllm_v2/MMLU-PRO": 0.1138 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Open-Orca.json b/data/developers/Open-Orca.json new file mode 100644 index 0000000000000000000000000000000000000000..8f976ab95ea911176fa76bd1983b2b9e37e8dd39 --- /dev/null +++ b/data/developers/Open-Orca.json @@ -0,0 +1,19 @@ +{ + "developer": "Open-Orca", + "models": [ + { + "id": "Open-Orca/Mistral-7B-OpenOrca", + "name": "Mistral-7B-OpenOrca", + "developer": "Open-Orca", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4978, + "hfopenllm_v2/BBH": 0.4768, + "hfopenllm_v2/MATH Level 5": 0.0355, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3858, + "hfopenllm_v2/MMLU-PRO": 0.2653 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/OpenAI.json b/data/developers/OpenAI.json new file mode 100644 index 0000000000000000000000000000000000000000..4f5fe50e33b6537a7fa64519c50297e6a6d53b51 --- /dev/null +++ b/data/developers/OpenAI.json @@ -0,0 +1,132 @@ +{ + "developer": "OpenAI", + "models": [ + { + "id": "openai/gpt-4.1", + "name": "openai/gpt-4.1", + "developer": "OpenAI", + "evaluator_relationship": null, + "benchmark_scores": { + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.0, + "livecodebenchpro/Easy Problems": 0.19718309859154928 + } + }, + { + "id": "openai/gpt-5", + "name": "GPT-5", + "developer": "OpenAI", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 35.2 + } + }, + { + "id": "openai/gpt-5-codex", + "name": "GPT-5-Codex", + "developer": "OpenAI", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 41.3 + } + }, + { + "id": "openai/gpt-5-mini", + "name": "GPT-5-Mini", + "developer": "OpenAI", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 31.9 + } + }, + { + "id": "openai/gpt-5-nano", + "name": "GPT-5-Nano", + "developer": "OpenAI", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 7.0 + } + }, + { + "id": "openai/gpt-5.1", + "name": "GPT-5.1", + "developer": "OpenAI", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 47.6 + } + }, + { + "id": "openai/gpt-5.1-codex", + "name": "GPT-5.1-Codex", + "developer": "OpenAI", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 36.9 + } + }, + { + "id": "openai/gpt-5.1-codex-max", + "name": "GPT-5.1-Codex-Max", + "developer": "OpenAI", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 60.4 + } + }, + { + "id": "openai/gpt-5.1-codex-mini", + "name": "GPT-5.1-Codex-Mini", + "developer": "OpenAI", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 43.1 + } + }, + { + "id": "openai/gpt-5.2", + "name": "GPT-5.2", + "developer": "OpenAI", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 60.7 + } + }, + { + "id": "openai/gpt-5.2-2025-12-11", + "name": "gpt-5.2-2025-12-11", + "developer": "OpenAI", + "evaluator_relationship": null, + "benchmark_scores": { + "appworld_test_normal/appworld/test_normal": 0.071, + "browsecompplus/browsecompplus": 0.46, + "livecodebenchpro/Hard Problems": 0.1594, + "livecodebenchpro/Medium Problems": 0.5211, + "livecodebenchpro/Easy Problems": 0.9014, + "swe-bench/swe-bench": 0.5253, + "tau-bench-2_airline/tau-bench-2/airline": 0.54, + "tau-bench-2_retail/tau-bench-2/retail": 0.73, + "tau-bench-2_telecom/tau-bench-2/telecom": 0.5354 + } + }, + { + "id": "openai/gpt-5.2-codex", + "name": "GPT-5.2-Codex", + "developer": "OpenAI", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 66.5 + } + }, + { + "id": "openai/gpt-5.3-codex", + "name": "GPT-5.3-Codex", + "developer": "OpenAI", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 77.3 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/OpenAssistant.json b/data/developers/OpenAssistant.json new file mode 100644 index 0000000000000000000000000000000000000000..01c75aa86e6780446770cc9407ec4e8d8baaeaf2 --- /dev/null +++ b/data/developers/OpenAssistant.json @@ -0,0 +1,76 @@ +{ + "developer": "OpenAssistant", + "models": [ + { + "id": "OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1", + "name": "OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1", + "developer": "OpenAssistant", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.615, + "reward-bench/Factuality": 0.3979, + "reward-bench/Precise IF": 0.2875, + "reward-bench/Math": 0.377, + "reward-bench/Safety": 0.5446, + "reward-bench/Focus": 0.1535, + "reward-bench/Ties": 0.047, + "reward-bench/Chat": 0.9246, + "reward-bench/Chat Hard": 0.3728, + "reward-bench/Reasoning": 0.5855, + "reward-bench/Prior Sets (0.5 weight)": 0.6801 + } + }, + { + "id": "OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5", + "name": "OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5", + "developer": "OpenAssistant", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.2648, + "reward-bench/Chat": 0.8855, + "reward-bench/Chat Hard": 0.4868, + "reward-bench/Safety": 0.3244, + "reward-bench/Reasoning": 0.7752, + "reward-bench/Prior Sets (0.5 weight)": 0.6533, + "reward-bench/Factuality": 0.3179, + "reward-bench/Precise IF": 0.2625, + "reward-bench/Math": 0.3934, + "reward-bench/Focus": 0.2707, + "reward-bench/Ties": 0.0198 + } + }, + { + "id": "OpenAssistant/oasst-sft-1-pythia-12b", + "name": "oasst-sft-1-pythia-12b", + "developer": "OpenAssistant", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1055, + "hfopenllm_v2/BBH": 0.3147, + "hfopenllm_v2/MATH Level 5": 0.0151, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3327, + "hfopenllm_v2/MMLU-PRO": 0.1113 + } + }, + { + "id": "OpenAssistant/reward-model-deberta-v3-large-v2", + "name": "OpenAssistant/reward-model-deberta-v3-large-v2", + "developer": "OpenAssistant", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.32, + "reward-bench/Chat": 0.8939, + "reward-bench/Chat Hard": 0.4518, + "reward-bench/Safety": 0.3667, + "reward-bench/Reasoning": 0.3855, + "reward-bench/Prior Sets (0.5 weight)": 0.5836, + "reward-bench/Factuality": 0.3853, + "reward-bench/Precise IF": 0.2687, + "reward-bench/Math": 0.5027, + "reward-bench/Focus": 0.2768, + "reward-bench/Ties": 0.12 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/OpenBuddy.json b/data/developers/OpenBuddy.json new file mode 100644 index 0000000000000000000000000000000000000000..a98e6fe11e595e2b8e16769938fa62415b797e60 --- /dev/null +++ b/data/developers/OpenBuddy.json @@ -0,0 +1,313 @@ +{ + "developer": "OpenBuddy", + "models": [ + { + "id": "OpenBuddy/openbuddy-falcon3-10b-v24.2-131k", + "name": "openbuddy-falcon3-10b-v24.2-131k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5086, + "hfopenllm_v2/BBH": 0.6004, + "hfopenllm_v2/MATH Level 5": 0.213, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4186, + "hfopenllm_v2/MMLU-PRO": 0.3834 + } + }, + { + "id": "OpenBuddy/openbuddy-llama3-70b-v21.2-32k", + "name": "openbuddy-llama3-70b-v21.2-32k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.701, + "hfopenllm_v2/BBH": 0.6507, + "hfopenllm_v2/MATH Level 5": 0.2032, + "hfopenllm_v2/GPQA": 0.3423, + "hfopenllm_v2/MUSR": 0.458, + "hfopenllm_v2/MMLU-PRO": 0.4832 + } + }, + { + "id": "OpenBuddy/openbuddy-llama3-8b-v21.1-8k", + "name": "openbuddy-llama3-8b-v21.1-8k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.557, + "hfopenllm_v2/BBH": 0.4788, + "hfopenllm_v2/MATH Level 5": 0.0431, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3988, + "hfopenllm_v2/MMLU-PRO": 0.2955 + } + }, + { + "id": "OpenBuddy/openbuddy-llama3-8b-v21.2-32k", + "name": "openbuddy-llama3-8b-v21.2-32k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6192, + "hfopenllm_v2/BBH": 0.4856, + "hfopenllm_v2/MATH Level 5": 0.0785, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3779, + "hfopenllm_v2/MMLU-PRO": 0.3299 + } + }, + { + "id": "OpenBuddy/openbuddy-llama3.1-70b-v22.1-131k", + "name": "openbuddy-llama3.1-70b-v22.1-131k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7333, + "hfopenllm_v2/BBH": 0.6698, + "hfopenllm_v2/MATH Level 5": 0.395, + "hfopenllm_v2/GPQA": 0.375, + "hfopenllm_v2/MUSR": 0.463, + "hfopenllm_v2/MMLU-PRO": 0.5304 + } + }, + { + "id": "OpenBuddy/openbuddy-llama3.1-8b-v22.2-131k", + "name": "openbuddy-llama3.1-8b-v22.2-131k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6657, + "hfopenllm_v2/BBH": 0.5007, + "hfopenllm_v2/MATH Level 5": 0.1148, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.4081, + "hfopenllm_v2/MMLU-PRO": 0.331 + } + }, + { + "id": "OpenBuddy/openbuddy-llama3.1-8b-v22.3-131k", + "name": "openbuddy-llama3.1-8b-v22.3-131k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5997, + "hfopenllm_v2/BBH": 0.5066, + "hfopenllm_v2/MATH Level 5": 0.1208, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.4015, + "hfopenllm_v2/MMLU-PRO": 0.3277 + } + }, + { + "id": "OpenBuddy/openbuddy-llama3.2-1b-v23.1-131k", + "name": "openbuddy-llama3.2-1b-v23.1-131k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.359, + "hfopenllm_v2/BBH": 0.3267, + "hfopenllm_v2/MATH Level 5": 0.0249, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3342, + "hfopenllm_v2/MMLU-PRO": 0.184 + } + }, + { + "id": "OpenBuddy/openbuddy-llama3.2-3b-v23.2-131k", + "name": "openbuddy-llama3.2-3b-v23.2-131k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4319, + "hfopenllm_v2/BBH": 0.4073, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3263, + "hfopenllm_v2/MMLU-PRO": 0.2479 + } + }, + { + "id": "OpenBuddy/openbuddy-llama3.3-70b-v24.1-131k", + "name": "openbuddy-llama3.3-70b-v24.1-131k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8121, + "hfopenllm_v2/BBH": 0.6858, + "hfopenllm_v2/MATH Level 5": 0.4411, + "hfopenllm_v2/GPQA": 0.4346, + "hfopenllm_v2/MUSR": 0.4869, + "hfopenllm_v2/MMLU-PRO": 0.5327 + } + }, + { + "id": "OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k", + "name": "openbuddy-mixtral-7bx8-v18.1-32k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5493, + "hfopenllm_v2/BBH": 0.4656, + "hfopenllm_v2/MATH Level 5": 0.108, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.3831, + "hfopenllm_v2/MMLU-PRO": 0.3804 + } + }, + { + "id": "OpenBuddy/openbuddy-nemotron-70b-v23.1-131k", + "name": "openbuddy-nemotron-70b-v23.1-131k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7555, + "hfopenllm_v2/BBH": 0.6749, + "hfopenllm_v2/MATH Level 5": 0.321, + "hfopenllm_v2/GPQA": 0.3633, + "hfopenllm_v2/MUSR": 0.4538, + "hfopenllm_v2/MMLU-PRO": 0.5175 + } + }, + { + "id": "OpenBuddy/openbuddy-nemotron-70b-v23.2-131k", + "name": "openbuddy-nemotron-70b-v23.2-131k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7227, + "hfopenllm_v2/BBH": 0.6705, + "hfopenllm_v2/MATH Level 5": 0.3157, + "hfopenllm_v2/GPQA": 0.3599, + "hfopenllm_v2/MUSR": 0.4696, + "hfopenllm_v2/MMLU-PRO": 0.5121 + } + }, + { + "id": "OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.1-200k", + "name": "openbuddy-qwen2.5llamaify-14b-v23.1-200k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6309, + "hfopenllm_v2/BBH": 0.6013, + "hfopenllm_v2/MATH Level 5": 0.2538, + "hfopenllm_v2/GPQA": 0.3331, + "hfopenllm_v2/MUSR": 0.424, + "hfopenllm_v2/MMLU-PRO": 0.4673 + } + }, + { + "id": "OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.3-200k", + "name": "openbuddy-qwen2.5llamaify-14b-v23.3-200k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6131, + "hfopenllm_v2/BBH": 0.6081, + "hfopenllm_v2/MATH Level 5": 0.2311, + "hfopenllm_v2/GPQA": 0.3272, + "hfopenllm_v2/MUSR": 0.4346, + "hfopenllm_v2/MMLU-PRO": 0.4795 + } + }, + { + "id": "OpenBuddy/openbuddy-qwen2.5llamaify-7b-v23.1-200k", + "name": "openbuddy-qwen2.5llamaify-7b-v23.1-200k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5673, + "hfopenllm_v2/BBH": 0.5509, + "hfopenllm_v2/MATH Level 5": 0.1888, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.4363, + "hfopenllm_v2/MMLU-PRO": 0.3948 + } + }, + { + "id": "OpenBuddy/openbuddy-qwq-32b-v24.1-200k", + "name": "openbuddy-qwq-32b-v24.1-200k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5937, + "hfopenllm_v2/BBH": 0.6798, + "hfopenllm_v2/MATH Level 5": 0.3739, + "hfopenllm_v2/GPQA": 0.3809, + "hfopenllm_v2/MUSR": 0.4849, + "hfopenllm_v2/MMLU-PRO": 0.549 + } + }, + { + "id": "OpenBuddy/openbuddy-qwq-32b-v24.2-200k", + "name": "openbuddy-qwq-32b-v24.2-200k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.597, + "hfopenllm_v2/BBH": 0.6772, + "hfopenllm_v2/MATH Level 5": 0.3776, + "hfopenllm_v2/GPQA": 0.3767, + "hfopenllm_v2/MUSR": 0.4718, + "hfopenllm_v2/MMLU-PRO": 0.5446 + } + }, + { + "id": "OpenBuddy/openbuddy-yi1.5-34b-v21.3-32k", + "name": "openbuddy-yi1.5-34b-v21.3-32k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.542, + "hfopenllm_v2/BBH": 0.6163, + "hfopenllm_v2/MATH Level 5": 0.1782, + "hfopenllm_v2/GPQA": 0.349, + "hfopenllm_v2/MUSR": 0.4439, + "hfopenllm_v2/MMLU-PRO": 0.4599 + } + }, + { + "id": "OpenBuddy/openbuddy-zero-14b-v22.3-32k", + "name": "openbuddy-zero-14b-v22.3-32k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3753, + "hfopenllm_v2/BBH": 0.486, + "hfopenllm_v2/MATH Level 5": 0.0937, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4166, + "hfopenllm_v2/MMLU-PRO": 0.3187 + } + }, + { + "id": "OpenBuddy/openbuddy-zero-3b-v21.2-32k", + "name": "openbuddy-zero-3b-v21.2-32k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3802, + "hfopenllm_v2/BBH": 0.3935, + "hfopenllm_v2/MATH Level 5": 0.0189, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3566, + "hfopenllm_v2/MMLU-PRO": 0.2034 + } + }, + { + "id": "OpenBuddy/openbuddy-zero-56b-v21.2-32k", + "name": "openbuddy-zero-56b-v21.2-32k", + "developer": "OpenBuddy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5057, + "hfopenllm_v2/BBH": 0.6128, + "hfopenllm_v2/MATH Level 5": 0.1624, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.4305, + "hfopenllm_v2/MMLU-PRO": 0.4399 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/OpenGenerativeAI.json b/data/developers/OpenGenerativeAI.json new file mode 100644 index 0000000000000000000000000000000000000000..e4348d42b5ebbf4a91a2f194731b6091cf2362c6 --- /dev/null +++ b/data/developers/OpenGenerativeAI.json @@ -0,0 +1,33 @@ +{ + "developer": "OpenGenerativeAI", + "models": [ + { + "id": "OpenGenerativeAI/Bifrost", + "name": "Bifrost", + "developer": "OpenGenerativeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6348, + "hfopenllm_v2/BBH": 0.6849, + "hfopenllm_v2/MATH Level 5": 0.2545, + "hfopenllm_v2/GPQA": 0.3683, + "hfopenllm_v2/MUSR": 0.4598, + "hfopenllm_v2/MMLU-PRO": 0.516 + } + }, + { + "id": "OpenGenerativeAI/Bifrost-14B", + "name": "Bifrost-14B", + "developer": "OpenGenerativeAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6615, + "hfopenllm_v2/BBH": 0.6845, + "hfopenllm_v2/MATH Level 5": 0.2356, + "hfopenllm_v2/GPQA": 0.3792, + "hfopenllm_v2/MUSR": 0.4624, + "hfopenllm_v2/MMLU-PRO": 0.5074 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/OpenLLM-France.json b/data/developers/OpenLLM-France.json new file mode 100644 index 0000000000000000000000000000000000000000..6ebba764bd5c8af0db5e7c6efeec343fba0ad06e --- /dev/null +++ b/data/developers/OpenLLM-France.json @@ -0,0 +1,61 @@ +{ + "developer": "OpenLLM-France", + "models": [ + { + "id": "OpenLLM-France/Lucie-7B", + "name": "Lucie-7B", + "developer": "OpenLLM-France", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2496, + "hfopenllm_v2/BBH": 0.3492, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3923, + "hfopenllm_v2/MMLU-PRO": 0.1498 + } + }, + { + "id": "OpenLLM-France/Lucie-7B-Instruct", + "name": "Lucie-7B-Instruct", + "developer": "OpenLLM-France", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2796, + "hfopenllm_v2/BBH": 0.3254, + "hfopenllm_v2/MATH Level 5": 0.0166, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3662, + "hfopenllm_v2/MMLU-PRO": 0.1556 + } + }, + { + "id": "OpenLLM-France/Lucie-7B-Instruct-human-data", + "name": "Lucie-7B-Instruct-human-data", + "developer": "OpenLLM-France", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2946, + "hfopenllm_v2/BBH": 0.3284, + "hfopenllm_v2/MATH Level 5": 0.0219, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3729, + "hfopenllm_v2/MMLU-PRO": 0.143 + } + }, + { + "id": "OpenLLM-France/Lucie-7B-Instruct-v1.1", + "name": "Lucie-7B-Instruct-v1.1", + "developer": "OpenLLM-France", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3039, + "hfopenllm_v2/BBH": 0.3816, + "hfopenllm_v2/MATH Level 5": 0.0317, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.375, + "hfopenllm_v2/MMLU-PRO": 0.1864 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/OpenLeecher.json b/data/developers/OpenLeecher.json new file mode 100644 index 0000000000000000000000000000000000000000..049d103cc2aafb7dad98cc0f49c1a8d98d36c1a7 --- /dev/null +++ b/data/developers/OpenLeecher.json @@ -0,0 +1,19 @@ +{ + "developer": "OpenLeecher", + "models": [ + { + "id": "OpenLeecher/llama3-8b-lima", + "name": "llama3-8b-lima", + "developer": "OpenLeecher", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4371, + "hfopenllm_v2/BBH": 0.4296, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.2383, + "hfopenllm_v2/MUSR": 0.3713, + "hfopenllm_v2/MMLU-PRO": 0.2626 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/OpenScholar.json b/data/developers/OpenScholar.json new file mode 100644 index 0000000000000000000000000000000000000000..72e39b0c055a9a51c7cb863cbd21bb77d6332d8f --- /dev/null +++ b/data/developers/OpenScholar.json @@ -0,0 +1,19 @@ +{ + "developer": "OpenScholar", + "models": [ + { + "id": "OpenScholar/Llama-3.1_OpenScholar-8B", + "name": "Llama-3.1_OpenScholar-8B", + "developer": "OpenScholar", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6064, + "hfopenllm_v2/BBH": 0.5208, + "hfopenllm_v2/MATH Level 5": 0.1654, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.4275, + "hfopenllm_v2/MMLU-PRO": 0.3708 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Orenguteng.json b/data/developers/Orenguteng.json new file mode 100644 index 0000000000000000000000000000000000000000..9a10ade8d5678f6c0ab0a9dd6f8f24025ba5bec1 --- /dev/null +++ b/data/developers/Orenguteng.json @@ -0,0 +1,33 @@ +{ + "developer": "Orenguteng", + "models": [ + { + "id": "Orenguteng/Llama-3.1-8B-Lexi-Uncensored", + "name": "Llama-3.1-8B-Lexi-Uncensored", + "developer": "Orenguteng", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7777, + "hfopenllm_v2/BBH": 0.5057, + "hfopenllm_v2/MATH Level 5": 0.1571, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3871, + "hfopenllm_v2/MMLU-PRO": 0.379 + } + }, + { + "id": "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2", + "name": "Llama-3.1-8B-Lexi-Uncensored-V2", + "developer": "Orenguteng", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7792, + "hfopenllm_v2/BBH": 0.5084, + "hfopenllm_v2/MATH Level 5": 0.1971, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.3843, + "hfopenllm_v2/MMLU-PRO": 0.3781 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Orion-zhen.json b/data/developers/Orion-zhen.json new file mode 100644 index 0000000000000000000000000000000000000000..aabb46207c5258c83b3620297614572e6cfaa67a --- /dev/null +++ b/data/developers/Orion-zhen.json @@ -0,0 +1,33 @@ +{ + "developer": "Orion-zhen", + "models": [ + { + "id": "Orion-zhen/Qwen2.5-7B-Instruct-Uncensored", + "name": "Qwen2.5-7B-Instruct-Uncensored", + "developer": "Orion-zhen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7204, + "hfopenllm_v2/BBH": 0.5474, + "hfopenllm_v2/MATH Level 5": 0.4773, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4361, + "hfopenllm_v2/MMLU-PRO": 0.4427 + } + }, + { + "id": "Orion-zhen/phi-4-abliterated", + "name": "phi-4-abliterated", + "developer": "Orion-zhen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0576, + "hfopenllm_v2/BBH": 0.6698, + "hfopenllm_v2/MATH Level 5": 0.3021, + "hfopenllm_v2/GPQA": 0.4044, + "hfopenllm_v2/MUSR": 0.5006, + "hfopenllm_v2/MMLU-PRO": 0.5292 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/P0x0.json b/data/developers/P0x0.json new file mode 100644 index 0000000000000000000000000000000000000000..9eb75a9f1302a8c7d9b6f9eebe590f0b56dd2857 --- /dev/null +++ b/data/developers/P0x0.json @@ -0,0 +1,19 @@ +{ + "developer": "P0x0", + "models": [ + { + "id": "P0x0/Astra-v1-12B", + "name": "Astra-v1-12B", + "developer": "P0x0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2806, + "hfopenllm_v2/BBH": 0.5215, + "hfopenllm_v2/MATH Level 5": 0.1133, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4052, + "hfopenllm_v2/MMLU-PRO": 0.3461 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/PJMixers-Dev.json b/data/developers/PJMixers-Dev.json new file mode 100644 index 0000000000000000000000000000000000000000..0b7d3b30cc96b19fa424505465769a9547729ed5 --- /dev/null +++ b/data/developers/PJMixers-Dev.json @@ -0,0 +1,131 @@ +{ + "developer": "PJMixers-Dev", + "models": [ + { + "id": "PJMixers-Dev/L3.2-Instruct-Thinking-v0.1-1B", + "name": "L3.2-Instruct-Thinking-v0.1-1B", + "developer": "PJMixers-Dev", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4628, + "hfopenllm_v2/BBH": 0.3302, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3262, + "hfopenllm_v2/MMLU-PRO": 0.1483 + } + }, + { + "id": "PJMixers-Dev/LLaMa-3.1-Instruct-Interleaved-Zeroed-13B", + "name": "LLaMa-3.1-Instruct-Interleaved-Zeroed-13B", + "developer": "PJMixers-Dev", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7871, + "hfopenllm_v2/BBH": 0.5073, + "hfopenllm_v2/MATH Level 5": 0.2002, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.387, + "hfopenllm_v2/MMLU-PRO": 0.3767 + } + }, + { + "id": "PJMixers-Dev/LLaMa-3.1-RomboTiesTest-8B", + "name": "LLaMa-3.1-RomboTiesTest-8B", + "developer": "PJMixers-Dev", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7825, + "hfopenllm_v2/BBH": 0.5073, + "hfopenllm_v2/MATH Level 5": 0.2002, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.387, + "hfopenllm_v2/MMLU-PRO": 0.3767 + } + }, + { + "id": "PJMixers-Dev/LLaMa-3.1-RomboTiesTest2-8B", + "name": "LLaMa-3.1-RomboTiesTest2-8B", + "developer": "PJMixers-Dev", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7825, + "hfopenllm_v2/BBH": 0.5073, + "hfopenllm_v2/MATH Level 5": 0.2002, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.387, + "hfopenllm_v2/MMLU-PRO": 0.3767 + } + }, + { + "id": "PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B", + "name": "LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B", + "developer": "PJMixers-Dev", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6931, + "hfopenllm_v2/BBH": 0.4556, + "hfopenllm_v2/MATH Level 5": 0.1216, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.37, + "hfopenllm_v2/MMLU-PRO": 0.3127 + } + }, + { + "id": "PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B", + "name": "LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B", + "developer": "PJMixers-Dev", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6292, + "hfopenllm_v2/BBH": 0.4581, + "hfopenllm_v2/MATH Level 5": 0.1299, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3659, + "hfopenllm_v2/MMLU-PRO": 0.3115 + } + }, + { + "id": "PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B", + "name": "LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B", + "developer": "PJMixers-Dev", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6504, + "hfopenllm_v2/BBH": 0.4511, + "hfopenllm_v2/MATH Level 5": 0.1261, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3687, + "hfopenllm_v2/MMLU-PRO": 0.3108 + } + }, + { + "id": "PJMixers-Dev/LLaMa-3.2-Instruct-JankMixBread-v0.1-3B", + "name": "LLaMa-3.2-Instruct-JankMixBread-v0.1-3B", + "developer": "PJMixers-Dev", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5041, + "hfopenllm_v2/BBH": 0.4483, + "hfopenllm_v2/MATH Level 5": 0.1307, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.3516, + "hfopenllm_v2/MMLU-PRO": 0.3083 + } + }, + { + "id": "PJMixers-Dev/Qwen2.5-RomboTiesTest-7B", + "name": "Qwen2.5-RomboTiesTest-7B", + "developer": "PJMixers-Dev", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7558, + "hfopenllm_v2/BBH": 0.5399, + "hfopenllm_v2/MATH Level 5": 0.4962, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4034, + "hfopenllm_v2/MMLU-PRO": 0.4285 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/PJMixers.json b/data/developers/PJMixers.json new file mode 100644 index 0000000000000000000000000000000000000000..114530932a47032429f611ae892cb01ef93e092f --- /dev/null +++ b/data/developers/PJMixers.json @@ -0,0 +1,19 @@ +{ + "developer": "PJMixers", + "models": [ + { + "id": "PJMixers/LLaMa-3-CursedStock-v2.0-8B", + "name": "LLaMa-3-CursedStock-v2.0-8B", + "developer": "PJMixers", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6331, + "hfopenllm_v2/BBH": 0.5271, + "hfopenllm_v2/MATH Level 5": 0.0944, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3856, + "hfopenllm_v2/MMLU-PRO": 0.3556 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/PKU-Alignment.json b/data/developers/PKU-Alignment.json new file mode 100644 index 0000000000000000000000000000000000000000..c49cf2d2c5ee4c838ff715dbd555f36543ad5f7c --- /dev/null +++ b/data/developers/PKU-Alignment.json @@ -0,0 +1,81 @@ +{ + "developer": "PKU-Alignment", + "models": [ + { + "id": "PKU-Alignment/beaver-7b-v1.0-cost", + "name": "PKU-Alignment/beaver-7b-v1.0-cost", + "developer": "PKU-Alignment", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.3332, + "reward-bench/Chat": 0.6173, + "reward-bench/Chat Hard": 0.4232, + "reward-bench/Safety": 0.7589, + "reward-bench/Reasoning": 0.5482, + "reward-bench/Prior Sets (0.5 weight)": 0.57, + "reward-bench/Factuality": 0.3263, + "reward-bench/Precise IF": 0.2313, + "reward-bench/Math": 0.3989, + "reward-bench/Focus": 0.2939, + "reward-bench/Ties": -0.01 + } + }, + { + "id": "PKU-Alignment/beaver-7b-v1.0-reward", + "name": "PKU-Alignment/beaver-7b-v1.0-reward", + "developer": "PKU-Alignment", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.4727, + "reward-bench/Factuality": 0.2105, + "reward-bench/Precise IF": 0.2938, + "reward-bench/Math": 0.2623, + "reward-bench/Safety": 0.3757, + "reward-bench/Focus": 0.0646, + "reward-bench/Ties": -0.01, + "reward-bench/Chat": 0.8184, + "reward-bench/Chat Hard": 0.2873, + "reward-bench/Reasoning": 0.346, + "reward-bench/Prior Sets (0.5 weight)": 0.5993 + } + }, + { + "id": "PKU-Alignment/beaver-7b-v2.0-cost", + "name": "PKU-Alignment/beaver-7b-v2.0-cost", + "developer": "PKU-Alignment", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.3326, + "reward-bench/Chat": 0.5726, + "reward-bench/Chat Hard": 0.4561, + "reward-bench/Safety": 0.7356, + "reward-bench/Reasoning": 0.6211, + "reward-bench/Prior Sets (0.5 weight)": 0.5397, + "reward-bench/Factuality": 0.3789, + "reward-bench/Precise IF": 0.275, + "reward-bench/Math": 0.3333, + "reward-bench/Focus": 0.2828, + "reward-bench/Ties": -0.01 + } + }, + { + "id": "PKU-Alignment/beaver-7b-v2.0-reward", + "name": "PKU-Alignment/beaver-7b-v2.0-reward", + "developer": "PKU-Alignment", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.2544, + "reward-bench/Chat": 0.8994, + "reward-bench/Chat Hard": 0.364, + "reward-bench/Safety": 0.3156, + "reward-bench/Reasoning": 0.6887, + "reward-bench/Prior Sets (0.5 weight)": 0.6171, + "reward-bench/Factuality": 0.2168, + "reward-bench/Precise IF": 0.2562, + "reward-bench/Math": 0.3825, + "reward-bench/Focus": 0.2606, + "reward-bench/Ties": 0.0944 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Parissa3.json b/data/developers/Parissa3.json new file mode 100644 index 0000000000000000000000000000000000000000..83eccf6cdb08cd71d0852659ed3f4e1c5dc29824 --- /dev/null +++ b/data/developers/Parissa3.json @@ -0,0 +1,19 @@ +{ + "developer": "Parissa3", + "models": [ + { + "id": "Parissa3/test-model", + "name": "test-model", + "developer": "Parissa3", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3883, + "hfopenllm_v2/BBH": 0.5194, + "hfopenllm_v2/MATH Level 5": 0.065, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4685, + "hfopenllm_v2/MMLU-PRO": 0.3057 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Pinkstack.json b/data/developers/Pinkstack.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a4d32c248f64869858ac2d7497f1b5f19ed120 --- /dev/null +++ b/data/developers/Pinkstack.json @@ -0,0 +1,61 @@ +{ + "developer": "Pinkstack", + "models": [ + { + "id": "Pinkstack/PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B", + "name": "PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B", + "developer": "Pinkstack", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5085, + "hfopenllm_v2/BBH": 0.4711, + "hfopenllm_v2/MATH Level 5": 0.1692, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4479, + "hfopenllm_v2/MMLU-PRO": 0.3511 + } + }, + { + "id": "Pinkstack/SuperThoughts-CoT-14B-16k-o1-QwQ", + "name": "SuperThoughts-CoT-14B-16k-o1-QwQ", + "developer": "Pinkstack", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0515, + "hfopenllm_v2/BBH": 0.672, + "hfopenllm_v2/MATH Level 5": 0.4199, + "hfopenllm_v2/GPQA": 0.3926, + "hfopenllm_v2/MUSR": 0.4914, + "hfopenllm_v2/MMLU-PRO": 0.5268 + } + }, + { + "id": "Pinkstack/Superthoughts-lite-1.8B-experimental-o1", + "name": "Superthoughts-lite-1.8B-experimental-o1", + "developer": "Pinkstack", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0375, + "hfopenllm_v2/BBH": 0.3435, + "hfopenllm_v2/MATH Level 5": 0.0317, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3354, + "hfopenllm_v2/MMLU-PRO": 0.1851 + } + }, + { + "id": "Pinkstack/Superthoughts-lite-v1", + "name": "Superthoughts-lite-v1", + "developer": "Pinkstack", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1659, + "hfopenllm_v2/BBH": 0.3466, + "hfopenllm_v2/MATH Level 5": 0.0295, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.3672, + "hfopenllm_v2/MMLU-PRO": 0.1755 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/PoLL.json b/data/developers/PoLL.json new file mode 100644 index 0000000000000000000000000000000000000000..654e212b1aaaf403fa50a809752fa41a54bc85fd --- /dev/null +++ b/data/developers/PoLL.json @@ -0,0 +1,18 @@ +{ + "developer": "PoLL", + "models": [ + { + "id": "PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022...", + "name": "PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022...", + "developer": "PoLL", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7578, + "reward-bench/Chat": 0.9525, + "reward-bench/Chat Hard": 0.5406, + "reward-bench/Safety": 0.8034, + "reward-bench/Reasoning": 0.7346 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/PocketDoc.json b/data/developers/PocketDoc.json new file mode 100644 index 0000000000000000000000000000000000000000..8751064d6df577680d6921f745eec4c56dad0ae5 --- /dev/null +++ b/data/developers/PocketDoc.json @@ -0,0 +1,75 @@ +{ + "developer": "PocketDoc", + "models": [ + { + "id": "PocketDoc/Dans-Instruct-CoreCurriculum-12b", + "name": "Dans-Instruct-CoreCurriculum-12b", + "developer": "PocketDoc", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2191, + "hfopenllm_v2/BBH": 0.3789, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4096, + "hfopenllm_v2/MMLU-PRO": 0.1219 + } + }, + { + "id": "PocketDoc/Dans-PersonalityEngine-V1.1.0-12b", + "name": "Dans-PersonalityEngine-V1.1.0-12b", + "developer": "PocketDoc", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7075, + "hfopenllm_v2/BBH": 0.5361, + "hfopenllm_v2/MATH Level 5": 0.105, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.4587, + "hfopenllm_v2/MMLU-PRO": 0.3262 + } + }, + { + "id": "PocketDoc/Dans-PersonalityEngine-V1.2.0-24b", + "name": "Dans-PersonalityEngine-V1.2.0-24b", + "developer": "PocketDoc", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7886, + "hfopenllm_v2/BBH": 0.6421, + "hfopenllm_v2/MATH Level 5": 0.2455, + "hfopenllm_v2/GPQA": 0.3188, + "hfopenllm_v2/MUSR": 0.43, + "hfopenllm_v2/MMLU-PRO": 0.5026 + } + }, + { + "id": "PocketDoc/Dans-PersonalityEngine-v1.0.0-8b", + "name": "Dans-PersonalityEngine-v1.0.0-8b", + "developer": "PocketDoc", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4982, + "hfopenllm_v2/BBH": 0.4733, + "hfopenllm_v2/MATH Level 5": 0.0816, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.3542, + "hfopenllm_v2/MMLU-PRO": 0.3065 + } + }, + { + "id": "PocketDoc/Dans-SakuraKaze-V1.0.0-12b", + "name": "Dans-SakuraKaze-V1.0.0-12b", + "developer": "PocketDoc", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.652, + "hfopenllm_v2/BBH": 0.5405, + "hfopenllm_v2/MATH Level 5": 0.0929, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4745, + "hfopenllm_v2/MMLU-PRO": 0.356 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/PowerInfer.json b/data/developers/PowerInfer.json new file mode 100644 index 0000000000000000000000000000000000000000..9191dec6772330f9ad0b1eeccf587f30deb4e088 --- /dev/null +++ b/data/developers/PowerInfer.json @@ -0,0 +1,19 @@ +{ + "developer": "PowerInfer", + "models": [ + { + "id": "PowerInfer/SmallThinker-3B-Preview", + "name": "SmallThinker-3B-Preview", + "developer": "PowerInfer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.62, + "hfopenllm_v2/BBH": 0.4495, + "hfopenllm_v2/MATH Level 5": 0.2779, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3525, + "hfopenllm_v2/MMLU-PRO": 0.3018 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/PranavHarshan.json b/data/developers/PranavHarshan.json new file mode 100644 index 0000000000000000000000000000000000000000..309a69bba59fe1c32060e0d57bf412cecc18d376 --- /dev/null +++ b/data/developers/PranavHarshan.json @@ -0,0 +1,33 @@ +{ + "developer": "PranavHarshan", + "models": [ + { + "id": "PranavHarshan/LaMistral-V4", + "name": "LaMistral-V4", + "developer": "PranavHarshan", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6239, + "hfopenllm_v2/BBH": 0.5184, + "hfopenllm_v2/MATH Level 5": 0.0687, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.3643, + "hfopenllm_v2/MMLU-PRO": 0.3599 + } + }, + { + "id": "PranavHarshan/MedNarra-X1", + "name": "MedNarra-X1", + "developer": "PranavHarshan", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4338, + "hfopenllm_v2/BBH": 0.4637, + "hfopenllm_v2/MATH Level 5": 0.0438, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.354, + "hfopenllm_v2/MMLU-PRO": 0.3431 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Pretergeek.json b/data/developers/Pretergeek.json new file mode 100644 index 0000000000000000000000000000000000000000..78652e4ddb223f51b181d3459e21a2725b73cd6b --- /dev/null +++ b/data/developers/Pretergeek.json @@ -0,0 +1,131 @@ +{ + "developer": "Pretergeek", + "models": [ + { + "id": "Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Appended", + "name": "OpenChat-3.5-0106_10.7B_48Layers-Appended", + "developer": "Pretergeek", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5961, + "hfopenllm_v2/BBH": 0.462, + "hfopenllm_v2/MATH Level 5": 0.0793, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4254, + "hfopenllm_v2/MMLU-PRO": 0.329 + } + }, + { + "id": "Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Interleaved", + "name": "OpenChat-3.5-0106_10.7B_48Layers-Interleaved", + "developer": "Pretergeek", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5961, + "hfopenllm_v2/BBH": 0.462, + "hfopenllm_v2/MATH Level 5": 0.0778, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4254, + "hfopenllm_v2/MMLU-PRO": 0.3299 + } + }, + { + "id": "Pretergeek/OpenChat-3.5-0106_32K-PoSE", + "name": "OpenChat-3.5-0106_32K-PoSE", + "developer": "Pretergeek", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3969, + "hfopenllm_v2/BBH": 0.3471, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.4205, + "hfopenllm_v2/MMLU-PRO": 0.2031 + } + }, + { + "id": "Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Appended", + "name": "OpenChat-3.5-0106_8.11B_36Layers-Appended", + "developer": "Pretergeek", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5976, + "hfopenllm_v2/BBH": 0.462, + "hfopenllm_v2/MATH Level 5": 0.0793, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4254, + "hfopenllm_v2/MMLU-PRO": 0.329 + } + }, + { + "id": "Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Interleaved", + "name": "OpenChat-3.5-0106_8.11B_36Layers-Interleaved", + "developer": "Pretergeek", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5961, + "hfopenllm_v2/BBH": 0.4621, + "hfopenllm_v2/MATH Level 5": 0.0778, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4241, + "hfopenllm_v2/MMLU-PRO": 0.3299 + } + }, + { + "id": "Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Appended", + "name": "OpenChat-3.5-0106_8.99B_40Layers-Appended", + "developer": "Pretergeek", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5961, + "hfopenllm_v2/BBH": 0.462, + "hfopenllm_v2/MATH Level 5": 0.0793, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4254, + "hfopenllm_v2/MMLU-PRO": 0.329 + } + }, + { + "id": "Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Interleaved", + "name": "OpenChat-3.5-0106_8.99B_40Layers-Interleaved", + "developer": "Pretergeek", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5976, + "hfopenllm_v2/BBH": 0.4621, + "hfopenllm_v2/MATH Level 5": 0.0778, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4241, + "hfopenllm_v2/MMLU-PRO": 0.3299 + } + }, + { + "id": "Pretergeek/OpenChat-3.5-0106_9.86B_44Layers-Appended", + "name": "OpenChat-3.5-0106_9.86B_44Layers-Appended", + "developer": "Pretergeek", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5961, + "hfopenllm_v2/BBH": 0.462, + "hfopenllm_v2/MATH Level 5": 0.0793, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4254, + "hfopenllm_v2/MMLU-PRO": 0.329 + } + }, + { + "id": "Pretergeek/openchat-3.5-0106_Rebased_Mistral-7B-v0.2", + "name": "openchat-3.5-0106_Rebased_Mistral-7B-v0.2", + "developer": "Pretergeek", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3706, + "hfopenllm_v2/BBH": 0.3627, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.484, + "hfopenllm_v2/MMLU-PRO": 0.283 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/PrimeIntellect.json b/data/developers/PrimeIntellect.json new file mode 100644 index 0000000000000000000000000000000000000000..674a0e3b141480d7e0d33d0a2fe9b205b710216f --- /dev/null +++ b/data/developers/PrimeIntellect.json @@ -0,0 +1,33 @@ +{ + "developer": "PrimeIntellect", + "models": [ + { + "id": "PrimeIntellect/INTELLECT-1", + "name": "INTELLECT-1", + "developer": "PrimeIntellect", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1757, + "hfopenllm_v2/BBH": 0.276, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3339, + "hfopenllm_v2/MMLU-PRO": 0.1123 + } + }, + { + "id": "PrimeIntellect/INTELLECT-1-Instruct", + "name": "INTELLECT-1-Instruct", + "developer": "PrimeIntellect", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0, + "hfopenllm_v2/BBH": 0.287, + "hfopenllm_v2/MATH Level 5": 0.0227, + "hfopenllm_v2/GPQA": 0.2483, + "hfopenllm_v2/MUSR": 0.3577, + "hfopenllm_v2/MMLU-PRO": 0.1064 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/PuxAI.json b/data/developers/PuxAI.json new file mode 100644 index 0000000000000000000000000000000000000000..bfa39503dcfc20f324f1e8e3114d60da6d1bbd13 --- /dev/null +++ b/data/developers/PuxAI.json @@ -0,0 +1,19 @@ +{ + "developer": "PuxAI", + "models": [ + { + "id": "PuxAI/LUA_model", + "name": "LUA_model", + "developer": "PuxAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2282, + "hfopenllm_v2/BBH": 0.2877, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3484, + "hfopenllm_v2/MMLU-PRO": 0.1123 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/PygmalionAI.json b/data/developers/PygmalionAI.json new file mode 100644 index 0000000000000000000000000000000000000000..6a5917e144555e5d99f516ca381f3ee4f0a8f434 --- /dev/null +++ b/data/developers/PygmalionAI.json @@ -0,0 +1,19 @@ +{ + "developer": "PygmalionAI", + "models": [ + { + "id": "PygmalionAI/pygmalion-6b", + "name": "pygmalion-6b", + "developer": "PygmalionAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2091, + "hfopenllm_v2/BBH": 0.3199, + "hfopenllm_v2/MATH Level 5": 0.0083, + "hfopenllm_v2/GPQA": 0.2492, + "hfopenllm_v2/MUSR": 0.3684, + "hfopenllm_v2/MMLU-PRO": 0.1184 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Q-bert.json b/data/developers/Q-bert.json new file mode 100644 index 0000000000000000000000000000000000000000..e42745fdd53475572843e194019b98f3f63a3074 --- /dev/null +++ b/data/developers/Q-bert.json @@ -0,0 +1,19 @@ +{ + "developer": "Q-bert", + "models": [ + { + "id": "Q-bert/MetaMath-1B", + "name": "MetaMath-1B", + "developer": "Q-bert", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.53, + "hfopenllm_v2/BBH": 0.3451, + "hfopenllm_v2/MATH Level 5": 0.0627, + "hfopenllm_v2/GPQA": 0.2517, + "hfopenllm_v2/MUSR": 0.3289, + "hfopenllm_v2/MMLU-PRO": 0.1495 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Quazim0t0.json b/data/developers/Quazim0t0.json new file mode 100644 index 0000000000000000000000000000000000000000..2d35a4f5e966e8ec3da190137c155e9ea4458c4d --- /dev/null +++ b/data/developers/Quazim0t0.json @@ -0,0 +1,985 @@ +{ + "developer": "Quazim0t0", + "models": [ + { + "id": "Quazim0t0/1up-14b", + "name": "1up-14b", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6888, + "hfopenllm_v2/BBH": 0.6921, + "hfopenllm_v2/MATH Level 5": 0.4162, + "hfopenllm_v2/GPQA": 0.3624, + "hfopenllm_v2/MUSR": 0.4583, + "hfopenllm_v2/MMLU-PRO": 0.5406 + } + }, + { + "id": "Quazim0t0/Adamant-14B-sce", + "name": "Adamant-14B-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6858, + "hfopenllm_v2/BBH": 0.6859, + "hfopenllm_v2/MATH Level 5": 0.3988, + "hfopenllm_v2/GPQA": 0.3507, + "hfopenllm_v2/MUSR": 0.4558, + "hfopenllm_v2/MMLU-PRO": 0.5372 + } + }, + { + "id": "Quazim0t0/Alice-14B", + "name": "Alice-14B", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6836, + "hfopenllm_v2/BBH": 0.6938, + "hfopenllm_v2/MATH Level 5": 0.4569, + "hfopenllm_v2/GPQA": 0.3515, + "hfopenllm_v2/MUSR": 0.4479, + "hfopenllm_v2/MMLU-PRO": 0.5419 + } + }, + { + "id": "Quazim0t0/Alien-CoT-14B-sce", + "name": "Alien-CoT-14B-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0749, + "hfopenllm_v2/BBH": 0.6395, + "hfopenllm_v2/MATH Level 5": 0.5204, + "hfopenllm_v2/GPQA": 0.3918, + "hfopenllm_v2/MUSR": 0.4785, + "hfopenllm_v2/MMLU-PRO": 0.517 + } + }, + { + "id": "Quazim0t0/Aura-8B-Linear", + "name": "Aura-8B-Linear", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7948, + "hfopenllm_v2/BBH": 0.5074, + "hfopenllm_v2/MATH Level 5": 0.1805, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3687, + "hfopenllm_v2/MMLU-PRO": 0.3801 + } + }, + { + "id": "Quazim0t0/Casa-14b-sce", + "name": "Casa-14b-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6718, + "hfopenllm_v2/BBH": 0.6891, + "hfopenllm_v2/MATH Level 5": 0.4985, + "hfopenllm_v2/GPQA": 0.3339, + "hfopenllm_v2/MUSR": 0.4323, + "hfopenllm_v2/MMLU-PRO": 0.5408 + } + }, + { + "id": "Quazim0t0/Charlie-8B-Linear", + "name": "Charlie-8B-Linear", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7381, + "hfopenllm_v2/BBH": 0.5141, + "hfopenllm_v2/MATH Level 5": 0.2651, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3485, + "hfopenllm_v2/MMLU-PRO": 0.3573 + } + }, + { + "id": "Quazim0t0/Chromatic-8b-sce", + "name": "Chromatic-8b-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5085, + "hfopenllm_v2/BBH": 0.5063, + "hfopenllm_v2/MATH Level 5": 0.1556, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.4051, + "hfopenllm_v2/MMLU-PRO": 0.3755 + } + }, + { + "id": "Quazim0t0/CoT_Phi", + "name": "CoT_Phi", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6159, + "hfopenllm_v2/BBH": 0.6751, + "hfopenllm_v2/MATH Level 5": 0.3308, + "hfopenllm_v2/GPQA": 0.3582, + "hfopenllm_v2/MUSR": 0.4244, + "hfopenllm_v2/MMLU-PRO": 0.4901 + } + }, + { + "id": "Quazim0t0/Dyson-14b", + "name": "Dyson-14b", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5857, + "hfopenllm_v2/BBH": 0.6863, + "hfopenllm_v2/MATH Level 5": 0.5393, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4259, + "hfopenllm_v2/MMLU-PRO": 0.5399 + } + }, + { + "id": "Quazim0t0/Edu-14B-Linear", + "name": "Edu-14B-Linear", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6158, + "hfopenllm_v2/BBH": 0.6758, + "hfopenllm_v2/MATH Level 5": 0.2447, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4378, + "hfopenllm_v2/MMLU-PRO": 0.5086 + } + }, + { + "id": "Quazim0t0/Fugazi14b", + "name": "Fugazi14b", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6998, + "hfopenllm_v2/BBH": 0.6941, + "hfopenllm_v2/MATH Level 5": 0.4653, + "hfopenllm_v2/GPQA": 0.3515, + "hfopenllm_v2/MUSR": 0.4546, + "hfopenllm_v2/MMLU-PRO": 0.5417 + } + }, + { + "id": "Quazim0t0/GZA-14B-sce", + "name": "GZA-14B-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6274, + "hfopenllm_v2/BBH": 0.6687, + "hfopenllm_v2/MATH Level 5": 0.4721, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4285, + "hfopenllm_v2/MMLU-PRO": 0.5232 + } + }, + { + "id": "Quazim0t0/Geedorah-14B", + "name": "Geedorah-14B", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6873, + "hfopenllm_v2/BBH": 0.6964, + "hfopenllm_v2/MATH Level 5": 0.4449, + "hfopenllm_v2/GPQA": 0.3473, + "hfopenllm_v2/MUSR": 0.4547, + "hfopenllm_v2/MMLU-PRO": 0.5421 + } + }, + { + "id": "Quazim0t0/GivingTree-8b-sce", + "name": "GivingTree-8b-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5006, + "hfopenllm_v2/BBH": 0.504, + "hfopenllm_v2/MATH Level 5": 0.1526, + "hfopenllm_v2/GPQA": 0.3221, + "hfopenllm_v2/MUSR": 0.4051, + "hfopenllm_v2/MMLU-PRO": 0.3761 + } + }, + { + "id": "Quazim0t0/GuiltySpark-14B-ties", + "name": "GuiltySpark-14B-ties", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6854, + "hfopenllm_v2/BBH": 0.6914, + "hfopenllm_v2/MATH Level 5": 0.3837, + "hfopenllm_v2/GPQA": 0.3649, + "hfopenllm_v2/MUSR": 0.4557, + "hfopenllm_v2/MMLU-PRO": 0.54 + } + }, + { + "id": "Quazim0t0/Halo-14B-sce", + "name": "Halo-14B-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6754, + "hfopenllm_v2/BBH": 0.6876, + "hfopenllm_v2/MATH Level 5": 0.429, + "hfopenllm_v2/GPQA": 0.3473, + "hfopenllm_v2/MUSR": 0.4401, + "hfopenllm_v2/MMLU-PRO": 0.5376 + } + }, + { + "id": "Quazim0t0/Heretic1.5b", + "name": "Heretic1.5b", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2062, + "hfopenllm_v2/BBH": 0.3529, + "hfopenllm_v2/MATH Level 5": 0.244, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3511, + "hfopenllm_v2/MMLU-PRO": 0.1728 + } + }, + { + "id": "Quazim0t0/Hyde-14b-sce", + "name": "Hyde-14b-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6715, + "hfopenllm_v2/BBH": 0.6885, + "hfopenllm_v2/MATH Level 5": 0.2734, + "hfopenllm_v2/GPQA": 0.3414, + "hfopenllm_v2/MUSR": 0.4141, + "hfopenllm_v2/MMLU-PRO": 0.53 + } + }, + { + "id": "Quazim0t0/Imagine-v0.5-16bit", + "name": "Imagine-v0.5-16bit", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2759, + "hfopenllm_v2/BBH": 0.6769, + "hfopenllm_v2/MATH Level 5": 0.1397, + "hfopenllm_v2/GPQA": 0.3649, + "hfopenllm_v2/MUSR": 0.4349, + "hfopenllm_v2/MMLU-PRO": 0.5354 + } + }, + { + "id": "Quazim0t0/Imbue-14b", + "name": "Imbue-14b", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.52, + "hfopenllm_v2/BBH": 0.6845, + "hfopenllm_v2/MATH Level 5": 0.5317, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4167, + "hfopenllm_v2/MMLU-PRO": 0.5402 + } + }, + { + "id": "Quazim0t0/Insom", + "name": "Insom", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6818, + "hfopenllm_v2/BBH": 0.6881, + "hfopenllm_v2/MATH Level 5": 0.3852, + "hfopenllm_v2/GPQA": 0.3498, + "hfopenllm_v2/MUSR": 0.4311, + "hfopenllm_v2/MMLU-PRO": 0.5352 + } + }, + { + "id": "Quazim0t0/InspectorDeck-14B-sce", + "name": "InspectorDeck-14B-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3241, + "hfopenllm_v2/BBH": 0.6668, + "hfopenllm_v2/MATH Level 5": 0.3165, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.3982, + "hfopenllm_v2/MMLU-PRO": 0.5261 + } + }, + { + "id": "Quazim0t0/Jekyl-8b-sce", + "name": "Jekyl-8b-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4697, + "hfopenllm_v2/BBH": 0.4994, + "hfopenllm_v2/MATH Level 5": 0.1616, + "hfopenllm_v2/GPQA": 0.3381, + "hfopenllm_v2/MUSR": 0.4197, + "hfopenllm_v2/MMLU-PRO": 0.3686 + } + }, + { + "id": "Quazim0t0/Jigsaw-14B-Linear", + "name": "Jigsaw-14B-Linear", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.648, + "hfopenllm_v2/BBH": 0.6865, + "hfopenllm_v2/MATH Level 5": 0.2651, + "hfopenllm_v2/GPQA": 0.3406, + "hfopenllm_v2/MUSR": 0.4483, + "hfopenllm_v2/MMLU-PRO": 0.5234 + } + }, + { + "id": "Quazim0t0/Katana-8b-sce", + "name": "Katana-8b-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5107, + "hfopenllm_v2/BBH": 0.5075, + "hfopenllm_v2/MATH Level 5": 0.1511, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.4038, + "hfopenllm_v2/MMLU-PRO": 0.3771 + } + }, + { + "id": "Quazim0t0/Knot-CoT-14B-sce", + "name": "Knot-CoT-14B-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4832, + "hfopenllm_v2/BBH": 0.6616, + "hfopenllm_v2/MATH Level 5": 0.3995, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.414, + "hfopenllm_v2/MMLU-PRO": 0.5154 + } + }, + { + "id": "Quazim0t0/Lineage-14B", + "name": "Lineage-14B", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.707, + "hfopenllm_v2/BBH": 0.6934, + "hfopenllm_v2/MATH Level 5": 0.4245, + "hfopenllm_v2/GPQA": 0.3599, + "hfopenllm_v2/MUSR": 0.4597, + "hfopenllm_v2/MMLU-PRO": 0.5411 + } + }, + { + "id": "Quazim0t0/Lo-Phi-14b", + "name": "Lo-Phi-14b", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4941, + "hfopenllm_v2/BBH": 0.6852, + "hfopenllm_v2/MATH Level 5": 0.5196, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.4232, + "hfopenllm_v2/MMLU-PRO": 0.5369 + } + }, + { + "id": "Quazim0t0/Loke-14B-sce", + "name": "Loke-14B-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6848, + "hfopenllm_v2/BBH": 0.6924, + "hfopenllm_v2/MATH Level 5": 0.3905, + "hfopenllm_v2/GPQA": 0.3649, + "hfopenllm_v2/MUSR": 0.4637, + "hfopenllm_v2/MMLU-PRO": 0.5401 + } + }, + { + "id": "Quazim0t0/MFDOOM-14B", + "name": "MFDOOM-14B", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6736, + "hfopenllm_v2/BBH": 0.6916, + "hfopenllm_v2/MATH Level 5": 0.5264, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.4377, + "hfopenllm_v2/MMLU-PRO": 0.5426 + } + }, + { + "id": "Quazim0t0/MFGRIMM-14B", + "name": "MFGRIMM-14B", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6894, + "hfopenllm_v2/BBH": 0.6909, + "hfopenllm_v2/MATH Level 5": 0.506, + "hfopenllm_v2/GPQA": 0.3339, + "hfopenllm_v2/MUSR": 0.4361, + "hfopenllm_v2/MMLU-PRO": 0.5416 + } + }, + { + "id": "Quazim0t0/Math_Phi4_Reason", + "name": "Math_Phi4_Reason", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.322, + "hfopenllm_v2/BBH": 0.624, + "hfopenllm_v2/MATH Level 5": 0.3278, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4034, + "hfopenllm_v2/MMLU-PRO": 0.503 + } + }, + { + "id": "Quazim0t0/Mithril-14B-sce", + "name": "Mithril-14B-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6958, + "hfopenllm_v2/BBH": 0.6926, + "hfopenllm_v2/MATH Level 5": 0.3822, + "hfopenllm_v2/GPQA": 0.3691, + "hfopenllm_v2/MUSR": 0.4611, + "hfopenllm_v2/MMLU-PRO": 0.5403 + } + }, + { + "id": "Quazim0t0/Mononoke-14B-sce", + "name": "Mononoke-14B-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3502, + "hfopenllm_v2/BBH": 0.6744, + "hfopenllm_v2/MATH Level 5": 0.4698, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.4155, + "hfopenllm_v2/MMLU-PRO": 0.5298 + } + }, + { + "id": "Quazim0t0/Motion-8B-Linear", + "name": "Motion-8B-Linear", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7686, + "hfopenllm_v2/BBH": 0.5084, + "hfopenllm_v2/MATH Level 5": 0.1888, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3606, + "hfopenllm_v2/MMLU-PRO": 0.3785 + } + }, + { + "id": "Quazim0t0/Mouse-9B", + "name": "Mouse-9B", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1325, + "hfopenllm_v2/BBH": 0.2979, + "hfopenllm_v2/MATH Level 5": 0.0053, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.347, + "hfopenllm_v2/MMLU-PRO": 0.1139 + } + }, + { + "id": "Quazim0t0/Nova-14b-sce", + "name": "Nova-14b-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7022, + "hfopenllm_v2/BBH": 0.6935, + "hfopenllm_v2/MATH Level 5": 0.4162, + "hfopenllm_v2/GPQA": 0.3633, + "hfopenllm_v2/MUSR": 0.4571, + "hfopenllm_v2/MMLU-PRO": 0.5413 + } + }, + { + "id": "Quazim0t0/NovaScotia-14b-stock", + "name": "NovaScotia-14b-stock", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6787, + "hfopenllm_v2/BBH": 0.6935, + "hfopenllm_v2/MATH Level 5": 0.463, + "hfopenllm_v2/GPQA": 0.349, + "hfopenllm_v2/MUSR": 0.4493, + "hfopenllm_v2/MMLU-PRO": 0.5409 + } + }, + { + "id": "Quazim0t0/ODB-14B-sce", + "name": "ODB-14B-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2922, + "hfopenllm_v2/BBH": 0.6559, + "hfopenllm_v2/MATH Level 5": 0.2545, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3929, + "hfopenllm_v2/MMLU-PRO": 0.5207 + } + }, + { + "id": "Quazim0t0/ODB-14b-sce", + "name": "ODB-14b-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7016, + "hfopenllm_v2/BBH": 0.6942, + "hfopenllm_v2/MATH Level 5": 0.4116, + "hfopenllm_v2/GPQA": 0.3624, + "hfopenllm_v2/MUSR": 0.4571, + "hfopenllm_v2/MMLU-PRO": 0.5411 + } + }, + { + "id": "Quazim0t0/Oasis-14B-ties", + "name": "Oasis-14B-ties", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6937, + "hfopenllm_v2/BBH": 0.6915, + "hfopenllm_v2/MATH Level 5": 0.3754, + "hfopenllm_v2/GPQA": 0.3649, + "hfopenllm_v2/MUSR": 0.4571, + "hfopenllm_v2/MMLU-PRO": 0.5405 + } + }, + { + "id": "Quazim0t0/Origami-14B-sce", + "name": "Origami-14B-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3259, + "hfopenllm_v2/BBH": 0.662, + "hfopenllm_v2/MATH Level 5": 0.2915, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.4035, + "hfopenllm_v2/MMLU-PRO": 0.5244 + } + }, + { + "id": "Quazim0t0/Phi4.Turn.R1Distill.16bit", + "name": "Phi4.Turn.R1Distill.16bit", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3126, + "hfopenllm_v2/BBH": 0.6563, + "hfopenllm_v2/MATH Level 5": 0.2311, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.3902, + "hfopenllm_v2/MMLU-PRO": 0.5257 + } + }, + { + "id": "Quazim0t0/Phi4.Turn.R1Distill_v1.5.1-Tensors", + "name": "Phi4.Turn.R1Distill_v1.5.1-Tensors", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2995, + "hfopenllm_v2/BBH": 0.6456, + "hfopenllm_v2/MATH Level 5": 0.219, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3929, + "hfopenllm_v2/MMLU-PRO": 0.5117 + } + }, + { + "id": "Quazim0t0/Phi4Basis-14B-sce", + "name": "Phi4Basis-14B-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6502, + "hfopenllm_v2/BBH": 0.6909, + "hfopenllm_v2/MATH Level 5": 0.4789, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.4338, + "hfopenllm_v2/MMLU-PRO": 0.539 + } + }, + { + "id": "Quazim0t0/Ponder-14B-linear", + "name": "Ponder-14B-linear", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6906, + "hfopenllm_v2/BBH": 0.6943, + "hfopenllm_v2/MATH Level 5": 0.4282, + "hfopenllm_v2/GPQA": 0.3582, + "hfopenllm_v2/MUSR": 0.4558, + "hfopenllm_v2/MMLU-PRO": 0.5408 + } + }, + { + "id": "Quazim0t0/RZA-14B-sce", + "name": "RZA-14B-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4774, + "hfopenllm_v2/BBH": 0.6686, + "hfopenllm_v2/MATH Level 5": 0.5189, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4113, + "hfopenllm_v2/MMLU-PRO": 0.5383 + } + }, + { + "id": "Quazim0t0/Rosemary-14b", + "name": "Rosemary-14b", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6915, + "hfopenllm_v2/BBH": 0.6955, + "hfopenllm_v2/MATH Level 5": 0.4388, + "hfopenllm_v2/GPQA": 0.3565, + "hfopenllm_v2/MUSR": 0.4492, + "hfopenllm_v2/MMLU-PRO": 0.5396 + } + }, + { + "id": "Quazim0t0/Rune-14b", + "name": "Rune-14b", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7016, + "hfopenllm_v2/BBH": 0.6937, + "hfopenllm_v2/MATH Level 5": 0.4585, + "hfopenllm_v2/GPQA": 0.3515, + "hfopenllm_v2/MUSR": 0.4533, + "hfopenllm_v2/MMLU-PRO": 0.5411 + } + }, + { + "id": "Quazim0t0/SZA-14B-sce", + "name": "SZA-14B-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5659, + "hfopenllm_v2/BBH": 0.6889, + "hfopenllm_v2/MATH Level 5": 0.5242, + "hfopenllm_v2/GPQA": 0.3305, + "hfopenllm_v2/MUSR": 0.4339, + "hfopenllm_v2/MMLU-PRO": 0.5353 + } + }, + { + "id": "Quazim0t0/Sake-20b", + "name": "Sake-20b", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6693, + "hfopenllm_v2/BBH": 0.677, + "hfopenllm_v2/MATH Level 5": 0.4653, + "hfopenllm_v2/GPQA": 0.3188, + "hfopenllm_v2/MUSR": 0.4494, + "hfopenllm_v2/MMLU-PRO": 0.5391 + } + }, + { + "id": "Quazim0t0/Spok-14b-sce", + "name": "Spok-14b-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6682, + "hfopenllm_v2/BBH": 0.6899, + "hfopenllm_v2/MATH Level 5": 0.2719, + "hfopenllm_v2/GPQA": 0.3456, + "hfopenllm_v2/MUSR": 0.4141, + "hfopenllm_v2/MMLU-PRO": 0.5298 + } + }, + { + "id": "Quazim0t0/Sumatra-20b", + "name": "Sumatra-20b", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6738, + "hfopenllm_v2/BBH": 0.6855, + "hfopenllm_v2/MATH Level 5": 0.3671, + "hfopenllm_v2/GPQA": 0.3263, + "hfopenllm_v2/MUSR": 0.456, + "hfopenllm_v2/MMLU-PRO": 0.5415 + } + }, + { + "id": "Quazim0t0/SuperNova14b", + "name": "SuperNova14b", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7076, + "hfopenllm_v2/BBH": 0.6937, + "hfopenllm_v2/MATH Level 5": 0.4396, + "hfopenllm_v2/GPQA": 0.3523, + "hfopenllm_v2/MUSR": 0.4545, + "hfopenllm_v2/MMLU-PRO": 0.5435 + } + }, + { + "id": "Quazim0t0/TB0-8B-sce", + "name": "TB0-8B-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5107, + "hfopenllm_v2/BBH": 0.5075, + "hfopenllm_v2/MATH Level 5": 0.1511, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.4038, + "hfopenllm_v2/MMLU-PRO": 0.3771 + } + }, + { + "id": "Quazim0t0/TBL-8B-sce", + "name": "TBL-8B-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4581, + "hfopenllm_v2/BBH": 0.5008, + "hfopenllm_v2/MATH Level 5": 0.1533, + "hfopenllm_v2/GPQA": 0.3339, + "hfopenllm_v2/MUSR": 0.4236, + "hfopenllm_v2/MMLU-PRO": 0.3689 + } + }, + { + "id": "Quazim0t0/ThinkPhi1.1-Tensors", + "name": "ThinkPhi1.1-Tensors", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3908, + "hfopenllm_v2/BBH": 0.6449, + "hfopenllm_v2/MATH Level 5": 0.182, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.418, + "hfopenllm_v2/MMLU-PRO": 0.4908 + } + }, + { + "id": "Quazim0t0/Venti-20b", + "name": "Venti-20b", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6641, + "hfopenllm_v2/BBH": 0.6901, + "hfopenllm_v2/MATH Level 5": 0.3391, + "hfopenllm_v2/GPQA": 0.3322, + "hfopenllm_v2/MUSR": 0.448, + "hfopenllm_v2/MMLU-PRO": 0.5386 + } + }, + { + "id": "Quazim0t0/Venti-Blend-sce", + "name": "Venti-Blend-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6879, + "hfopenllm_v2/BBH": 0.6843, + "hfopenllm_v2/MATH Level 5": 0.4056, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4389, + "hfopenllm_v2/MMLU-PRO": 0.5414 + } + }, + { + "id": "Quazim0t0/Vine-14b-sce", + "name": "Vine-14b-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6733, + "hfopenllm_v2/BBH": 0.6891, + "hfopenllm_v2/MATH Level 5": 0.5008, + "hfopenllm_v2/GPQA": 0.3339, + "hfopenllm_v2/MUSR": 0.4323, + "hfopenllm_v2/MMLU-PRO": 0.5408 + } + }, + { + "id": "Quazim0t0/Wendy-14B", + "name": "Wendy-14B", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6772, + "hfopenllm_v2/BBH": 0.6958, + "hfopenllm_v2/MATH Level 5": 0.4834, + "hfopenllm_v2/GPQA": 0.3322, + "hfopenllm_v2/MUSR": 0.4428, + "hfopenllm_v2/MMLU-PRO": 0.5435 + } + }, + { + "id": "Quazim0t0/Wu-14b-sce", + "name": "Wu-14b-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6718, + "hfopenllm_v2/BBH": 0.6885, + "hfopenllm_v2/MATH Level 5": 0.2613, + "hfopenllm_v2/GPQA": 0.3465, + "hfopenllm_v2/MUSR": 0.4114, + "hfopenllm_v2/MMLU-PRO": 0.5293 + } + }, + { + "id": "Quazim0t0/bloom-14b-stock", + "name": "bloom-14b-stock", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6575, + "hfopenllm_v2/BBH": 0.6878, + "hfopenllm_v2/MATH Level 5": 0.4811, + "hfopenllm_v2/GPQA": 0.3314, + "hfopenllm_v2/MUSR": 0.431, + "hfopenllm_v2/MMLU-PRO": 0.5373 + } + }, + { + "id": "Quazim0t0/caramel-14B", + "name": "caramel-14B", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6745, + "hfopenllm_v2/BBH": 0.6919, + "hfopenllm_v2/MATH Level 5": 0.4713, + "hfopenllm_v2/GPQA": 0.3448, + "hfopenllm_v2/MUSR": 0.4454, + "hfopenllm_v2/MMLU-PRO": 0.5436 + } + }, + { + "id": "Quazim0t0/graphite-14b-sce", + "name": "graphite-14b-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3217, + "hfopenllm_v2/BBH": 0.6631, + "hfopenllm_v2/MATH Level 5": 0.3006, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.3981, + "hfopenllm_v2/MMLU-PRO": 0.528 + } + }, + { + "id": "Quazim0t0/mocha-14B", + "name": "mocha-14B", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5893, + "hfopenllm_v2/BBH": 0.6895, + "hfopenllm_v2/MATH Level 5": 0.5264, + "hfopenllm_v2/GPQA": 0.3305, + "hfopenllm_v2/MUSR": 0.4272, + "hfopenllm_v2/MMLU-PRO": 0.5384 + } + }, + { + "id": "Quazim0t0/mosaic-14b-sce", + "name": "mosaic-14b-sce", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6876, + "hfopenllm_v2/BBH": 0.6907, + "hfopenllm_v2/MATH Level 5": 0.4026, + "hfopenllm_v2/GPQA": 0.3624, + "hfopenllm_v2/MUSR": 0.4558, + "hfopenllm_v2/MMLU-PRO": 0.5396 + } + }, + { + "id": "Quazim0t0/tesseract-14b-stock", + "name": "tesseract-14b-stock", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5848, + "hfopenllm_v2/BBH": 0.688, + "hfopenllm_v2/MATH Level 5": 0.5144, + "hfopenllm_v2/GPQA": 0.3272, + "hfopenllm_v2/MUSR": 0.4232, + "hfopenllm_v2/MMLU-PRO": 0.5389 + } + }, + { + "id": "Quazim0t0/time-14b-stock", + "name": "time-14b-stock", + "developer": "Quazim0t0", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6699, + "hfopenllm_v2/BBH": 0.6897, + "hfopenllm_v2/MATH Level 5": 0.5083, + "hfopenllm_v2/GPQA": 0.3347, + "hfopenllm_v2/MUSR": 0.4323, + "hfopenllm_v2/MMLU-PRO": 0.5419 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Qwen.json b/data/developers/Qwen.json new file mode 100644 index 0000000000000000000000000000000000000000..782ee6834856d0a95850e189e3f494255390e68a --- /dev/null +++ b/data/developers/Qwen.json @@ -0,0 +1,882 @@ +{ + "developer": "Qwen", + "models": [ + { + "id": "Qwen/QwQ-32B", + "name": "QwQ-32B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3977, + "hfopenllm_v2/BBH": 0.2983, + "hfopenllm_v2/MATH Level 5": 0.1609, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.4206, + "hfopenllm_v2/MMLU-PRO": 0.1196 + } + }, + { + "id": "Qwen/QwQ-32B-Preview", + "name": "QwQ-32B-Preview", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4035, + "hfopenllm_v2/BBH": 0.6691, + "hfopenllm_v2/MATH Level 5": 0.4494, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.411, + "hfopenllm_v2/MMLU-PRO": 0.5678 + } + }, + { + "id": "Qwen/Qwen1.5-0.5B", + "name": "Qwen1.5-0.5B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1706, + "hfopenllm_v2/BBH": 0.3154, + "hfopenllm_v2/MATH Level 5": 0.0174, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.3616, + "hfopenllm_v2/MMLU-PRO": 0.1307 + } + }, + { + "id": "Qwen/Qwen1.5-0.5B-Chat", + "name": "Qwen1.5-0.5B-Chat", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1807, + "hfopenllm_v2/BBH": 0.3167, + "hfopenllm_v2/MATH Level 5": 0.0068, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3837, + "hfopenllm_v2/MMLU-PRO": 0.1213, + "reward-bench/Score": 0.5298, + "reward-bench/Chat": 0.3547, + "reward-bench/Chat Hard": 0.6294, + "reward-bench/Safety": 0.5703, + "reward-bench/Reasoning": 0.5984, + "reward-bench/Prior Sets (0.5 weight)": 0.4629 + } + }, + { + "id": "Qwen/Qwen1.5-1.8B", + "name": "Qwen1.5-1.8B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2154, + "hfopenllm_v2/BBH": 0.3476, + "hfopenllm_v2/MATH Level 5": 0.0317, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.3605, + "hfopenllm_v2/MMLU-PRO": 0.1882 + } + }, + { + "id": "Qwen/Qwen1.5-1.8B-Chat", + "name": "Qwen1.5-1.8B-Chat", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2019, + "hfopenllm_v2/BBH": 0.3256, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.426, + "hfopenllm_v2/MMLU-PRO": 0.1804, + "reward-bench/Score": 0.589, + "reward-bench/Chat": 0.5615, + "reward-bench/Chat Hard": 0.6031, + "reward-bench/Safety": 0.4838, + "reward-bench/Reasoning": 0.7793, + "reward-bench/Prior Sets (0.5 weight)": 0.4453 + } + }, + { + "id": "Qwen/Qwen1.5-110B", + "name": "Qwen1.5-110B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3422, + "hfopenllm_v2/BBH": 0.61, + "hfopenllm_v2/MATH Level 5": 0.247, + "hfopenllm_v2/GPQA": 0.3523, + "hfopenllm_v2/MUSR": 0.4408, + "hfopenllm_v2/MMLU-PRO": 0.5361 + } + }, + { + "id": "Qwen/Qwen1.5-110B-Chat", + "name": "Qwen1.5-110B-Chat", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5939, + "hfopenllm_v2/BBH": 0.6184, + "hfopenllm_v2/MATH Level 5": 0.2341, + "hfopenllm_v2/GPQA": 0.3414, + "hfopenllm_v2/MUSR": 0.4522, + "hfopenllm_v2/MMLU-PRO": 0.4825 + } + }, + { + "id": "Qwen/Qwen1.5-14B", + "name": "Qwen1.5-14B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2905, + "hfopenllm_v2/BBH": 0.508, + "hfopenllm_v2/MATH Level 5": 0.2024, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4186, + "hfopenllm_v2/MMLU-PRO": 0.3644 + } + }, + { + "id": "Qwen/Qwen1.5-14B-Chat", + "name": "Qwen1.5-14B-Chat", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4768, + "hfopenllm_v2/BBH": 0.5229, + "hfopenllm_v2/MATH Level 5": 0.1526, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.44, + "hfopenllm_v2/MMLU-PRO": 0.3618, + "reward-bench/Score": 0.6864, + "reward-bench/Chat": 0.5726, + "reward-bench/Chat Hard": 0.7018, + "reward-bench/Safety": 0.7122, + "reward-bench/Reasoning": 0.8961, + "reward-bench/Prior Sets (0.5 weight)": 0.4123 + } + }, + { + "id": "Qwen/Qwen1.5-32B", + "name": "Qwen1.5-32B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3297, + "hfopenllm_v2/BBH": 0.5715, + "hfopenllm_v2/MATH Level 5": 0.3029, + "hfopenllm_v2/GPQA": 0.3297, + "hfopenllm_v2/MUSR": 0.4278, + "hfopenllm_v2/MMLU-PRO": 0.45 + } + }, + { + "id": "Qwen/Qwen1.5-32B-Chat", + "name": "Qwen1.5-32B-Chat", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5532, + "hfopenllm_v2/BBH": 0.6067, + "hfopenllm_v2/MATH Level 5": 0.1956, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.416, + "hfopenllm_v2/MMLU-PRO": 0.4457 + } + }, + { + "id": "Qwen/Qwen1.5-4B", + "name": "Qwen1.5-4B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2445, + "hfopenllm_v2/BBH": 0.4054, + "hfopenllm_v2/MATH Level 5": 0.0529, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.3604, + "hfopenllm_v2/MMLU-PRO": 0.246 + } + }, + { + "id": "Qwen/Qwen1.5-4B-Chat", + "name": "Qwen1.5-4B-Chat", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3157, + "hfopenllm_v2/BBH": 0.4006, + "hfopenllm_v2/MATH Level 5": 0.0279, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3978, + "hfopenllm_v2/MMLU-PRO": 0.2396, + "reward-bench/Score": 0.5477, + "reward-bench/Chat": 0.3883, + "reward-bench/Chat Hard": 0.6272, + "reward-bench/Safety": 0.5568, + "reward-bench/Reasoning": 0.6689, + "reward-bench/Prior Sets (0.5 weight)": 0.447 + } + }, + { + "id": "Qwen/Qwen1.5-72B-Chat", + "name": "Qwen/Qwen1.5-72B-Chat", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6723, + "reward-bench/Chat": 0.6229, + "reward-bench/Chat Hard": 0.6601, + "reward-bench/Safety": 0.6757, + "reward-bench/Reasoning": 0.8554, + "reward-bench/Prior Sets (0.5 weight)": 0.4226 + } + }, + { + "id": "Qwen/Qwen1.5-7B", + "name": "Qwen1.5-7B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2684, + "hfopenllm_v2/BBH": 0.456, + "hfopenllm_v2/MATH Level 5": 0.0929, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4103, + "hfopenllm_v2/MMLU-PRO": 0.2916 + } + }, + { + "id": "Qwen/Qwen1.5-7B-Chat", + "name": "Qwen1.5-7B-Chat", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4371, + "hfopenllm_v2/BBH": 0.451, + "hfopenllm_v2/MATH Level 5": 0.0627, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.3779, + "hfopenllm_v2/MMLU-PRO": 0.2951, + "reward-bench/Score": 0.675, + "reward-bench/Chat": 0.5363, + "reward-bench/Chat Hard": 0.6908, + "reward-bench/Safety": 0.6919, + "reward-bench/Reasoning": 0.9041, + "reward-bench/Prior Sets (0.5 weight)": 0.4288 + } + }, + { + "id": "Qwen/Qwen1.5-MoE-A2.7B", + "name": "Qwen1.5-MoE-A2.7B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.266, + "hfopenllm_v2/BBH": 0.4114, + "hfopenllm_v2/MATH Level 5": 0.0929, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.4013, + "hfopenllm_v2/MMLU-PRO": 0.2778 + } + }, + { + "id": "Qwen/Qwen1.5-MoE-A2.7B-Chat", + "name": "Qwen1.5-MoE-A2.7B-Chat", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3795, + "hfopenllm_v2/BBH": 0.4272, + "hfopenllm_v2/MATH Level 5": 0.0634, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3899, + "hfopenllm_v2/MMLU-PRO": 0.2923, + "reward-bench/Score": 0.6644, + "reward-bench/Chat": 0.7291, + "reward-bench/Chat Hard": 0.6316, + "reward-bench/Safety": 0.6284, + "reward-bench/Reasoning": 0.774, + "reward-bench/Prior Sets (0.5 weight)": 0.4536 + } + }, + { + "id": "Qwen/Qwen2-0.5B", + "name": "Qwen2-0.5B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1873, + "hfopenllm_v2/BBH": 0.3239, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3752, + "hfopenllm_v2/MMLU-PRO": 0.172 + } + }, + { + "id": "Qwen/Qwen2-0.5B-Instruct", + "name": "Qwen2-0.5B-Instruct", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2247, + "hfopenllm_v2/BBH": 0.3173, + "hfopenllm_v2/MATH Level 5": 0.0287, + "hfopenllm_v2/GPQA": 0.2466, + "hfopenllm_v2/MUSR": 0.3353, + "hfopenllm_v2/MMLU-PRO": 0.1531 + } + }, + { + "id": "Qwen/Qwen2-1.5B", + "name": "Qwen2-1.5B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2113, + "hfopenllm_v2/BBH": 0.3575, + "hfopenllm_v2/MATH Level 5": 0.0702, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3658, + "hfopenllm_v2/MMLU-PRO": 0.2552 + } + }, + { + "id": "Qwen/Qwen2-1.5B-Instruct", + "name": "Qwen2-1.5B-Instruct", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3371, + "hfopenllm_v2/BBH": 0.3852, + "hfopenllm_v2/MATH Level 5": 0.0718, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.4293, + "hfopenllm_v2/MMLU-PRO": 0.2501 + } + }, + { + "id": "Qwen/Qwen2-57B-A14B", + "name": "Qwen2-57B-A14B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3113, + "hfopenllm_v2/BBH": 0.5618, + "hfopenllm_v2/MATH Level 5": 0.1866, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4174, + "hfopenllm_v2/MMLU-PRO": 0.4916 + } + }, + { + "id": "Qwen/Qwen2-57B-A14B-Instruct", + "name": "Qwen2-57B-A14B-Instruct", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6338, + "hfopenllm_v2/BBH": 0.5888, + "hfopenllm_v2/MATH Level 5": 0.2817, + "hfopenllm_v2/GPQA": 0.3314, + "hfopenllm_v2/MUSR": 0.4361, + "hfopenllm_v2/MMLU-PRO": 0.4575 + } + }, + { + "id": "Qwen/Qwen2-72B", + "name": "Qwen2-72B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3824, + "hfopenllm_v2/BBH": 0.6617, + "hfopenllm_v2/MATH Level 5": 0.3112, + "hfopenllm_v2/GPQA": 0.3943, + "hfopenllm_v2/MUSR": 0.4704, + "hfopenllm_v2/MMLU-PRO": 0.5731 + } + }, + { + "id": "Qwen/Qwen2-72B-Instruct", + "name": "Qwen2-72B-Instruct", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7989, + "hfopenllm_v2/BBH": 0.6977, + "hfopenllm_v2/MATH Level 5": 0.4177, + "hfopenllm_v2/GPQA": 0.3725, + "hfopenllm_v2/MUSR": 0.456, + "hfopenllm_v2/MMLU-PRO": 0.5403 + } + }, + { + "id": "Qwen/Qwen2-7B", + "name": "Qwen2-7B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3149, + "hfopenllm_v2/BBH": 0.5315, + "hfopenllm_v2/MATH Level 5": 0.2039, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4439, + "hfopenllm_v2/MMLU-PRO": 0.4183 + } + }, + { + "id": "Qwen/Qwen2-7B-Instruct", + "name": "Qwen2-7B-Instruct", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5679, + "hfopenllm_v2/BBH": 0.5545, + "hfopenllm_v2/MATH Level 5": 0.2764, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.3928, + "hfopenllm_v2/MMLU-PRO": 0.3847 + } + }, + { + "id": "Qwen/Qwen2-Math-72B-Instruct", + "name": "Qwen2-Math-72B-Instruct", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5694, + "hfopenllm_v2/BBH": 0.6343, + "hfopenllm_v2/MATH Level 5": 0.5536, + "hfopenllm_v2/GPQA": 0.3683, + "hfopenllm_v2/MUSR": 0.4517, + "hfopenllm_v2/MMLU-PRO": 0.4273 + } + }, + { + "id": "Qwen/Qwen2-Math-7B", + "name": "Qwen2-Math-7B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2687, + "hfopenllm_v2/BBH": 0.387, + "hfopenllm_v2/MATH Level 5": 0.2477, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3593, + "hfopenllm_v2/MMLU-PRO": 0.1197 + } + }, + { + "id": "Qwen/Qwen2-VL-72B-Instruct", + "name": "Qwen2-VL-72B-Instruct", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5982, + "hfopenllm_v2/BBH": 0.6946, + "hfopenllm_v2/MATH Level 5": 0.3444, + "hfopenllm_v2/GPQA": 0.3876, + "hfopenllm_v2/MUSR": 0.4492, + "hfopenllm_v2/MMLU-PRO": 0.5717 + } + }, + { + "id": "Qwen/Qwen2-VL-7B-Instruct", + "name": "Qwen2-VL-7B-Instruct", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4599, + "hfopenllm_v2/BBH": 0.5465, + "hfopenllm_v2/MATH Level 5": 0.1986, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.4375, + "hfopenllm_v2/MMLU-PRO": 0.4095 + } + }, + { + "id": "Qwen/Qwen2.5-0.5B", + "name": "Qwen2.5-0.5B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1627, + "hfopenllm_v2/BBH": 0.3275, + "hfopenllm_v2/MATH Level 5": 0.0393, + "hfopenllm_v2/GPQA": 0.2466, + "hfopenllm_v2/MUSR": 0.3433, + "hfopenllm_v2/MMLU-PRO": 0.1906 + } + }, + { + "id": "Qwen/Qwen2.5-0.5B-Instruct", + "name": "Qwen2.5-0.5B-Instruct", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3071, + "hfopenllm_v2/BBH": 0.3341, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3329, + "hfopenllm_v2/MMLU-PRO": 0.1697 + } + }, + { + "id": "Qwen/Qwen2.5-1.5B", + "name": "Qwen2.5-1.5B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2674, + "hfopenllm_v2/BBH": 0.4078, + "hfopenllm_v2/MATH Level 5": 0.0914, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.3576, + "hfopenllm_v2/MMLU-PRO": 0.2855 + } + }, + { + "id": "Qwen/Qwen2.5-1.5B-Instruct", + "name": "Qwen2.5-1.5B-Instruct", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4476, + "hfopenllm_v2/BBH": 0.4289, + "hfopenllm_v2/MATH Level 5": 0.2205, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3663, + "hfopenllm_v2/MMLU-PRO": 0.2799 + } + }, + { + "id": "Qwen/Qwen2.5-14B", + "name": "Qwen2.5-14B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3694, + "hfopenllm_v2/BBH": 0.6161, + "hfopenllm_v2/MATH Level 5": 0.29, + "hfopenllm_v2/GPQA": 0.3817, + "hfopenllm_v2/MUSR": 0.4502, + "hfopenllm_v2/MMLU-PRO": 0.5249 + } + }, + { + "id": "Qwen/Qwen2.5-14B-Instruct", + "name": "Qwen2.5-14B-Instruct", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8158, + "hfopenllm_v2/BBH": 0.639, + "hfopenllm_v2/MATH Level 5": 0.5476, + "hfopenllm_v2/GPQA": 0.3221, + "hfopenllm_v2/MUSR": 0.4101, + "hfopenllm_v2/MMLU-PRO": 0.4904 + } + }, + { + "id": "Qwen/Qwen2.5-14B-Instruct-1M", + "name": "Qwen2.5-14B-Instruct-1M", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8414, + "hfopenllm_v2/BBH": 0.6198, + "hfopenllm_v2/MATH Level 5": 0.5302, + "hfopenllm_v2/GPQA": 0.3431, + "hfopenllm_v2/MUSR": 0.418, + "hfopenllm_v2/MMLU-PRO": 0.485 + } + }, + { + "id": "Qwen/Qwen2.5-32B", + "name": "Qwen2.5-32B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4077, + "hfopenllm_v2/BBH": 0.6771, + "hfopenllm_v2/MATH Level 5": 0.3565, + "hfopenllm_v2/GPQA": 0.4119, + "hfopenllm_v2/MUSR": 0.4978, + "hfopenllm_v2/MMLU-PRO": 0.5805 + } + }, + { + "id": "Qwen/Qwen2.5-32B-Instruct", + "name": "Qwen2.5-32B-Instruct", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8346, + "hfopenllm_v2/BBH": 0.6913, + "hfopenllm_v2/MATH Level 5": 0.6254, + "hfopenllm_v2/GPQA": 0.3381, + "hfopenllm_v2/MUSR": 0.4261, + "hfopenllm_v2/MMLU-PRO": 0.5667 + } + }, + { + "id": "Qwen/Qwen2.5-3B", + "name": "Qwen2.5-3B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.269, + "hfopenllm_v2/BBH": 0.4612, + "hfopenllm_v2/MATH Level 5": 0.148, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4303, + "hfopenllm_v2/MMLU-PRO": 0.3203 + } + }, + { + "id": "Qwen/Qwen2.5-3B-Instruct", + "name": "Qwen2.5-3B-Instruct", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6475, + "hfopenllm_v2/BBH": 0.4693, + "hfopenllm_v2/MATH Level 5": 0.3678, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3968, + "hfopenllm_v2/MMLU-PRO": 0.3255 + } + }, + { + "id": "Qwen/Qwen2.5-72B", + "name": "Qwen2.5-72B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4137, + "hfopenllm_v2/BBH": 0.6797, + "hfopenllm_v2/MATH Level 5": 0.3912, + "hfopenllm_v2/GPQA": 0.4052, + "hfopenllm_v2/MUSR": 0.4771, + "hfopenllm_v2/MMLU-PRO": 0.5968 + } + }, + { + "id": "Qwen/Qwen2.5-72B-Instruct", + "name": "Qwen2.5-72B-Instruct", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8638, + "hfopenllm_v2/BBH": 0.7273, + "hfopenllm_v2/MATH Level 5": 0.5982, + "hfopenllm_v2/GPQA": 0.375, + "hfopenllm_v2/MUSR": 0.4206, + "hfopenllm_v2/MMLU-PRO": 0.5626 + } + }, + { + "id": "Qwen/Qwen2.5-7B", + "name": "Qwen2.5-7B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3374, + "hfopenllm_v2/BBH": 0.5416, + "hfopenllm_v2/MATH Level 5": 0.2508, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.4424, + "hfopenllm_v2/MMLU-PRO": 0.4365 + } + }, + { + "id": "Qwen/Qwen2.5-7B-Instruct", + "name": "Qwen2.5-7B-Instruct", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7585, + "hfopenllm_v2/BBH": 0.5394, + "hfopenllm_v2/MATH Level 5": 0.5, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.402, + "hfopenllm_v2/MMLU-PRO": 0.4287 + } + }, + { + "id": "Qwen/Qwen2.5-7B-Instruct-1M", + "name": "Qwen2.5-7B-Instruct-1M", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7448, + "hfopenllm_v2/BBH": 0.5404, + "hfopenllm_v2/MATH Level 5": 0.4335, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4087, + "hfopenllm_v2/MMLU-PRO": 0.3505 + } + }, + { + "id": "Qwen/Qwen2.5-Coder-14B", + "name": "Qwen2.5-Coder-14B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3473, + "hfopenllm_v2/BBH": 0.5865, + "hfopenllm_v2/MATH Level 5": 0.2251, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.3874, + "hfopenllm_v2/MMLU-PRO": 0.4521 + } + }, + { + "id": "Qwen/Qwen2.5-Coder-14B-Instruct", + "name": "Qwen2.5-Coder-14B-Instruct", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6908, + "hfopenllm_v2/BBH": 0.614, + "hfopenllm_v2/MATH Level 5": 0.3248, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.3915, + "hfopenllm_v2/MMLU-PRO": 0.3939 + } + }, + { + "id": "Qwen/Qwen2.5-Coder-32B", + "name": "Qwen2.5-Coder-32B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4363, + "hfopenllm_v2/BBH": 0.6404, + "hfopenllm_v2/MATH Level 5": 0.3089, + "hfopenllm_v2/GPQA": 0.3465, + "hfopenllm_v2/MUSR": 0.4528, + "hfopenllm_v2/MMLU-PRO": 0.5303 + } + }, + { + "id": "Qwen/Qwen2.5-Coder-32B-Instruct", + "name": "Qwen2.5-Coder-32B-Instruct", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7265, + "hfopenllm_v2/BBH": 0.6625, + "hfopenllm_v2/MATH Level 5": 0.4955, + "hfopenllm_v2/GPQA": 0.349, + "hfopenllm_v2/MUSR": 0.4386, + "hfopenllm_v2/MMLU-PRO": 0.4413 + } + }, + { + "id": "Qwen/Qwen2.5-Coder-7B", + "name": "Qwen2.5-Coder-7B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3446, + "hfopenllm_v2/BBH": 0.4856, + "hfopenllm_v2/MATH Level 5": 0.1918, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3449, + "hfopenllm_v2/MMLU-PRO": 0.3679 + } + }, + { + "id": "Qwen/Qwen2.5-Coder-7B-Instruct", + "name": "Qwen2.5-Coder-7B-Instruct", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6101, + "hfopenllm_v2/BBH": 0.5008, + "hfopenllm_v2/MATH Level 5": 0.3716, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4073, + "hfopenllm_v2/MMLU-PRO": 0.3352 + } + }, + { + "id": "Qwen/Qwen2.5-Math-1.5B-Instruct", + "name": "Qwen2.5-Math-1.5B-Instruct", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1856, + "hfopenllm_v2/BBH": 0.3752, + "hfopenllm_v2/MATH Level 5": 0.2628, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3685, + "hfopenllm_v2/MMLU-PRO": 0.1801 + } + }, + { + "id": "Qwen/Qwen2.5-Math-72B-Instruct", + "name": "Qwen2.5-Math-72B-Instruct", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4003, + "hfopenllm_v2/BBH": 0.6452, + "hfopenllm_v2/MATH Level 5": 0.6239, + "hfopenllm_v2/GPQA": 0.3314, + "hfopenllm_v2/MUSR": 0.4473, + "hfopenllm_v2/MMLU-PRO": 0.4812 + } + }, + { + "id": "Qwen/Qwen2.5-Math-7B", + "name": "Qwen2.5-Math-7B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.246, + "hfopenllm_v2/BBH": 0.4455, + "hfopenllm_v2/MATH Level 5": 0.3051, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.3781, + "hfopenllm_v2/MMLU-PRO": 0.2718 + } + }, + { + "id": "Qwen/Qwen2.5-Math-7B-Instruct", + "name": "Qwen2.5-Math-7B-Instruct", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2636, + "hfopenllm_v2/BBH": 0.4388, + "hfopenllm_v2/MATH Level 5": 0.5808, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3647, + "hfopenllm_v2/MMLU-PRO": 0.282 + } + }, + { + "id": "Qwen/WorldPM-72B", + "name": "Qwen/WorldPM-72B", + "developer": "Qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6333, + "reward-bench/Factuality": 0.7074, + "reward-bench/Precise IF": 0.3125, + "reward-bench/Math": 0.6557, + "reward-bench/Safety": 0.8533, + "reward-bench/Focus": 0.9172, + "reward-bench/Ties": 0.3535 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/R-I-S-E.json b/data/developers/R-I-S-E.json new file mode 100644 index 0000000000000000000000000000000000000000..05d6bd52d17e381ac81a1c18a0dba0bf121dcda2 --- /dev/null +++ b/data/developers/R-I-S-E.json @@ -0,0 +1,31 @@ +{ + "developer": "R-I-S-E", + "models": [ + { + "id": "R-I-S-E/RISE-Judge-Qwen2.5-32B", + "name": "R-I-S-E/RISE-Judge-Qwen2.5-32B", + "developer": "R-I-S-E", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9266, + "reward-bench/Chat": 0.9665, + "reward-bench/Chat Hard": 0.8333, + "reward-bench/Safety": 0.9189, + "reward-bench/Reasoning": 0.9877 + } + }, + { + "id": "R-I-S-E/RISE-Judge-Qwen2.5-7B", + "name": "R-I-S-E/RISE-Judge-Qwen2.5-7B", + "developer": "R-I-S-E", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8819, + "reward-bench/Chat": 0.9218, + "reward-bench/Chat Hard": 0.7654, + "reward-bench/Safety": 0.8797, + "reward-bench/Reasoning": 0.9608 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/RDson.json b/data/developers/RDson.json new file mode 100644 index 0000000000000000000000000000000000000000..36f1051289198450d90b1967f86a100056cc2aa6 --- /dev/null +++ b/data/developers/RDson.json @@ -0,0 +1,19 @@ +{ + "developer": "RDson", + "models": [ + { + "id": "RDson/WomboCombo-R1-Coder-14B-Preview", + "name": "WomboCombo-R1-Coder-14B-Preview", + "developer": "RDson", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6286, + "hfopenllm_v2/BBH": 0.6392, + "hfopenllm_v2/MATH Level 5": 0.5989, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.4844, + "hfopenllm_v2/MMLU-PRO": 0.5168 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/RESMPDEV.json b/data/developers/RESMPDEV.json new file mode 100644 index 0000000000000000000000000000000000000000..272e604c50494381f3fdb725c611a8e0bb67c4fb --- /dev/null +++ b/data/developers/RESMPDEV.json @@ -0,0 +1,33 @@ +{ + "developer": "RESMPDEV", + "models": [ + { + "id": "RESMPDEV/EVA-Qwen2.5-1.5B-FRFR", + "name": "EVA-Qwen2.5-1.5B-FRFR", + "developer": "RESMPDEV", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3082, + "hfopenllm_v2/BBH": 0.3932, + "hfopenllm_v2/MATH Level 5": 0.1027, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3539, + "hfopenllm_v2/MMLU-PRO": 0.277 + } + }, + { + "id": "RESMPDEV/Qwen2-Wukong-0.5B", + "name": "Qwen2-Wukong-0.5B", + "developer": "RESMPDEV", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1854, + "hfopenllm_v2/BBH": 0.3085, + "hfopenllm_v2/MATH Level 5": 0.0015, + "hfopenllm_v2/GPQA": 0.2366, + "hfopenllm_v2/MUSR": 0.3525, + "hfopenllm_v2/MMLU-PRO": 0.1327 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/RLHFlow.json b/data/developers/RLHFlow.json new file mode 100644 index 0000000000000000000000000000000000000000..77943db5b2dda8795d48935b98b37c80216a7ae1 --- /dev/null +++ b/data/developers/RLHFlow.json @@ -0,0 +1,78 @@ +{ + "developer": "RLHFlow", + "models": [ + { + "id": "RLHFlow/ArmoRM-Llama3-8B-v0.1", + "name": "ArmoRM-Llama3-8B-v0.1", + "developer": "RLHFlow", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1897, + "hfopenllm_v2/BBH": 0.2876, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2492, + "hfopenllm_v2/MUSR": 0.3948, + "hfopenllm_v2/MMLU-PRO": 0.1078, + "reward-bench/Score": 0.886, + "reward-bench/Factuality": 0.6568, + "reward-bench/Precise IF": 0.4188, + "reward-bench/Math": 0.6612, + "reward-bench/Safety": 0.9054, + "reward-bench/Focus": 0.7657, + "reward-bench/Ties": 0.6629, + "reward-bench/Chat": 0.9693, + "reward-bench/Chat Hard": 0.7675, + "reward-bench/Reasoning": 0.9735, + "reward-bench/Prior Sets (0.5 weight)": 0.7429 + } + }, + { + "id": "RLHFlow/LLaMA3-iterative-DPO-final", + "name": "LLaMA3-iterative-DPO-final", + "developer": "RLHFlow", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.534, + "hfopenllm_v2/BBH": 0.5058, + "hfopenllm_v2/MATH Level 5": 0.0884, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3673, + "hfopenllm_v2/MMLU-PRO": 0.3257, + "reward-bench/Score": 0.6783, + "reward-bench/Chat": 0.838, + "reward-bench/Chat Hard": 0.5921, + "reward-bench/Safety": 0.7865, + "reward-bench/Reasoning": 0.6161, + "reward-bench/Prior Sets (0.5 weight)": 0.4392 + } + }, + { + "id": "RLHFlow/RewardModel-Mistral-7B-for-DPA-v1", + "name": "RLHFlow/RewardModel-Mistral-7B-for-DPA-v1", + "developer": "RLHFlow", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6633, + "reward-bench/Chat": 0.8799, + "reward-bench/Chat Hard": 0.4978, + "reward-bench/Safety": 0.7068, + "reward-bench/Reasoning": 0.5971, + "reward-bench/Prior Sets (0.5 weight)": 0.6068 + } + }, + { + "id": "RLHFlow/pair-preference-model-LLaMA3-8B", + "name": "RLHFlow/pair-preference-model-LLaMA3-8B", + "developer": "RLHFlow", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8575, + "reward-bench/Chat": 0.9832, + "reward-bench/Chat Hard": 0.6579, + "reward-bench/Safety": 0.8973, + "reward-bench/Reasoning": 0.9473, + "reward-bench/Prior Sets (0.5 weight)": 0.7458 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/RWKV.json b/data/developers/RWKV.json new file mode 100644 index 0000000000000000000000000000000000000000..11a2d290f58b9571e1d2dfdc73fb201adaee5f40 --- /dev/null +++ b/data/developers/RWKV.json @@ -0,0 +1,19 @@ +{ + "developer": "RWKV", + "models": [ + { + "id": "RWKV/rwkv-raven-14b", + "name": "rwkv-raven-14b", + "developer": "RWKV", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0768, + "hfopenllm_v2/BBH": 0.3307, + "hfopenllm_v2/MATH Level 5": 0.0045, + "hfopenllm_v2/GPQA": 0.229, + "hfopenllm_v2/MUSR": 0.3951, + "hfopenllm_v2/MMLU-PRO": 0.115 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Rakuten.json b/data/developers/Rakuten.json new file mode 100644 index 0000000000000000000000000000000000000000..877ffb5c05690ede4d79770a9f14e03e050382e5 --- /dev/null +++ b/data/developers/Rakuten.json @@ -0,0 +1,47 @@ +{ + "developer": "Rakuten", + "models": [ + { + "id": "Rakuten/RakutenAI-2.0-mini-instruct", + "name": "RakutenAI-2.0-mini-instruct", + "developer": "Rakuten", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6794, + "hfopenllm_v2/BBH": 0.2867, + "hfopenllm_v2/MATH Level 5": 0.0521, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3249, + "hfopenllm_v2/MMLU-PRO": 0.1118 + } + }, + { + "id": "Rakuten/RakutenAI-7B", + "name": "RakutenAI-7B", + "developer": "Rakuten", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1556, + "hfopenllm_v2/BBH": 0.4315, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.3738, + "hfopenllm_v2/MMLU-PRO": 0.2877 + } + }, + { + "id": "Rakuten/RakutenAI-7B-chat", + "name": "RakutenAI-7B-chat", + "developer": "Rakuten", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2686, + "hfopenllm_v2/BBH": 0.4316, + "hfopenllm_v2/MATH Level 5": 0.0295, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.379, + "hfopenllm_v2/MMLU-PRO": 0.2798 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Ray2333.json b/data/developers/Ray2333.json new file mode 100644 index 0000000000000000000000000000000000000000..f2e4e9124a06fb0ae9b827267581f8562c5b9fe1 --- /dev/null +++ b/data/developers/Ray2333.json @@ -0,0 +1,160 @@ +{ + "developer": "Ray2333", + "models": [ + { + "id": "Ray2333/GRM-Gemma-2B-rewardmodel-ft", + "name": "Ray2333/GRM-Gemma-2B-rewardmodel-ft", + "developer": "Ray2333", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8447, + "reward-bench/Chat": 0.8939, + "reward-bench/Chat Hard": 0.7522, + "reward-bench/Safety": 0.8446, + "reward-bench/Reasoning": 0.8881 + } + }, + { + "id": "Ray2333/GRM-Gemma-2B-sftreg", + "name": "Ray2333/GRM-Gemma-2B-sftreg", + "developer": "Ray2333", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7451, + "reward-bench/Chat": 0.9553, + "reward-bench/Chat Hard": 0.4868, + "reward-bench/Safety": 0.7932, + "reward-bench/Reasoning": 0.7684, + "reward-bench/Prior Sets (0.5 weight)": 0.6983 + } + }, + { + "id": "Ray2333/GRM-Llama3-8B-rewardmodel-ft", + "name": "Ray2333/GRM-Llama3-8B-rewardmodel-ft", + "developer": "Ray2333", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6766, + "reward-bench/Chat": 0.9553, + "reward-bench/Chat Hard": 0.8618, + "reward-bench/Safety": 0.9222, + "reward-bench/Reasoning": 0.9362, + "reward-bench/Factuality": 0.6274, + "reward-bench/Precise IF": 0.35, + "reward-bench/Math": 0.5847, + "reward-bench/Focus": 0.8929, + "reward-bench/Ties": 0.6824 + } + }, + { + "id": "Ray2333/GRM-gemma2-2B-rewardmodel-ft", + "name": "Ray2333/GRM-gemma2-2B-rewardmodel-ft", + "developer": "Ray2333", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8839, + "reward-bench/Factuality": 0.5305, + "reward-bench/Precise IF": 0.3125, + "reward-bench/Math": 0.5902, + "reward-bench/Safety": 0.9216, + "reward-bench/Focus": 0.7455, + "reward-bench/Ties": 0.4788, + "reward-bench/Chat": 0.9302, + "reward-bench/Chat Hard": 0.7719, + "reward-bench/Reasoning": 0.912 + } + }, + { + "id": "Ray2333/GRM-llama3-8B-distill", + "name": "Ray2333/GRM-llama3-8B-distill", + "developer": "Ray2333", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.589, + "reward-bench/Chat": 0.9832, + "reward-bench/Chat Hard": 0.6842, + "reward-bench/Safety": 0.7222, + "reward-bench/Reasoning": 0.9133, + "reward-bench/Prior Sets (0.5 weight)": 0.7209, + "reward-bench/Factuality": 0.5874, + "reward-bench/Precise IF": 0.3875, + "reward-bench/Math": 0.5902, + "reward-bench/Focus": 0.6727, + "reward-bench/Ties": 0.5743 + } + }, + { + "id": "Ray2333/GRM-llama3-8B-sftreg", + "name": "Ray2333/GRM-llama3-8B-sftreg", + "developer": "Ray2333", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6089, + "reward-bench/Chat": 0.986, + "reward-bench/Chat Hard": 0.6776, + "reward-bench/Safety": 0.7867, + "reward-bench/Reasoning": 0.9229, + "reward-bench/Prior Sets (0.5 weight)": 0.7309, + "reward-bench/Factuality": 0.6189, + "reward-bench/Precise IF": 0.3875, + "reward-bench/Math": 0.5792, + "reward-bench/Focus": 0.6828, + "reward-bench/Ties": 0.5981 + } + }, + { + "id": "Ray2333/GRM-llama3.2-3B-rewardmodel-ft", + "name": "Ray2333/GRM-llama3.2-3B-rewardmodel-ft", + "developer": "Ray2333", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9092, + "reward-bench/Chat": 0.9162, + "reward-bench/Chat Hard": 0.8487, + "reward-bench/Safety": 0.927, + "reward-bench/Reasoning": 0.945 + } + }, + { + "id": "Ray2333/Gemma-2B-rewardmodel-baseline", + "name": "Ray2333/Gemma-2B-rewardmodel-baseline", + "developer": "Ray2333", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.729, + "reward-bench/Chat": 0.9413, + "reward-bench/Chat Hard": 0.4693, + "reward-bench/Safety": 0.7865, + "reward-bench/Reasoning": 0.7384, + "reward-bench/Prior Sets (0.5 weight)": 0.6897 + } + }, + { + "id": "Ray2333/Gemma-2B-rewardmodel-ft", + "name": "Ray2333/Gemma-2B-rewardmodel-ft", + "developer": "Ray2333", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8048, + "reward-bench/Chat": 0.7793, + "reward-bench/Chat Hard": 0.7478, + "reward-bench/Safety": 0.8527, + "reward-bench/Reasoning": 0.8393 + } + }, + { + "id": "Ray2333/reward-model-Mistral-7B-instruct-Unifie...", + "name": "Ray2333/reward-model-Mistral-7B-instruct-Unifie...", + "developer": "Ray2333", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7661, + "reward-bench/Chat": 0.9777, + "reward-bench/Chat Hard": 0.5066, + "reward-bench/Safety": 0.8527, + "reward-bench/Reasoning": 0.7389, + "reward-bench/Prior Sets (0.5 weight)": 0.7434 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Replete-AI.json b/data/developers/Replete-AI.json new file mode 100644 index 0000000000000000000000000000000000000000..0f038b31b0c0cf28a02b6c25fb4fd9bd374c118c --- /dev/null +++ b/data/developers/Replete-AI.json @@ -0,0 +1,131 @@ +{ + "developer": "Replete-AI", + "models": [ + { + "id": "Replete-AI/L3-Pneuma-8B", + "name": "L3-Pneuma-8B", + "developer": "Replete-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2413, + "hfopenllm_v2/BBH": 0.4909, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.4105, + "hfopenllm_v2/MMLU-PRO": 0.3176 + } + }, + { + "id": "Replete-AI/L3.1-Pneuma-8B", + "name": "L3.1-Pneuma-8B", + "developer": "Replete-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7076, + "hfopenllm_v2/BBH": 0.505, + "hfopenllm_v2/MATH Level 5": 0.2198, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.3871, + "hfopenllm_v2/MMLU-PRO": 0.3691 + } + }, + { + "id": "Replete-AI/Llama3-8B-Instruct-Replete-Adapted", + "name": "Llama3-8B-Instruct-Replete-Adapted", + "developer": "Replete-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6915, + "hfopenllm_v2/BBH": 0.487, + "hfopenllm_v2/MATH Level 5": 0.071, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.3634, + "hfopenllm_v2/MMLU-PRO": 0.3391 + } + }, + { + "id": "Replete-AI/Replete-Coder-Instruct-8b-Merged", + "name": "Replete-Coder-Instruct-8b-Merged", + "developer": "Replete-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5388, + "hfopenllm_v2/BBH": 0.4462, + "hfopenllm_v2/MATH Level 5": 0.0778, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.366, + "hfopenllm_v2/MMLU-PRO": 0.1805 + } + }, + { + "id": "Replete-AI/Replete-Coder-Llama3-8B", + "name": "Replete-Coder-Llama3-8B", + "developer": "Replete-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4729, + "hfopenllm_v2/BBH": 0.3271, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3953, + "hfopenllm_v2/MMLU-PRO": 0.1331 + } + }, + { + "id": "Replete-AI/Replete-Coder-Qwen2-1.5b", + "name": "Replete-Coder-Qwen2-1.5b", + "developer": "Replete-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3014, + "hfopenllm_v2/BBH": 0.3475, + "hfopenllm_v2/MATH Level 5": 0.0385, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.4073, + "hfopenllm_v2/MMLU-PRO": 0.2147 + } + }, + { + "id": "Replete-AI/Replete-LLM-Qwen2-7b", + "name": "Replete-LLM-Qwen2-7b", + "developer": "Replete-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0905, + "hfopenllm_v2/BBH": 0.2985, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3848, + "hfopenllm_v2/MMLU-PRO": 0.1158 + } + }, + { + "id": "Replete-AI/Replete-LLM-Qwen2-7b_Beta-Preview", + "name": "Replete-LLM-Qwen2-7b_Beta-Preview", + "developer": "Replete-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0858, + "hfopenllm_v2/BBH": 0.2929, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2483, + "hfopenllm_v2/MUSR": 0.3981, + "hfopenllm_v2/MMLU-PRO": 0.1285 + } + }, + { + "id": "Replete-AI/Replete-LLM-V2-Llama-3.1-8b", + "name": "Replete-LLM-V2-Llama-3.1-8b", + "developer": "Replete-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5515, + "hfopenllm_v2/BBH": 0.5339, + "hfopenllm_v2/MATH Level 5": 0.1405, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4001, + "hfopenllm_v2/MMLU-PRO": 0.3753 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/RezVortex.json b/data/developers/RezVortex.json new file mode 100644 index 0000000000000000000000000000000000000000..33312e29f040c0a280e0d7ba6bda55874dd96a18 --- /dev/null +++ b/data/developers/RezVortex.json @@ -0,0 +1,33 @@ +{ + "developer": "RezVortex", + "models": [ + { + "id": "RezVortex/JAJUKA-WEWILLNEVERFORGETYOU-3B", + "name": "JAJUKA-WEWILLNEVERFORGETYOU-3B", + "developer": "RezVortex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6858, + "hfopenllm_v2/BBH": 0.4619, + "hfopenllm_v2/MATH Level 5": 0.1548, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.363, + "hfopenllm_v2/MMLU-PRO": 0.3143 + } + }, + { + "id": "RezVortex/Jajuka-3b", + "name": "Jajuka-3b", + "developer": "RezVortex", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6925, + "hfopenllm_v2/BBH": 0.4594, + "hfopenllm_v2/MATH Level 5": 0.1594, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3671, + "hfopenllm_v2/MMLU-PRO": 0.3137 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Ro-xe.json b/data/developers/Ro-xe.json new file mode 100644 index 0000000000000000000000000000000000000000..1888889d0865258e380c0aea95a46b43e7022139 --- /dev/null +++ b/data/developers/Ro-xe.json @@ -0,0 +1,61 @@ +{ + "developer": "Ro-xe", + "models": [ + { + "id": "Ro-xe/FMixIA-7B-DARE-0", + "name": "FMixIA-7B-DARE-0", + "developer": "Ro-xe", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3341, + "hfopenllm_v2/BBH": 0.5035, + "hfopenllm_v2/MATH Level 5": 0.0529, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.4545, + "hfopenllm_v2/MMLU-PRO": 0.3016 + } + }, + { + "id": "Ro-xe/FMixIA-7B-SLERP-27", + "name": "FMixIA-7B-SLERP-27", + "developer": "Ro-xe", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3765, + "hfopenllm_v2/BBH": 0.5151, + "hfopenllm_v2/MATH Level 5": 0.0634, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4412, + "hfopenllm_v2/MMLU-PRO": 0.3008 + } + }, + { + "id": "Ro-xe/FMixIA-7B-TIES-1", + "name": "FMixIA-7B-TIES-1", + "developer": "Ro-xe", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3453, + "hfopenllm_v2/BBH": 0.5092, + "hfopenllm_v2/MATH Level 5": 0.0566, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.4689, + "hfopenllm_v2/MMLU-PRO": 0.2992 + } + }, + { + "id": "Ro-xe/FMixIA-FrankenMerge-9.5B-PT-9", + "name": "FMixIA-FrankenMerge-9.5B-PT-9", + "developer": "Ro-xe", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.194, + "hfopenllm_v2/BBH": 0.5088, + "hfopenllm_v2/MATH Level 5": 0.003, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.417, + "hfopenllm_v2/MMLU-PRO": 0.3657 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Rombo-Org.json b/data/developers/Rombo-Org.json new file mode 100644 index 0000000000000000000000000000000000000000..b79d7a7f39a0dea35feab8529f6276806b3465d6 --- /dev/null +++ b/data/developers/Rombo-Org.json @@ -0,0 +1,19 @@ +{ + "developer": "Rombo-Org", + "models": [ + { + "id": "Rombo-Org/Rombo-LLM-V2.5-Qwen-7b", + "name": "Rombo-LLM-V2.5-Qwen-7b", + "developer": "Rombo-Org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7482, + "hfopenllm_v2/BBH": 0.54, + "hfopenllm_v2/MATH Level 5": 0.5068, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.398, + "hfopenllm_v2/MMLU-PRO": 0.4283 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/RubielLabarta.json b/data/developers/RubielLabarta.json new file mode 100644 index 0000000000000000000000000000000000000000..b07aef707c469238132c77be4ee5e8a60fe2458c --- /dev/null +++ b/data/developers/RubielLabarta.json @@ -0,0 +1,19 @@ +{ + "developer": "RubielLabarta", + "models": [ + { + "id": "RubielLabarta/LogoS-7Bx2-MoE-13B-v0.2", + "name": "LogoS-7Bx2-MoE-13B-v0.2", + "developer": "RubielLabarta", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4379, + "hfopenllm_v2/BBH": 0.5207, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.4226, + "hfopenllm_v2/MMLU-PRO": 0.3088 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/SF-Foundation.json b/data/developers/SF-Foundation.json new file mode 100644 index 0000000000000000000000000000000000000000..aad345870c7d1ee261bdf988b97dd013cae7289e --- /dev/null +++ b/data/developers/SF-Foundation.json @@ -0,0 +1,31 @@ +{ + "developer": "SF-Foundation", + "models": [ + { + "id": "SF-Foundation/TextEval-Llama3.1-70B", + "name": "SF-Foundation/TextEval-Llama3.1-70B", + "developer": "SF-Foundation", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9348, + "reward-bench/Chat": 0.9413, + "reward-bench/Chat Hard": 0.9013, + "reward-bench/Safety": 0.9324, + "reward-bench/Reasoning": 0.9641 + } + }, + { + "id": "SF-Foundation/TextEval-OffsetBias-12B", + "name": "SF-Foundation/TextEval-OffsetBias-12B", + "developer": "SF-Foundation", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9105, + "reward-bench/Chat": 0.919, + "reward-bench/Chat Hard": 0.8662, + "reward-bench/Safety": 0.9203, + "reward-bench/Reasoning": 0.9365 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/SaisExperiments.json b/data/developers/SaisExperiments.json new file mode 100644 index 0000000000000000000000000000000000000000..802705b381af64db71ac86a6e6fccc0d765cc9d5 --- /dev/null +++ b/data/developers/SaisExperiments.json @@ -0,0 +1,89 @@ +{ + "developer": "SaisExperiments", + "models": [ + { + "id": "SaisExperiments/Evil-Alpaca-3B-L3.2", + "name": "Evil-Alpaca-3B-L3.2", + "developer": "SaisExperiments", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3251, + "hfopenllm_v2/BBH": 0.4341, + "hfopenllm_v2/MATH Level 5": 0.0702, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.4198, + "hfopenllm_v2/MMLU-PRO": 0.2621 + } + }, + { + "id": "SaisExperiments/Gemma-2-2B-Opus-Instruct", + "name": "Gemma-2-2B-Opus-Instruct", + "developer": "SaisExperiments", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.475, + "hfopenllm_v2/BBH": 0.4293, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.4057, + "hfopenllm_v2/MMLU-PRO": 0.265 + } + }, + { + "id": "SaisExperiments/Gemma-2-2B-Stheno-Filtered", + "name": "Gemma-2-2B-Stheno-Filtered", + "developer": "SaisExperiments", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4197, + "hfopenllm_v2/BBH": 0.4149, + "hfopenllm_v2/MATH Level 5": 0.0461, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.4003, + "hfopenllm_v2/MMLU-PRO": 0.263 + } + }, + { + "id": "SaisExperiments/Not-So-Small-Alpaca-24B", + "name": "Not-So-Small-Alpaca-24B", + "developer": "SaisExperiments", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6244, + "hfopenllm_v2/BBH": 0.5339, + "hfopenllm_v2/MATH Level 5": 0.1828, + "hfopenllm_v2/GPQA": 0.3591, + "hfopenllm_v2/MUSR": 0.4282, + "hfopenllm_v2/MMLU-PRO": 0.3694 + } + }, + { + "id": "SaisExperiments/QwOwO-7B-V1", + "name": "QwOwO-7B-V1", + "developer": "SaisExperiments", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4556, + "hfopenllm_v2/BBH": 0.5431, + "hfopenllm_v2/MATH Level 5": 0.386, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3835, + "hfopenllm_v2/MMLU-PRO": 0.4224 + } + }, + { + "id": "SaisExperiments/RightSheep-Llama3.2-3B", + "name": "RightSheep-Llama3.2-3B", + "developer": "SaisExperiments", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4156, + "hfopenllm_v2/BBH": 0.4241, + "hfopenllm_v2/MATH Level 5": 0.0808, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.3767, + "hfopenllm_v2/MMLU-PRO": 0.254 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Sakalti.json b/data/developers/Sakalti.json new file mode 100644 index 0000000000000000000000000000000000000000..a073d172055492a731f0a4f4c9d818bcedb6d6ee --- /dev/null +++ b/data/developers/Sakalti.json @@ -0,0 +1,929 @@ +{ + "developer": "Sakalti", + "models": [ + { + "id": "Sakalti/Anemoi-3B", + "name": "Anemoi-3B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3804, + "hfopenllm_v2/BBH": 0.4922, + "hfopenllm_v2/MATH Level 5": 0.1775, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4371, + "hfopenllm_v2/MMLU-PRO": 0.3766 + } + }, + { + "id": "Sakalti/Euphrates-14B", + "name": "Euphrates-14B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2647, + "hfopenllm_v2/BBH": 0.6138, + "hfopenllm_v2/MATH Level 5": 0.3051, + "hfopenllm_v2/GPQA": 0.3935, + "hfopenllm_v2/MUSR": 0.4516, + "hfopenllm_v2/MMLU-PRO": 0.5255 + } + }, + { + "id": "Sakalti/Llama3.2-3B-Uranus-1", + "name": "Llama3.2-3B-Uranus-1", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5335, + "hfopenllm_v2/BBH": 0.4437, + "hfopenllm_v2/MATH Level 5": 0.1495, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.3669, + "hfopenllm_v2/MMLU-PRO": 0.3094 + } + }, + { + "id": "Sakalti/Magro-7B-v1.1", + "name": "Magro-7B-v1.1", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1204, + "hfopenllm_v2/BBH": 0.4179, + "hfopenllm_v2/MATH Level 5": 0.0249, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4433, + "hfopenllm_v2/MMLU-PRO": 0.2764 + } + }, + { + "id": "Sakalti/Neptuno-3B", + "name": "Neptuno-3B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4296, + "hfopenllm_v2/BBH": 0.4834, + "hfopenllm_v2/MATH Level 5": 0.2553, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4002, + "hfopenllm_v2/MMLU-PRO": 0.3773 + } + }, + { + "id": "Sakalti/Neptuno-Alpha", + "name": "Neptuno-Alpha", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.378, + "hfopenllm_v2/BBH": 0.4925, + "hfopenllm_v2/MATH Level 5": 0.1835, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4371, + "hfopenllm_v2/MMLU-PRO": 0.3767 + } + }, + { + "id": "Sakalti/Oxyge1-33B", + "name": "Oxyge1-33B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4548, + "hfopenllm_v2/BBH": 0.7033, + "hfopenllm_v2/MATH Level 5": 0.4962, + "hfopenllm_v2/GPQA": 0.3826, + "hfopenllm_v2/MUSR": 0.5008, + "hfopenllm_v2/MMLU-PRO": 0.5909 + } + }, + { + "id": "Sakalti/Phi3.5-Comets-3.8B", + "name": "Phi3.5-Comets-3.8B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2094, + "hfopenllm_v2/BBH": 0.3335, + "hfopenllm_v2/MATH Level 5": 0.0008, + "hfopenllm_v2/GPQA": 0.2492, + "hfopenllm_v2/MUSR": 0.3764, + "hfopenllm_v2/MMLU-PRO": 0.1153 + } + }, + { + "id": "Sakalti/Qwen2.5-1B-Instruct", + "name": "Qwen2.5-1B-Instruct", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1751, + "hfopenllm_v2/BBH": 0.3027, + "hfopenllm_v2/MATH Level 5": 0.006, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3369, + "hfopenllm_v2/MMLU-PRO": 0.1213 + } + }, + { + "id": "Sakalti/QwenTest-7", + "name": "QwenTest-7", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1672, + "hfopenllm_v2/BBH": 0.3063, + "hfopenllm_v2/MATH Level 5": 0.0038, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3422, + "hfopenllm_v2/MMLU-PRO": 0.1212 + } + }, + { + "id": "Sakalti/SJT-0.5B", + "name": "SJT-0.5B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2425, + "hfopenllm_v2/BBH": 0.3306, + "hfopenllm_v2/MATH Level 5": 0.0521, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3196, + "hfopenllm_v2/MMLU-PRO": 0.1891 + } + }, + { + "id": "Sakalti/SJT-1.5B-Alpha", + "name": "SJT-1.5B-Alpha", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3449, + "hfopenllm_v2/BBH": 0.4241, + "hfopenllm_v2/MATH Level 5": 0.0997, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4226, + "hfopenllm_v2/MMLU-PRO": 0.2961 + } + }, + { + "id": "Sakalti/SJT-1.5B-Alpha-1.1", + "name": "SJT-1.5B-Alpha-1.1", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3439, + "hfopenllm_v2/BBH": 0.4243, + "hfopenllm_v2/MATH Level 5": 0.0959, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.4239, + "hfopenllm_v2/MMLU-PRO": 0.2966 + } + }, + { + "id": "Sakalti/SJT-1.7B", + "name": "SJT-1.7B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1776, + "hfopenllm_v2/BBH": 0.2934, + "hfopenllm_v2/MATH Level 5": 0.0015, + "hfopenllm_v2/GPQA": 0.2416, + "hfopenllm_v2/MUSR": 0.3964, + "hfopenllm_v2/MMLU-PRO": 0.1133 + } + }, + { + "id": "Sakalti/SJT-14B", + "name": "SJT-14B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5494, + "hfopenllm_v2/BBH": 0.6536, + "hfopenllm_v2/MATH Level 5": 0.3844, + "hfopenllm_v2/GPQA": 0.3867, + "hfopenllm_v2/MUSR": 0.4766, + "hfopenllm_v2/MMLU-PRO": 0.5381 + } + }, + { + "id": "Sakalti/SJT-2.4B", + "name": "SJT-2.4B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2804, + "hfopenllm_v2/BBH": 0.349, + "hfopenllm_v2/MATH Level 5": 0.0219, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3699, + "hfopenllm_v2/MMLU-PRO": 0.1858 + } + }, + { + "id": "Sakalti/SJT-24B-Alpha", + "name": "SJT-24B-Alpha", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3206, + "hfopenllm_v2/BBH": 0.6081, + "hfopenllm_v2/MATH Level 5": 0.253, + "hfopenllm_v2/GPQA": 0.3809, + "hfopenllm_v2/MUSR": 0.4595, + "hfopenllm_v2/MMLU-PRO": 0.4857 + } + }, + { + "id": "Sakalti/SJT-2B", + "name": "SJT-2B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2151, + "hfopenllm_v2/BBH": 0.2936, + "hfopenllm_v2/MATH Level 5": 0.0008, + "hfopenllm_v2/GPQA": 0.2416, + "hfopenllm_v2/MUSR": 0.3564, + "hfopenllm_v2/MMLU-PRO": 0.1187 + } + }, + { + "id": "Sakalti/SJT-2B-V1.1", + "name": "SJT-2B-V1.1", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3977, + "hfopenllm_v2/BBH": 0.3984, + "hfopenllm_v2/MATH Level 5": 0.0483, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.4299, + "hfopenllm_v2/MMLU-PRO": 0.2124 + } + }, + { + "id": "Sakalti/SJT-3.7B", + "name": "SJT-3.7B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1078, + "hfopenllm_v2/BBH": 0.3393, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3617, + "hfopenllm_v2/MMLU-PRO": 0.1505 + } + }, + { + "id": "Sakalti/SJT-4B", + "name": "SJT-4B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4077, + "hfopenllm_v2/BBH": 0.4886, + "hfopenllm_v2/MATH Level 5": 0.1156, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.478, + "hfopenllm_v2/MMLU-PRO": 0.3281 + } + }, + { + "id": "Sakalti/SJT-7.5B", + "name": "SJT-7.5B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4223, + "hfopenllm_v2/BBH": 0.5367, + "hfopenllm_v2/MATH Level 5": 0.2168, + "hfopenllm_v2/GPQA": 0.3263, + "hfopenllm_v2/MUSR": 0.4399, + "hfopenllm_v2/MMLU-PRO": 0.3951 + } + }, + { + "id": "Sakalti/SJT-7B-V1.1", + "name": "SJT-7B-V1.1", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4703, + "hfopenllm_v2/BBH": 0.5419, + "hfopenllm_v2/MATH Level 5": 0.2432, + "hfopenllm_v2/GPQA": 0.3339, + "hfopenllm_v2/MUSR": 0.4411, + "hfopenllm_v2/MMLU-PRO": 0.4412 + } + }, + { + "id": "Sakalti/SJT-7B-V1.1-Multilingal", + "name": "SJT-7B-V1.1-Multilingal", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1949, + "hfopenllm_v2/BBH": 0.292, + "hfopenllm_v2/MATH Level 5": 0.0045, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3621, + "hfopenllm_v2/MMLU-PRO": 0.1137 + } + }, + { + "id": "Sakalti/SJT-8B", + "name": "SJT-8B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6535, + "hfopenllm_v2/BBH": 0.5282, + "hfopenllm_v2/MATH Level 5": 0.2538, + "hfopenllm_v2/GPQA": 0.3297, + "hfopenllm_v2/MUSR": 0.408, + "hfopenllm_v2/MMLU-PRO": 0.4266 + } + }, + { + "id": "Sakalti/SJT-8B-V1.1", + "name": "SJT-8B-V1.1", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4621, + "hfopenllm_v2/BBH": 0.5121, + "hfopenllm_v2/MATH Level 5": 0.2069, + "hfopenllm_v2/GPQA": 0.3364, + "hfopenllm_v2/MUSR": 0.4266, + "hfopenllm_v2/MMLU-PRO": 0.4231 + } + }, + { + "id": "Sakalti/SJT-900M", + "name": "SJT-900M", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.241, + "hfopenllm_v2/BBH": 0.3169, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3595, + "hfopenllm_v2/MMLU-PRO": 0.1142 + } + }, + { + "id": "Sakalti/SJT-Moe2x7.5B", + "name": "SJT-Moe2x7.5B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4117, + "hfopenllm_v2/BBH": 0.5371, + "hfopenllm_v2/MATH Level 5": 0.2145, + "hfopenllm_v2/GPQA": 0.3263, + "hfopenllm_v2/MUSR": 0.4399, + "hfopenllm_v2/MMLU-PRO": 0.3954 + } + }, + { + "id": "Sakalti/SJTPass-2", + "name": "SJTPass-2", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.24, + "hfopenllm_v2/BBH": 0.3302, + "hfopenllm_v2/MATH Level 5": 0.0529, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3222, + "hfopenllm_v2/MMLU-PRO": 0.1902 + } + }, + { + "id": "Sakalti/SJTPass-4", + "name": "SJTPass-4", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1913, + "hfopenllm_v2/BBH": 0.2964, + "hfopenllm_v2/MATH Level 5": 0.0023, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3898, + "hfopenllm_v2/MMLU-PRO": 0.1083 + } + }, + { + "id": "Sakalti/SJTPass-5", + "name": "SJTPass-5", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2425, + "hfopenllm_v2/BBH": 0.3103, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3794, + "hfopenllm_v2/MMLU-PRO": 0.1327 + } + }, + { + "id": "Sakalti/Saba-Passthrough-2", + "name": "Saba-Passthrough-2", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1691, + "hfopenllm_v2/BBH": 0.3672, + "hfopenllm_v2/MATH Level 5": 0.0008, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3844, + "hfopenllm_v2/MMLU-PRO": 0.2077 + } + }, + { + "id": "Sakalti/Saba1-1.8B", + "name": "Saba1-1.8B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3333, + "hfopenllm_v2/BBH": 0.4147, + "hfopenllm_v2/MATH Level 5": 0.1541, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4239, + "hfopenllm_v2/MMLU-PRO": 0.2926 + } + }, + { + "id": "Sakalti/Saba1-7B", + "name": "Saba1-7B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4585, + "hfopenllm_v2/BBH": 0.5489, + "hfopenllm_v2/MATH Level 5": 0.3663, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4793, + "hfopenllm_v2/MMLU-PRO": 0.4376 + } + }, + { + "id": "Sakalti/Saba1.5-1.5B", + "name": "Saba1.5-1.5B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3333, + "hfopenllm_v2/BBH": 0.4147, + "hfopenllm_v2/MATH Level 5": 0.1541, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4239, + "hfopenllm_v2/MMLU-PRO": 0.2926 + } + }, + { + "id": "Sakalti/Saba1.5-Pro-3B", + "name": "Saba1.5-Pro-3B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2386, + "hfopenllm_v2/BBH": 0.3623, + "hfopenllm_v2/MATH Level 5": 0.0272, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.4405, + "hfopenllm_v2/MMLU-PRO": 0.1958 + } + }, + { + "id": "Sakalti/Saba2-14B-Preview", + "name": "Saba2-14B-Preview", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4722, + "hfopenllm_v2/BBH": 0.6496, + "hfopenllm_v2/MATH Level 5": 0.3127, + "hfopenllm_v2/GPQA": 0.3826, + "hfopenllm_v2/MUSR": 0.4781, + "hfopenllm_v2/MMLU-PRO": 0.5384 + } + }, + { + "id": "Sakalti/Saba2-3B", + "name": "Saba2-3B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2865, + "hfopenllm_v2/BBH": 0.2801, + "hfopenllm_v2/MATH Level 5": 0.006, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3927, + "hfopenllm_v2/MMLU-PRO": 0.121 + } + }, + { + "id": "Sakalti/Sailor-japanese", + "name": "Sailor-japanese", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1605, + "hfopenllm_v2/BBH": 0.2913, + "hfopenllm_v2/MATH Level 5": 0.003, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3912, + "hfopenllm_v2/MMLU-PRO": 0.1164 + } + }, + { + "id": "Sakalti/Saka-1.5B", + "name": "Saka-1.5B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2726, + "hfopenllm_v2/BBH": 0.3988, + "hfopenllm_v2/MATH Level 5": 0.0801, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.3739, + "hfopenllm_v2/MMLU-PRO": 0.2415 + } + }, + { + "id": "Sakalti/Saka-14B", + "name": "Saka-14B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7174, + "hfopenllm_v2/BBH": 0.6497, + "hfopenllm_v2/MATH Level 5": 0.4094, + "hfopenllm_v2/GPQA": 0.396, + "hfopenllm_v2/MUSR": 0.4886, + "hfopenllm_v2/MMLU-PRO": 0.5396 + } + }, + { + "id": "Sakalti/Saka-24B", + "name": "Saka-24B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3819, + "hfopenllm_v2/BBH": 0.6072, + "hfopenllm_v2/MATH Level 5": 0.1805, + "hfopenllm_v2/GPQA": 0.3423, + "hfopenllm_v2/MUSR": 0.4541, + "hfopenllm_v2/MMLU-PRO": 0.4766 + } + }, + { + "id": "Sakalti/Saka-7.2B", + "name": "Saka-7.2B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1545, + "hfopenllm_v2/BBH": 0.2945, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2391, + "hfopenllm_v2/MUSR": 0.3711, + "hfopenllm_v2/MMLU-PRO": 0.116 + } + }, + { + "id": "Sakalti/Saka-7.6B", + "name": "Saka-7.6B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4524, + "hfopenllm_v2/BBH": 0.5655, + "hfopenllm_v2/MATH Level 5": 0.3255, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4489, + "hfopenllm_v2/MMLU-PRO": 0.454 + } + }, + { + "id": "Sakalti/SakaMoe-3x1.6B-Instruct", + "name": "SakaMoe-3x1.6B-Instruct", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2371, + "hfopenllm_v2/BBH": 0.3282, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3342, + "hfopenllm_v2/MMLU-PRO": 0.1882 + } + }, + { + "id": "Sakalti/SakalFusion-7B-Alpha", + "name": "SakalFusion-7B-Alpha", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.529, + "hfopenllm_v2/BBH": 0.5591, + "hfopenllm_v2/MATH Level 5": 0.3844, + "hfopenllm_v2/GPQA": 0.3255, + "hfopenllm_v2/MUSR": 0.4581, + "hfopenllm_v2/MMLU-PRO": 0.4474 + } + }, + { + "id": "Sakalti/SakalFusion-7B-Beta", + "name": "SakalFusion-7B-Beta", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1809, + "hfopenllm_v2/BBH": 0.2881, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2433, + "hfopenllm_v2/MUSR": 0.3872, + "hfopenllm_v2/MMLU-PRO": 0.109 + } + }, + { + "id": "Sakalti/Tara-3.8B-v1.1", + "name": "Tara-3.8B-v1.1", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4062, + "hfopenllm_v2/BBH": 0.4886, + "hfopenllm_v2/MATH Level 5": 0.1156, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.478, + "hfopenllm_v2/MMLU-PRO": 0.3281 + } + }, + { + "id": "Sakalti/light-1.1-3B", + "name": "light-1.1-3B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2735, + "hfopenllm_v2/BBH": 0.2803, + "hfopenllm_v2/MATH Level 5": 0.0113, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3901, + "hfopenllm_v2/MMLU-PRO": 0.1209 + } + }, + { + "id": "Sakalti/light-3B", + "name": "light-3B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5337, + "hfopenllm_v2/BBH": 0.4831, + "hfopenllm_v2/MATH Level 5": 0.2591, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4015, + "hfopenllm_v2/MMLU-PRO": 0.3775 + } + }, + { + "id": "Sakalti/light-3b-beta", + "name": "light-3b-beta", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5485, + "hfopenllm_v2/BBH": 0.4815, + "hfopenllm_v2/MATH Level 5": 0.2772, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4015, + "hfopenllm_v2/MMLU-PRO": 0.3758 + } + }, + { + "id": "Sakalti/light-7b-beta", + "name": "light-7b-beta", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6234, + "hfopenllm_v2/BBH": 0.5548, + "hfopenllm_v2/MATH Level 5": 0.3769, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.4291, + "hfopenllm_v2/MMLU-PRO": 0.4456 + } + }, + { + "id": "Sakalti/llama-3-yanyuedao-8b-instruct", + "name": "llama-3-yanyuedao-8b-instruct", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2186, + "hfopenllm_v2/BBH": 0.435, + "hfopenllm_v2/MATH Level 5": 0.0385, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4199, + "hfopenllm_v2/MMLU-PRO": 0.2911 + } + }, + { + "id": "Sakalti/magro-7B", + "name": "magro-7B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1344, + "hfopenllm_v2/BBH": 0.4186, + "hfopenllm_v2/MATH Level 5": 0.0204, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.446, + "hfopenllm_v2/MMLU-PRO": 0.2765 + } + }, + { + "id": "Sakalti/mergekit-01", + "name": "mergekit-01", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6234, + "hfopenllm_v2/BBH": 0.5548, + "hfopenllm_v2/MATH Level 5": 0.3769, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.4291, + "hfopenllm_v2/MMLU-PRO": 0.4456 + } + }, + { + "id": "Sakalti/mergekit-della_linear-vmeykci", + "name": "mergekit-della_linear-vmeykci", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1126, + "hfopenllm_v2/BBH": 0.2816, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3897, + "hfopenllm_v2/MMLU-PRO": 0.1089 + } + }, + { + "id": "Sakalti/model-3", + "name": "model-3", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6264, + "hfopenllm_v2/BBH": 0.5542, + "hfopenllm_v2/MATH Level 5": 0.3708, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.4264, + "hfopenllm_v2/MMLU-PRO": 0.4455 + } + }, + { + "id": "Sakalti/qwen2.5-2.3B", + "name": "qwen2.5-2.3B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1288, + "hfopenllm_v2/BBH": 0.2849, + "hfopenllm_v2/MATH Level 5": 0.0053, + "hfopenllm_v2/GPQA": 0.2517, + "hfopenllm_v2/MUSR": 0.3857, + "hfopenllm_v2/MMLU-PRO": 0.1173 + } + }, + { + "id": "Sakalti/tara-3.8B", + "name": "tara-3.8B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4077, + "hfopenllm_v2/BBH": 0.4886, + "hfopenllm_v2/MATH Level 5": 0.1156, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.478, + "hfopenllm_v2/MMLU-PRO": 0.3281 + } + }, + { + "id": "Sakalti/ultiima-14B", + "name": "ultiima-14B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5701, + "hfopenllm_v2/BBH": 0.6491, + "hfopenllm_v2/MATH Level 5": 0.4698, + "hfopenllm_v2/GPQA": 0.3742, + "hfopenllm_v2/MUSR": 0.4718, + "hfopenllm_v2/MMLU-PRO": 0.5381 + } + }, + { + "id": "Sakalti/ultiima-14B-v0.2", + "name": "ultiima-14B-v0.2", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.707, + "hfopenllm_v2/BBH": 0.6472, + "hfopenllm_v2/MATH Level 5": 0.3995, + "hfopenllm_v2/GPQA": 0.3826, + "hfopenllm_v2/MUSR": 0.4794, + "hfopenllm_v2/MMLU-PRO": 0.5387 + } + }, + { + "id": "Sakalti/ultiima-14B-v0.3", + "name": "ultiima-14B-v0.3", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.704, + "hfopenllm_v2/BBH": 0.6398, + "hfopenllm_v2/MATH Level 5": 0.3965, + "hfopenllm_v2/GPQA": 0.3767, + "hfopenllm_v2/MUSR": 0.4754, + "hfopenllm_v2/MMLU-PRO": 0.5337 + } + }, + { + "id": "Sakalti/ultiima-14B-v0.4", + "name": "ultiima-14B-v0.4", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3008, + "hfopenllm_v2/BBH": 0.642, + "hfopenllm_v2/MATH Level 5": 0.3535, + "hfopenllm_v2/GPQA": 0.396, + "hfopenllm_v2/MUSR": 0.4886, + "hfopenllm_v2/MMLU-PRO": 0.5278 + } + }, + { + "id": "Sakalti/ultiima-32B", + "name": "ultiima-32B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6854, + "hfopenllm_v2/BBH": 0.7037, + "hfopenllm_v2/MATH Level 5": 0.4962, + "hfopenllm_v2/GPQA": 0.3809, + "hfopenllm_v2/MUSR": 0.4995, + "hfopenllm_v2/MMLU-PRO": 0.591 + } + }, + { + "id": "Sakalti/ultiima-72B", + "name": "ultiima-72B", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.714, + "hfopenllm_v2/BBH": 0.7218, + "hfopenllm_v2/MATH Level 5": 0.5355, + "hfopenllm_v2/GPQA": 0.4144, + "hfopenllm_v2/MUSR": 0.4652, + "hfopenllm_v2/MMLU-PRO": 0.5906 + } + }, + { + "id": "Sakalti/ultiima-72B-v1.5", + "name": "ultiima-72B-v1.5", + "developer": "Sakalti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.655, + "hfopenllm_v2/BBH": 0.7392, + "hfopenllm_v2/MATH Level 5": 0.4396, + "hfopenllm_v2/GPQA": 0.4136, + "hfopenllm_v2/MUSR": 0.4691, + "hfopenllm_v2/MMLU-PRO": 0.6054 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Salesforce.json b/data/developers/Salesforce.json new file mode 100644 index 0000000000000000000000000000000000000000..7c5628f61141fdd948b57260a7591b6de20cb58a --- /dev/null +++ b/data/developers/Salesforce.json @@ -0,0 +1,58 @@ +{ + "developer": "Salesforce", + "models": [ + { + "id": "Salesforce/LLaMA-3-8B-SFR-Iterative-DPO-R", + "name": "LLaMA-3-8B-SFR-Iterative-DPO-R", + "developer": "Salesforce", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3816, + "hfopenllm_v2/BBH": 0.5012, + "hfopenllm_v2/MATH Level 5": 0.0914, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.3633, + "hfopenllm_v2/MMLU-PRO": 0.3172 + } + }, + { + "id": "Salesforce/SFR-LLaMa-3.1-70B-Judge-r", + "name": "Salesforce/SFR-LLaMa-3.1-70B-Judge-r", + "developer": "Salesforce", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9272, + "reward-bench/Chat": 0.9693, + "reward-bench/Chat Hard": 0.8476, + "reward-bench/Safety": 0.9162, + "reward-bench/Reasoning": 0.9757 + } + }, + { + "id": "Salesforce/SFR-LLaMa-3.1-8B-Judge-r", + "name": "Salesforce/SFR-LLaMa-3.1-8B-Judge-r", + "developer": "Salesforce", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8865, + "reward-bench/Chat": 0.9553, + "reward-bench/Chat Hard": 0.7774, + "reward-bench/Safety": 0.8622, + "reward-bench/Reasoning": 0.9513 + } + }, + { + "id": "Salesforce/SFR-nemo-12B-Judge-r", + "name": "Salesforce/SFR-nemo-12B-Judge-r", + "developer": "Salesforce", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9027, + "reward-bench/Chat": 0.9721, + "reward-bench/Chat Hard": 0.8224, + "reward-bench/Safety": 0.8649, + "reward-bench/Reasoning": 0.9513 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/SanjiWatsuki.json b/data/developers/SanjiWatsuki.json new file mode 100644 index 0000000000000000000000000000000000000000..9886f491af2c048e9849271814dfcde82a97dc56 --- /dev/null +++ b/data/developers/SanjiWatsuki.json @@ -0,0 +1,33 @@ +{ + "developer": "SanjiWatsuki", + "models": [ + { + "id": "SanjiWatsuki/Kunoichi-DPO-v2-7B", + "name": "Kunoichi-DPO-v2-7B", + "developer": "SanjiWatsuki", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5431, + "hfopenllm_v2/BBH": 0.4416, + "hfopenllm_v2/MATH Level 5": 0.0763, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4188, + "hfopenllm_v2/MMLU-PRO": 0.3107 + } + }, + { + "id": "SanjiWatsuki/Silicon-Maid-7B", + "name": "Silicon-Maid-7B", + "developer": "SanjiWatsuki", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5368, + "hfopenllm_v2/BBH": 0.4128, + "hfopenllm_v2/MATH Level 5": 0.065, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4188, + "hfopenllm_v2/MMLU-PRO": 0.3083 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Sao10K.json b/data/developers/Sao10K.json new file mode 100644 index 0000000000000000000000000000000000000000..6569058a39b468de833ea3ab9419e073af211e5a --- /dev/null +++ b/data/developers/Sao10K.json @@ -0,0 +1,117 @@ +{ + "developer": "Sao10K", + "models": [ + { + "id": "Sao10K/70B-L3.3-Cirrus-x1", + "name": "70B-L3.3-Cirrus-x1", + "developer": "Sao10K", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6681, + "hfopenllm_v2/BBH": 0.7029, + "hfopenllm_v2/MATH Level 5": 0.3739, + "hfopenllm_v2/GPQA": 0.4497, + "hfopenllm_v2/MUSR": 0.4842, + "hfopenllm_v2/MMLU-PRO": 0.5378 + } + }, + { + "id": "Sao10K/Fimbulvetr-11B-v2", + "name": "Fimbulvetr-11B-v2", + "developer": "Sao10K", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.51, + "hfopenllm_v2/BBH": 0.4544, + "hfopenllm_v2/MATH Level 5": 0.068, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4354, + "hfopenllm_v2/MMLU-PRO": 0.3301 + } + }, + { + "id": "Sao10K/L3-70B-Euryale-v2.1", + "name": "L3-70B-Euryale-v2.1", + "developer": "Sao10K", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7281, + "hfopenllm_v2/BBH": 0.6503, + "hfopenllm_v2/MATH Level 5": 0.2243, + "hfopenllm_v2/GPQA": 0.3314, + "hfopenllm_v2/MUSR": 0.4196, + "hfopenllm_v2/MMLU-PRO": 0.5096 + } + }, + { + "id": "Sao10K/L3-8B-Lunaris-v1", + "name": "L3-8B-Lunaris-v1", + "developer": "Sao10K", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6895, + "hfopenllm_v2/BBH": 0.5235, + "hfopenllm_v2/MATH Level 5": 0.0906, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.3727, + "hfopenllm_v2/MMLU-PRO": 0.3787 + } + }, + { + "id": "Sao10K/L3-8B-Niitama-v1", + "name": "L3-8B-Niitama-v1", + "developer": "Sao10K", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6791, + "hfopenllm_v2/BBH": 0.5303, + "hfopenllm_v2/MATH Level 5": 0.0982, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.3807, + "hfopenllm_v2/MMLU-PRO": 0.3701 + } + }, + { + "id": "Sao10K/L3-8B-Stheno-v3.2", + "name": "L3-8B-Stheno-v3.2", + "developer": "Sao10K", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6873, + "hfopenllm_v2/BBH": 0.5228, + "hfopenllm_v2/MATH Level 5": 0.0929, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.3794, + "hfopenllm_v2/MMLU-PRO": 0.3768 + } + }, + { + "id": "Sao10K/L3-8B-Stheno-v3.3-32K", + "name": "L3-8B-Stheno-v3.3-32K", + "developer": "Sao10K", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4604, + "hfopenllm_v2/BBH": 0.3844, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3725, + "hfopenllm_v2/MMLU-PRO": 0.1896 + } + }, + { + "id": "Sao10K/MN-12B-Lyra-v3", + "name": "MN-12B-Lyra-v3", + "developer": "Sao10K", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4486, + "hfopenllm_v2/BBH": 0.4804, + "hfopenllm_v2/MATH Level 5": 0.0937, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.4019, + "hfopenllm_v2/MMLU-PRO": 0.3249 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Saxo.json b/data/developers/Saxo.json new file mode 100644 index 0000000000000000000000000000000000000000..ebdad02ee1b1d5395feaa13ee41b292cd1c12286 --- /dev/null +++ b/data/developers/Saxo.json @@ -0,0 +1,159 @@ +{ + "developer": "Saxo", + "models": [ + { + "id": "Saxo/Linkbricks-Horizon-AI-Avengers-V1-32B", + "name": "Linkbricks-Horizon-AI-Avengers-V1-32B", + "developer": "Saxo", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7972, + "hfopenllm_v2/BBH": 0.7001, + "hfopenllm_v2/MATH Level 5": 0.6027, + "hfopenllm_v2/GPQA": 0.3624, + "hfopenllm_v2/MUSR": 0.4538, + "hfopenllm_v2/MMLU-PRO": 0.5793 + } + }, + { + "id": "Saxo/Linkbricks-Horizon-AI-Avengers-V2-32B", + "name": "Linkbricks-Horizon-AI-Avengers-V2-32B", + "developer": "Saxo", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7956, + "hfopenllm_v2/BBH": 0.7023, + "hfopenllm_v2/MATH Level 5": 0.5665, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.4166, + "hfopenllm_v2/MMLU-PRO": 0.572 + } + }, + { + "id": "Saxo/Linkbricks-Horizon-AI-Avengers-V3-32B", + "name": "Linkbricks-Horizon-AI-Avengers-V3-32B", + "developer": "Saxo", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8249, + "hfopenllm_v2/BBH": 0.6913, + "hfopenllm_v2/MATH Level 5": 0.6178, + "hfopenllm_v2/GPQA": 0.3381, + "hfopenllm_v2/MUSR": 0.4275, + "hfopenllm_v2/MMLU-PRO": 0.5664 + } + }, + { + "id": "Saxo/Linkbricks-Horizon-AI-Avengers-V4-32B", + "name": "Linkbricks-Horizon-AI-Avengers-V4-32B", + "developer": "Saxo", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7631, + "hfopenllm_v2/BBH": 0.692, + "hfopenllm_v2/MATH Level 5": 0.5363, + "hfopenllm_v2/GPQA": 0.3616, + "hfopenllm_v2/MUSR": 0.4643, + "hfopenllm_v2/MMLU-PRO": 0.5752 + } + }, + { + "id": "Saxo/Linkbricks-Horizon-AI-Avengers-V5-32B", + "name": "Linkbricks-Horizon-AI-Avengers-V5-32B", + "developer": "Saxo", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7516, + "hfopenllm_v2/BBH": 0.6929, + "hfopenllm_v2/MATH Level 5": 0.5461, + "hfopenllm_v2/GPQA": 0.3557, + "hfopenllm_v2/MUSR": 0.4709, + "hfopenllm_v2/MMLU-PRO": 0.5762 + } + }, + { + "id": "Saxo/Linkbricks-Horizon-AI-Avengers-V6-32B", + "name": "Linkbricks-Horizon-AI-Avengers-V6-32B", + "developer": "Saxo", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8209, + "hfopenllm_v2/BBH": 0.689, + "hfopenllm_v2/MATH Level 5": 0.6224, + "hfopenllm_v2/GPQA": 0.3347, + "hfopenllm_v2/MUSR": 0.4274, + "hfopenllm_v2/MMLU-PRO": 0.5672 + } + }, + { + "id": "Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V2-27B", + "name": "Linkbricks-Horizon-AI-Korean-Avengers-V2-27B", + "developer": "Saxo", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8146, + "hfopenllm_v2/BBH": 0.6463, + "hfopenllm_v2/MATH Level 5": 0.2802, + "hfopenllm_v2/GPQA": 0.3473, + "hfopenllm_v2/MUSR": 0.4139, + "hfopenllm_v2/MMLU-PRO": 0.4599 + } + }, + { + "id": "Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V3-27B", + "name": "Linkbricks-Horizon-AI-Korean-Avengers-V3-27B", + "developer": "Saxo", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8142, + "hfopenllm_v2/BBH": 0.6404, + "hfopenllm_v2/MATH Level 5": 0.2492, + "hfopenllm_v2/GPQA": 0.3591, + "hfopenllm_v2/MUSR": 0.4467, + "hfopenllm_v2/MMLU-PRO": 0.4524 + } + }, + { + "id": "Saxo/Linkbricks-Horizon-AI-Korean-Superb-22B", + "name": "Linkbricks-Horizon-AI-Korean-Superb-22B", + "developer": "Saxo", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6767, + "hfopenllm_v2/BBH": 0.5626, + "hfopenllm_v2/MATH Level 5": 0.2372, + "hfopenllm_v2/GPQA": 0.3263, + "hfopenllm_v2/MUSR": 0.3908, + "hfopenllm_v2/MMLU-PRO": 0.3871 + } + }, + { + "id": "Saxo/Linkbricks-Horizon-AI-Korean-Superb-27B", + "name": "Linkbricks-Horizon-AI-Korean-Superb-27B", + "developer": "Saxo", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7768, + "hfopenllm_v2/BBH": 0.6518, + "hfopenllm_v2/MATH Level 5": 0.2719, + "hfopenllm_v2/GPQA": 0.3599, + "hfopenllm_v2/MUSR": 0.4791, + "hfopenllm_v2/MMLU-PRO": 0.4647 + } + }, + { + "id": "Saxo/Linkbricks-Horizon-AI-Superb-27B", + "name": "Linkbricks-Horizon-AI-Superb-27B", + "developer": "Saxo", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7302, + "hfopenllm_v2/BBH": 0.6186, + "hfopenllm_v2/MATH Level 5": 0.2221, + "hfopenllm_v2/GPQA": 0.3574, + "hfopenllm_v2/MUSR": 0.465, + "hfopenllm_v2/MMLU-PRO": 0.406 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Schrieffer.json b/data/developers/Schrieffer.json new file mode 100644 index 0000000000000000000000000000000000000000..3126aaf3fafa22fcfcbb0e55bb08dbfc8e04baeb --- /dev/null +++ b/data/developers/Schrieffer.json @@ -0,0 +1,20 @@ +{ + "developer": "Schrieffer", + "models": [ + { + "id": "Schrieffer/Llama-SARM-4B", + "name": "Schrieffer/Llama-SARM-4B", + "developer": "Schrieffer", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7379, + "reward-bench/Factuality": 0.6874, + "reward-bench/Precise IF": 0.4281, + "reward-bench/Math": 0.6448, + "reward-bench/Safety": 0.9178, + "reward-bench/Focus": 0.9556, + "reward-bench/Ties": 0.7939 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/SeaLLMs.json b/data/developers/SeaLLMs.json new file mode 100644 index 0000000000000000000000000000000000000000..30ec679931075fdd65d50e21ff501eb142faf18c --- /dev/null +++ b/data/developers/SeaLLMs.json @@ -0,0 +1,47 @@ +{ + "developer": "SeaLLMs", + "models": [ + { + "id": "SeaLLMs/SeaLLM-7B-v2", + "name": "SeaLLM-7B-v2", + "developer": "SeaLLMs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3671, + "hfopenllm_v2/BBH": 0.4902, + "hfopenllm_v2/MATH Level 5": 0.0853, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.407, + "hfopenllm_v2/MMLU-PRO": 0.3083 + } + }, + { + "id": "SeaLLMs/SeaLLM-7B-v2.5", + "name": "SeaLLM-7B-v2.5", + "developer": "SeaLLMs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4522, + "hfopenllm_v2/BBH": 0.498, + "hfopenllm_v2/MATH Level 5": 0.1088, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.4203, + "hfopenllm_v2/MMLU-PRO": 0.3203 + } + }, + { + "id": "SeaLLMs/SeaLLMs-v3-7B-Chat", + "name": "SeaLLMs-v3-7B-Chat", + "developer": "SeaLLMs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4377, + "hfopenllm_v2/BBH": 0.5266, + "hfopenllm_v2/MATH Level 5": 0.1858, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4174, + "hfopenllm_v2/MMLU-PRO": 0.3895 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/SenseLLM.json b/data/developers/SenseLLM.json new file mode 100644 index 0000000000000000000000000000000000000000..c213f7326b642f4cff4035bb78e11bc136f1b002 --- /dev/null +++ b/data/developers/SenseLLM.json @@ -0,0 +1,33 @@ +{ + "developer": "SenseLLM", + "models": [ + { + "id": "SenseLLM/ReflectionCoder-CL-34B", + "name": "ReflectionCoder-CL-34B", + "developer": "SenseLLM", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4008, + "hfopenllm_v2/BBH": 0.3953, + "hfopenllm_v2/MATH Level 5": 0.0332, + "hfopenllm_v2/GPQA": 0.2508, + "hfopenllm_v2/MUSR": 0.4155, + "hfopenllm_v2/MMLU-PRO": 0.1424 + } + }, + { + "id": "SenseLLM/ReflectionCoder-DS-33B", + "name": "ReflectionCoder-DS-33B", + "developer": "SenseLLM", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3787, + "hfopenllm_v2/BBH": 0.3449, + "hfopenllm_v2/MATH Level 5": 0.0302, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3343, + "hfopenllm_v2/MMLU-PRO": 0.1202 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/SentientAGI.json b/data/developers/SentientAGI.json new file mode 100644 index 0000000000000000000000000000000000000000..9e3052f693fdbc8b0cc8ba88c6f61ca9b99f0ccd --- /dev/null +++ b/data/developers/SentientAGI.json @@ -0,0 +1,33 @@ +{ + "developer": "SentientAGI", + "models": [ + { + "id": "SentientAGI/Dobby-Mini-Leashed-Llama-3.1-8B", + "name": "Dobby-Mini-Leashed-Llama-3.1-8B", + "developer": "SentientAGI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7847, + "hfopenllm_v2/BBH": 0.5138, + "hfopenllm_v2/MATH Level 5": 0.1858, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4254, + "hfopenllm_v2/MMLU-PRO": 0.3694 + } + }, + { + "id": "SentientAGI/Dobby-Mini-Unhinged-Llama-3.1-8B", + "name": "Dobby-Mini-Unhinged-Llama-3.1-8B", + "developer": "SentientAGI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7457, + "hfopenllm_v2/BBH": 0.5142, + "hfopenllm_v2/MATH Level 5": 0.1563, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4013, + "hfopenllm_v2/MMLU-PRO": 0.3585 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/SeppeV.json b/data/developers/SeppeV.json new file mode 100644 index 0000000000000000000000000000000000000000..a36959d2d4ad27dc7b193839473927f74de9f14b --- /dev/null +++ b/data/developers/SeppeV.json @@ -0,0 +1,19 @@ +{ + "developer": "SeppeV", + "models": [ + { + "id": "SeppeV/SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo", + "name": "SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo", + "developer": "SeppeV", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0955, + "hfopenllm_v2/BBH": 0.3073, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.4032, + "hfopenllm_v2/MMLU-PRO": 0.1161 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Sharathhebbar24.json b/data/developers/Sharathhebbar24.json new file mode 100644 index 0000000000000000000000000000000000000000..b0739e1b331d5c06ace519d1073750dbd3400865 --- /dev/null +++ b/data/developers/Sharathhebbar24.json @@ -0,0 +1,33 @@ +{ + "developer": "Sharathhebbar24", + "models": [ + { + "id": "Sharathhebbar24/SSH_355M", + "name": "SSH_355M", + "developer": "Sharathhebbar24", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1424, + "hfopenllm_v2/BBH": 0.3099, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.4178, + "hfopenllm_v2/MMLU-PRO": 0.1176 + } + }, + { + "id": "Sharathhebbar24/chat_gpt2_dpo", + "name": "chat_gpt2_dpo", + "developer": "Sharathhebbar24", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0986, + "hfopenllm_v2/BBH": 0.2902, + "hfopenllm_v2/MATH Level 5": 0.0053, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3818, + "hfopenllm_v2/MMLU-PRO": 0.1142 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ShikaiChen.json b/data/developers/ShikaiChen.json new file mode 100644 index 0000000000000000000000000000000000000000..6162cba6dec7eaed27af88272ffb98342af2522b --- /dev/null +++ b/data/developers/ShikaiChen.json @@ -0,0 +1,23 @@ +{ + "developer": "ShikaiChen", + "models": [ + { + "id": "ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1", + "name": "ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1", + "developer": "ShikaiChen", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9499, + "reward-bench/Factuality": 0.7558, + "reward-bench/Precise IF": 0.35, + "reward-bench/Math": 0.6448, + "reward-bench/Safety": 0.9378, + "reward-bench/Focus": 0.9131, + "reward-bench/Ties": 0.7633, + "reward-bench/Chat": 0.9637, + "reward-bench/Chat Hard": 0.9079, + "reward-bench/Reasoning": 0.9903 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Shreyash2010.json b/data/developers/Shreyash2010.json new file mode 100644 index 0000000000000000000000000000000000000000..0b37aeb427e3399bb567b08cc303f7d6e7c4f332 --- /dev/null +++ b/data/developers/Shreyash2010.json @@ -0,0 +1,19 @@ +{ + "developer": "Shreyash2010", + "models": [ + { + "id": "Shreyash2010/Uma-4x4B-Instruct-v0.1", + "name": "Uma-4x4B-Instruct-v0.1", + "developer": "Shreyash2010", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5517, + "hfopenllm_v2/BBH": 0.5512, + "hfopenllm_v2/MATH Level 5": 0.1775, + "hfopenllm_v2/GPQA": 0.3347, + "hfopenllm_v2/MUSR": 0.4441, + "hfopenllm_v2/MMLU-PRO": 0.387 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Sicarius-Prototyping.json b/data/developers/Sicarius-Prototyping.json new file mode 100644 index 0000000000000000000000000000000000000000..c61b296d37ad9ca148604e6c10a345f9e24dd027 --- /dev/null +++ b/data/developers/Sicarius-Prototyping.json @@ -0,0 +1,47 @@ +{ + "developer": "Sicarius-Prototyping", + "models": [ + { + "id": "Sicarius-Prototyping/Brainy_LLAMA", + "name": "Brainy_LLAMA", + "developer": "Sicarius-Prototyping", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5204, + "hfopenllm_v2/BBH": 0.5117, + "hfopenllm_v2/MATH Level 5": 0.1337, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4143, + "hfopenllm_v2/MMLU-PRO": 0.3849 + } + }, + { + "id": "Sicarius-Prototyping/Micropenis_1B", + "name": "Micropenis_1B", + "developer": "Sicarius-Prototyping", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3461, + "hfopenllm_v2/BBH": 0.3372, + "hfopenllm_v2/MATH Level 5": 0.0461, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3325, + "hfopenllm_v2/MMLU-PRO": 0.186 + } + }, + { + "id": "Sicarius-Prototyping/bacon_and_food", + "name": "bacon_and_food", + "developer": "Sicarius-Prototyping", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.586, + "hfopenllm_v2/BBH": 0.4725, + "hfopenllm_v2/MATH Level 5": 0.0982, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.3884, + "hfopenllm_v2/MMLU-PRO": 0.3263 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/SicariusSicariiStuff.json b/data/developers/SicariusSicariiStuff.json new file mode 100644 index 0000000000000000000000000000000000000000..8b47da480157ebac5222a015de4726330a4530d3 --- /dev/null +++ b/data/developers/SicariusSicariiStuff.json @@ -0,0 +1,271 @@ +{ + "developer": "SicariusSicariiStuff", + "models": [ + { + "id": "SicariusSicariiStuff/2B-ad", + "name": "2B-ad", + "developer": "SicariusSicariiStuff", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4379, + "hfopenllm_v2/BBH": 0.4092, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.4015, + "hfopenllm_v2/MMLU-PRO": 0.2662 + } + }, + { + "id": "SicariusSicariiStuff/2B_or_not_2B", + "name": "2B_or_not_2B", + "developer": "SicariusSicariiStuff", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2062, + "hfopenllm_v2/BBH": 0.3416, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2475, + "hfopenllm_v2/MUSR": 0.3791, + "hfopenllm_v2/MMLU-PRO": 0.1399 + } + }, + { + "id": "SicariusSicariiStuff/Dusk_Rainbow", + "name": "Dusk_Rainbow", + "developer": "SicariusSicariiStuff", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3588, + "hfopenllm_v2/BBH": 0.4772, + "hfopenllm_v2/MATH Level 5": 0.0748, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4025, + "hfopenllm_v2/MMLU-PRO": 0.3443 + } + }, + { + "id": "SicariusSicariiStuff/Eximius_Persona_5B", + "name": "Eximius_Persona_5B", + "developer": "SicariusSicariiStuff", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.656, + "hfopenllm_v2/BBH": 0.4512, + "hfopenllm_v2/MATH Level 5": 0.102, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3818, + "hfopenllm_v2/MMLU-PRO": 0.314 + } + }, + { + "id": "SicariusSicariiStuff/Impish_LLAMA_3B", + "name": "Impish_LLAMA_3B", + "developer": "SicariusSicariiStuff", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.463, + "hfopenllm_v2/BBH": 0.4091, + "hfopenllm_v2/MATH Level 5": 0.1125, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.3673, + "hfopenllm_v2/MMLU-PRO": 0.2941 + } + }, + { + "id": "SicariusSicariiStuff/Impish_Mind_8B", + "name": "Impish_Mind_8B", + "developer": "SicariusSicariiStuff", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3179, + "hfopenllm_v2/BBH": 0.4674, + "hfopenllm_v2/MATH Level 5": 0.105, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.407, + "hfopenllm_v2/MMLU-PRO": 0.3309 + } + }, + { + "id": "SicariusSicariiStuff/Impish_QWEN_14B-1M", + "name": "Impish_QWEN_14B-1M", + "developer": "SicariusSicariiStuff", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7868, + "hfopenllm_v2/BBH": 0.6283, + "hfopenllm_v2/MATH Level 5": 0.3965, + "hfopenllm_v2/GPQA": 0.3507, + "hfopenllm_v2/MUSR": 0.4615, + "hfopenllm_v2/MMLU-PRO": 0.5044 + } + }, + { + "id": "SicariusSicariiStuff/Impish_QWEN_7B-1M", + "name": "Impish_QWEN_7B-1M", + "developer": "SicariusSicariiStuff", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6382, + "hfopenllm_v2/BBH": 0.5372, + "hfopenllm_v2/MATH Level 5": 0.3089, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4074, + "hfopenllm_v2/MMLU-PRO": 0.4265 + } + }, + { + "id": "SicariusSicariiStuff/LLAMA-3_8B_Unaligned_BETA", + "name": "LLAMA-3_8B_Unaligned_BETA", + "developer": "SicariusSicariiStuff", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3713, + "hfopenllm_v2/BBH": 0.4717, + "hfopenllm_v2/MATH Level 5": 0.0838, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4119, + "hfopenllm_v2/MMLU-PRO": 0.3465 + } + }, + { + "id": "SicariusSicariiStuff/Phi-Line_14B", + "name": "Phi-Line_14B", + "developer": "SicariusSicariiStuff", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6496, + "hfopenllm_v2/BBH": 0.6154, + "hfopenllm_v2/MATH Level 5": 0.386, + "hfopenllm_v2/GPQA": 0.3532, + "hfopenllm_v2/MUSR": 0.4479, + "hfopenllm_v2/MMLU-PRO": 0.5454 + } + }, + { + "id": "SicariusSicariiStuff/Phi-lthy4", + "name": "Phi-lthy4", + "developer": "SicariusSicariiStuff", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7679, + "hfopenllm_v2/BBH": 0.5879, + "hfopenllm_v2/MATH Level 5": 0.1367, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.4083, + "hfopenllm_v2/MMLU-PRO": 0.4333 + } + }, + { + "id": "SicariusSicariiStuff/Qwen2.5-14B_Uncencored", + "name": "Qwen2.5-14B_Uncencored", + "developer": "SicariusSicariiStuff", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3158, + "hfopenllm_v2/BBH": 0.6309, + "hfopenllm_v2/MATH Level 5": 0.318, + "hfopenllm_v2/GPQA": 0.3817, + "hfopenllm_v2/MUSR": 0.4517, + "hfopenllm_v2/MMLU-PRO": 0.5266 + } + }, + { + "id": "SicariusSicariiStuff/Qwen2.5-14B_Uncensored", + "name": "Qwen2.5-14B_Uncensored", + "developer": "SicariusSicariiStuff", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3173, + "hfopenllm_v2/BBH": 0.6309, + "hfopenllm_v2/MATH Level 5": 0.318, + "hfopenllm_v2/GPQA": 0.3817, + "hfopenllm_v2/MUSR": 0.4517, + "hfopenllm_v2/MMLU-PRO": 0.5266 + } + }, + { + "id": "SicariusSicariiStuff/Qwen2.5-14B_Uncensored_Instruct", + "name": "Qwen2.5-14B_Uncensored_Instruct", + "developer": "SicariusSicariiStuff", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3789, + "hfopenllm_v2/BBH": 0.5937, + "hfopenllm_v2/MATH Level 5": 0.3285, + "hfopenllm_v2/GPQA": 0.3297, + "hfopenllm_v2/MUSR": 0.3697, + "hfopenllm_v2/MMLU-PRO": 0.5127 + } + }, + { + "id": "SicariusSicariiStuff/Redemption_Wind_24B", + "name": "Redemption_Wind_24B", + "developer": "SicariusSicariiStuff", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2501, + "hfopenllm_v2/BBH": 0.6428, + "hfopenllm_v2/MATH Level 5": 0.1858, + "hfopenllm_v2/GPQA": 0.3834, + "hfopenllm_v2/MUSR": 0.4262, + "hfopenllm_v2/MMLU-PRO": 0.5432 + } + }, + { + "id": "SicariusSicariiStuff/Winged_Imp_8B", + "name": "Winged_Imp_8B", + "developer": "SicariusSicariiStuff", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.743, + "hfopenllm_v2/BBH": 0.512, + "hfopenllm_v2/MATH Level 5": 0.1201, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4148, + "hfopenllm_v2/MMLU-PRO": 0.3639 + } + }, + { + "id": "SicariusSicariiStuff/Wingless_Imp_8B", + "name": "Wingless_Imp_8B", + "developer": "SicariusSicariiStuff", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.743, + "hfopenllm_v2/BBH": 0.512, + "hfopenllm_v2/MATH Level 5": 0.1201, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4148, + "hfopenllm_v2/MMLU-PRO": 0.3639 + } + }, + { + "id": "SicariusSicariiStuff/Zion_Alpha", + "name": "Zion_Alpha", + "developer": "SicariusSicariiStuff", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3324, + "hfopenllm_v2/BBH": 0.4932, + "hfopenllm_v2/MATH Level 5": 0.0521, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4727, + "hfopenllm_v2/MMLU-PRO": 0.3132 + } + }, + { + "id": "SicariusSicariiStuff/dn_ep02", + "name": "dn_ep02", + "developer": "SicariusSicariiStuff", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5064, + "hfopenllm_v2/BBH": 0.5266, + "hfopenllm_v2/MATH Level 5": 0.142, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.4316, + "hfopenllm_v2/MMLU-PRO": 0.3998 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/SkyOrbis.json b/data/developers/SkyOrbis.json new file mode 100644 index 0000000000000000000000000000000000000000..8bfc544d3f0143c039b4e56871c1f9a9d900a33e --- /dev/null +++ b/data/developers/SkyOrbis.json @@ -0,0 +1,173 @@ +{ + "developer": "SkyOrbis", + "models": [ + { + "id": "SkyOrbis/SKY-Ko-Llama3.1-8B-lora", + "name": "SKY-Ko-Llama3.1-8B-lora", + "developer": "SkyOrbis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5058, + "hfopenllm_v2/BBH": 0.5088, + "hfopenllm_v2/MATH Level 5": 0.1548, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.3998, + "hfopenllm_v2/MMLU-PRO": 0.3777 + } + }, + { + "id": "SkyOrbis/SKY-Ko-Llama3.1-8B-lora-epoch1", + "name": "SKY-Ko-Llama3.1-8B-lora-epoch1", + "developer": "SkyOrbis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5058, + "hfopenllm_v2/BBH": 0.5088, + "hfopenllm_v2/MATH Level 5": 0.1548, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.3998, + "hfopenllm_v2/MMLU-PRO": 0.3777 + } + }, + { + "id": "SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch3", + "name": "SKY-Ko-Llama3.2-1B-lora-epoch3", + "developer": "SkyOrbis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3247, + "hfopenllm_v2/BBH": 0.3167, + "hfopenllm_v2/MATH Level 5": 0.0272, + "hfopenllm_v2/GPQA": 0.2517, + "hfopenllm_v2/MUSR": 0.3382, + "hfopenllm_v2/MMLU-PRO": 0.1279 + } + }, + { + "id": "SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch5", + "name": "SKY-Ko-Llama3.2-1B-lora-epoch5", + "developer": "SkyOrbis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.436, + "hfopenllm_v2/BBH": 0.3406, + "hfopenllm_v2/MATH Level 5": 0.0521, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3471, + "hfopenllm_v2/MMLU-PRO": 0.1946 + } + }, + { + "id": "SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch3", + "name": "SKY-Ko-Llama3.2-1B-lora-v2-epoch3", + "developer": "SkyOrbis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.436, + "hfopenllm_v2/BBH": 0.3406, + "hfopenllm_v2/MATH Level 5": 0.0521, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3471, + "hfopenllm_v2/MMLU-PRO": 0.1946 + } + }, + { + "id": "SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch5", + "name": "SKY-Ko-Llama3.2-1B-lora-v2-epoch5", + "developer": "SkyOrbis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4247, + "hfopenllm_v2/BBH": 0.3397, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.3458, + "hfopenllm_v2/MMLU-PRO": 0.1946 + } + }, + { + "id": "SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch1", + "name": "SKY-Ko-Llama3.2-3B-lora-epoch1", + "developer": "SkyOrbis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5331, + "hfopenllm_v2/BBH": 0.44, + "hfopenllm_v2/MATH Level 5": 0.1458, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.3522, + "hfopenllm_v2/MMLU-PRO": 0.3004 + } + }, + { + "id": "SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch2", + "name": "SKY-Ko-Llama3.2-3B-lora-epoch2", + "developer": "SkyOrbis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5331, + "hfopenllm_v2/BBH": 0.44, + "hfopenllm_v2/MATH Level 5": 0.1458, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.3522, + "hfopenllm_v2/MMLU-PRO": 0.3004 + } + }, + { + "id": "SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch3", + "name": "SKY-Ko-Llama3.2-3B-lora-epoch3", + "developer": "SkyOrbis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5331, + "hfopenllm_v2/BBH": 0.44, + "hfopenllm_v2/MATH Level 5": 0.1458, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.3522, + "hfopenllm_v2/MMLU-PRO": 0.3004 + } + }, + { + "id": "SkyOrbis/SKY-Ko-Qwen2.5-3B-Instruct", + "name": "SKY-Ko-Qwen2.5-3B-Instruct", + "developer": "SkyOrbis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3534, + "hfopenllm_v2/BBH": 0.4265, + "hfopenllm_v2/MATH Level 5": 0.0695, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.4024, + "hfopenllm_v2/MMLU-PRO": 0.2812 + } + }, + { + "id": "SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000", + "name": "SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000", + "developer": "SkyOrbis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3819, + "hfopenllm_v2/BBH": 0.5078, + "hfopenllm_v2/MATH Level 5": 0.1866, + "hfopenllm_v2/GPQA": 0.3272, + "hfopenllm_v2/MUSR": 0.4436, + "hfopenllm_v2/MMLU-PRO": 0.3914 + } + }, + { + "id": "SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000", + "name": "SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000", + "developer": "SkyOrbis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3812, + "hfopenllm_v2/BBH": 0.539, + "hfopenllm_v2/MATH Level 5": 0.21, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4238, + "hfopenllm_v2/MMLU-PRO": 0.4238 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Skywork.json b/data/developers/Skywork.json new file mode 100644 index 0000000000000000000000000000000000000000..99e0c1c396995d96ed17d92a31186ed29c117119 --- /dev/null +++ b/data/developers/Skywork.json @@ -0,0 +1,246 @@ +{ + "developer": "Skywork", + "models": [ + { + "id": "Skywork/Skywork-Critic-Llama-3.1-70B", + "name": "Skywork/Skywork-Critic-Llama-3.1-70B", + "developer": "Skywork", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9331, + "reward-bench/Chat": 0.9665, + "reward-bench/Chat Hard": 0.8794, + "reward-bench/Safety": 0.9311, + "reward-bench/Reasoning": 0.9554 + } + }, + { + "id": "Skywork/Skywork-Critic-Llama-3.1-8B", + "name": "Skywork/Skywork-Critic-Llama-3.1-8B", + "developer": "Skywork", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8896, + "reward-bench/Chat": 0.9358, + "reward-bench/Chat Hard": 0.8136, + "reward-bench/Safety": 0.9108, + "reward-bench/Reasoning": 0.898 + } + }, + { + "id": "Skywork/Skywork-Reward-Gemma-2-27B", + "name": "Skywork/Skywork-Reward-Gemma-2-27B", + "developer": "Skywork", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.938, + "reward-bench/Factuality": 0.7368, + "reward-bench/Precise IF": 0.4031, + "reward-bench/Math": 0.7049, + "reward-bench/Safety": 0.9189, + "reward-bench/Focus": 0.9323, + "reward-bench/Ties": 0.8261, + "reward-bench/Chat": 0.9581, + "reward-bench/Chat Hard": 0.9145, + "reward-bench/Reasoning": 0.9606 + } + }, + { + "id": "Skywork/Skywork-Reward-Gemma-2-27B-v0.2", + "name": "Skywork-Reward-Gemma-2-27B-v0.2", + "developer": "Skywork", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7807, + "hfopenllm_v2/BBH": 0.636, + "hfopenllm_v2/MATH Level 5": 0.2273, + "hfopenllm_v2/GPQA": 0.344, + "hfopenllm_v2/MUSR": 0.4231, + "hfopenllm_v2/MMLU-PRO": 0.4103, + "reward-bench/Score": 0.9426, + "reward-bench/Factuality": 0.7674, + "reward-bench/Precise IF": 0.375, + "reward-bench/Math": 0.6721, + "reward-bench/Safety": 0.9297, + "reward-bench/Focus": 0.9172, + "reward-bench/Ties": 0.8182, + "reward-bench/Chat": 0.9609, + "reward-bench/Chat Hard": 0.8991, + "reward-bench/Reasoning": 0.9807 + } + }, + { + "id": "Skywork/Skywork-Reward-Llama-3.1-8B", + "name": "Skywork/Skywork-Reward-Llama-3.1-8B", + "developer": "Skywork", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9252, + "reward-bench/Factuality": 0.6989, + "reward-bench/Precise IF": 0.425, + "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.9081, + "reward-bench/Focus": 0.9616, + "reward-bench/Ties": 0.741, + "reward-bench/Chat": 0.9581, + "reward-bench/Chat Hard": 0.8728, + "reward-bench/Reasoning": 0.962 + } + }, + { + "id": "Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", + "name": "Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", + "developer": "Skywork", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9313, + "reward-bench/Factuality": 0.6968, + "reward-bench/Precise IF": 0.4062, + "reward-bench/Math": 0.6011, + "reward-bench/Safety": 0.927, + "reward-bench/Focus": 0.9414, + "reward-bench/Ties": 0.7169, + "reward-bench/Chat": 0.9469, + "reward-bench/Chat Hard": 0.8838, + "reward-bench/Reasoning": 0.9675 + } + }, + { + "id": "Skywork/Skywork-Reward-V2-Llama-3.1-8B", + "name": "Skywork/Skywork-Reward-V2-Llama-3.1-8B", + "developer": "Skywork", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8413, + "reward-bench/Factuality": 0.8463, + "reward-bench/Precise IF": 0.6625, + "reward-bench/Math": 0.776, + "reward-bench/Safety": 0.9667, + "reward-bench/Focus": 0.9838, + "reward-bench/Ties": 0.8124 + } + }, + { + "id": "Skywork/Skywork-Reward-V2-Llama-3.2-1B", + "name": "Skywork/Skywork-Reward-V2-Llama-3.2-1B", + "developer": "Skywork", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6438, + "reward-bench/Factuality": 0.6084, + "reward-bench/Precise IF": 0.4562, + "reward-bench/Math": 0.6011, + "reward-bench/Safety": 0.8733, + "reward-bench/Focus": 0.8929, + "reward-bench/Ties": 0.4306 + } + }, + { + "id": "Skywork/Skywork-Reward-V2-Llama-3.2-3B", + "name": "Skywork/Skywork-Reward-V2-Llama-3.2-3B", + "developer": "Skywork", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7466, + "reward-bench/Factuality": 0.7621, + "reward-bench/Precise IF": 0.4562, + "reward-bench/Math": 0.694, + "reward-bench/Safety": 0.9311, + "reward-bench/Focus": 0.9596, + "reward-bench/Ties": 0.6768 + } + }, + { + "id": "Skywork/Skywork-Reward-V2-Qwen3-0.6B", + "name": "Skywork/Skywork-Reward-V2-Qwen3-0.6B", + "developer": "Skywork", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6125, + "reward-bench/Factuality": 0.58, + "reward-bench/Precise IF": 0.4, + "reward-bench/Math": 0.7158, + "reward-bench/Safety": 0.8444, + "reward-bench/Focus": 0.7949, + "reward-bench/Ties": 0.3397 + } + }, + { + "id": "Skywork/Skywork-Reward-V2-Qwen3-1.7B", + "name": "Skywork/Skywork-Reward-V2-Qwen3-1.7B", + "developer": "Skywork", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6818, + "reward-bench/Factuality": 0.6568, + "reward-bench/Precise IF": 0.4437, + "reward-bench/Math": 0.7268, + "reward-bench/Safety": 0.8911, + "reward-bench/Focus": 0.8848, + "reward-bench/Ties": 0.4872 + } + }, + { + "id": "Skywork/Skywork-Reward-V2-Qwen3-4B", + "name": "Skywork/Skywork-Reward-V2-Qwen3-4B", + "developer": "Skywork", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7551, + "reward-bench/Factuality": 0.7737, + "reward-bench/Precise IF": 0.4625, + "reward-bench/Math": 0.7322, + "reward-bench/Safety": 0.9222, + "reward-bench/Focus": 0.9657, + "reward-bench/Ties": 0.6743 + } + }, + { + "id": "Skywork/Skywork-Reward-V2-Qwen3-8B", + "name": "Skywork/Skywork-Reward-V2-Qwen3-8B", + "developer": "Skywork", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7837, + "reward-bench/Factuality": 0.7989, + "reward-bench/Precise IF": 0.5, + "reward-bench/Math": 0.7705, + "reward-bench/Safety": 0.94, + "reward-bench/Focus": 0.9636, + "reward-bench/Ties": 0.7294 + } + }, + { + "id": "Skywork/Skywork-VL-Reward-7B", + "name": "Skywork/Skywork-VL-Reward-7B", + "developer": "Skywork", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9007, + "reward-bench/Factuality": 0.6063, + "reward-bench/Precise IF": 0.35, + "reward-bench/Math": 0.6339, + "reward-bench/Safety": 0.9108, + "reward-bench/Focus": 0.8909, + "reward-bench/Ties": 0.7586, + "reward-bench/Chat": 0.8994, + "reward-bench/Chat Hard": 0.875, + "reward-bench/Reasoning": 0.9176 + } + }, + { + "id": "Skywork/Skywork-o1-Open-Llama-3.1-8B", + "name": "Skywork-o1-Open-Llama-3.1-8B", + "developer": "Skywork", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3518, + "hfopenllm_v2/BBH": 0.4516, + "hfopenllm_v2/MATH Level 5": 0.5211, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3156, + "hfopenllm_v2/MMLU-PRO": 0.203 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Solshine.json b/data/developers/Solshine.json new file mode 100644 index 0000000000000000000000000000000000000000..e830a92839463f45e681eaf15d27461bf7e55475 --- /dev/null +++ b/data/developers/Solshine.json @@ -0,0 +1,33 @@ +{ + "developer": "Solshine", + "models": [ + { + "id": "Solshine/Brimful-merged-replete", + "name": "Brimful-merged-replete", + "developer": "Solshine", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1761, + "hfopenllm_v2/BBH": 0.2883, + "hfopenllm_v2/MATH Level 5": 0.003, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3421, + "hfopenllm_v2/MMLU-PRO": 0.1085 + } + }, + { + "id": "Solshine/Llama-3-1-big-thoughtful-passthrough-merge-2", + "name": "Llama-3-1-big-thoughtful-passthrough-merge-2", + "developer": "Solshine", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2547, + "hfopenllm_v2/BBH": 0.3209, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3889, + "hfopenllm_v2/MMLU-PRO": 0.1185 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Sorawiz.json b/data/developers/Sorawiz.json new file mode 100644 index 0000000000000000000000000000000000000000..2650c3b4fd620a7263bae026629f54caad0a4d31 --- /dev/null +++ b/data/developers/Sorawiz.json @@ -0,0 +1,33 @@ +{ + "developer": "Sorawiz", + "models": [ + { + "id": "Sorawiz/Gemma-9B-Base", + "name": "Gemma-9B-Base", + "developer": "Sorawiz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1667, + "hfopenllm_v2/BBH": 0.593, + "hfopenllm_v2/MATH Level 5": 0.0982, + "hfopenllm_v2/GPQA": 0.3398, + "hfopenllm_v2/MUSR": 0.4045, + "hfopenllm_v2/MMLU-PRO": 0.4235 + } + }, + { + "id": "Sorawiz/Gemma-Creative-9B-Base", + "name": "Gemma-Creative-9B-Base", + "developer": "Sorawiz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1515, + "hfopenllm_v2/BBH": 0.5459, + "hfopenllm_v2/MATH Level 5": 0.0778, + "hfopenllm_v2/GPQA": 0.3297, + "hfopenllm_v2/MUSR": 0.4019, + "hfopenllm_v2/MMLU-PRO": 0.4008 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Sourjayon.json b/data/developers/Sourjayon.json new file mode 100644 index 0000000000000000000000000000000000000000..01fa6b43cb5bdaf67d532ba142857849618af89f --- /dev/null +++ b/data/developers/Sourjayon.json @@ -0,0 +1,33 @@ +{ + "developer": "Sourjayon", + "models": [ + { + "id": "Sourjayon/DeepSeek-R1-8b-Sify", + "name": "DeepSeek-R1-8b-Sify", + "developer": "Sourjayon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3679, + "hfopenllm_v2/BBH": 0.3379, + "hfopenllm_v2/MATH Level 5": 0.2447, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.3303, + "hfopenllm_v2/MMLU-PRO": 0.1981 + } + }, + { + "id": "Sourjayon/DeepSeek-R1-ForumNXT", + "name": "DeepSeek-R1-ForumNXT", + "developer": "Sourjayon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2603, + "hfopenllm_v2/BBH": 0.331, + "hfopenllm_v2/MATH Level 5": 0.2576, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3392, + "hfopenllm_v2/MMLU-PRO": 0.1648 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/SpaceYL.json b/data/developers/SpaceYL.json new file mode 100644 index 0000000000000000000000000000000000000000..c9fc888c11d4b241d443803a3cf12907f61998f9 --- /dev/null +++ b/data/developers/SpaceYL.json @@ -0,0 +1,19 @@ +{ + "developer": "SpaceYL", + "models": [ + { + "id": "SpaceYL/ECE_Poirot", + "name": "ECE_Poirot", + "developer": "SpaceYL", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3107, + "hfopenllm_v2/BBH": 0.4262, + "hfopenllm_v2/MATH Level 5": 0.0914, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4026, + "hfopenllm_v2/MMLU-PRO": 0.2883 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Spestly.json b/data/developers/Spestly.json new file mode 100644 index 0000000000000000000000000000000000000000..0b4037b864eb053cd4c0642e5b3b289ce874ab09 --- /dev/null +++ b/data/developers/Spestly.json @@ -0,0 +1,47 @@ +{ + "developer": "Spestly", + "models": [ + { + "id": "Spestly/Athena-1-3B", + "name": "Athena-1-3B", + "developer": "Spestly", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5569, + "hfopenllm_v2/BBH": 0.4702, + "hfopenllm_v2/MATH Level 5": 0.2379, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4362, + "hfopenllm_v2/MMLU-PRO": 0.3519 + } + }, + { + "id": "Spestly/Atlas-Pro-1.5B-Preview", + "name": "Atlas-Pro-1.5B-Preview", + "developer": "Spestly", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.243, + "hfopenllm_v2/BBH": 0.3499, + "hfopenllm_v2/MATH Level 5": 0.3195, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.3354, + "hfopenllm_v2/MMLU-PRO": 0.1925 + } + }, + { + "id": "Spestly/Atlas-Pro-7B-Preview", + "name": "Atlas-Pro-7B-Preview", + "developer": "Spestly", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3154, + "hfopenllm_v2/BBH": 0.4668, + "hfopenllm_v2/MATH Level 5": 0.5083, + "hfopenllm_v2/GPQA": 0.3372, + "hfopenllm_v2/MUSR": 0.3911, + "hfopenllm_v2/MMLU-PRO": 0.297 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Stark2008.json b/data/developers/Stark2008.json new file mode 100644 index 0000000000000000000000000000000000000000..50633df72c9efad74f05c46342aae6f44173516f --- /dev/null +++ b/data/developers/Stark2008.json @@ -0,0 +1,47 @@ +{ + "developer": "Stark2008", + "models": [ + { + "id": "Stark2008/GutenLaserPi", + "name": "GutenLaserPi", + "developer": "Stark2008", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4227, + "hfopenllm_v2/BBH": 0.5212, + "hfopenllm_v2/MATH Level 5": 0.0785, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.462, + "hfopenllm_v2/MMLU-PRO": 0.3106 + } + }, + { + "id": "Stark2008/LayleleFlamPi", + "name": "LayleleFlamPi", + "developer": "Stark2008", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4284, + "hfopenllm_v2/BBH": 0.5116, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.4608, + "hfopenllm_v2/MMLU-PRO": 0.3093 + } + }, + { + "id": "Stark2008/VisFlamCat", + "name": "VisFlamCat", + "developer": "Stark2008", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4366, + "hfopenllm_v2/BBH": 0.5217, + "hfopenllm_v2/MATH Level 5": 0.0763, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4463, + "hfopenllm_v2/MMLU-PRO": 0.3144 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Steelskull.json b/data/developers/Steelskull.json new file mode 100644 index 0000000000000000000000000000000000000000..c52479b54b19affd00493eea0310042cc8cea472 --- /dev/null +++ b/data/developers/Steelskull.json @@ -0,0 +1,33 @@ +{ + "developer": "Steelskull", + "models": [ + { + "id": "Steelskull/L3.3-MS-Nevoria-70b", + "name": "L3.3-MS-Nevoria-70b", + "developer": "Steelskull", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6963, + "hfopenllm_v2/BBH": 0.6998, + "hfopenllm_v2/MATH Level 5": 0.3958, + "hfopenllm_v2/GPQA": 0.4706, + "hfopenllm_v2/MUSR": 0.4682, + "hfopenllm_v2/MMLU-PRO": 0.5535 + } + }, + { + "id": "Steelskull/L3.3-Nevoria-R1-70b", + "name": "L3.3-Nevoria-R1-70b", + "developer": "Steelskull", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6024, + "hfopenllm_v2/BBH": 0.6972, + "hfopenllm_v2/MATH Level 5": 0.463, + "hfopenllm_v2/GPQA": 0.469, + "hfopenllm_v2/MUSR": 0.4775, + "hfopenllm_v2/MMLU-PRO": 0.5463 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/StelleX.json b/data/developers/StelleX.json new file mode 100644 index 0000000000000000000000000000000000000000..76954e170183fd4864448255c6d8328f7b2922c3 --- /dev/null +++ b/data/developers/StelleX.json @@ -0,0 +1,33 @@ +{ + "developer": "StelleX", + "models": [ + { + "id": "StelleX/Qwen2.5_Math_7B_Cot", + "name": "Qwen2.5_Math_7B_Cot", + "developer": "StelleX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2143, + "hfopenllm_v2/BBH": 0.4313, + "hfopenllm_v2/MATH Level 5": 0.3263, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.3924, + "hfopenllm_v2/MMLU-PRO": 0.281 + } + }, + { + "id": "StelleX/Vorisatex-7B-preview", + "name": "Vorisatex-7B-preview", + "developer": "StelleX", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1515, + "hfopenllm_v2/BBH": 0.3112, + "hfopenllm_v2/MATH Level 5": 0.0287, + "hfopenllm_v2/GPQA": 0.2517, + "hfopenllm_v2/MUSR": 0.4192, + "hfopenllm_v2/MMLU-PRO": 0.1166 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/SultanR.json b/data/developers/SultanR.json new file mode 100644 index 0000000000000000000000000000000000000000..66d7f9d1e1a8f04e94773c6cce3dfbe31f53919f --- /dev/null +++ b/data/developers/SultanR.json @@ -0,0 +1,60 @@ +{ + "developer": "SultanR", + "models": [ + { + "id": "SultanR/SmolTulu-1.7b-Instruct", + "name": "SmolTulu-1.7b-Instruct", + "developer": "SultanR", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6541, + "hfopenllm_v2/BBH": 0.3713, + "hfopenllm_v2/MATH Level 5": 0.0793, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.354, + "hfopenllm_v2/MMLU-PRO": 0.171 + } + }, + { + "id": "SultanR/SmolTulu-1.7b-RM", + "name": "SultanR/SmolTulu-1.7b-RM", + "developer": "SultanR", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5094, + "reward-bench/Chat": 0.743, + "reward-bench/Chat Hard": 0.4408, + "reward-bench/Safety": 0.5716, + "reward-bench/Reasoning": 0.2821 + } + }, + { + "id": "SultanR/SmolTulu-1.7b-Reinforced", + "name": "SmolTulu-1.7b-Reinforced", + "developer": "SultanR", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6791, + "hfopenllm_v2/BBH": 0.3552, + "hfopenllm_v2/MATH Level 5": 0.0718, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3406, + "hfopenllm_v2/MMLU-PRO": 0.1763 + } + }, + { + "id": "SultanR/SmolTulu-1.7b-it-v0", + "name": "SmolTulu-1.7b-it-v0", + "developer": "SultanR", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6541, + "hfopenllm_v2/BBH": 0.3713, + "hfopenllm_v2/MATH Level 5": 0.0793, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.354, + "hfopenllm_v2/MMLU-PRO": 0.171 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Supichi.json b/data/developers/Supichi.json new file mode 100644 index 0000000000000000000000000000000000000000..6ecb1a54881b531ea993c765c78c24fa9899ec87 --- /dev/null +++ b/data/developers/Supichi.json @@ -0,0 +1,159 @@ +{ + "developer": "Supichi", + "models": [ + { + "id": "Supichi/BBA-123", + "name": "BBA-123", + "developer": "Supichi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.208, + "hfopenllm_v2/BBH": 0.292, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3499, + "hfopenllm_v2/MMLU-PRO": 0.1167 + } + }, + { + "id": "Supichi/BBA99", + "name": "BBA99", + "developer": "Supichi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1407, + "hfopenllm_v2/BBH": 0.2769, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3218, + "hfopenllm_v2/MMLU-PRO": 0.1112 + } + }, + { + "id": "Supichi/BBAIK29", + "name": "BBAIK29", + "developer": "Supichi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4588, + "hfopenllm_v2/BBH": 0.559, + "hfopenllm_v2/MATH Level 5": 0.3678, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4501, + "hfopenllm_v2/MMLU-PRO": 0.4469 + } + }, + { + "id": "Supichi/BBAI_135_Gemma", + "name": "BBAI_135_Gemma", + "developer": "Supichi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0656, + "hfopenllm_v2/BBH": 0.3568, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3805, + "hfopenllm_v2/MMLU-PRO": 0.1672 + } + }, + { + "id": "Supichi/BBAI_250_Xia0_gZ", + "name": "BBAI_250_Xia0_gZ", + "developer": "Supichi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4685, + "hfopenllm_v2/BBH": 0.5568, + "hfopenllm_v2/MATH Level 5": 0.364, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.4579, + "hfopenllm_v2/MMLU-PRO": 0.4465 + } + }, + { + "id": "Supichi/BBAI_275_Tsunami_gZ", + "name": "BBAI_275_Tsunami_gZ", + "developer": "Supichi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.537, + "hfopenllm_v2/BBH": 0.5531, + "hfopenllm_v2/MATH Level 5": 0.3285, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.4448, + "hfopenllm_v2/MMLU-PRO": 0.4492 + } + }, + { + "id": "Supichi/BBAI_525_Tsu_gZ_Xia0", + "name": "BBAI_525_Tsu_gZ_Xia0", + "developer": "Supichi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5339, + "hfopenllm_v2/BBH": 0.5562, + "hfopenllm_v2/MATH Level 5": 0.3429, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4474, + "hfopenllm_v2/MMLU-PRO": 0.4477 + } + }, + { + "id": "Supichi/BBAI_78B_Calme_3_1_Ties", + "name": "BBAI_78B_Calme_3_1_Ties", + "developer": "Supichi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1828, + "hfopenllm_v2/BBH": 0.2828, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.229, + "hfopenllm_v2/MUSR": 0.31, + "hfopenllm_v2/MMLU-PRO": 0.1144 + } + }, + { + "id": "Supichi/BBAI_QWEEN_V000000_LUMEN_14B", + "name": "BBAI_QWEEN_V000000_LUMEN_14B", + "developer": "Supichi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1815, + "hfopenllm_v2/BBH": 0.2297, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2315, + "hfopenllm_v2/MUSR": 0.3445, + "hfopenllm_v2/MMLU-PRO": 0.116 + } + }, + { + "id": "Supichi/HF_TOKEN", + "name": "HF_TOKEN", + "developer": "Supichi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.138, + "hfopenllm_v2/BBH": 0.2764, + "hfopenllm_v2/MATH Level 5": 0.0008, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3272, + "hfopenllm_v2/MMLU-PRO": 0.111 + } + }, + { + "id": "Supichi/NJS26", + "name": "NJS26", + "developer": "Supichi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0448, + "hfopenllm_v2/BBH": 0.478, + "hfopenllm_v2/MATH Level 5": 0.0325, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.3854, + "hfopenllm_v2/MMLU-PRO": 0.3037 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Svak.json b/data/developers/Svak.json new file mode 100644 index 0000000000000000000000000000000000000000..b354380187fdce2c4780478c1151e96721aa43f5 --- /dev/null +++ b/data/developers/Svak.json @@ -0,0 +1,33 @@ +{ + "developer": "Svak", + "models": [ + { + "id": "Svak/MN-12B-Inferor-v0.0", + "name": "MN-12B-Inferor-v0.0", + "developer": "Svak", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5708, + "hfopenllm_v2/BBH": 0.5195, + "hfopenllm_v2/MATH Level 5": 0.102, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4639, + "hfopenllm_v2/MMLU-PRO": 0.3559 + } + }, + { + "id": "Svak/MN-12B-Inferor-v0.1", + "name": "MN-12B-Inferor-v0.1", + "developer": "Svak", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6347, + "hfopenllm_v2/BBH": 0.5147, + "hfopenllm_v2/MATH Level 5": 0.1261, + "hfopenllm_v2/GPQA": 0.3255, + "hfopenllm_v2/MUSR": 0.4351, + "hfopenllm_v2/MMLU-PRO": 0.3662 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Syed-Hasan-8503.json b/data/developers/Syed-Hasan-8503.json new file mode 100644 index 0000000000000000000000000000000000000000..54097e5df4fffe1a64a108ab1414f546df9428a9 --- /dev/null +++ b/data/developers/Syed-Hasan-8503.json @@ -0,0 +1,19 @@ +{ + "developer": "Syed-Hasan-8503", + "models": [ + { + "id": "Syed-Hasan-8503/Phi-3-mini-4K-instruct-cpo-simpo", + "name": "Phi-3-mini-4K-instruct-cpo-simpo", + "developer": "Syed-Hasan-8503", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5714, + "hfopenllm_v2/BBH": 0.5682, + "hfopenllm_v2/MATH Level 5": 0.1571, + "hfopenllm_v2/GPQA": 0.3305, + "hfopenllm_v2/MUSR": 0.3964, + "hfopenllm_v2/MMLU-PRO": 0.3861 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/T145.json b/data/developers/T145.json new file mode 100644 index 0000000000000000000000000000000000000000..66d575dca223a9d26a763efc75545aeb619d8578 --- /dev/null +++ b/data/developers/T145.json @@ -0,0 +1,719 @@ +{ + "developer": "T145", + "models": [ + { + "id": "T145/KRONOS-8B-V1-P1", + "name": "KRONOS-8B-V1-P1", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.785, + "hfopenllm_v2/BBH": 0.5085, + "hfopenllm_v2/MATH Level 5": 0.1979, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.3881, + "hfopenllm_v2/MMLU-PRO": 0.376 + } + }, + { + "id": "T145/KRONOS-8B-V1-P2", + "name": "KRONOS-8B-V1-P2", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6724, + "hfopenllm_v2/BBH": 0.4772, + "hfopenllm_v2/MATH Level 5": 0.1601, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.3568, + "hfopenllm_v2/MMLU-PRO": 0.3453 + } + }, + { + "id": "T145/KRONOS-8B-V1-P3", + "name": "KRONOS-8B-V1-P3", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7137, + "hfopenllm_v2/BBH": 0.5128, + "hfopenllm_v2/MATH Level 5": 0.1926, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3616, + "hfopenllm_v2/MMLU-PRO": 0.3405 + } + }, + { + "id": "T145/KRONOS-8B-V2", + "name": "KRONOS-8B-V2", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.518, + "hfopenllm_v2/BBH": 0.5133, + "hfopenllm_v2/MATH Level 5": 0.2266, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.3829, + "hfopenllm_v2/MMLU-PRO": 0.3738 + } + }, + { + "id": "T145/KRONOS-8B-V3", + "name": "KRONOS-8B-V3", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5475, + "hfopenllm_v2/BBH": 0.5119, + "hfopenllm_v2/MATH Level 5": 0.2598, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.3922, + "hfopenllm_v2/MMLU-PRO": 0.3738 + } + }, + { + "id": "T145/KRONOS-8B-V4", + "name": "KRONOS-8B-V4", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7889, + "hfopenllm_v2/BBH": 0.5092, + "hfopenllm_v2/MATH Level 5": 0.1949, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.383, + "hfopenllm_v2/MMLU-PRO": 0.3786 + } + }, + { + "id": "T145/KRONOS-8B-V5", + "name": "KRONOS-8B-V5", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5405, + "hfopenllm_v2/BBH": 0.5089, + "hfopenllm_v2/MATH Level 5": 0.2689, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4055, + "hfopenllm_v2/MMLU-PRO": 0.3759 + } + }, + { + "id": "T145/KRONOS-8B-V6", + "name": "KRONOS-8B-V6", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7022, + "hfopenllm_v2/BBH": 0.5034, + "hfopenllm_v2/MATH Level 5": 0.2598, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.4121, + "hfopenllm_v2/MMLU-PRO": 0.3501 + } + }, + { + "id": "T145/KRONOS-8B-V7", + "name": "KRONOS-8B-V7", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3529, + "hfopenllm_v2/BBH": 0.4526, + "hfopenllm_v2/MATH Level 5": 0.111, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3671, + "hfopenllm_v2/MMLU-PRO": 0.2697 + } + }, + { + "id": "T145/KRONOS-8B-V8", + "name": "KRONOS-8B-V8", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.777, + "hfopenllm_v2/BBH": 0.5094, + "hfopenllm_v2/MATH Level 5": 0.2047, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.3869, + "hfopenllm_v2/MMLU-PRO": 0.3782 + } + }, + { + "id": "T145/KRONOS-8B-V9", + "name": "KRONOS-8B-V9", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7856, + "hfopenllm_v2/BBH": 0.5099, + "hfopenllm_v2/MATH Level 5": 0.1986, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.3868, + "hfopenllm_v2/MMLU-PRO": 0.3752 + } + }, + { + "id": "T145/Llama-3.1-8B-Instruct-Zeus", + "name": "Llama-3.1-8B-Instruct-Zeus", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7941, + "hfopenllm_v2/BBH": 0.5174, + "hfopenllm_v2/MATH Level 5": 0.1956, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.3976, + "hfopenllm_v2/MMLU-PRO": 0.3893 + } + }, + { + "id": "T145/Llama-3.1-8B-Zeus", + "name": "Llama-3.1-8B-Zeus", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3518, + "hfopenllm_v2/BBH": 0.3671, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3316, + "hfopenllm_v2/MMLU-PRO": 0.1332 + } + }, + { + "id": "T145/Meta-Llama-3.1-8B-Instruct-TIES", + "name": "Meta-Llama-3.1-8B-Instruct-TIES", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5424, + "hfopenllm_v2/BBH": 0.507, + "hfopenllm_v2/MATH Level 5": 0.21, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.3843, + "hfopenllm_v2/MMLU-PRO": 0.378 + } + }, + { + "id": "T145/ZEUS-8B-V10", + "name": "ZEUS-8B-V10", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7707, + "hfopenllm_v2/BBH": 0.527, + "hfopenllm_v2/MATH Level 5": 0.2115, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.3898, + "hfopenllm_v2/MMLU-PRO": 0.3904 + } + }, + { + "id": "T145/ZEUS-8B-V11", + "name": "ZEUS-8B-V11", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.81, + "hfopenllm_v2/BBH": 0.5162, + "hfopenllm_v2/MATH Level 5": 0.1964, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.3807, + "hfopenllm_v2/MMLU-PRO": 0.3884 + } + }, + { + "id": "T145/ZEUS-8B-V12", + "name": "ZEUS-8B-V12", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7816, + "hfopenllm_v2/BBH": 0.5254, + "hfopenllm_v2/MATH Level 5": 0.2115, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.3858, + "hfopenllm_v2/MMLU-PRO": 0.3912 + } + }, + { + "id": "T145/ZEUS-8B-V13", + "name": "ZEUS-8B-V13", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7904, + "hfopenllm_v2/BBH": 0.5277, + "hfopenllm_v2/MATH Level 5": 0.2137, + "hfopenllm_v2/GPQA": 0.3238, + "hfopenllm_v2/MUSR": 0.3845, + "hfopenllm_v2/MMLU-PRO": 0.3911 + } + }, + { + "id": "T145/ZEUS-8B-V13-abliterated", + "name": "ZEUS-8B-V13-abliterated", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7878, + "hfopenllm_v2/BBH": 0.5198, + "hfopenllm_v2/MATH Level 5": 0.179, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.3871, + "hfopenllm_v2/MMLU-PRO": 0.3872 + } + }, + { + "id": "T145/ZEUS-8B-V14", + "name": "ZEUS-8B-V14", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7709, + "hfopenllm_v2/BBH": 0.5275, + "hfopenllm_v2/MATH Level 5": 0.213, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.3844, + "hfopenllm_v2/MMLU-PRO": 0.3914 + } + }, + { + "id": "T145/ZEUS-8B-V15", + "name": "ZEUS-8B-V15", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7013, + "hfopenllm_v2/BBH": 0.5538, + "hfopenllm_v2/MATH Level 5": 0.2304, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.402, + "hfopenllm_v2/MMLU-PRO": 0.4059 + } + }, + { + "id": "T145/ZEUS-8B-V16", + "name": "ZEUS-8B-V16", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7925, + "hfopenllm_v2/BBH": 0.5266, + "hfopenllm_v2/MATH Level 5": 0.2205, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.3951, + "hfopenllm_v2/MMLU-PRO": 0.3926 + } + }, + { + "id": "T145/ZEUS-8B-V17", + "name": "ZEUS-8B-V17", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7941, + "hfopenllm_v2/BBH": 0.5251, + "hfopenllm_v2/MATH Level 5": 0.2243, + "hfopenllm_v2/GPQA": 0.3221, + "hfopenllm_v2/MUSR": 0.4016, + "hfopenllm_v2/MMLU-PRO": 0.3935 + } + }, + { + "id": "T145/ZEUS-8B-V17-abliterated", + "name": "ZEUS-8B-V17-abliterated", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7576, + "hfopenllm_v2/BBH": 0.52, + "hfopenllm_v2/MATH Level 5": 0.0438, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4269, + "hfopenllm_v2/MMLU-PRO": 0.3622 + } + }, + { + "id": "T145/ZEUS-8B-V17-abliterated-V2", + "name": "ZEUS-8B-V17-abliterated-V2", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6532, + "hfopenllm_v2/BBH": 0.4928, + "hfopenllm_v2/MATH Level 5": 0.1118, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3407, + "hfopenllm_v2/MMLU-PRO": 0.3402 + } + }, + { + "id": "T145/ZEUS-8B-V17-abliterated-V4", + "name": "ZEUS-8B-V17-abliterated-V4", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7228, + "hfopenllm_v2/BBH": 0.5169, + "hfopenllm_v2/MATH Level 5": 0.0937, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.4187, + "hfopenllm_v2/MMLU-PRO": 0.3774 + } + }, + { + "id": "T145/ZEUS-8B-V18", + "name": "ZEUS-8B-V18", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7834, + "hfopenllm_v2/BBH": 0.527, + "hfopenllm_v2/MATH Level 5": 0.2183, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.4043, + "hfopenllm_v2/MMLU-PRO": 0.3942 + } + }, + { + "id": "T145/ZEUS-8B-V19", + "name": "ZEUS-8B-V19", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7883, + "hfopenllm_v2/BBH": 0.5276, + "hfopenllm_v2/MATH Level 5": 0.2205, + "hfopenllm_v2/GPQA": 0.3221, + "hfopenllm_v2/MUSR": 0.4043, + "hfopenllm_v2/MMLU-PRO": 0.3934 + } + }, + { + "id": "T145/ZEUS-8B-V2", + "name": "ZEUS-8B-V2", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8029, + "hfopenllm_v2/BBH": 0.5194, + "hfopenllm_v2/MATH Level 5": 0.216, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.391, + "hfopenllm_v2/MMLU-PRO": 0.3896 + } + }, + { + "id": "T145/ZEUS-8B-V2-ORPO", + "name": "ZEUS-8B-V2-ORPO", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7187, + "hfopenllm_v2/BBH": 0.5075, + "hfopenllm_v2/MATH Level 5": 0.1828, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.3935, + "hfopenllm_v2/MMLU-PRO": 0.3678 + } + }, + { + "id": "T145/ZEUS-8B-V2-abliterated", + "name": "ZEUS-8B-V2-abliterated", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7895, + "hfopenllm_v2/BBH": 0.5129, + "hfopenllm_v2/MATH Level 5": 0.2115, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.3911, + "hfopenllm_v2/MMLU-PRO": 0.3825 + } + }, + { + "id": "T145/ZEUS-8B-V20", + "name": "ZEUS-8B-V20", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7956, + "hfopenllm_v2/BBH": 0.5244, + "hfopenllm_v2/MATH Level 5": 0.219, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.4043, + "hfopenllm_v2/MMLU-PRO": 0.393 + } + }, + { + "id": "T145/ZEUS-8B-V21", + "name": "ZEUS-8B-V21", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3785, + "hfopenllm_v2/BBH": 0.3398, + "hfopenllm_v2/MATH Level 5": 0.1594, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3262, + "hfopenllm_v2/MMLU-PRO": 0.1714 + } + }, + { + "id": "T145/ZEUS-8B-V22", + "name": "ZEUS-8B-V22", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7995, + "hfopenllm_v2/BBH": 0.5245, + "hfopenllm_v2/MATH Level 5": 0.2228, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.399, + "hfopenllm_v2/MMLU-PRO": 0.3938 + } + }, + { + "id": "T145/ZEUS-8B-V23", + "name": "ZEUS-8B-V23", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7621, + "hfopenllm_v2/BBH": 0.5195, + "hfopenllm_v2/MATH Level 5": 0.182, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.3922, + "hfopenllm_v2/MMLU-PRO": 0.3666 + } + }, + { + "id": "T145/ZEUS-8B-V24", + "name": "ZEUS-8B-V24", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6, + "hfopenllm_v2/BBH": 0.4778, + "hfopenllm_v2/MATH Level 5": 0.1458, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3729, + "hfopenllm_v2/MMLU-PRO": 0.3285 + } + }, + { + "id": "T145/ZEUS-8B-V25", + "name": "ZEUS-8B-V25", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.332, + "hfopenllm_v2/BBH": 0.4547, + "hfopenllm_v2/MATH Level 5": 0.2039, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3488, + "hfopenllm_v2/MMLU-PRO": 0.2885 + } + }, + { + "id": "T145/ZEUS-8B-V26", + "name": "ZEUS-8B-V26", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6708, + "hfopenllm_v2/BBH": 0.5232, + "hfopenllm_v2/MATH Level 5": 0.1246, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4016, + "hfopenllm_v2/MMLU-PRO": 0.3907 + } + }, + { + "id": "T145/ZEUS-8B-V27", + "name": "ZEUS-8B-V27", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6544, + "hfopenllm_v2/BBH": 0.523, + "hfopenllm_v2/MATH Level 5": 0.1344, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.3977, + "hfopenllm_v2/MMLU-PRO": 0.3902 + } + }, + { + "id": "T145/ZEUS-8B-V28", + "name": "ZEUS-8B-V28", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6353, + "hfopenllm_v2/BBH": 0.5254, + "hfopenllm_v2/MATH Level 5": 0.1269, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.3896, + "hfopenllm_v2/MMLU-PRO": 0.3902 + } + }, + { + "id": "T145/ZEUS-8B-V29", + "name": "ZEUS-8B-V29", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7418, + "hfopenllm_v2/BBH": 0.5253, + "hfopenllm_v2/MATH Level 5": 0.1601, + "hfopenllm_v2/GPQA": 0.3263, + "hfopenllm_v2/MUSR": 0.4003, + "hfopenllm_v2/MMLU-PRO": 0.392 + } + }, + { + "id": "T145/ZEUS-8B-V2L1", + "name": "ZEUS-8B-V2L1", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3192, + "hfopenllm_v2/BBH": 0.5013, + "hfopenllm_v2/MATH Level 5": 0.1239, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.3882, + "hfopenllm_v2/MMLU-PRO": 0.3638 + } + }, + { + "id": "T145/ZEUS-8B-V2L2", + "name": "ZEUS-8B-V2L2", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8021, + "hfopenllm_v2/BBH": 0.5203, + "hfopenllm_v2/MATH Level 5": 0.2017, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.3975, + "hfopenllm_v2/MMLU-PRO": 0.3884 + } + }, + { + "id": "T145/ZEUS-8B-V3", + "name": "ZEUS-8B-V3", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7887, + "hfopenllm_v2/BBH": 0.5265, + "hfopenllm_v2/MATH Level 5": 0.1677, + "hfopenllm_v2/GPQA": 0.3221, + "hfopenllm_v2/MUSR": 0.4017, + "hfopenllm_v2/MMLU-PRO": 0.3804 + } + }, + { + "id": "T145/ZEUS-8B-V30", + "name": "ZEUS-8B-V30", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7436, + "hfopenllm_v2/BBH": 0.5243, + "hfopenllm_v2/MATH Level 5": 0.1586, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.4029, + "hfopenllm_v2/MMLU-PRO": 0.3944 + } + }, + { + "id": "T145/ZEUS-8B-V4", + "name": "ZEUS-8B-V4", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7807, + "hfopenllm_v2/BBH": 0.5246, + "hfopenllm_v2/MATH Level 5": 0.1926, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4029, + "hfopenllm_v2/MMLU-PRO": 0.3788 + } + }, + { + "id": "T145/ZEUS-8B-V6", + "name": "ZEUS-8B-V6", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7838, + "hfopenllm_v2/BBH": 0.524, + "hfopenllm_v2/MATH Level 5": 0.2024, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4068, + "hfopenllm_v2/MMLU-PRO": 0.3759 + } + }, + { + "id": "T145/ZEUS-8B-V7", + "name": "ZEUS-8B-V7", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7786, + "hfopenllm_v2/BBH": 0.507, + "hfopenllm_v2/MATH Level 5": 0.148, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4162, + "hfopenllm_v2/MMLU-PRO": 0.3812 + } + }, + { + "id": "T145/ZEUS-8B-V8", + "name": "ZEUS-8B-V8", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7914, + "hfopenllm_v2/BBH": 0.5065, + "hfopenllm_v2/MATH Level 5": 0.1329, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.4214, + "hfopenllm_v2/MMLU-PRO": 0.3761 + } + }, + { + "id": "T145/ZEUS-8B-V9", + "name": "ZEUS-8B-V9", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5551, + "hfopenllm_v2/BBH": 0.5207, + "hfopenllm_v2/MATH Level 5": 0.2137, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.3949, + "hfopenllm_v2/MMLU-PRO": 0.3901 + } + }, + { + "id": "T145/qwen-2.5-3B-merge-test", + "name": "qwen-2.5-3B-merge-test", + "developer": "T145", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5751, + "hfopenllm_v2/BBH": 0.4842, + "hfopenllm_v2/MATH Level 5": 0.3202, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.4007, + "hfopenllm_v2/MMLU-PRO": 0.329 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/THUDM.json b/data/developers/THUDM.json new file mode 100644 index 0000000000000000000000000000000000000000..03f33b182a312a7c7bdb9d8d0ede927124c9fcd0 --- /dev/null +++ b/data/developers/THUDM.json @@ -0,0 +1,75 @@ +{ + "developer": "THUDM", + "models": [ + { + "id": "THUDM/glm-4-9b", + "name": "glm-4-9b", + "developer": "THUDM", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1426, + "hfopenllm_v2/BBH": 0.5528, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4386, + "hfopenllm_v2/MMLU-PRO": 0.4145 + } + }, + { + "id": "THUDM/glm-4-9b-chat", + "name": "glm-4-9b-chat", + "developer": "THUDM", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0, + "hfopenllm_v2/BBH": 0.4736, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.3994, + "hfopenllm_v2/MMLU-PRO": 0.3167 + } + }, + { + "id": "THUDM/glm-4-9b-chat-1m", + "name": "glm-4-9b-chat-1m", + "developer": "THUDM", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0, + "hfopenllm_v2/BBH": 0.418, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.3795, + "hfopenllm_v2/MMLU-PRO": 0.3163 + } + }, + { + "id": "THUDM/glm-4-9b-chat-1m-hf", + "name": "glm-4-9b-chat-1m-hf", + "developer": "THUDM", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5341, + "hfopenllm_v2/BBH": 0.3901, + "hfopenllm_v2/MATH Level 5": 0.0483, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.3689, + "hfopenllm_v2/MMLU-PRO": 0.1814 + } + }, + { + "id": "THUDM/glm-4-9b-chat-hf", + "name": "glm-4-9b-chat-hf", + "developer": "THUDM", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6513, + "hfopenllm_v2/BBH": 0.4432, + "hfopenllm_v2/MATH Level 5": 0.0846, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.3593, + "hfopenllm_v2/MMLU-PRO": 0.2774 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/TIGER-Lab.json b/data/developers/TIGER-Lab.json new file mode 100644 index 0000000000000000000000000000000000000000..b8cefe535b9ad8717beca5a57c1a96b5095d11df --- /dev/null +++ b/data/developers/TIGER-Lab.json @@ -0,0 +1,89 @@ +{ + "developer": "TIGER-Lab", + "models": [ + { + "id": "TIGER-Lab/AceCodeRM-7B", + "name": "AceCodeRM-7B", + "developer": "TIGER-Lab", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5855, + "hfopenllm_v2/BBH": 0.4773, + "hfopenllm_v2/MATH Level 5": 0.3467, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4192, + "hfopenllm_v2/MMLU-PRO": 0.3361 + } + }, + { + "id": "TIGER-Lab/AceCoder-Qwen2.5-7B-Ins-Rule", + "name": "AceCoder-Qwen2.5-7B-Ins-Rule", + "developer": "TIGER-Lab", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7424, + "hfopenllm_v2/BBH": 0.5404, + "hfopenllm_v2/MATH Level 5": 0.4992, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.398, + "hfopenllm_v2/MMLU-PRO": 0.4322 + } + }, + { + "id": "TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Base-Rule", + "name": "AceCoder-Qwen2.5-Coder-7B-Base-Rule", + "developer": "TIGER-Lab", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4408, + "hfopenllm_v2/BBH": 0.4902, + "hfopenllm_v2/MATH Level 5": 0.2017, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3449, + "hfopenllm_v2/MMLU-PRO": 0.3745 + } + }, + { + "id": "TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Ins-Rule", + "name": "AceCoder-Qwen2.5-Coder-7B-Ins-Rule", + "developer": "TIGER-Lab", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6222, + "hfopenllm_v2/BBH": 0.5089, + "hfopenllm_v2/MATH Level 5": 0.3603, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.4046, + "hfopenllm_v2/MMLU-PRO": 0.3428 + } + }, + { + "id": "TIGER-Lab/MAmmoTH2-7B-Plus", + "name": "MAmmoTH2-7B-Plus", + "developer": "TIGER-Lab", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5575, + "hfopenllm_v2/BBH": 0.4235, + "hfopenllm_v2/MATH Level 5": 0.1858, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.4124, + "hfopenllm_v2/MMLU-PRO": 0.3017 + } + }, + { + "id": "TIGER-Lab/Qwen2.5-Math-7B-CFT", + "name": "Qwen2.5-Math-7B-CFT", + "developer": "TIGER-Lab", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2777, + "hfopenllm_v2/BBH": 0.4637, + "hfopenllm_v2/MATH Level 5": 0.5574, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.3887, + "hfopenllm_v2/MMLU-PRO": 0.2945 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/TTTXXX01.json b/data/developers/TTTXXX01.json new file mode 100644 index 0000000000000000000000000000000000000000..bb862daad6569fae6d4762f1274132ef4b4a48d3 --- /dev/null +++ b/data/developers/TTTXXX01.json @@ -0,0 +1,19 @@ +{ + "developer": "TTTXXX01", + "models": [ + { + "id": "TTTXXX01/Mistral-7B-Base-SimPO2-5e-7", + "name": "Mistral-7B-Base-SimPO2-5e-7", + "developer": "TTTXXX01", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4392, + "hfopenllm_v2/BBH": 0.432, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.3604, + "hfopenllm_v2/MMLU-PRO": 0.2766 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Tarek07.json b/data/developers/Tarek07.json new file mode 100644 index 0000000000000000000000000000000000000000..8b3233c296ab77e65724be7e95482f31b6b0200b --- /dev/null +++ b/data/developers/Tarek07.json @@ -0,0 +1,33 @@ +{ + "developer": "Tarek07", + "models": [ + { + "id": "Tarek07/Progenitor-V1.1-LLaMa-70B", + "name": "Progenitor-V1.1-LLaMa-70B", + "developer": "Tarek07", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6906, + "hfopenllm_v2/BBH": 0.6971, + "hfopenllm_v2/MATH Level 5": 0.3573, + "hfopenllm_v2/GPQA": 0.4581, + "hfopenllm_v2/MUSR": 0.4736, + "hfopenllm_v2/MMLU-PRO": 0.5465 + } + }, + { + "id": "Tarek07/Thalassic-Alpha-LLaMa-70B", + "name": "Thalassic-Alpha-LLaMa-70B", + "developer": "Tarek07", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7003, + "hfopenllm_v2/BBH": 0.694, + "hfopenllm_v2/MATH Level 5": 0.315, + "hfopenllm_v2/GPQA": 0.4438, + "hfopenllm_v2/MUSR": 0.4802, + "hfopenllm_v2/MMLU-PRO": 0.5435 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/TeeZee.json b/data/developers/TeeZee.json new file mode 100644 index 0000000000000000000000000000000000000000..144d54ca187a75a6a7a41a675e96d0e2498668c9 --- /dev/null +++ b/data/developers/TeeZee.json @@ -0,0 +1,19 @@ +{ + "developer": "TeeZee", + "models": [ + { + "id": "TeeZee/DoubleBagel-57B-v1.0", + "name": "DoubleBagel-57B-v1.0", + "developer": "TeeZee", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2336, + "hfopenllm_v2/BBH": 0.3251, + "hfopenllm_v2/MATH Level 5": 0.0098, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.4315, + "hfopenllm_v2/MMLU-PRO": 0.1478 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Telugu-LLM-Labs.json b/data/developers/Telugu-LLM-Labs.json new file mode 100644 index 0000000000000000000000000000000000000000..1b4f9bae71f6b7e1507f62ac83550ac483c30b65 --- /dev/null +++ b/data/developers/Telugu-LLM-Labs.json @@ -0,0 +1,33 @@ +{ + "developer": "Telugu-LLM-Labs", + "models": [ + { + "id": "Telugu-LLM-Labs/Indic-gemma-2b-finetuned-sft-Navarasa-2.0", + "name": "Indic-gemma-2b-finetuned-sft-Navarasa-2.0", + "developer": "Telugu-LLM-Labs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2103, + "hfopenllm_v2/BBH": 0.3241, + "hfopenllm_v2/MATH Level 5": 0.0272, + "hfopenllm_v2/GPQA": 0.2433, + "hfopenllm_v2/MUSR": 0.3899, + "hfopenllm_v2/MMLU-PRO": 0.1279 + } + }, + { + "id": "Telugu-LLM-Labs/Indic-gemma-7b-finetuned-sft-Navarasa-2.0", + "name": "Indic-gemma-7b-finetuned-sft-Navarasa-2.0", + "developer": "Telugu-LLM-Labs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3237, + "hfopenllm_v2/BBH": 0.4023, + "hfopenllm_v2/MATH Level 5": 0.0257, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.4083, + "hfopenllm_v2/MMLU-PRO": 0.235 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/TencentARC.json b/data/developers/TencentARC.json new file mode 100644 index 0000000000000000000000000000000000000000..6bdcb11bbff177b71f03310639417506e24254cb --- /dev/null +++ b/data/developers/TencentARC.json @@ -0,0 +1,61 @@ +{ + "developer": "TencentARC", + "models": [ + { + "id": "TencentARC/LLaMA-Pro-8B", + "name": "LLaMA-Pro-8B", + "developer": "TencentARC", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2277, + "hfopenllm_v2/BBH": 0.3484, + "hfopenllm_v2/MATH Level 5": 0.0189, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.4018, + "hfopenllm_v2/MMLU-PRO": 0.1811 + } + }, + { + "id": "TencentARC/LLaMA-Pro-8B-Instruct", + "name": "LLaMA-Pro-8B-Instruct", + "developer": "TencentARC", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4486, + "hfopenllm_v2/BBH": 0.4224, + "hfopenllm_v2/MATH Level 5": 0.0249, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.419, + "hfopenllm_v2/MMLU-PRO": 0.1946 + } + }, + { + "id": "TencentARC/MetaMath-Mistral-Pro", + "name": "MetaMath-Mistral-Pro", + "developer": "TencentARC", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2119, + "hfopenllm_v2/BBH": 0.4413, + "hfopenllm_v2/MATH Level 5": 0.0763, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3524, + "hfopenllm_v2/MMLU-PRO": 0.2472 + } + }, + { + "id": "TencentARC/Mistral_Pro_8B_v0.1", + "name": "Mistral_Pro_8B_v0.1", + "developer": "TencentARC", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2115, + "hfopenllm_v2/BBH": 0.4526, + "hfopenllm_v2/MATH Level 5": 0.0566, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.4242, + "hfopenllm_v2/MMLU-PRO": 0.2765 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/TheDrummer.json b/data/developers/TheDrummer.json new file mode 100644 index 0000000000000000000000000000000000000000..113c04c24075576f2c3900dfda1978f8623653fc --- /dev/null +++ b/data/developers/TheDrummer.json @@ -0,0 +1,131 @@ +{ + "developer": "TheDrummer", + "models": [ + { + "id": "TheDrummer/Cydonia-22B-v1.2", + "name": "Cydonia-22B-v1.2", + "developer": "TheDrummer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5635, + "hfopenllm_v2/BBH": 0.5809, + "hfopenllm_v2/MATH Level 5": 0.2032, + "hfopenllm_v2/GPQA": 0.3305, + "hfopenllm_v2/MUSR": 0.4022, + "hfopenllm_v2/MMLU-PRO": 0.4141 + } + }, + { + "id": "TheDrummer/Gemmasutra-9B-v1", + "name": "Gemmasutra-9B-v1", + "developer": "TheDrummer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2416, + "hfopenllm_v2/BBH": 0.5887, + "hfopenllm_v2/MATH Level 5": 0.0831, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4846, + "hfopenllm_v2/MMLU-PRO": 0.4045 + } + }, + { + "id": "TheDrummer/Gemmasutra-Mini-2B-v1", + "name": "Gemmasutra-Mini-2B-v1", + "developer": "TheDrummer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2549, + "hfopenllm_v2/BBH": 0.3575, + "hfopenllm_v2/MATH Level 5": 0.0378, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.349, + "hfopenllm_v2/MMLU-PRO": 0.2055 + } + }, + { + "id": "TheDrummer/Llama-3SOME-8B-v2", + "name": "Llama-3SOME-8B-v2", + "developer": "TheDrummer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4508, + "hfopenllm_v2/BBH": 0.5203, + "hfopenllm_v2/MATH Level 5": 0.0937, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.3833, + "hfopenllm_v2/MMLU-PRO": 0.3753 + } + }, + { + "id": "TheDrummer/Ministrations-8B-v1", + "name": "Ministrations-8B-v1", + "developer": "TheDrummer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2822, + "hfopenllm_v2/BBH": 0.4877, + "hfopenllm_v2/MATH Level 5": 0.1843, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.4449, + "hfopenllm_v2/MMLU-PRO": 0.3644 + } + }, + { + "id": "TheDrummer/Rocinante-12B-v1", + "name": "Rocinante-12B-v1", + "developer": "TheDrummer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6076, + "hfopenllm_v2/BBH": 0.5065, + "hfopenllm_v2/MATH Level 5": 0.1269, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.4017, + "hfopenllm_v2/MMLU-PRO": 0.3477 + } + }, + { + "id": "TheDrummer/Tiger-Gemma-9B-v1", + "name": "Tiger-Gemma-9B-v1", + "developer": "TheDrummer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7282, + "hfopenllm_v2/BBH": 0.5704, + "hfopenllm_v2/MATH Level 5": 0.1835, + "hfopenllm_v2/GPQA": 0.3389, + "hfopenllm_v2/MUSR": 0.4162, + "hfopenllm_v2/MMLU-PRO": 0.4118 + } + }, + { + "id": "TheDrummer/Tiger-Gemma-9B-v2", + "name": "Tiger-Gemma-9B-v2", + "developer": "TheDrummer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6986, + "hfopenllm_v2/BBH": 0.5617, + "hfopenllm_v2/MATH Level 5": 0.182, + "hfopenllm_v2/GPQA": 0.3398, + "hfopenllm_v2/MUSR": 0.4084, + "hfopenllm_v2/MMLU-PRO": 0.4112 + } + }, + { + "id": "TheDrummer/Tiger-Gemma-9B-v3", + "name": "Tiger-Gemma-9B-v3", + "developer": "TheDrummer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6821, + "hfopenllm_v2/BBH": 0.5812, + "hfopenllm_v2/MATH Level 5": 0.1624, + "hfopenllm_v2/GPQA": 0.3389, + "hfopenllm_v2/MUSR": 0.4004, + "hfopenllm_v2/MMLU-PRO": 0.4059 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/TheDrunkenSnail.json b/data/developers/TheDrunkenSnail.json new file mode 100644 index 0000000000000000000000000000000000000000..30efc6e1511d9b33e12c49f5c4903c88e8b660fe --- /dev/null +++ b/data/developers/TheDrunkenSnail.json @@ -0,0 +1,47 @@ +{ + "developer": "TheDrunkenSnail", + "models": [ + { + "id": "TheDrunkenSnail/Daughter-of-Rhodia-12B", + "name": "Daughter-of-Rhodia-12B", + "developer": "TheDrunkenSnail", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6904, + "hfopenllm_v2/BBH": 0.5179, + "hfopenllm_v2/MATH Level 5": 0.1224, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4348, + "hfopenllm_v2/MMLU-PRO": 0.3641 + } + }, + { + "id": "TheDrunkenSnail/Mother-of-Rhodia-12B", + "name": "Mother-of-Rhodia-12B", + "developer": "TheDrunkenSnail", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6505, + "hfopenllm_v2/BBH": 0.4948, + "hfopenllm_v2/MATH Level 5": 0.1224, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4124, + "hfopenllm_v2/MMLU-PRO": 0.3551 + } + }, + { + "id": "TheDrunkenSnail/Son-of-Rhodia", + "name": "Son-of-Rhodia", + "developer": "TheDrunkenSnail", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7046, + "hfopenllm_v2/BBH": 0.5097, + "hfopenllm_v2/MATH Level 5": 0.1314, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4203, + "hfopenllm_v2/MMLU-PRO": 0.3608 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/TheHierophant.json b/data/developers/TheHierophant.json new file mode 100644 index 0000000000000000000000000000000000000000..c5942c1f64f18b40b781b7a930f223e096946f47 --- /dev/null +++ b/data/developers/TheHierophant.json @@ -0,0 +1,19 @@ +{ + "developer": "TheHierophant", + "models": [ + { + "id": "TheHierophant/Underground-Cognitive-V0.3-test", + "name": "Underground-Cognitive-V0.3-test", + "developer": "TheHierophant", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4808, + "hfopenllm_v2/BBH": 0.529, + "hfopenllm_v2/MATH Level 5": 0.0589, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4351, + "hfopenllm_v2/MMLU-PRO": 0.3318 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/TheTsar1209.json b/data/developers/TheTsar1209.json new file mode 100644 index 0000000000000000000000000000000000000000..3287d82fbdad8add23c36e578b883cdb742fbacb --- /dev/null +++ b/data/developers/TheTsar1209.json @@ -0,0 +1,103 @@ +{ + "developer": "TheTsar1209", + "models": [ + { + "id": "TheTsar1209/nemo-carpmuscle-v0.1", + "name": "nemo-carpmuscle-v0.1", + "developer": "TheTsar1209", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2276, + "hfopenllm_v2/BBH": 0.5084, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4135, + "hfopenllm_v2/MMLU-PRO": 0.3406 + } + }, + { + "id": "TheTsar1209/qwen-carpmuscle-r-v0.3", + "name": "qwen-carpmuscle-r-v0.3", + "developer": "TheTsar1209", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4455, + "hfopenllm_v2/BBH": 0.6227, + "hfopenllm_v2/MATH Level 5": 0.3006, + "hfopenllm_v2/GPQA": 0.3507, + "hfopenllm_v2/MUSR": 0.4278, + "hfopenllm_v2/MMLU-PRO": 0.5103 + } + }, + { + "id": "TheTsar1209/qwen-carpmuscle-v0.1", + "name": "qwen-carpmuscle-v0.1", + "developer": "TheTsar1209", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5622, + "hfopenllm_v2/BBH": 0.6434, + "hfopenllm_v2/MATH Level 5": 0.2628, + "hfopenllm_v2/GPQA": 0.344, + "hfopenllm_v2/MUSR": 0.4161, + "hfopenllm_v2/MMLU-PRO": 0.52 + } + }, + { + "id": "TheTsar1209/qwen-carpmuscle-v0.2", + "name": "qwen-carpmuscle-v0.2", + "developer": "TheTsar1209", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5257, + "hfopenllm_v2/BBH": 0.6387, + "hfopenllm_v2/MATH Level 5": 0.2832, + "hfopenllm_v2/GPQA": 0.3557, + "hfopenllm_v2/MUSR": 0.4346, + "hfopenllm_v2/MMLU-PRO": 0.5147 + } + }, + { + "id": "TheTsar1209/qwen-carpmuscle-v0.3", + "name": "qwen-carpmuscle-v0.3", + "developer": "TheTsar1209", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4476, + "hfopenllm_v2/BBH": 0.6152, + "hfopenllm_v2/MATH Level 5": 0.3134, + "hfopenllm_v2/GPQA": 0.3565, + "hfopenllm_v2/MUSR": 0.4132, + "hfopenllm_v2/MMLU-PRO": 0.5062 + } + }, + { + "id": "TheTsar1209/qwen-carpmuscle-v0.4", + "name": "qwen-carpmuscle-v0.4", + "developer": "TheTsar1209", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7202, + "hfopenllm_v2/BBH": 0.6454, + "hfopenllm_v2/MATH Level 5": 0.2772, + "hfopenllm_v2/GPQA": 0.3523, + "hfopenllm_v2/MUSR": 0.4516, + "hfopenllm_v2/MMLU-PRO": 0.5144 + } + }, + { + "id": "TheTsar1209/qwen-carpmuscle-v0.4.1", + "name": "qwen-carpmuscle-v0.4.1", + "developer": "TheTsar1209", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.736, + "hfopenllm_v2/BBH": 0.6507, + "hfopenllm_v2/MATH Level 5": 0.2779, + "hfopenllm_v2/GPQA": 0.3456, + "hfopenllm_v2/MUSR": 0.4489, + "hfopenllm_v2/MMLU-PRO": 0.5191 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Tijmen2.json b/data/developers/Tijmen2.json new file mode 100644 index 0000000000000000000000000000000000000000..704d058bd3030289404726f9aae333117b3bed0b --- /dev/null +++ b/data/developers/Tijmen2.json @@ -0,0 +1,19 @@ +{ + "developer": "Tijmen2", + "models": [ + { + "id": "Tijmen2/cosmosage-v3", + "name": "cosmosage-v3", + "developer": "Tijmen2", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4482, + "hfopenllm_v2/BBH": 0.4551, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4199, + "hfopenllm_v2/MMLU-PRO": 0.2486 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/TinyLlama.json b/data/developers/TinyLlama.json new file mode 100644 index 0000000000000000000000000000000000000000..497ffeb47f52828836f7758f7a9a1082c24b52ea --- /dev/null +++ b/data/developers/TinyLlama.json @@ -0,0 +1,89 @@ +{ + "developer": "TinyLlama", + "models": [ + { + "id": "TinyLlama/TinyLlama-1.1B-Chat-v0.1", + "name": "TinyLlama-1.1B-Chat-v0.1", + "developer": "TinyLlama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1479, + "hfopenllm_v2/BBH": 0.3084, + "hfopenllm_v2/MATH Level 5": 0.006, + "hfopenllm_v2/GPQA": 0.229, + "hfopenllm_v2/MUSR": 0.3592, + "hfopenllm_v2/MMLU-PRO": 0.1098 + } + }, + { + "id": "TinyLlama/TinyLlama-1.1B-Chat-v0.5", + "name": "TinyLlama-1.1B-Chat-v0.5", + "developer": "TinyLlama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1634, + "hfopenllm_v2/BBH": 0.3105, + "hfopenllm_v2/MATH Level 5": 0.0038, + "hfopenllm_v2/GPQA": 0.2483, + "hfopenllm_v2/MUSR": 0.3661, + "hfopenllm_v2/MMLU-PRO": 0.1096 + } + }, + { + "id": "TinyLlama/TinyLlama-1.1B-Chat-v0.6", + "name": "TinyLlama-1.1B-Chat-v0.6", + "developer": "TinyLlama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1574, + "hfopenllm_v2/BBH": 0.3067, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3422, + "hfopenllm_v2/MMLU-PRO": 0.1149 + } + }, + { + "id": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "name": "TinyLlama-1.1B-Chat-v1.0", + "developer": "TinyLlama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0596, + "hfopenllm_v2/BBH": 0.3104, + "hfopenllm_v2/MATH Level 5": 0.0151, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3515, + "hfopenllm_v2/MMLU-PRO": 0.1101 + } + }, + { + "id": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "name": "TinyLlama-1.1B-intermediate-step-1431k-3T", + "developer": "TinyLlama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2277, + "hfopenllm_v2/BBH": 0.3071, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.338, + "hfopenllm_v2/MMLU-PRO": 0.112 + } + }, + { + "id": "TinyLlama/TinyLlama_v1.1", + "name": "TinyLlama_v1.1", + "developer": "TinyLlama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2001, + "hfopenllm_v2/BBH": 0.3024, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2458, + "hfopenllm_v2/MUSR": 0.37, + "hfopenllm_v2/MMLU-PRO": 0.1049 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ToastyPigeon.json b/data/developers/ToastyPigeon.json new file mode 100644 index 0000000000000000000000000000000000000000..78d9a15ae7c4e769870922b86d113659a479616b --- /dev/null +++ b/data/developers/ToastyPigeon.json @@ -0,0 +1,19 @@ +{ + "developer": "ToastyPigeon", + "models": [ + { + "id": "ToastyPigeon/Sto-vo-kor-12B", + "name": "Sto-vo-kor-12B", + "developer": "ToastyPigeon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5501, + "hfopenllm_v2/BBH": 0.5065, + "hfopenllm_v2/MATH Level 5": 0.1088, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.3938, + "hfopenllm_v2/MMLU-PRO": 0.3398 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Trappu.json b/data/developers/Trappu.json new file mode 100644 index 0000000000000000000000000000000000000000..68f5894c486680acbd0d60271a2cad4b3980837d --- /dev/null +++ b/data/developers/Trappu.json @@ -0,0 +1,33 @@ +{ + "developer": "Trappu", + "models": [ + { + "id": "Trappu/Magnum-Picaro-0.7-v2-12b", + "name": "Magnum-Picaro-0.7-v2-12b", + "developer": "Trappu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3003, + "hfopenllm_v2/BBH": 0.5507, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.4727, + "hfopenllm_v2/MMLU-PRO": 0.358 + } + }, + { + "id": "Trappu/Nemo-Picaro-12B", + "name": "Nemo-Picaro-12B", + "developer": "Trappu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2577, + "hfopenllm_v2/BBH": 0.549, + "hfopenllm_v2/MATH Level 5": 0.0846, + "hfopenllm_v2/GPQA": 0.3272, + "hfopenllm_v2/MUSR": 0.4726, + "hfopenllm_v2/MMLU-PRO": 0.3605 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Tremontaine.json b/data/developers/Tremontaine.json new file mode 100644 index 0000000000000000000000000000000000000000..20b1ca478985df9227ce5715f269cd38654609db --- /dev/null +++ b/data/developers/Tremontaine.json @@ -0,0 +1,19 @@ +{ + "developer": "Tremontaine", + "models": [ + { + "id": "Tremontaine/L3-12B-Lunaris-v1", + "name": "L3-12B-Lunaris-v1", + "developer": "Tremontaine", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6909, + "hfopenllm_v2/BBH": 0.523, + "hfopenllm_v2/MATH Level 5": 0.0876, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.3674, + "hfopenllm_v2/MMLU-PRO": 0.3775 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Triangle104.json b/data/developers/Triangle104.json new file mode 100644 index 0000000000000000000000000000000000000000..b5dc4366af66252a7cff6c2cd383e780abb24eae --- /dev/null +++ b/data/developers/Triangle104.json @@ -0,0 +1,859 @@ +{ + "developer": "Triangle104", + "models": [ + { + "id": "Triangle104/Annunaki-12b", + "name": "Annunaki-12b", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3872, + "hfopenllm_v2/BBH": 0.5499, + "hfopenllm_v2/MATH Level 5": 0.1216, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.4409, + "hfopenllm_v2/MMLU-PRO": 0.3721 + } + }, + { + "id": "Triangle104/BigTalker-Lite-8B", + "name": "BigTalker-Lite-8B", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3689, + "hfopenllm_v2/BBH": 0.5308, + "hfopenllm_v2/MATH Level 5": 0.102, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4208, + "hfopenllm_v2/MMLU-PRO": 0.3431 + } + }, + { + "id": "Triangle104/Chatty-Harry_V2.0", + "name": "Chatty-Harry_V2.0", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3326, + "hfopenllm_v2/BBH": 0.5319, + "hfopenllm_v2/MATH Level 5": 0.139, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.4078, + "hfopenllm_v2/MMLU-PRO": 0.3683 + } + }, + { + "id": "Triangle104/Chatty-Harry_V3.0", + "name": "Chatty-Harry_V3.0", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3675, + "hfopenllm_v2/BBH": 0.5526, + "hfopenllm_v2/MATH Level 5": 0.1125, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.4408, + "hfopenllm_v2/MMLU-PRO": 0.3702 + } + }, + { + "id": "Triangle104/Chronos-Prism_V1.0", + "name": "Chronos-Prism_V1.0", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3259, + "hfopenllm_v2/BBH": 0.5554, + "hfopenllm_v2/MATH Level 5": 0.1201, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4263, + "hfopenllm_v2/MMLU-PRO": 0.3673 + } + }, + { + "id": "Triangle104/DS-Distilled-Hermes-Llama-3.1", + "name": "DS-Distilled-Hermes-Llama-3.1", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3229, + "hfopenllm_v2/BBH": 0.5117, + "hfopenllm_v2/MATH Level 5": 0.2931, + "hfopenllm_v2/GPQA": 0.3188, + "hfopenllm_v2/MUSR": 0.4039, + "hfopenllm_v2/MMLU-PRO": 0.311 + } + }, + { + "id": "Triangle104/DS-Distilled-Hermes-Llama-3.1_TIES", + "name": "DS-Distilled-Hermes-Llama-3.1_TIES", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1364, + "hfopenllm_v2/BBH": 0.2928, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.245, + "hfopenllm_v2/MUSR": 0.3621, + "hfopenllm_v2/MMLU-PRO": 0.1104 + } + }, + { + "id": "Triangle104/DS-R1-Distill-Q2.5-10B-Harmony", + "name": "DS-R1-Distill-Q2.5-10B-Harmony", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1751, + "hfopenllm_v2/BBH": 0.2643, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2106, + "hfopenllm_v2/MUSR": 0.3128, + "hfopenllm_v2/MMLU-PRO": 0.1173 + } + }, + { + "id": "Triangle104/DS-R1-Distill-Q2.5-14B-Harmony_V0.1", + "name": "DS-R1-Distill-Q2.5-14B-Harmony_V0.1", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4515, + "hfopenllm_v2/BBH": 0.5783, + "hfopenllm_v2/MATH Level 5": 0.5551, + "hfopenllm_v2/GPQA": 0.3935, + "hfopenllm_v2/MUSR": 0.5567, + "hfopenllm_v2/MMLU-PRO": 0.4601 + } + }, + { + "id": "Triangle104/DS-R1-Distill-Q2.5-7B-RP", + "name": "DS-R1-Distill-Q2.5-7B-RP", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3445, + "hfopenllm_v2/BBH": 0.4383, + "hfopenllm_v2/MATH Level 5": 0.4683, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.403, + "hfopenllm_v2/MMLU-PRO": 0.2891 + } + }, + { + "id": "Triangle104/DS-R1-Llama-8B-Harmony", + "name": "DS-R1-Llama-8B-Harmony", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3566, + "hfopenllm_v2/BBH": 0.4154, + "hfopenllm_v2/MATH Level 5": 0.4282, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.3762, + "hfopenllm_v2/MMLU-PRO": 0.2744 + } + }, + { + "id": "Triangle104/DSR1-Distill-Llama-Lit-8B", + "name": "DSR1-Distill-Llama-Lit-8B", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1885, + "hfopenllm_v2/BBH": 0.4284, + "hfopenllm_v2/MATH Level 5": 0.352, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.3535, + "hfopenllm_v2/MMLU-PRO": 0.2798 + } + }, + { + "id": "Triangle104/DSR1-Distill-Qwen-7B-RP", + "name": "DSR1-Distill-Qwen-7B-RP", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3609, + "hfopenllm_v2/BBH": 0.4326, + "hfopenllm_v2/MATH Level 5": 0.4804, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.4045, + "hfopenllm_v2/MMLU-PRO": 0.3028 + } + }, + { + "id": "Triangle104/Dark-Chivalry_V1.0", + "name": "Dark-Chivalry_V1.0", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4326, + "hfopenllm_v2/BBH": 0.4974, + "hfopenllm_v2/MATH Level 5": 0.1314, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4182, + "hfopenllm_v2/MMLU-PRO": 0.3444 + } + }, + { + "id": "Triangle104/Distilled-DarkPlanet-Allades-8B", + "name": "Distilled-DarkPlanet-Allades-8B", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.346, + "hfopenllm_v2/BBH": 0.4634, + "hfopenllm_v2/MATH Level 5": 0.4003, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.3538, + "hfopenllm_v2/MMLU-PRO": 0.2901 + } + }, + { + "id": "Triangle104/Distilled-DarkPlanet-Allades-8B_TIES", + "name": "Distilled-DarkPlanet-Allades-8B_TIES", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3892, + "hfopenllm_v2/BBH": 0.5042, + "hfopenllm_v2/MATH Level 5": 0.0906, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.3868, + "hfopenllm_v2/MMLU-PRO": 0.3401 + } + }, + { + "id": "Triangle104/Distilled-Whiskey-8b", + "name": "Distilled-Whiskey-8b", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3448, + "hfopenllm_v2/BBH": 0.5028, + "hfopenllm_v2/MATH Level 5": 0.2545, + "hfopenllm_v2/GPQA": 0.3314, + "hfopenllm_v2/MUSR": 0.4172, + "hfopenllm_v2/MMLU-PRO": 0.3367 + } + }, + { + "id": "Triangle104/Dolphin3-Llama3.2-Smart", + "name": "Dolphin3-Llama3.2-Smart", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4137, + "hfopenllm_v2/BBH": 0.3975, + "hfopenllm_v2/MATH Level 5": 0.0438, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3922, + "hfopenllm_v2/MMLU-PRO": 0.2195 + } + }, + { + "id": "Triangle104/Gemmadevi-Stock-10B", + "name": "Gemmadevi-Stock-10B", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1582, + "hfopenllm_v2/BBH": 0.6066, + "hfopenllm_v2/MATH Level 5": 0.0967, + "hfopenllm_v2/GPQA": 0.3532, + "hfopenllm_v2/MUSR": 0.4621, + "hfopenllm_v2/MMLU-PRO": 0.4262 + } + }, + { + "id": "Triangle104/Hermes-Llama-3.2-CoT", + "name": "Hermes-Llama-3.2-CoT", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4178, + "hfopenllm_v2/BBH": 0.4616, + "hfopenllm_v2/MATH Level 5": 0.0952, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3698, + "hfopenllm_v2/MMLU-PRO": 0.2947 + } + }, + { + "id": "Triangle104/Hermes-Llama-3.2-CoT-Summary", + "name": "Hermes-Llama-3.2-CoT-Summary", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.483, + "hfopenllm_v2/BBH": 0.42, + "hfopenllm_v2/MATH Level 5": 0.0831, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3575, + "hfopenllm_v2/MMLU-PRO": 0.2901 + } + }, + { + "id": "Triangle104/Hermes3-L3.1-DirtyHarry-8B", + "name": "Hermes3-L3.1-DirtyHarry-8B", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3242, + "hfopenllm_v2/BBH": 0.5066, + "hfopenllm_v2/MATH Level 5": 0.0718, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4069, + "hfopenllm_v2/MMLU-PRO": 0.3339 + } + }, + { + "id": "Triangle104/Herodotos-14B", + "name": "Herodotos-14B", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4667, + "hfopenllm_v2/BBH": 0.6435, + "hfopenllm_v2/MATH Level 5": 0.5045, + "hfopenllm_v2/GPQA": 0.3733, + "hfopenllm_v2/MUSR": 0.4795, + "hfopenllm_v2/MMLU-PRO": 0.529 + } + }, + { + "id": "Triangle104/Herodotos-14B_V0.1", + "name": "Herodotos-14B_V0.1", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1879, + "hfopenllm_v2/BBH": 0.3017, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.224, + "hfopenllm_v2/MUSR": 0.3684, + "hfopenllm_v2/MMLU-PRO": 0.1164 + } + }, + { + "id": "Triangle104/L3.1-8B-Dusky-Ink", + "name": "L3.1-8B-Dusky-Ink", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.453, + "hfopenllm_v2/BBH": 0.5098, + "hfopenllm_v2/MATH Level 5": 0.1231, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.4224, + "hfopenllm_v2/MMLU-PRO": 0.3683 + } + }, + { + "id": "Triangle104/L3.1-8B-Dusky-Ink_v0.r1", + "name": "L3.1-8B-Dusky-Ink_v0.r1", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1985, + "hfopenllm_v2/BBH": 0.4337, + "hfopenllm_v2/MATH Level 5": 0.0431, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.3988, + "hfopenllm_v2/MMLU-PRO": 0.3206 + } + }, + { + "id": "Triangle104/LThreePointOne-8B-HermesBlackroot", + "name": "LThreePointOne-8B-HermesBlackroot", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1792, + "hfopenllm_v2/BBH": 0.4998, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.3586, + "hfopenllm_v2/MMLU-PRO": 0.3285 + } + }, + { + "id": "Triangle104/LThreePointOne-8B-HermesInk", + "name": "LThreePointOne-8B-HermesInk", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4031, + "hfopenllm_v2/BBH": 0.5223, + "hfopenllm_v2/MATH Level 5": 0.1722, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.4129, + "hfopenllm_v2/MMLU-PRO": 0.3467 + } + }, + { + "id": "Triangle104/Llama3.1-Allades-Lit-8b", + "name": "Llama3.1-Allades-Lit-8b", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2461, + "hfopenllm_v2/BBH": 0.4183, + "hfopenllm_v2/MATH Level 5": 0.0023, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.3708, + "hfopenllm_v2/MMLU-PRO": 0.2724 + } + }, + { + "id": "Triangle104/Llama3.1-cc-Lit-8b", + "name": "Llama3.1-cc-Lit-8b", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2993, + "hfopenllm_v2/BBH": 0.3848, + "hfopenllm_v2/MATH Level 5": 0.003, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.3854, + "hfopenllm_v2/MMLU-PRO": 0.3004 + } + }, + { + "id": "Triangle104/Minerva-1.5b", + "name": "Minerva-1.5b", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2694, + "hfopenllm_v2/BBH": 0.4026, + "hfopenllm_v2/MATH Level 5": 0.1027, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.3655, + "hfopenllm_v2/MMLU-PRO": 0.2698 + } + }, + { + "id": "Triangle104/Minerva-1.5b_V0.2", + "name": "Minerva-1.5b_V0.2", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3083, + "hfopenllm_v2/BBH": 0.3989, + "hfopenllm_v2/MATH Level 5": 0.114, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.396, + "hfopenllm_v2/MMLU-PRO": 0.2911 + } + }, + { + "id": "Triangle104/Minerva-10b", + "name": "Minerva-10b", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1879, + "hfopenllm_v2/BBH": 0.4462, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.3627, + "hfopenllm_v2/MMLU-PRO": 0.2318 + } + }, + { + "id": "Triangle104/Minerva-14b", + "name": "Minerva-14b", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3468, + "hfopenllm_v2/BBH": 0.6301, + "hfopenllm_v2/MATH Level 5": 0.3051, + "hfopenllm_v2/GPQA": 0.3742, + "hfopenllm_v2/MUSR": 0.4766, + "hfopenllm_v2/MMLU-PRO": 0.5194 + } + }, + { + "id": "Triangle104/Minerva-14b-V0.1", + "name": "Minerva-14b-V0.1", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0861, + "hfopenllm_v2/BBH": 0.609, + "hfopenllm_v2/MATH Level 5": 0.3051, + "hfopenllm_v2/GPQA": 0.3658, + "hfopenllm_v2/MUSR": 0.47, + "hfopenllm_v2/MMLU-PRO": 0.5118 + } + }, + { + "id": "Triangle104/Minerva-7b", + "name": "Minerva-7b", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3724, + "hfopenllm_v2/BBH": 0.5498, + "hfopenllm_v2/MATH Level 5": 0.284, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.4143, + "hfopenllm_v2/MMLU-PRO": 0.4444 + } + }, + { + "id": "Triangle104/Minerva-8b", + "name": "Minerva-8b", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1721, + "hfopenllm_v2/BBH": 0.4669, + "hfopenllm_v2/MATH Level 5": 0.0045, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4273, + "hfopenllm_v2/MMLU-PRO": 0.3089 + } + }, + { + "id": "Triangle104/Mistral-Redemption-Arc", + "name": "Mistral-Redemption-Arc", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4029, + "hfopenllm_v2/BBH": 0.6255, + "hfopenllm_v2/MATH Level 5": 0.4101, + "hfopenllm_v2/GPQA": 0.3473, + "hfopenllm_v2/MUSR": 0.4595, + "hfopenllm_v2/MMLU-PRO": 0.451 + } + }, + { + "id": "Triangle104/Mistral-Small-24b-Harmony", + "name": "Mistral-Small-24b-Harmony", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1687, + "hfopenllm_v2/BBH": 0.6434, + "hfopenllm_v2/MATH Level 5": 0.1911, + "hfopenllm_v2/GPQA": 0.3842, + "hfopenllm_v2/MUSR": 0.4276, + "hfopenllm_v2/MMLU-PRO": 0.5431 + } + }, + { + "id": "Triangle104/Pans_Gutenbergum_V0.1", + "name": "Pans_Gutenbergum_V0.1", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3097, + "hfopenllm_v2/BBH": 0.5541, + "hfopenllm_v2/MATH Level 5": 0.1057, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.4528, + "hfopenllm_v2/MMLU-PRO": 0.3697 + } + }, + { + "id": "Triangle104/Pans_Gutenbergum_V0.2", + "name": "Pans_Gutenbergum_V0.2", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3215, + "hfopenllm_v2/BBH": 0.5526, + "hfopenllm_v2/MATH Level 5": 0.0687, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4673, + "hfopenllm_v2/MMLU-PRO": 0.3585 + } + }, + { + "id": "Triangle104/Pantheon_ChatWaifu_V0.2", + "name": "Pantheon_ChatWaifu_V0.2", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2683, + "hfopenllm_v2/BBH": 0.5532, + "hfopenllm_v2/MATH Level 5": 0.0566, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.4755, + "hfopenllm_v2/MMLU-PRO": 0.3442 + } + }, + { + "id": "Triangle104/Phi-4-AbliteratedRP", + "name": "Phi-4-AbliteratedRP", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4923, + "hfopenllm_v2/BBH": 0.6709, + "hfopenllm_v2/MATH Level 5": 0.3074, + "hfopenllm_v2/GPQA": 0.3951, + "hfopenllm_v2/MUSR": 0.5098, + "hfopenllm_v2/MMLU-PRO": 0.5308 + } + }, + { + "id": "Triangle104/Phi4-RP-o1", + "name": "Phi4-RP-o1", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.022, + "hfopenllm_v2/BBH": 0.6653, + "hfopenllm_v2/MATH Level 5": 0.3776, + "hfopenllm_v2/GPQA": 0.3733, + "hfopenllm_v2/MUSR": 0.4756, + "hfopenllm_v2/MMLU-PRO": 0.5111 + } + }, + { + "id": "Triangle104/Phi4-RP-o1-Ablit", + "name": "Phi4-RP-o1-Ablit", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0239, + "hfopenllm_v2/BBH": 0.663, + "hfopenllm_v2/MATH Level 5": 0.3882, + "hfopenllm_v2/GPQA": 0.3633, + "hfopenllm_v2/MUSR": 0.4754, + "hfopenllm_v2/MMLU-PRO": 0.5105 + } + }, + { + "id": "Triangle104/Porpoise-R1-Llama3.2-3b", + "name": "Porpoise-R1-Llama3.2-3b", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4352, + "hfopenllm_v2/BBH": 0.3824, + "hfopenllm_v2/MATH Level 5": 0.0423, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3576, + "hfopenllm_v2/MMLU-PRO": 0.2117 + } + }, + { + "id": "Triangle104/Q2.5-14B-Instruct-1M-Harmony", + "name": "Q2.5-14B-Instruct-1M-Harmony", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5986, + "hfopenllm_v2/BBH": 0.6339, + "hfopenllm_v2/MATH Level 5": 0.3769, + "hfopenllm_v2/GPQA": 0.375, + "hfopenllm_v2/MUSR": 0.4795, + "hfopenllm_v2/MMLU-PRO": 0.5075 + } + }, + { + "id": "Triangle104/Q2.5-AthensCOT", + "name": "Q2.5-AthensCOT", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4573, + "hfopenllm_v2/BBH": 0.5542, + "hfopenllm_v2/MATH Level 5": 0.2915, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4578, + "hfopenllm_v2/MMLU-PRO": 0.4379 + } + }, + { + "id": "Triangle104/Q2.5-CodeR1-3B", + "name": "Q2.5-CodeR1-3B", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3588, + "hfopenllm_v2/BBH": 0.4661, + "hfopenllm_v2/MATH Level 5": 0.1639, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4315, + "hfopenllm_v2/MMLU-PRO": 0.2979 + } + }, + { + "id": "Triangle104/Q2.5-EVACOT-7b", + "name": "Q2.5-EVACOT-7b", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5784, + "hfopenllm_v2/BBH": 0.5506, + "hfopenllm_v2/MATH Level 5": 0.2825, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.4499, + "hfopenllm_v2/MMLU-PRO": 0.4331 + } + }, + { + "id": "Triangle104/Q2.5-EvaHumane-RP", + "name": "Q2.5-EvaHumane-RP", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3676, + "hfopenllm_v2/BBH": 0.5328, + "hfopenllm_v2/MATH Level 5": 0.2923, + "hfopenllm_v2/GPQA": 0.3188, + "hfopenllm_v2/MUSR": 0.4276, + "hfopenllm_v2/MMLU-PRO": 0.4412 + } + }, + { + "id": "Triangle104/Q2.5-Humane-RP", + "name": "Q2.5-Humane-RP", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4412, + "hfopenllm_v2/BBH": 0.5649, + "hfopenllm_v2/MATH Level 5": 0.3391, + "hfopenllm_v2/GPQA": 0.3188, + "hfopenllm_v2/MUSR": 0.4528, + "hfopenllm_v2/MMLU-PRO": 0.4492 + } + }, + { + "id": "Triangle104/Q2.5-Instruct-1M_Harmony", + "name": "Q2.5-Instruct-1M_Harmony", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6038, + "hfopenllm_v2/BBH": 0.5373, + "hfopenllm_v2/MATH Level 5": 0.3323, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.4688, + "hfopenllm_v2/MMLU-PRO": 0.4366 + } + }, + { + "id": "Triangle104/Q2.5-R1-3B", + "name": "Q2.5-R1-3B", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4214, + "hfopenllm_v2/BBH": 0.4812, + "hfopenllm_v2/MATH Level 5": 0.2674, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.432, + "hfopenllm_v2/MMLU-PRO": 0.3813 + } + }, + { + "id": "Triangle104/Q2.5-R1-7B", + "name": "Q2.5-R1-7B", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1346, + "hfopenllm_v2/BBH": 0.3007, + "hfopenllm_v2/MATH Level 5": 0.0166, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.3607, + "hfopenllm_v2/MMLU-PRO": 0.118 + } + }, + { + "id": "Triangle104/Robo-Gutenberg_V1.0", + "name": "Robo-Gutenberg_V1.0", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6008, + "hfopenllm_v2/BBH": 0.6537, + "hfopenllm_v2/MATH Level 5": 0.4562, + "hfopenllm_v2/GPQA": 0.3859, + "hfopenllm_v2/MUSR": 0.4744, + "hfopenllm_v2/MMLU-PRO": 0.5391 + } + }, + { + "id": "Triangle104/Rocinante-Prism_V2.0", + "name": "Rocinante-Prism_V2.0", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2616, + "hfopenllm_v2/BBH": 0.5361, + "hfopenllm_v2/MATH Level 5": 0.111, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.445, + "hfopenllm_v2/MMLU-PRO": 0.364 + } + }, + { + "id": "Triangle104/Rocinante-Prism_V2.1", + "name": "Rocinante-Prism_V2.1", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2558, + "hfopenllm_v2/BBH": 0.5333, + "hfopenllm_v2/MATH Level 5": 0.1125, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.449, + "hfopenllm_v2/MMLU-PRO": 0.3651 + } + }, + { + "id": "Triangle104/RomboHermes3-R1-Llama3.2-3b", + "name": "RomboHermes3-R1-Llama3.2-3b", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3007, + "hfopenllm_v2/BBH": 0.4264, + "hfopenllm_v2/MATH Level 5": 0.0816, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3657, + "hfopenllm_v2/MMLU-PRO": 0.2957 + } + }, + { + "id": "Triangle104/Rombos-Novasky-7B_V1c", + "name": "Rombos-Novasky-7B_V1c", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.408, + "hfopenllm_v2/BBH": 0.4349, + "hfopenllm_v2/MATH Level 5": 0.0853, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4465, + "hfopenllm_v2/MMLU-PRO": 0.2738 + } + }, + { + "id": "Triangle104/Set-70b", + "name": "Set-70b", + "developer": "Triangle104", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7643, + "hfopenllm_v2/BBH": 0.7014, + "hfopenllm_v2/MATH Level 5": 0.364, + "hfopenllm_v2/GPQA": 0.4463, + "hfopenllm_v2/MUSR": 0.4696, + "hfopenllm_v2/MMLU-PRO": 0.5442 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Tsunami-th.json b/data/developers/Tsunami-th.json new file mode 100644 index 0000000000000000000000000000000000000000..64c5b78b88f5bfca2f52ae7bd367015c771d5caf --- /dev/null +++ b/data/developers/Tsunami-th.json @@ -0,0 +1,61 @@ +{ + "developer": "Tsunami-th", + "models": [ + { + "id": "Tsunami-th/Tsunami-0.5-7B-Instruct", + "name": "Tsunami-0.5-7B-Instruct", + "developer": "Tsunami-th", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.74, + "hfopenllm_v2/BBH": 0.5524, + "hfopenllm_v2/MATH Level 5": 0.5045, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4257, + "hfopenllm_v2/MMLU-PRO": 0.4413 + } + }, + { + "id": "Tsunami-th/Tsunami-0.5x-7B-Instruct", + "name": "Tsunami-0.5x-7B-Instruct", + "developer": "Tsunami-th", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7099, + "hfopenllm_v2/BBH": 0.5593, + "hfopenllm_v2/MATH Level 5": 0.4207, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.4667, + "hfopenllm_v2/MMLU-PRO": 0.4458 + } + }, + { + "id": "Tsunami-th/Tsunami-1.0-14B-Instruct", + "name": "Tsunami-1.0-14B-Instruct", + "developer": "Tsunami-th", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7829, + "hfopenllm_v2/BBH": 0.6439, + "hfopenllm_v2/MATH Level 5": 0.4585, + "hfopenllm_v2/GPQA": 0.3565, + "hfopenllm_v2/MUSR": 0.4459, + "hfopenllm_v2/MMLU-PRO": 0.5249 + } + }, + { + "id": "Tsunami-th/Tsunami-1.0-7B-Instruct", + "name": "Tsunami-1.0-7B-Instruct", + "developer": "Tsunami-th", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7309, + "hfopenllm_v2/BBH": 0.5491, + "hfopenllm_v2/MATH Level 5": 0.4335, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4493, + "hfopenllm_v2/MMLU-PRO": 0.4424 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/UCLA-AGI.json b/data/developers/UCLA-AGI.json new file mode 100644 index 0000000000000000000000000000000000000000..75dd11b6fe190b43556ce10dc96eff4392988245 --- /dev/null +++ b/data/developers/UCLA-AGI.json @@ -0,0 +1,145 @@ +{ + "developer": "UCLA-AGI", + "models": [ + { + "id": "UCLA-AGI/Gemma-2-9B-It-SPPO-Iter1", + "name": "Gemma-2-9B-It-SPPO-Iter1", + "developer": "UCLA-AGI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3082, + "hfopenllm_v2/BBH": 0.5969, + "hfopenllm_v2/MATH Level 5": 0.0899, + "hfopenllm_v2/GPQA": 0.3364, + "hfopenllm_v2/MUSR": 0.4099, + "hfopenllm_v2/MMLU-PRO": 0.3907 + } + }, + { + "id": "UCLA-AGI/Gemma-2-9B-It-SPPO-Iter2", + "name": "Gemma-2-9B-It-SPPO-Iter2", + "developer": "UCLA-AGI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.31, + "hfopenllm_v2/BBH": 0.599, + "hfopenllm_v2/MATH Level 5": 0.0808, + "hfopenllm_v2/GPQA": 0.3347, + "hfopenllm_v2/MUSR": 0.4139, + "hfopenllm_v2/MMLU-PRO": 0.387 + } + }, + { + "id": "UCLA-AGI/Gemma-2-9B-It-SPPO-Iter3", + "name": "Gemma-2-9B-It-SPPO-Iter3", + "developer": "UCLA-AGI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3167, + "hfopenllm_v2/BBH": 0.6007, + "hfopenllm_v2/MATH Level 5": 0.071, + "hfopenllm_v2/GPQA": 0.3389, + "hfopenllm_v2/MUSR": 0.4166, + "hfopenllm_v2/MMLU-PRO": 0.3826 + } + }, + { + "id": "UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter1", + "name": "Llama-3-Instruct-8B-SPPO-Iter1", + "developer": "UCLA-AGI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7299, + "hfopenllm_v2/BBH": 0.5058, + "hfopenllm_v2/MATH Level 5": 0.1148, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3568, + "hfopenllm_v2/MMLU-PRO": 0.3711 + } + }, + { + "id": "UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter2", + "name": "Llama-3-Instruct-8B-SPPO-Iter2", + "developer": "UCLA-AGI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6989, + "hfopenllm_v2/BBH": 0.5089, + "hfopenllm_v2/MATH Level 5": 0.1035, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3594, + "hfopenllm_v2/MMLU-PRO": 0.3692 + } + }, + { + "id": "UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3", + "name": "Llama-3-Instruct-8B-SPPO-Iter3", + "developer": "UCLA-AGI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6834, + "hfopenllm_v2/BBH": 0.508, + "hfopenllm_v2/MATH Level 5": 0.0959, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3661, + "hfopenllm_v2/MMLU-PRO": 0.3644 + } + }, + { + "id": "UCLA-AGI/Mistral7B-PairRM-SPPO", + "name": "Mistral7B-PairRM-SPPO", + "developer": "UCLA-AGI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4355, + "hfopenllm_v2/BBH": 0.4439, + "hfopenllm_v2/MATH Level 5": 0.031, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.3965, + "hfopenllm_v2/MMLU-PRO": 0.2621 + } + }, + { + "id": "UCLA-AGI/Mistral7B-PairRM-SPPO-Iter1", + "name": "Mistral7B-PairRM-SPPO-Iter1", + "developer": "UCLA-AGI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5047, + "hfopenllm_v2/BBH": 0.4468, + "hfopenllm_v2/MATH Level 5": 0.0249, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3992, + "hfopenllm_v2/MMLU-PRO": 0.2695 + } + }, + { + "id": "UCLA-AGI/Mistral7B-PairRM-SPPO-Iter2", + "name": "Mistral7B-PairRM-SPPO-Iter2", + "developer": "UCLA-AGI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4446, + "hfopenllm_v2/BBH": 0.4466, + "hfopenllm_v2/MATH Level 5": 0.0219, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.4085, + "hfopenllm_v2/MMLU-PRO": 0.2677 + } + }, + { + "id": "UCLA-AGI/Mistral7B-PairRM-SPPO-Iter3", + "name": "Mistral7B-PairRM-SPPO-Iter3", + "developer": "UCLA-AGI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4351, + "hfopenllm_v2/BBH": 0.4397, + "hfopenllm_v2/MATH Level 5": 0.0234, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.4071, + "hfopenllm_v2/MMLU-PRO": 0.2658 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/UKzExecution.json b/data/developers/UKzExecution.json new file mode 100644 index 0000000000000000000000000000000000000000..4bcbd0d70ea224adc75713c79f642ffd360ec781 --- /dev/null +++ b/data/developers/UKzExecution.json @@ -0,0 +1,19 @@ +{ + "developer": "UKzExecution", + "models": [ + { + "id": "UKzExecution/LlamaExecutor-8B-3.0.5", + "name": "LlamaExecutor-8B-3.0.5", + "developer": "UKzExecution", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7403, + "hfopenllm_v2/BBH": 0.5006, + "hfopenllm_v2/MATH Level 5": 0.102, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3754, + "hfopenllm_v2/MMLU-PRO": 0.3625 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Unbabel.json b/data/developers/Unbabel.json new file mode 100644 index 0000000000000000000000000000000000000000..5a60588fe344e5c7316edcb47d8d3aa47ea2f5c1 --- /dev/null +++ b/data/developers/Unbabel.json @@ -0,0 +1,19 @@ +{ + "developer": "Unbabel", + "models": [ + { + "id": "Unbabel/TowerInstruct-Mistral-7B-v0.2", + "name": "TowerInstruct-Mistral-7B-v0.2", + "developer": "Unbabel", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2843, + "hfopenllm_v2/BBH": 0.3882, + "hfopenllm_v2/MATH Level 5": 0.0204, + "hfopenllm_v2/GPQA": 0.2475, + "hfopenllm_v2/MUSR": 0.4522, + "hfopenllm_v2/MMLU-PRO": 0.1968 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Undi95.json b/data/developers/Undi95.json new file mode 100644 index 0000000000000000000000000000000000000000..f448e36f91def4ad862286f0aed2b6c5faea3274 --- /dev/null +++ b/data/developers/Undi95.json @@ -0,0 +1,33 @@ +{ + "developer": "Undi95", + "models": [ + { + "id": "Undi95/MG-FinalMix-72B", + "name": "MG-FinalMix-72B", + "developer": "Undi95", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8014, + "hfopenllm_v2/BBH": 0.6973, + "hfopenllm_v2/MATH Level 5": 0.3973, + "hfopenllm_v2/GPQA": 0.3851, + "hfopenllm_v2/MUSR": 0.4823, + "hfopenllm_v2/MMLU-PRO": 0.5427 + } + }, + { + "id": "Undi95/Phi4-abliterated", + "name": "Phi4-abliterated", + "developer": "Undi95", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6618, + "hfopenllm_v2/BBH": 0.6809, + "hfopenllm_v2/MATH Level 5": 0.3701, + "hfopenllm_v2/GPQA": 0.3305, + "hfopenllm_v2/MUSR": 0.4034, + "hfopenllm_v2/MMLU-PRO": 0.5281 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/V3N0M.json b/data/developers/V3N0M.json new file mode 100644 index 0000000000000000000000000000000000000000..046d2dddb46255aa6e08144f75c10d7203668a12 --- /dev/null +++ b/data/developers/V3N0M.json @@ -0,0 +1,19 @@ +{ + "developer": "V3N0M", + "models": [ + { + "id": "V3N0M/Jenna-Tiny-2.0", + "name": "Jenna-Tiny-2.0", + "developer": "V3N0M", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2309, + "hfopenllm_v2/BBH": 0.3148, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3367, + "hfopenllm_v2/MMLU-PRO": 0.1147 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/VAGOsolutions.json b/data/developers/VAGOsolutions.json new file mode 100644 index 0000000000000000000000000000000000000000..ecbb33654002c8b680d7345058a43422d24c0ff6 --- /dev/null +++ b/data/developers/VAGOsolutions.json @@ -0,0 +1,243 @@ +{ + "developer": "VAGOsolutions", + "models": [ + { + "id": "VAGOsolutions/Llama-3-SauerkrautLM-70b-Instruct", + "name": "Llama-3-SauerkrautLM-70b-Instruct", + "developer": "VAGOsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8045, + "hfopenllm_v2/BBH": 0.6663, + "hfopenllm_v2/MATH Level 5": 0.2281, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.4339, + "hfopenllm_v2/MMLU-PRO": 0.5392 + } + }, + { + "id": "VAGOsolutions/Llama-3-SauerkrautLM-8b-Instruct", + "name": "Llama-3-SauerkrautLM-8b-Instruct", + "developer": "VAGOsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7445, + "hfopenllm_v2/BBH": 0.4943, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4241, + "hfopenllm_v2/MMLU-PRO": 0.3857 + } + }, + { + "id": "VAGOsolutions/Llama-3.1-SauerkrautLM-70b-Instruct", + "name": "Llama-3.1-SauerkrautLM-70b-Instruct", + "developer": "VAGOsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8656, + "hfopenllm_v2/BBH": 0.7006, + "hfopenllm_v2/MATH Level 5": 0.3693, + "hfopenllm_v2/GPQA": 0.3414, + "hfopenllm_v2/MUSR": 0.4711, + "hfopenllm_v2/MMLU-PRO": 0.5335 + } + }, + { + "id": "VAGOsolutions/Llama-3.1-SauerkrautLM-8b-Instruct", + "name": "Llama-3.1-SauerkrautLM-8b-Instruct", + "developer": "VAGOsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8017, + "hfopenllm_v2/BBH": 0.5115, + "hfopenllm_v2/MATH Level 5": 0.1941, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4148, + "hfopenllm_v2/MMLU-PRO": 0.389 + } + }, + { + "id": "VAGOsolutions/SauerkrautLM-1.5b", + "name": "SauerkrautLM-1.5b", + "developer": "VAGOsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2404, + "hfopenllm_v2/BBH": 0.3704, + "hfopenllm_v2/MATH Level 5": 0.0363, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3739, + "hfopenllm_v2/MMLU-PRO": 0.2151 + } + }, + { + "id": "VAGOsolutions/SauerkrautLM-7b-HerO", + "name": "SauerkrautLM-7b-HerO", + "developer": "VAGOsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5346, + "hfopenllm_v2/BBH": 0.4904, + "hfopenllm_v2/MATH Level 5": 0.0393, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3924, + "hfopenllm_v2/MMLU-PRO": 0.3046 + } + }, + { + "id": "VAGOsolutions/SauerkrautLM-7b-LaserChat", + "name": "SauerkrautLM-7b-LaserChat", + "developer": "VAGOsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5988, + "hfopenllm_v2/BBH": 0.4543, + "hfopenllm_v2/MATH Level 5": 0.0778, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4148, + "hfopenllm_v2/MMLU-PRO": 0.3305 + } + }, + { + "id": "VAGOsolutions/SauerkrautLM-Gemma-2b", + "name": "SauerkrautLM-Gemma-2b", + "developer": "VAGOsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2475, + "hfopenllm_v2/BBH": 0.3416, + "hfopenllm_v2/MATH Level 5": 0.0279, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3676, + "hfopenllm_v2/MMLU-PRO": 0.1469 + } + }, + { + "id": "VAGOsolutions/SauerkrautLM-Gemma-7b", + "name": "SauerkrautLM-Gemma-7b", + "developer": "VAGOsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3407, + "hfopenllm_v2/BBH": 0.4188, + "hfopenllm_v2/MATH Level 5": 0.0672, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.3594, + "hfopenllm_v2/MMLU-PRO": 0.2961 + } + }, + { + "id": "VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct", + "name": "SauerkrautLM-Mixtral-8x7B-Instruct", + "developer": "VAGOsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5602, + "hfopenllm_v2/BBH": 0.5277, + "hfopenllm_v2/MATH Level 5": 0.0982, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4204, + "hfopenllm_v2/MMLU-PRO": 0.365 + } + }, + { + "id": "VAGOsolutions/SauerkrautLM-Nemo-12b-Instruct", + "name": "SauerkrautLM-Nemo-12b-Instruct", + "developer": "VAGOsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6113, + "hfopenllm_v2/BBH": 0.5214, + "hfopenllm_v2/MATH Level 5": 0.1224, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4469, + "hfopenllm_v2/MMLU-PRO": 0.3385 + } + }, + { + "id": "VAGOsolutions/SauerkrautLM-Phi-3-medium", + "name": "SauerkrautLM-Phi-3-medium", + "developer": "VAGOsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4409, + "hfopenllm_v2/BBH": 0.6433, + "hfopenllm_v2/MATH Level 5": 0.1601, + "hfopenllm_v2/GPQA": 0.3347, + "hfopenllm_v2/MUSR": 0.4845, + "hfopenllm_v2/MMLU-PRO": 0.4665 + } + }, + { + "id": "VAGOsolutions/SauerkrautLM-SOLAR-Instruct", + "name": "SauerkrautLM-SOLAR-Instruct", + "developer": "VAGOsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4917, + "hfopenllm_v2/BBH": 0.5169, + "hfopenllm_v2/MATH Level 5": 0.0634, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.3965, + "hfopenllm_v2/MMLU-PRO": 0.3183 + } + }, + { + "id": "VAGOsolutions/SauerkrautLM-gemma-2-2b-it", + "name": "SauerkrautLM-gemma-2-2b-it", + "developer": "VAGOsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1321, + "hfopenllm_v2/BBH": 0.4241, + "hfopenllm_v2/MATH Level 5": 0.0219, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3995, + "hfopenllm_v2/MMLU-PRO": 0.2693 + } + }, + { + "id": "VAGOsolutions/SauerkrautLM-gemma-2-9b-it", + "name": "SauerkrautLM-gemma-2-9b-it", + "developer": "VAGOsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3024, + "hfopenllm_v2/BBH": 0.6073, + "hfopenllm_v2/MATH Level 5": 0.0838, + "hfopenllm_v2/GPQA": 0.3272, + "hfopenllm_v2/MUSR": 0.4318, + "hfopenllm_v2/MMLU-PRO": 0.4091 + } + }, + { + "id": "VAGOsolutions/SauerkrautLM-v2-14b-DPO", + "name": "SauerkrautLM-v2-14b-DPO", + "developer": "VAGOsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7412, + "hfopenllm_v2/BBH": 0.656, + "hfopenllm_v2/MATH Level 5": 0.3165, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.4375, + "hfopenllm_v2/MMLU-PRO": 0.5117 + } + }, + { + "id": "VAGOsolutions/SauerkrautLM-v2-14b-SFT", + "name": "SauerkrautLM-v2-14b-SFT", + "developer": "VAGOsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6949, + "hfopenllm_v2/BBH": 0.621, + "hfopenllm_v2/MATH Level 5": 0.3285, + "hfopenllm_v2/GPQA": 0.3356, + "hfopenllm_v2/MUSR": 0.4179, + "hfopenllm_v2/MMLU-PRO": 0.5205 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/VIRNECT.json b/data/developers/VIRNECT.json new file mode 100644 index 0000000000000000000000000000000000000000..c4d108ef590950a6696d9775575feeb937623918 --- /dev/null +++ b/data/developers/VIRNECT.json @@ -0,0 +1,33 @@ +{ + "developer": "VIRNECT", + "models": [ + { + "id": "VIRNECT/llama-3-Korean-8B", + "name": "llama-3-Korean-8B", + "developer": "VIRNECT", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5058, + "hfopenllm_v2/BBH": 0.4908, + "hfopenllm_v2/MATH Level 5": 0.0929, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3662, + "hfopenllm_v2/MMLU-PRO": 0.3539 + } + }, + { + "id": "VIRNECT/llama-3-Korean-8B-r-v-0.1", + "name": "llama-3-Korean-8B-r-v-0.1", + "developer": "VIRNECT", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4916, + "hfopenllm_v2/BBH": 0.4806, + "hfopenllm_v2/MATH Level 5": 0.0861, + "hfopenllm_v2/GPQA": 0.2424, + "hfopenllm_v2/MUSR": 0.3675, + "hfopenllm_v2/MMLU-PRO": 0.326 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ValiantLabs.json b/data/developers/ValiantLabs.json new file mode 100644 index 0000000000000000000000000000000000000000..892d28b9bdd00084b849ccf11411be9ae5f19a1e --- /dev/null +++ b/data/developers/ValiantLabs.json @@ -0,0 +1,159 @@ +{ + "developer": "ValiantLabs", + "models": [ + { + "id": "ValiantLabs/Llama3-70B-Fireplace", + "name": "Llama3-70B-Fireplace", + "developer": "ValiantLabs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7774, + "hfopenllm_v2/BBH": 0.6489, + "hfopenllm_v2/MATH Level 5": 0.2145, + "hfopenllm_v2/GPQA": 0.3549, + "hfopenllm_v2/MUSR": 0.4449, + "hfopenllm_v2/MMLU-PRO": 0.4893 + } + }, + { + "id": "ValiantLabs/Llama3-70B-ShiningValiant2", + "name": "Llama3-70B-ShiningValiant2", + "developer": "ValiantLabs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6122, + "hfopenllm_v2/BBH": 0.6338, + "hfopenllm_v2/MATH Level 5": 0.2077, + "hfopenllm_v2/GPQA": 0.3305, + "hfopenllm_v2/MUSR": 0.4326, + "hfopenllm_v2/MMLU-PRO": 0.4898 + } + }, + { + "id": "ValiantLabs/Llama3.1-70B-ShiningValiant2", + "name": "Llama3.1-70B-ShiningValiant2", + "developer": "ValiantLabs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5355, + "hfopenllm_v2/BBH": 0.6738, + "hfopenllm_v2/MATH Level 5": 0.2915, + "hfopenllm_v2/GPQA": 0.3926, + "hfopenllm_v2/MUSR": 0.4681, + "hfopenllm_v2/MMLU-PRO": 0.5173 + } + }, + { + "id": "ValiantLabs/Llama3.1-8B-Cobalt", + "name": "Llama3.1-8B-Cobalt", + "developer": "ValiantLabs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3496, + "hfopenllm_v2/BBH": 0.4947, + "hfopenllm_v2/MATH Level 5": 0.1269, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.3959, + "hfopenllm_v2/MMLU-PRO": 0.3644 + } + }, + { + "id": "ValiantLabs/Llama3.1-8B-Enigma", + "name": "Llama3.1-8B-Enigma", + "developer": "ValiantLabs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2681, + "hfopenllm_v2/BBH": 0.4478, + "hfopenllm_v2/MATH Level 5": 0.0891, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.4196, + "hfopenllm_v2/MMLU-PRO": 0.3409 + } + }, + { + "id": "ValiantLabs/Llama3.1-8B-Esper2", + "name": "Llama3.1-8B-Esper2", + "developer": "ValiantLabs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2567, + "hfopenllm_v2/BBH": 0.447, + "hfopenllm_v2/MATH Level 5": 0.0589, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3561, + "hfopenllm_v2/MMLU-PRO": 0.2904 + } + }, + { + "id": "ValiantLabs/Llama3.1-8B-Fireplace2", + "name": "Llama3.1-8B-Fireplace2", + "developer": "ValiantLabs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5328, + "hfopenllm_v2/BBH": 0.4613, + "hfopenllm_v2/MATH Level 5": 0.0876, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.3367, + "hfopenllm_v2/MMLU-PRO": 0.2424 + } + }, + { + "id": "ValiantLabs/Llama3.1-8B-ShiningValiant2", + "name": "Llama3.1-8B-ShiningValiant2", + "developer": "ValiantLabs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6496, + "hfopenllm_v2/BBH": 0.4774, + "hfopenllm_v2/MATH Level 5": 0.0566, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.3909, + "hfopenllm_v2/MMLU-PRO": 0.3382 + } + }, + { + "id": "ValiantLabs/Llama3.2-3B-Enigma", + "name": "Llama3.2-3B-Enigma", + "developer": "ValiantLabs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2786, + "hfopenllm_v2/BBH": 0.3723, + "hfopenllm_v2/MATH Level 5": 0.0438, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3921, + "hfopenllm_v2/MMLU-PRO": 0.2428 + } + }, + { + "id": "ValiantLabs/Llama3.2-3B-Esper2", + "name": "Llama3.2-3B-Esper2", + "developer": "ValiantLabs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.275, + "hfopenllm_v2/BBH": 0.3808, + "hfopenllm_v2/MATH Level 5": 0.0363, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.355, + "hfopenllm_v2/MMLU-PRO": 0.2257 + } + }, + { + "id": "ValiantLabs/Llama3.2-3B-ShiningValiant2", + "name": "Llama3.2-3B-ShiningValiant2", + "developer": "ValiantLabs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2625, + "hfopenllm_v2/BBH": 0.4226, + "hfopenllm_v2/MATH Level 5": 0.0823, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3866, + "hfopenllm_v2/MMLU-PRO": 0.2829 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Vikhrmodels.json b/data/developers/Vikhrmodels.json new file mode 100644 index 0000000000000000000000000000000000000000..5cc67a891b263df5a424866528225e042c9fa782 --- /dev/null +++ b/data/developers/Vikhrmodels.json @@ -0,0 +1,33 @@ +{ + "developer": "Vikhrmodels", + "models": [ + { + "id": "Vikhrmodels/Vikhr-Llama3.1-8B-Instruct-R-21-09-24", + "name": "Vikhr-Llama3.1-8B-Instruct-R-21-09-24", + "developer": "Vikhrmodels", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6431, + "hfopenllm_v2/BBH": 0.5272, + "hfopenllm_v2/MATH Level 5": 0.2175, + "hfopenllm_v2/GPQA": 0.245, + "hfopenllm_v2/MUSR": 0.3754, + "hfopenllm_v2/MMLU-PRO": 0.3547 + } + }, + { + "id": "Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24", + "name": "Vikhr-Nemo-12B-Instruct-R-21-09-24", + "developer": "Vikhrmodels", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5999, + "hfopenllm_v2/BBH": 0.5212, + "hfopenllm_v2/MATH Level 5": 0.1715, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.4073, + "hfopenllm_v2/MMLU-PRO": 0.3398 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Weyaxi.json b/data/developers/Weyaxi.json new file mode 100644 index 0000000000000000000000000000000000000000..9d6c12fcc47862825f53e1e4b1c81ff9b3189f3e --- /dev/null +++ b/data/developers/Weyaxi.json @@ -0,0 +1,117 @@ +{ + "developer": "Weyaxi", + "models": [ + { + "id": "Weyaxi/Bagel-Hermes-2x34B", + "name": "Bagel-Hermes-2x34B", + "developer": "Weyaxi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5432, + "hfopenllm_v2/BBH": 0.4917, + "hfopenllm_v2/MATH Level 5": 0.0604, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.4517, + "hfopenllm_v2/MMLU-PRO": 0.4589 + } + }, + { + "id": "Weyaxi/Bagel-Hermes-34B-Slerp", + "name": "Bagel-Hermes-34B-Slerp", + "developer": "Weyaxi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4603, + "hfopenllm_v2/BBH": 0.5922, + "hfopenllm_v2/MATH Level 5": 0.0604, + "hfopenllm_v2/GPQA": 0.3347, + "hfopenllm_v2/MUSR": 0.4622, + "hfopenllm_v2/MMLU-PRO": 0.4703 + } + }, + { + "id": "Weyaxi/Einstein-v4-7B", + "name": "Einstein-v4-7B", + "developer": "Weyaxi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4708, + "hfopenllm_v2/BBH": 0.3849, + "hfopenllm_v2/MATH Level 5": 0.0189, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.4682, + "hfopenllm_v2/MMLU-PRO": 0.2259 + } + }, + { + "id": "Weyaxi/Einstein-v6.1-Llama3-8B", + "name": "Einstein-v6.1-Llama3-8B", + "developer": "Weyaxi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4568, + "hfopenllm_v2/BBH": 0.5008, + "hfopenllm_v2/MATH Level 5": 0.068, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.4213, + "hfopenllm_v2/MMLU-PRO": 0.3131 + } + }, + { + "id": "Weyaxi/Einstein-v6.1-developed-by-Weyaxi-Llama3-8B", + "name": "Einstein-v6.1-developed-by-Weyaxi-Llama3-8B", + "developer": "Weyaxi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3927, + "hfopenllm_v2/BBH": 0.5044, + "hfopenllm_v2/MATH Level 5": 0.0718, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.4332, + "hfopenllm_v2/MMLU-PRO": 0.3093 + } + }, + { + "id": "Weyaxi/Einstein-v7-Qwen2-7B", + "name": "Einstein-v7-Qwen2-7B", + "developer": "Weyaxi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.41, + "hfopenllm_v2/BBH": 0.5161, + "hfopenllm_v2/MATH Level 5": 0.1994, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.44, + "hfopenllm_v2/MMLU-PRO": 0.4096 + } + }, + { + "id": "Weyaxi/Einstein-v8-Llama3.2-1B", + "name": "Einstein-v8-Llama3.2-1B", + "developer": "Weyaxi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1862, + "hfopenllm_v2/BBH": 0.3018, + "hfopenllm_v2/MATH Level 5": 0.0008, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3618, + "hfopenllm_v2/MMLU-PRO": 0.1161 + } + }, + { + "id": "Weyaxi/SauerkrautLM-UNA-SOLAR-Instruct", + "name": "SauerkrautLM-UNA-SOLAR-Instruct", + "developer": "Weyaxi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4573, + "hfopenllm_v2/BBH": 0.5166, + "hfopenllm_v2/MATH Level 5": 0.0461, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.3979, + "hfopenllm_v2/MMLU-PRO": 0.3153 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/WizardLMTeam.json b/data/developers/WizardLMTeam.json new file mode 100644 index 0000000000000000000000000000000000000000..4c9fb2823b79284575bcc095b3d2c6b47611bd1c --- /dev/null +++ b/data/developers/WizardLMTeam.json @@ -0,0 +1,47 @@ +{ + "developer": "WizardLMTeam", + "models": [ + { + "id": "WizardLMTeam/WizardLM-13B-V1.0", + "name": "WizardLM-13B-V1.0", + "developer": "WizardLMTeam", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.185, + "hfopenllm_v2/BBH": 0.2913, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3497, + "hfopenllm_v2/MMLU-PRO": 0.1166 + } + }, + { + "id": "WizardLMTeam/WizardLM-13B-V1.2", + "name": "WizardLM-13B-V1.2", + "developer": "WizardLMTeam", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3392, + "hfopenllm_v2/BBH": 0.4462, + "hfopenllm_v2/MATH Level 5": 0.0189, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.4378, + "hfopenllm_v2/MMLU-PRO": 0.2519 + } + }, + { + "id": "WizardLMTeam/WizardLM-70B-V1.0", + "name": "WizardLM-70B-V1.0", + "developer": "WizardLMTeam", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4951, + "hfopenllm_v2/BBH": 0.559, + "hfopenllm_v2/MATH Level 5": 0.0393, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.4391, + "hfopenllm_v2/MMLU-PRO": 0.3447 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Wladastic.json b/data/developers/Wladastic.json new file mode 100644 index 0000000000000000000000000000000000000000..f0bb4a704305145cf952e72fb6f156140ab45ac9 --- /dev/null +++ b/data/developers/Wladastic.json @@ -0,0 +1,19 @@ +{ + "developer": "Wladastic", + "models": [ + { + "id": "Wladastic/Mini-Think-Base-1B", + "name": "Mini-Think-Base-1B", + "developer": "Wladastic", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5588, + "hfopenllm_v2/BBH": 0.3574, + "hfopenllm_v2/MATH Level 5": 0.0733, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3275, + "hfopenllm_v2/MMLU-PRO": 0.1772 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Xclbr7.json b/data/developers/Xclbr7.json new file mode 100644 index 0000000000000000000000000000000000000000..3d8100e2b07037301a9f7029a956d95ebc7d1aa0 --- /dev/null +++ b/data/developers/Xclbr7.json @@ -0,0 +1,61 @@ +{ + "developer": "Xclbr7", + "models": [ + { + "id": "Xclbr7/Arcanum-12b", + "name": "Arcanum-12b", + "developer": "Xclbr7", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2907, + "hfopenllm_v2/BBH": 0.5265, + "hfopenllm_v2/MATH Level 5": 0.1193, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.417, + "hfopenllm_v2/MMLU-PRO": 0.3586 + } + }, + { + "id": "Xclbr7/Hyena-12b", + "name": "Hyena-12b", + "developer": "Xclbr7", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3404, + "hfopenllm_v2/BBH": 0.5457, + "hfopenllm_v2/MATH Level 5": 0.1133, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.3984, + "hfopenllm_v2/MMLU-PRO": 0.3439 + } + }, + { + "id": "Xclbr7/caliburn-12b", + "name": "caliburn-12b", + "developer": "Xclbr7", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3576, + "hfopenllm_v2/BBH": 0.5519, + "hfopenllm_v2/MATH Level 5": 0.1125, + "hfopenllm_v2/GPQA": 0.3364, + "hfopenllm_v2/MUSR": 0.4292, + "hfopenllm_v2/MMLU-PRO": 0.3675 + } + }, + { + "id": "Xclbr7/caliburn-v2-12b", + "name": "caliburn-v2-12b", + "developer": "Xclbr7", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2967, + "hfopenllm_v2/BBH": 0.5141, + "hfopenllm_v2/MATH Level 5": 0.105, + "hfopenllm_v2/GPQA": 0.3263, + "hfopenllm_v2/MUSR": 0.437, + "hfopenllm_v2/MMLU-PRO": 0.3784 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Xiaojian9992024.json b/data/developers/Xiaojian9992024.json new file mode 100644 index 0000000000000000000000000000000000000000..143c5d039208c97a6954b4619330d62b00e55a1a --- /dev/null +++ b/data/developers/Xiaojian9992024.json @@ -0,0 +1,173 @@ +{ + "developer": "Xiaojian9992024", + "models": [ + { + "id": "Xiaojian9992024/Llama3.2-1B-THREADRIPPER", + "name": "Llama3.2-1B-THREADRIPPER", + "developer": "Xiaojian9992024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5576, + "hfopenllm_v2/BBH": 0.3544, + "hfopenllm_v2/MATH Level 5": 0.074, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.313, + "hfopenllm_v2/MMLU-PRO": 0.1763 + } + }, + { + "id": "Xiaojian9992024/Llama3.2-1B-THREADRIPPER-v0.2", + "name": "Llama3.2-1B-THREADRIPPER-v0.2", + "developer": "Xiaojian9992024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5318, + "hfopenllm_v2/BBH": 0.3528, + "hfopenllm_v2/MATH Level 5": 0.0657, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3316, + "hfopenllm_v2/MMLU-PRO": 0.1745 + } + }, + { + "id": "Xiaojian9992024/Phi-4-Megatron-Empathetic", + "name": "Phi-4-Megatron-Empathetic", + "developer": "Xiaojian9992024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0173, + "hfopenllm_v2/BBH": 0.6673, + "hfopenllm_v2/MATH Level 5": 0.2696, + "hfopenllm_v2/GPQA": 0.3859, + "hfopenllm_v2/MUSR": 0.5071, + "hfopenllm_v2/MMLU-PRO": 0.5082 + } + }, + { + "id": "Xiaojian9992024/Phi-4-mini-UNOFFICAL", + "name": "Phi-4-mini-UNOFFICAL", + "developer": "Xiaojian9992024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1273, + "hfopenllm_v2/BBH": 0.2944, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2408, + "hfopenllm_v2/MUSR": 0.3368, + "hfopenllm_v2/MMLU-PRO": 0.1144 + } + }, + { + "id": "Xiaojian9992024/Qwen2.5-7B-MS-Destroyer", + "name": "Qwen2.5-7B-MS-Destroyer", + "developer": "Xiaojian9992024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7296, + "hfopenllm_v2/BBH": 0.547, + "hfopenllm_v2/MATH Level 5": 0.4592, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.427, + "hfopenllm_v2/MMLU-PRO": 0.4412 + } + }, + { + "id": "Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview", + "name": "Qwen2.5-Dyanka-7B-Preview", + "developer": "Xiaojian9992024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.764, + "hfopenllm_v2/BBH": 0.5543, + "hfopenllm_v2/MATH Level 5": 0.4879, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4481, + "hfopenllm_v2/MMLU-PRO": 0.4376 + } + }, + { + "id": "Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview-v0.2", + "name": "Qwen2.5-Dyanka-7B-Preview-v0.2", + "developer": "Xiaojian9992024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6702, + "hfopenllm_v2/BBH": 0.5374, + "hfopenllm_v2/MATH Level 5": 0.4721, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4467, + "hfopenllm_v2/MMLU-PRO": 0.4371 + } + }, + { + "id": "Xiaojian9992024/Qwen2.5-THREADRIPPER-Medium-Censored", + "name": "Qwen2.5-THREADRIPPER-Medium-Censored", + "developer": "Xiaojian9992024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8112, + "hfopenllm_v2/BBH": 0.6431, + "hfopenllm_v2/MATH Level 5": 0.534, + "hfopenllm_v2/GPQA": 0.3347, + "hfopenllm_v2/MUSR": 0.414, + "hfopenllm_v2/MMLU-PRO": 0.4929 + } + }, + { + "id": "Xiaojian9992024/Qwen2.5-THREADRIPPER-Small", + "name": "Qwen2.5-THREADRIPPER-Small", + "developer": "Xiaojian9992024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7689, + "hfopenllm_v2/BBH": 0.549, + "hfopenllm_v2/MATH Level 5": 0.4736, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4349, + "hfopenllm_v2/MMLU-PRO": 0.4357 + } + }, + { + "id": "Xiaojian9992024/Qwen2.5-THREADRIPPER-Small-AnniversaryEdition", + "name": "Qwen2.5-THREADRIPPER-Small-AnniversaryEdition", + "developer": "Xiaojian9992024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7404, + "hfopenllm_v2/BBH": 0.5465, + "hfopenllm_v2/MATH Level 5": 0.5076, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3807, + "hfopenllm_v2/MMLU-PRO": 0.4393 + } + }, + { + "id": "Xiaojian9992024/Qwen2.5-Ultra-1.5B-25.02-Exp", + "name": "Qwen2.5-Ultra-1.5B-25.02-Exp", + "developer": "Xiaojian9992024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4073, + "hfopenllm_v2/BBH": 0.4066, + "hfopenllm_v2/MATH Level 5": 0.0831, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3383, + "hfopenllm_v2/MMLU-PRO": 0.2641 + } + }, + { + "id": "Xiaojian9992024/Reflection-L3.2-JametMiniMix-3B", + "name": "Reflection-L3.2-JametMiniMix-3B", + "developer": "Xiaojian9992024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4619, + "hfopenllm_v2/BBH": 0.439, + "hfopenllm_v2/MATH Level 5": 0.1193, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.3667, + "hfopenllm_v2/MMLU-PRO": 0.2988 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Xkev.json b/data/developers/Xkev.json new file mode 100644 index 0000000000000000000000000000000000000000..5612b6593ae88e8cf3c29929039db59c9e95e540 --- /dev/null +++ b/data/developers/Xkev.json @@ -0,0 +1,19 @@ +{ + "developer": "Xkev", + "models": [ + { + "id": "Xkev/Llama-3.2V-11B-cot", + "name": "Llama-3.2V-11B-cot", + "developer": "Xkev", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4158, + "hfopenllm_v2/BBH": 0.4959, + "hfopenllm_v2/MATH Level 5": 0.1556, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4159, + "hfopenllm_v2/MMLU-PRO": 0.3587 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/YOYO-AI.json b/data/developers/YOYO-AI.json new file mode 100644 index 0000000000000000000000000000000000000000..eb2a1ce270d817efe822931d5e6cc6863bc0fc22 --- /dev/null +++ b/data/developers/YOYO-AI.json @@ -0,0 +1,299 @@ +{ + "developer": "YOYO-AI", + "models": [ + { + "id": "YOYO-AI/Qwen2.5-14B-1M-YOYO-V3", + "name": "Qwen2.5-14B-1M-YOYO-V3", + "developer": "YOYO-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8398, + "hfopenllm_v2/BBH": 0.6448, + "hfopenllm_v2/MATH Level 5": 0.5355, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.4141, + "hfopenllm_v2/MMLU-PRO": 0.5207 + } + }, + { + "id": "YOYO-AI/Qwen2.5-14B-YOYO-0505", + "name": "Qwen2.5-14B-YOYO-0505", + "developer": "YOYO-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5883, + "hfopenllm_v2/BBH": 0.6539, + "hfopenllm_v2/MATH Level 5": 0.4434, + "hfopenllm_v2/GPQA": 0.3733, + "hfopenllm_v2/MUSR": 0.4757, + "hfopenllm_v2/MMLU-PRO": 0.5371 + } + }, + { + "id": "YOYO-AI/Qwen2.5-14B-YOYO-0510-v2", + "name": "Qwen2.5-14B-YOYO-0510-v2", + "developer": "YOYO-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5947, + "hfopenllm_v2/BBH": 0.6553, + "hfopenllm_v2/MATH Level 5": 0.4441, + "hfopenllm_v2/GPQA": 0.3817, + "hfopenllm_v2/MUSR": 0.4744, + "hfopenllm_v2/MMLU-PRO": 0.5381 + } + }, + { + "id": "YOYO-AI/Qwen2.5-14B-YOYO-0805", + "name": "Qwen2.5-14B-YOYO-0805", + "developer": "YOYO-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5883, + "hfopenllm_v2/BBH": 0.6539, + "hfopenllm_v2/MATH Level 5": 0.4434, + "hfopenllm_v2/GPQA": 0.3733, + "hfopenllm_v2/MUSR": 0.4757, + "hfopenllm_v2/MMLU-PRO": 0.5371 + } + }, + { + "id": "YOYO-AI/Qwen2.5-14B-YOYO-1005", + "name": "Qwen2.5-14B-YOYO-1005", + "developer": "YOYO-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5972, + "hfopenllm_v2/BBH": 0.6542, + "hfopenllm_v2/MATH Level 5": 0.4524, + "hfopenllm_v2/GPQA": 0.3809, + "hfopenllm_v2/MUSR": 0.473, + "hfopenllm_v2/MMLU-PRO": 0.5382 + } + }, + { + "id": "YOYO-AI/Qwen2.5-14B-YOYO-1005-v2", + "name": "Qwen2.5-14B-YOYO-1005-v2", + "developer": "YOYO-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5953, + "hfopenllm_v2/BBH": 0.6551, + "hfopenllm_v2/MATH Level 5": 0.4434, + "hfopenllm_v2/GPQA": 0.3842, + "hfopenllm_v2/MUSR": 0.4731, + "hfopenllm_v2/MMLU-PRO": 0.5372 + } + }, + { + "id": "YOYO-AI/Qwen2.5-14B-YOYO-1010", + "name": "Qwen2.5-14B-YOYO-1010", + "developer": "YOYO-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5899, + "hfopenllm_v2/BBH": 0.654, + "hfopenllm_v2/MATH Level 5": 0.4509, + "hfopenllm_v2/GPQA": 0.3834, + "hfopenllm_v2/MUSR": 0.4744, + "hfopenllm_v2/MMLU-PRO": 0.5376 + } + }, + { + "id": "YOYO-AI/Qwen2.5-14B-YOYO-1010-v2", + "name": "Qwen2.5-14B-YOYO-1010-v2", + "developer": "YOYO-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5947, + "hfopenllm_v2/BBH": 0.6553, + "hfopenllm_v2/MATH Level 5": 0.4441, + "hfopenllm_v2/GPQA": 0.3817, + "hfopenllm_v2/MUSR": 0.4744, + "hfopenllm_v2/MMLU-PRO": 0.5381 + } + }, + { + "id": "YOYO-AI/Qwen2.5-14B-YOYO-SCE", + "name": "Qwen2.5-14B-YOYO-SCE", + "developer": "YOYO-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5844, + "hfopenllm_v2/BBH": 0.6489, + "hfopenllm_v2/MATH Level 5": 0.4615, + "hfopenllm_v2/GPQA": 0.3742, + "hfopenllm_v2/MUSR": 0.4704, + "hfopenllm_v2/MMLU-PRO": 0.5381 + } + }, + { + "id": "YOYO-AI/Qwen2.5-14B-YOYO-V4", + "name": "Qwen2.5-14B-YOYO-V4", + "developer": "YOYO-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8398, + "hfopenllm_v2/BBH": 0.649, + "hfopenllm_v2/MATH Level 5": 0.5347, + "hfopenllm_v2/GPQA": 0.3221, + "hfopenllm_v2/MUSR": 0.4115, + "hfopenllm_v2/MMLU-PRO": 0.517 + } + }, + { + "id": "YOYO-AI/Qwen2.5-14B-YOYO-V4-p1", + "name": "Qwen2.5-14B-YOYO-V4-p1", + "developer": "YOYO-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8203, + "hfopenllm_v2/BBH": 0.6516, + "hfopenllm_v2/MATH Level 5": 0.5332, + "hfopenllm_v2/GPQA": 0.3456, + "hfopenllm_v2/MUSR": 0.4194, + "hfopenllm_v2/MMLU-PRO": 0.502 + } + }, + { + "id": "YOYO-AI/Qwen2.5-14B-YOYO-V4-p2", + "name": "Qwen2.5-14B-YOYO-V4-p2", + "developer": "YOYO-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8048, + "hfopenllm_v2/BBH": 0.6339, + "hfopenllm_v2/MATH Level 5": 0.5166, + "hfopenllm_v2/GPQA": 0.3272, + "hfopenllm_v2/MUSR": 0.4435, + "hfopenllm_v2/MMLU-PRO": 0.4968 + } + }, + { + "id": "YOYO-AI/Qwen2.5-14B-YOYO-latest", + "name": "Qwen2.5-14B-YOYO-latest", + "developer": "YOYO-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5911, + "hfopenllm_v2/BBH": 0.6656, + "hfopenllm_v2/MATH Level 5": 0.4418, + "hfopenllm_v2/GPQA": 0.3826, + "hfopenllm_v2/MUSR": 0.4691, + "hfopenllm_v2/MMLU-PRO": 0.5371 + } + }, + { + "id": "YOYO-AI/Qwen2.5-14B-YOYO-latest-V2", + "name": "Qwen2.5-14B-YOYO-latest-V2", + "developer": "YOYO-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7771, + "hfopenllm_v2/BBH": 0.6299, + "hfopenllm_v2/MATH Level 5": 0.5159, + "hfopenllm_v2/GPQA": 0.354, + "hfopenllm_v2/MUSR": 0.4299, + "hfopenllm_v2/MMLU-PRO": 0.5224 + } + }, + { + "id": "YOYO-AI/Qwen2.5-14B-it-restore", + "name": "Qwen2.5-14B-it-restore", + "developer": "YOYO-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8209, + "hfopenllm_v2/BBH": 0.6388, + "hfopenllm_v2/MATH Level 5": 0.537, + "hfopenllm_v2/GPQA": 0.3372, + "hfopenllm_v2/MUSR": 0.4087, + "hfopenllm_v2/MMLU-PRO": 0.49 + } + }, + { + "id": "YOYO-AI/Qwen2.5-7B-it-restore", + "name": "Qwen2.5-7B-it-restore", + "developer": "YOYO-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7531, + "hfopenllm_v2/BBH": 0.5407, + "hfopenllm_v2/MATH Level 5": 0.5, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4007, + "hfopenllm_v2/MMLU-PRO": 0.4288 + } + }, + { + "id": "YOYO-AI/Qwen2.5-Coder-14B-YOYO-1010", + "name": "Qwen2.5-Coder-14B-YOYO-1010", + "developer": "YOYO-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5336, + "hfopenllm_v2/BBH": 0.6187, + "hfopenllm_v2/MATH Level 5": 0.3218, + "hfopenllm_v2/GPQA": 0.3523, + "hfopenllm_v2/MUSR": 0.4422, + "hfopenllm_v2/MMLU-PRO": 0.4075 + } + }, + { + "id": "YOYO-AI/ZYH-LLM-Qwen2.5-14B", + "name": "ZYH-LLM-Qwen2.5-14B", + "developer": "YOYO-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5941, + "hfopenllm_v2/BBH": 0.6644, + "hfopenllm_v2/MATH Level 5": 0.4116, + "hfopenllm_v2/GPQA": 0.3859, + "hfopenllm_v2/MUSR": 0.4757, + "hfopenllm_v2/MMLU-PRO": 0.5351 + } + }, + { + "id": "YOYO-AI/ZYH-LLM-Qwen2.5-14B-V2", + "name": "ZYH-LLM-Qwen2.5-14B-V2", + "developer": "YOYO-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5071, + "hfopenllm_v2/BBH": 0.6452, + "hfopenllm_v2/MATH Level 5": 0.3542, + "hfopenllm_v2/GPQA": 0.3792, + "hfopenllm_v2/MUSR": 0.4689, + "hfopenllm_v2/MMLU-PRO": 0.5372 + } + }, + { + "id": "YOYO-AI/ZYH-LLM-Qwen2.5-14B-V3", + "name": "ZYH-LLM-Qwen2.5-14B-V3", + "developer": "YOYO-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8578, + "hfopenllm_v2/BBH": 0.6359, + "hfopenllm_v2/MATH Level 5": 0.5272, + "hfopenllm_v2/GPQA": 0.3322, + "hfopenllm_v2/MUSR": 0.4022, + "hfopenllm_v2/MMLU-PRO": 0.4881 + } + }, + { + "id": "YOYO-AI/ZYH-LLM-Qwen2.5-14B-V4", + "name": "ZYH-LLM-Qwen2.5-14B-V4", + "developer": "YOYO-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8365, + "hfopenllm_v2/BBH": 0.6515, + "hfopenllm_v2/MATH Level 5": 0.5393, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.4434, + "hfopenllm_v2/MMLU-PRO": 0.5204 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Yash21.json b/data/developers/Yash21.json new file mode 100644 index 0000000000000000000000000000000000000000..ab19941e98732d96ef334b7798f9aaa9e1ede929 --- /dev/null +++ b/data/developers/Yash21.json @@ -0,0 +1,19 @@ +{ + "developer": "Yash21", + "models": [ + { + "id": "Yash21/TinyYi-7B-Test", + "name": "TinyYi-7B-Test", + "developer": "Yash21", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1856, + "hfopenllm_v2/BBH": 0.291, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3364, + "hfopenllm_v2/MMLU-PRO": 0.1091 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Youlln.json b/data/developers/Youlln.json new file mode 100644 index 0000000000000000000000000000000000000000..b5a516e1321358bc2316ecb79a0a15334b9c7292 --- /dev/null +++ b/data/developers/Youlln.json @@ -0,0 +1,271 @@ +{ + "developer": "Youlln", + "models": [ + { + "id": "Youlln/1PARAMMYL-8B-ModelStock", + "name": "1PARAMMYL-8B-ModelStock", + "developer": "Youlln", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5371, + "hfopenllm_v2/BBH": 0.5216, + "hfopenllm_v2/MATH Level 5": 0.1488, + "hfopenllm_v2/GPQA": 0.3238, + "hfopenllm_v2/MUSR": 0.4409, + "hfopenllm_v2/MMLU-PRO": 0.4 + } + }, + { + "id": "Youlln/2PRYMMAL-Yi1.5-6B-SLERP", + "name": "2PRYMMAL-Yi1.5-6B-SLERP", + "developer": "Youlln", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2826, + "hfopenllm_v2/BBH": 0.4665, + "hfopenllm_v2/MATH Level 5": 0.1133, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4756, + "hfopenllm_v2/MMLU-PRO": 0.317 + } + }, + { + "id": "Youlln/3PRYMMAL-PHI3-3B-SLERP", + "name": "3PRYMMAL-PHI3-3B-SLERP", + "developer": "Youlln", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3656, + "hfopenllm_v2/BBH": 0.5422, + "hfopenllm_v2/MATH Level 5": 0.1715, + "hfopenllm_v2/GPQA": 0.3263, + "hfopenllm_v2/MUSR": 0.4648, + "hfopenllm_v2/MMLU-PRO": 0.4002 + } + }, + { + "id": "Youlln/4PRYMMAL-GEMMA2-9B-SLERP", + "name": "4PRYMMAL-GEMMA2-9B-SLERP", + "developer": "Youlln", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2714, + "hfopenllm_v2/BBH": 0.5923, + "hfopenllm_v2/MATH Level 5": 0.0906, + "hfopenllm_v2/GPQA": 0.3305, + "hfopenllm_v2/MUSR": 0.4672, + "hfopenllm_v2/MMLU-PRO": 0.421 + } + }, + { + "id": "Youlln/ECE-MIRAGE-1-12B", + "name": "ECE-MIRAGE-1-12B", + "developer": "Youlln", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.207, + "hfopenllm_v2/BBH": 0.3011, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3219, + "hfopenllm_v2/MMLU-PRO": 0.111 + } + }, + { + "id": "Youlln/ECE-MIRAGE-1-15B", + "name": "ECE-MIRAGE-1-15B", + "developer": "Youlln", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.207, + "hfopenllm_v2/BBH": 0.3011, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3219, + "hfopenllm_v2/MMLU-PRO": 0.111 + } + }, + { + "id": "Youlln/ECE-PRYMMAL-0.5B-FT-V3", + "name": "ECE-PRYMMAL-0.5B-FT-V3", + "developer": "Youlln", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1642, + "hfopenllm_v2/BBH": 0.3093, + "hfopenllm_v2/MATH Level 5": 0.003, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3644, + "hfopenllm_v2/MMLU-PRO": 0.1161 + } + }, + { + "id": "Youlln/ECE-PRYMMAL-0.5B-FT-V3-MUSR", + "name": "ECE-PRYMMAL-0.5B-FT-V3-MUSR", + "developer": "Youlln", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1533, + "hfopenllm_v2/BBH": 0.3041, + "hfopenllm_v2/MATH Level 5": 0.0242, + "hfopenllm_v2/GPQA": 0.2492, + "hfopenllm_v2/MUSR": 0.366, + "hfopenllm_v2/MMLU-PRO": 0.1645 + } + }, + { + "id": "Youlln/ECE-PRYMMAL-0.5B-FT-V4-MUSR", + "name": "ECE-PRYMMAL-0.5B-FT-V4-MUSR", + "developer": "Youlln", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1138, + "hfopenllm_v2/BBH": 0.3038, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3529, + "hfopenllm_v2/MMLU-PRO": 0.1321 + } + }, + { + "id": "Youlln/ECE-PRYMMAL-0.5B-SLERP-V2", + "name": "ECE-PRYMMAL-0.5B-SLERP-V2", + "developer": "Youlln", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1612, + "hfopenllm_v2/BBH": 0.2935, + "hfopenllm_v2/MATH Level 5": 0.0008, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3831, + "hfopenllm_v2/MMLU-PRO": 0.1095 + } + }, + { + "id": "Youlln/ECE-PRYMMAL-0.5B-SLERP-V3", + "name": "ECE-PRYMMAL-0.5B-SLERP-V3", + "developer": "Youlln", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.167, + "hfopenllm_v2/BBH": 0.2938, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2517, + "hfopenllm_v2/MUSR": 0.3541, + "hfopenllm_v2/MMLU-PRO": 0.1087 + } + }, + { + "id": "Youlln/ECE-PRYMMAL-YL-1B-SLERP-V1", + "name": "ECE-PRYMMAL-YL-1B-SLERP-V1", + "developer": "Youlln", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3251, + "hfopenllm_v2/BBH": 0.4209, + "hfopenllm_v2/MATH Level 5": 0.1073, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.4266, + "hfopenllm_v2/MMLU-PRO": 0.2936 + } + }, + { + "id": "Youlln/ECE-PRYMMAL-YL-1B-SLERP-V2", + "name": "ECE-PRYMMAL-YL-1B-SLERP-V2", + "developer": "Youlln", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3251, + "hfopenllm_v2/BBH": 0.4209, + "hfopenllm_v2/MATH Level 5": 0.1073, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.4266, + "hfopenllm_v2/MMLU-PRO": 0.2936 + } + }, + { + "id": "Youlln/ECE-PRYMMAL-YL-7B-SLERP-V4", + "name": "ECE-PRYMMAL-YL-7B-SLERP-V4", + "developer": "Youlln", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.251, + "hfopenllm_v2/BBH": 0.377, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3745, + "hfopenllm_v2/MMLU-PRO": 0.2132 + } + }, + { + "id": "Youlln/ECE-PRYMMAL0.5-FT", + "name": "ECE-PRYMMAL0.5-FT", + "developer": "Youlln", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1851, + "hfopenllm_v2/BBH": 0.3132, + "hfopenllm_v2/MATH Level 5": 0.0234, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3301, + "hfopenllm_v2/MMLU-PRO": 0.1477 + } + }, + { + "id": "Youlln/ECE-PRYMMAL0.5B-Youri", + "name": "ECE-PRYMMAL0.5B-Youri", + "developer": "Youlln", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1446, + "hfopenllm_v2/BBH": 0.2817, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2433, + "hfopenllm_v2/MUSR": 0.3697, + "hfopenllm_v2/MMLU-PRO": 0.1095 + } + }, + { + "id": "Youlln/ECE-PRYMMAL1B-FT-V1", + "name": "ECE-PRYMMAL1B-FT-V1", + "developer": "Youlln", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2144, + "hfopenllm_v2/BBH": 0.4033, + "hfopenllm_v2/MATH Level 5": 0.0642, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.3417, + "hfopenllm_v2/MMLU-PRO": 0.2743 + } + }, + { + "id": "Youlln/ECE-Qwen0.5B-FT-V2", + "name": "ECE-Qwen0.5B-FT-V2", + "developer": "Youlln", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2526, + "hfopenllm_v2/BBH": 0.329, + "hfopenllm_v2/MATH Level 5": 0.0204, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3063, + "hfopenllm_v2/MMLU-PRO": 0.1666 + } + }, + { + "id": "Youlln/ECE.EIFFEIL.ia-0.5B-SLERP", + "name": "ECE.EIFFEIL.ia-0.5B-SLERP", + "developer": "Youlln", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2561, + "hfopenllm_v2/BBH": 0.3306, + "hfopenllm_v2/MATH Level 5": 0.0597, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3102, + "hfopenllm_v2/MMLU-PRO": 0.1903 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/YoungPanda.json b/data/developers/YoungPanda.json new file mode 100644 index 0000000000000000000000000000000000000000..a2cbd319801c160fef029841a7e5ed06a606a524 --- /dev/null +++ b/data/developers/YoungPanda.json @@ -0,0 +1,19 @@ +{ + "developer": "YoungPanda", + "models": [ + { + "id": "YoungPanda/qwenqwen", + "name": "qwenqwen", + "developer": "YoungPanda", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1264, + "hfopenllm_v2/BBH": 0.3379, + "hfopenllm_v2/MATH Level 5": 0.0355, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3434, + "hfopenllm_v2/MMLU-PRO": 0.1168 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Yuma42.json b/data/developers/Yuma42.json new file mode 100644 index 0000000000000000000000000000000000000000..86fa973f880ed9c54d4e33501ca47dd3bc9324bc --- /dev/null +++ b/data/developers/Yuma42.json @@ -0,0 +1,47 @@ +{ + "developer": "Yuma42", + "models": [ + { + "id": "Yuma42/KangalKhan-RawRuby-7B", + "name": "KangalKhan-RawRuby-7B", + "developer": "Yuma42", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5477, + "hfopenllm_v2/BBH": 0.4755, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.395, + "hfopenllm_v2/MMLU-PRO": 0.3023 + } + }, + { + "id": "Yuma42/Llama3.1-IgneousIguana-8B", + "name": "Llama3.1-IgneousIguana-8B", + "developer": "Yuma42", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8133, + "hfopenllm_v2/BBH": 0.5191, + "hfopenllm_v2/MATH Level 5": 0.2198, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4203, + "hfopenllm_v2/MMLU-PRO": 0.3974 + } + }, + { + "id": "Yuma42/Llama3.1-SuperHawk-8B", + "name": "Llama3.1-SuperHawk-8B", + "developer": "Yuma42", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7986, + "hfopenllm_v2/BBH": 0.52, + "hfopenllm_v2/MATH Level 5": 0.2349, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4084, + "hfopenllm_v2/MMLU-PRO": 0.3945 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Z-AI.json b/data/developers/Z-AI.json new file mode 100644 index 0000000000000000000000000000000000000000..d4ada9599b117d1b4893fef821933bb44c4fb1d9 --- /dev/null +++ b/data/developers/Z-AI.json @@ -0,0 +1,23 @@ +{ + "developer": "Z-AI", + "models": [ + { + "id": "zhipu-ai/glm-4.7", + "name": "GLM 4.7", + "developer": "Z-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 33.3 + } + }, + { + "id": "zhipu-ai/glm-5", + "name": "GLM 5", + "developer": "Z-AI", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 52.4 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Z.AI.json b/data/developers/Z.AI.json new file mode 100644 index 0000000000000000000000000000000000000000..ef7b8b09a0447d151ec048fad8785a407220b327 --- /dev/null +++ b/data/developers/Z.AI.json @@ -0,0 +1,16 @@ +{ + "developer": "Z.AI", + "models": [ + { + "id": "z-ai/glm-4.5", + "name": "z-ai/glm-4.5", + "developer": "Z.AI", + "evaluator_relationship": null, + "benchmark_scores": { + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.028169014084507043, + "livecodebenchpro/Easy Problems": 0.1267605633802817 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Z.ai.json b/data/developers/Z.ai.json new file mode 100644 index 0000000000000000000000000000000000000000..57c7f480b675e9b58ece7b3513ffc5f50a1b139e --- /dev/null +++ b/data/developers/Z.ai.json @@ -0,0 +1,14 @@ +{ + "developer": "Z.ai", + "models": [ + { + "id": "zhipu-ai/glm-4.6", + "name": "GLM 4.6", + "developer": "Z.ai", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 24.5 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/Z1-Coder.json b/data/developers/Z1-Coder.json new file mode 100644 index 0000000000000000000000000000000000000000..0ddc003961050c2c441db68bf3a0a10aa617937d --- /dev/null +++ b/data/developers/Z1-Coder.json @@ -0,0 +1,19 @@ +{ + "developer": "Z1-Coder", + "models": [ + { + "id": "Z1-Coder/Z1-Coder-7B", + "name": "Z1-Coder-7B", + "developer": "Z1-Coder", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3215, + "hfopenllm_v2/BBH": 0.4842, + "hfopenllm_v2/MATH Level 5": 0.3248, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3622, + "hfopenllm_v2/MMLU-PRO": 0.3759 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ZHLiu627.json b/data/developers/ZHLiu627.json new file mode 100644 index 0000000000000000000000000000000000000000..bfd999c370e89411f80f7ac2b53269ebd8da056f --- /dev/null +++ b/data/developers/ZHLiu627.json @@ -0,0 +1,33 @@ +{ + "developer": "ZHLiu627", + "models": [ + { + "id": "ZHLiu627/zephyr-7b-gemma-dpo-avg", + "name": "zephyr-7b-gemma-dpo-avg", + "developer": "ZHLiu627", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.309, + "hfopenllm_v2/BBH": 0.4149, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.4107, + "hfopenllm_v2/MMLU-PRO": 0.2851 + } + }, + { + "id": "ZHLiu627/zephyr-7b-gemma-rpo-avg", + "name": "zephyr-7b-gemma-rpo-avg", + "developer": "ZHLiu627", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3006, + "hfopenllm_v2/BBH": 0.4183, + "hfopenllm_v2/MATH Level 5": 0.0498, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.4081, + "hfopenllm_v2/MMLU-PRO": 0.2831 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ZeroXClem.json b/data/developers/ZeroXClem.json new file mode 100644 index 0000000000000000000000000000000000000000..3da31c4394d2fcde462d5ea16ed3a93f4dfc601d --- /dev/null +++ b/data/developers/ZeroXClem.json @@ -0,0 +1,159 @@ +{ + "developer": "ZeroXClem", + "models": [ + { + "id": "ZeroXClem/L3-Aspire-Heart-Matrix-8B", + "name": "L3-Aspire-Heart-Matrix-8B", + "developer": "ZeroXClem", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4834, + "hfopenllm_v2/BBH": 0.5384, + "hfopenllm_v2/MATH Level 5": 0.1828, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.4187, + "hfopenllm_v2/MMLU-PRO": 0.3785 + } + }, + { + "id": "ZeroXClem/Llama-3.1-8B-AthenaSky-MegaMix", + "name": "Llama-3.1-8B-AthenaSky-MegaMix", + "developer": "ZeroXClem", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6301, + "hfopenllm_v2/BBH": 0.5163, + "hfopenllm_v2/MATH Level 5": 0.2795, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.3538, + "hfopenllm_v2/MMLU-PRO": 0.3504 + } + }, + { + "id": "ZeroXClem/Llama-3.1-8B-RainbowLight-EtherealMix", + "name": "Llama-3.1-8B-RainbowLight-EtherealMix", + "developer": "ZeroXClem", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4973, + "hfopenllm_v2/BBH": 0.5155, + "hfopenllm_v2/MATH Level 5": 0.1216, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.3947, + "hfopenllm_v2/MMLU-PRO": 0.363 + } + }, + { + "id": "ZeroXClem/Llama-3.1-8B-SpecialTitanFusion", + "name": "Llama-3.1-8B-SpecialTitanFusion", + "developer": "ZeroXClem", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7402, + "hfopenllm_v2/BBH": 0.5439, + "hfopenllm_v2/MATH Level 5": 0.2334, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.3874, + "hfopenllm_v2/MMLU-PRO": 0.3621 + } + }, + { + "id": "ZeroXClem/Llama-3.1-8B-SuperNova-EtherealHermes", + "name": "Llama-3.1-8B-SuperNova-EtherealHermes", + "developer": "ZeroXClem", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7339, + "hfopenllm_v2/BBH": 0.5244, + "hfopenllm_v2/MATH Level 5": 0.1745, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4066, + "hfopenllm_v2/MMLU-PRO": 0.3745 + } + }, + { + "id": "ZeroXClem/Llama-3.1-8B-SuperTulu-LexiNova", + "name": "Llama-3.1-8B-SuperTulu-LexiNova", + "developer": "ZeroXClem", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4165, + "hfopenllm_v2/BBH": 0.5079, + "hfopenllm_v2/MATH Level 5": 0.253, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.3971, + "hfopenllm_v2/MMLU-PRO": 0.3368 + } + }, + { + "id": "ZeroXClem/Qwen-2.5-Aether-SlerpFusion-7B", + "name": "Qwen-2.5-Aether-SlerpFusion-7B", + "developer": "ZeroXClem", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6262, + "hfopenllm_v2/BBH": 0.5462, + "hfopenllm_v2/MATH Level 5": 0.2734, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4178, + "hfopenllm_v2/MMLU-PRO": 0.4327 + } + }, + { + "id": "ZeroXClem/Qwen2.5-7B-CelestialHarmony-1M", + "name": "Qwen2.5-7B-CelestialHarmony-1M", + "developer": "ZeroXClem", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5944, + "hfopenllm_v2/BBH": 0.5431, + "hfopenllm_v2/MATH Level 5": 0.3474, + "hfopenllm_v2/GPQA": 0.3188, + "hfopenllm_v2/MUSR": 0.4595, + "hfopenllm_v2/MMLU-PRO": 0.4387 + } + }, + { + "id": "ZeroXClem/Qwen2.5-7B-HomerAnvita-NerdMix", + "name": "Qwen2.5-7B-HomerAnvita-NerdMix", + "developer": "ZeroXClem", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7708, + "hfopenllm_v2/BBH": 0.5541, + "hfopenllm_v2/MATH Level 5": 0.3837, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.4391, + "hfopenllm_v2/MMLU-PRO": 0.4432 + } + }, + { + "id": "ZeroXClem/Qwen2.5-7B-HomerCreative-Mix", + "name": "Qwen2.5-7B-HomerCreative-Mix", + "developer": "ZeroXClem", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7835, + "hfopenllm_v2/BBH": 0.5548, + "hfopenllm_v2/MATH Level 5": 0.3565, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.435, + "hfopenllm_v2/MMLU-PRO": 0.4447 + } + }, + { + "id": "ZeroXClem/Qwen2.5-7B-Qandora-CySec", + "name": "Qwen2.5-7B-Qandora-CySec", + "developer": "ZeroXClem", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6773, + "hfopenllm_v2/BBH": 0.549, + "hfopenllm_v2/MATH Level 5": 0.2931, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4286, + "hfopenllm_v2/MMLU-PRO": 0.4485 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ZeusLabs.json b/data/developers/ZeusLabs.json new file mode 100644 index 0000000000000000000000000000000000000000..83dccb02e142f371a1ba7a31d170ad01689e5601 --- /dev/null +++ b/data/developers/ZeusLabs.json @@ -0,0 +1,19 @@ +{ + "developer": "ZeusLabs", + "models": [ + { + "id": "ZeusLabs/L3-Aethora-15B-V2", + "name": "L3-Aethora-15B-V2", + "developer": "ZeusLabs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7208, + "hfopenllm_v2/BBH": 0.5011, + "hfopenllm_v2/MATH Level 5": 0.0808, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.3871, + "hfopenllm_v2/MMLU-PRO": 0.35 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ZhangShenao.json b/data/developers/ZhangShenao.json new file mode 100644 index 0000000000000000000000000000000000000000..182ed81b639b78ebf31540c427577135f11c8ab9 --- /dev/null +++ b/data/developers/ZhangShenao.json @@ -0,0 +1,19 @@ +{ + "developer": "ZhangShenao", + "models": [ + { + "id": "ZhangShenao/SELM-Llama-3-8B-Instruct-iter-3", + "name": "SELM-Llama-3-8B-Instruct-iter-3", + "developer": "ZhangShenao", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6903, + "hfopenllm_v2/BBH": 0.5046, + "hfopenllm_v2/MATH Level 5": 0.0861, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3845, + "hfopenllm_v2/MMLU-PRO": 0.3783 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ZiyiYe.json b/data/developers/ZiyiYe.json new file mode 100644 index 0000000000000000000000000000000000000000..1b84dc0cda7c9fc81602b625fc2bd93f040b0dcd --- /dev/null +++ b/data/developers/ZiyiYe.json @@ -0,0 +1,18 @@ +{ + "developer": "ZiyiYe", + "models": [ + { + "id": "ZiyiYe/Con-J-Qwen2-7B", + "name": "ZiyiYe/Con-J-Qwen2-7B", + "developer": "ZiyiYe", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8712, + "reward-bench/Chat": 0.919, + "reward-bench/Chat Hard": 0.8026, + "reward-bench/Safety": 0.8824, + "reward-bench/Reasoning": 0.8808 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/aaditya.json b/data/developers/aaditya.json new file mode 100644 index 0000000000000000000000000000000000000000..0c8f7b6e99b4d65bac2e439e583dab6c4397f20f --- /dev/null +++ b/data/developers/aaditya.json @@ -0,0 +1,19 @@ +{ + "developer": "aaditya", + "models": [ + { + "id": "aaditya/Llama3-OpenBioLLM-70B", + "name": "Llama3-OpenBioLLM-70B", + "developer": "aaditya", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7597, + "hfopenllm_v2/BBH": 0.6399, + "hfopenllm_v2/MATH Level 5": 0.1971, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.4417, + "hfopenllm_v2/MMLU-PRO": 0.4867 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/abacusai.json b/data/developers/abacusai.json new file mode 100644 index 0000000000000000000000000000000000000000..57d664aba0074dcb1cc5c63b825bd23d57d67763 --- /dev/null +++ b/data/developers/abacusai.json @@ -0,0 +1,145 @@ +{ + "developer": "abacusai", + "models": [ + { + "id": "abacusai/Dracarys-72B-Instruct", + "name": "Dracarys-72B-Instruct", + "developer": "abacusai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7856, + "hfopenllm_v2/BBH": 0.6944, + "hfopenllm_v2/MATH Level 5": 0.3965, + "hfopenllm_v2/GPQA": 0.3909, + "hfopenllm_v2/MUSR": 0.4558, + "hfopenllm_v2/MMLU-PRO": 0.5456 + } + }, + { + "id": "abacusai/Liberated-Qwen1.5-14B", + "name": "Liberated-Qwen1.5-14B", + "developer": "abacusai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3631, + "hfopenllm_v2/BBH": 0.4948, + "hfopenllm_v2/MATH Level 5": 0.1601, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.4175, + "hfopenllm_v2/MMLU-PRO": 0.3512 + } + }, + { + "id": "abacusai/Llama-3-Smaug-8B", + "name": "Llama-3-Smaug-8B", + "developer": "abacusai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4867, + "hfopenllm_v2/BBH": 0.4931, + "hfopenllm_v2/MATH Level 5": 0.0853, + "hfopenllm_v2/GPQA": 0.2483, + "hfopenllm_v2/MUSR": 0.3622, + "hfopenllm_v2/MMLU-PRO": 0.3185 + } + }, + { + "id": "abacusai/Smaug-34B-v0.1", + "name": "Smaug-34B-v0.1", + "developer": "abacusai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5016, + "hfopenllm_v2/BBH": 0.5358, + "hfopenllm_v2/MATH Level 5": 0.0718, + "hfopenllm_v2/GPQA": 0.3297, + "hfopenllm_v2/MUSR": 0.3979, + "hfopenllm_v2/MMLU-PRO": 0.4543 + } + }, + { + "id": "abacusai/Smaug-72B-v0.1", + "name": "Smaug-72B-v0.1", + "developer": "abacusai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5167, + "hfopenllm_v2/BBH": 0.5996, + "hfopenllm_v2/MATH Level 5": 0.1911, + "hfopenllm_v2/GPQA": 0.3238, + "hfopenllm_v2/MUSR": 0.4473, + "hfopenllm_v2/MMLU-PRO": 0.4624 + } + }, + { + "id": "abacusai/Smaug-Llama-3-70B-Instruct-32K", + "name": "Smaug-Llama-3-70B-Instruct-32K", + "developer": "abacusai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7761, + "hfopenllm_v2/BBH": 0.6493, + "hfopenllm_v2/MATH Level 5": 0.2749, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4208, + "hfopenllm_v2/MMLU-PRO": 0.4765 + } + }, + { + "id": "abacusai/Smaug-Mixtral-v0.1", + "name": "Smaug-Mixtral-v0.1", + "developer": "abacusai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5554, + "hfopenllm_v2/BBH": 0.5162, + "hfopenllm_v2/MATH Level 5": 0.0952, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4298, + "hfopenllm_v2/MMLU-PRO": 0.3352 + } + }, + { + "id": "abacusai/Smaug-Qwen2-72B-Instruct", + "name": "Smaug-Qwen2-72B-Instruct", + "developer": "abacusai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7825, + "hfopenllm_v2/BBH": 0.691, + "hfopenllm_v2/MATH Level 5": 0.4131, + "hfopenllm_v2/GPQA": 0.3616, + "hfopenllm_v2/MUSR": 0.4401, + "hfopenllm_v2/MMLU-PRO": 0.519 + } + }, + { + "id": "abacusai/bigstral-12b-32k", + "name": "bigstral-12b-32k", + "developer": "abacusai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4194, + "hfopenllm_v2/BBH": 0.47, + "hfopenllm_v2/MATH Level 5": 0.0151, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.456, + "hfopenllm_v2/MMLU-PRO": 0.2641 + } + }, + { + "id": "abacusai/bigyi-15b", + "name": "bigyi-15b", + "developer": "abacusai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2094, + "hfopenllm_v2/BBH": 0.4345, + "hfopenllm_v2/MATH Level 5": 0.0295, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.3538, + "hfopenllm_v2/MMLU-PRO": 0.3003 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/abhishek.json b/data/developers/abhishek.json new file mode 100644 index 0000000000000000000000000000000000000000..4a990668962b400e2663f06544d4986dc1420d7b --- /dev/null +++ b/data/developers/abhishek.json @@ -0,0 +1,75 @@ +{ + "developer": "abhishek", + "models": [ + { + "id": "abhishek/autotrain-0tmgq-5tpbg", + "name": "autotrain-0tmgq-5tpbg", + "developer": "abhishek", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1952, + "hfopenllm_v2/BBH": 0.3127, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3584, + "hfopenllm_v2/MMLU-PRO": 0.1144 + } + }, + { + "id": "abhishek/autotrain-llama3-70b-orpo-v1", + "name": "autotrain-llama3-70b-orpo-v1", + "developer": "abhishek", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4233, + "hfopenllm_v2/BBH": 0.5998, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2441, + "hfopenllm_v2/MUSR": 0.3579, + "hfopenllm_v2/MMLU-PRO": 0.1122 + } + }, + { + "id": "abhishek/autotrain-llama3-70b-orpo-v2", + "name": "autotrain-llama3-70b-orpo-v2", + "developer": "abhishek", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5406, + "hfopenllm_v2/BBH": 0.5899, + "hfopenllm_v2/MATH Level 5": 0.2107, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4113, + "hfopenllm_v2/MMLU-PRO": 0.4818 + } + }, + { + "id": "abhishek/autotrain-llama3-orpo-v2", + "name": "autotrain-llama3-orpo-v2", + "developer": "abhishek", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4372, + "hfopenllm_v2/BBH": 0.3159, + "hfopenllm_v2/MATH Level 5": 0.0468, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3792, + "hfopenllm_v2/MMLU-PRO": 0.2218 + } + }, + { + "id": "abhishek/autotrain-vr4a1-e5mms", + "name": "autotrain-vr4a1-e5mms", + "developer": "abhishek", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2142, + "hfopenllm_v2/BBH": 0.5001, + "hfopenllm_v2/MATH Level 5": 0.1412, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.3891, + "hfopenllm_v2/MMLU-PRO": 0.3667 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/abideen.json b/data/developers/abideen.json new file mode 100644 index 0000000000000000000000000000000000000000..2dc881b02a2a8f6f895b89ec57324cd2908d459f --- /dev/null +++ b/data/developers/abideen.json @@ -0,0 +1,19 @@ +{ + "developer": "abideen", + "models": [ + { + "id": "abideen/MedPhi-4-14B-v1", + "name": "MedPhi-4-14B-v1", + "developer": "abideen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6277, + "hfopenllm_v2/BBH": 0.6897, + "hfopenllm_v2/MATH Level 5": 0.2931, + "hfopenllm_v2/GPQA": 0.344, + "hfopenllm_v2/MUSR": 0.4155, + "hfopenllm_v2/MMLU-PRO": 0.5338 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/adamo1139.json b/data/developers/adamo1139.json new file mode 100644 index 0000000000000000000000000000000000000000..762c7a68f0531c4d31557e4e771d8ce5da0d6b9d --- /dev/null +++ b/data/developers/adamo1139.json @@ -0,0 +1,19 @@ +{ + "developer": "adamo1139", + "models": [ + { + "id": "adamo1139/Yi-34B-200K-AEZAKMI-v2", + "name": "Yi-34B-200K-AEZAKMI-v2", + "developer": "adamo1139", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4555, + "hfopenllm_v2/BBH": 0.5384, + "hfopenllm_v2/MATH Level 5": 0.0566, + "hfopenllm_v2/GPQA": 0.3322, + "hfopenllm_v2/MUSR": 0.3886, + "hfopenllm_v2/MMLU-PRO": 0.4513 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/adriszmar.json b/data/developers/adriszmar.json new file mode 100644 index 0000000000000000000000000000000000000000..1f1d39916960942963a9c3c265196aea3657be38 --- /dev/null +++ b/data/developers/adriszmar.json @@ -0,0 +1,19 @@ +{ + "developer": "adriszmar", + "models": [ + { + "id": "adriszmar/QAIMath-Qwen2.5-7B-TIES", + "name": "QAIMath-Qwen2.5-7B-TIES", + "developer": "adriszmar", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1746, + "hfopenllm_v2/BBH": 0.3126, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.245, + "hfopenllm_v2/MUSR": 0.4096, + "hfopenllm_v2/MMLU-PRO": 0.1087 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/aevalone.json b/data/developers/aevalone.json new file mode 100644 index 0000000000000000000000000000000000000000..55082929513d655b314911f3243f2292ef947361 --- /dev/null +++ b/data/developers/aevalone.json @@ -0,0 +1,19 @@ +{ + "developer": "aevalone", + "models": [ + { + "id": "aevalone/distill_qw_test", + "name": "distill_qw_test", + "developer": "aevalone", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7409, + "hfopenllm_v2/BBH": 0.5246, + "hfopenllm_v2/MATH Level 5": 0.4781, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.386, + "hfopenllm_v2/MMLU-PRO": 0.4092 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/agentlans.json b/data/developers/agentlans.json new file mode 100644 index 0000000000000000000000000000000000000000..98f9740882541b697b43a695b6b4d1ee3bdf493a --- /dev/null +++ b/data/developers/agentlans.json @@ -0,0 +1,131 @@ +{ + "developer": "agentlans", + "models": [ + { + "id": "agentlans/Gemma2-9B-AdvancedFuse", + "name": "Gemma2-9B-AdvancedFuse", + "developer": "agentlans", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1543, + "hfopenllm_v2/BBH": 0.5859, + "hfopenllm_v2/MATH Level 5": 0.1005, + "hfopenllm_v2/GPQA": 0.3347, + "hfopenllm_v2/MUSR": 0.4231, + "hfopenllm_v2/MMLU-PRO": 0.4 + } + }, + { + "id": "agentlans/Llama-3.2-1B-Instruct-CrashCourse12K", + "name": "Llama-3.2-1B-Instruct-CrashCourse12K", + "developer": "agentlans", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5395, + "hfopenllm_v2/BBH": 0.3548, + "hfopenllm_v2/MATH Level 5": 0.071, + "hfopenllm_v2/GPQA": 0.2408, + "hfopenllm_v2/MUSR": 0.321, + "hfopenllm_v2/MMLU-PRO": 0.1809 + } + }, + { + "id": "agentlans/Llama3.1-8B-drill", + "name": "Llama3.1-8B-drill", + "developer": "agentlans", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7652, + "hfopenllm_v2/BBH": 0.5016, + "hfopenllm_v2/MATH Level 5": 0.1715, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3672, + "hfopenllm_v2/MMLU-PRO": 0.3776 + } + }, + { + "id": "agentlans/Llama3.1-Daredevilish", + "name": "Llama3.1-Daredevilish", + "developer": "agentlans", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6292, + "hfopenllm_v2/BBH": 0.5013, + "hfopenllm_v2/MATH Level 5": 0.1292, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4091, + "hfopenllm_v2/MMLU-PRO": 0.3697 + } + }, + { + "id": "agentlans/Llama3.1-Daredevilish-Instruct", + "name": "Llama3.1-Daredevilish-Instruct", + "developer": "agentlans", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7926, + "hfopenllm_v2/BBH": 0.5235, + "hfopenllm_v2/MATH Level 5": 0.1722, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.3911, + "hfopenllm_v2/MMLU-PRO": 0.3877 + } + }, + { + "id": "agentlans/Llama3.1-LexiHermes-SuperStorm", + "name": "Llama3.1-LexiHermes-SuperStorm", + "developer": "agentlans", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7835, + "hfopenllm_v2/BBH": 0.5266, + "hfopenllm_v2/MATH Level 5": 0.1616, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.3963, + "hfopenllm_v2/MMLU-PRO": 0.3844 + } + }, + { + "id": "agentlans/Llama3.1-SuperDeepFuse", + "name": "Llama3.1-SuperDeepFuse", + "developer": "agentlans", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7762, + "hfopenllm_v2/BBH": 0.5049, + "hfopenllm_v2/MATH Level 5": 0.1828, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3699, + "hfopenllm_v2/MMLU-PRO": 0.3775 + } + }, + { + "id": "agentlans/Llama3.1-SuperDeepFuse-CrashCourse12K", + "name": "Llama3.1-SuperDeepFuse-CrashCourse12K", + "developer": "agentlans", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7187, + "hfopenllm_v2/BBH": 0.5216, + "hfopenllm_v2/MATH Level 5": 0.1805, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4026, + "hfopenllm_v2/MMLU-PRO": 0.3631 + } + }, + { + "id": "agentlans/Qwen2.5-0.5B-Instruct-CrashCourse-dropout", + "name": "Qwen2.5-0.5B-Instruct-CrashCourse-dropout", + "developer": "agentlans", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2949, + "hfopenllm_v2/BBH": 0.3312, + "hfopenllm_v2/MATH Level 5": 0.0423, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3342, + "hfopenllm_v2/MMLU-PRO": 0.1608 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ahmeda335.json b/data/developers/ahmeda335.json new file mode 100644 index 0000000000000000000000000000000000000000..b2e9d41dc695bef0c1a41cfdf0f2550bf69de595 --- /dev/null +++ b/data/developers/ahmeda335.json @@ -0,0 +1,19 @@ +{ + "developer": "ahmeda335", + "models": [ + { + "id": "ahmeda335/13_outOf_32_pruned_layers_llama3.1-8b", + "name": "13_outOf_32_pruned_layers_llama3.1-8b", + "developer": "ahmeda335", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1748, + "hfopenllm_v2/BBH": 0.2883, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3803, + "hfopenllm_v2/MMLU-PRO": 0.1129 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ai2.json b/data/developers/ai2.json new file mode 100644 index 0000000000000000000000000000000000000000..7e5f541744bdd2f63b0bbf1d278aa5db277042bc --- /dev/null +++ b/data/developers/ai2.json @@ -0,0 +1,89 @@ +{ + "developer": "ai2", + "models": [ + { + "id": "ai2/llama-2-chat-7b-nectar-3.8m.json", + "name": "ai2/llama-2-chat-7b-nectar-3.8m.json", + "developer": "ai2", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5843, + "reward-bench/Chat": 0.8631, + "reward-bench/Chat Hard": 0.2654, + "reward-bench/Safety": 0.6243 + } + }, + { + "id": "ai2/llama-2-chat-nectar-180k.json", + "name": "ai2/llama-2-chat-nectar-180k.json", + "developer": "ai2", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5235, + "reward-bench/Chat": 0.8827, + "reward-bench/Chat Hard": 0.2851, + "reward-bench/Safety": 0.4027 + } + }, + { + "id": "ai2/llama-2-chat-ultrafeedback-60k.jsonl", + "name": "ai2/llama-2-chat-ultrafeedback-60k.jsonl", + "developer": "ai2", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.644, + "reward-bench/Chat": 0.9441, + "reward-bench/Chat Hard": 0.4539, + "reward-bench/Safety": 0.5338 + } + }, + { + "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", + "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", + "developer": "ai2", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6895, + "reward-bench/Chat": 0.9385, + "reward-bench/Chat Hard": 0.3706, + "reward-bench/Safety": 0.7595 + } + }, + { + "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json", + "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json", + "developer": "ai2", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7127, + "reward-bench/Chat": 0.9358, + "reward-bench/Chat Hard": 0.4079, + "reward-bench/Safety": 0.7946 + } + }, + { + "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized.json", + "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized.json", + "developer": "ai2", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6756, + "reward-bench/Chat": 0.9134, + "reward-bench/Chat Hard": 0.3904, + "reward-bench/Safety": 0.723 + } + }, + { + "id": "ai2/tulu-2-7b-rm-v0.json", + "name": "ai2/tulu-2-7b-rm-v0.json", + "developer": "ai2", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6655, + "reward-bench/Chat": 0.933, + "reward-bench/Chat Hard": 0.4539, + "reward-bench/Safety": 0.6095 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ai21.json b/data/developers/ai21.json new file mode 100644 index 0000000000000000000000000000000000000000..f0f196efc80ae830a22f83787b0924437fdf3520 --- /dev/null +++ b/data/developers/ai21.json @@ -0,0 +1,364 @@ +{ + "developer": "ai21", + "models": [ + { + "id": "ai21/J1-Grande-v1-17B", + "name": "J1-Grande v1 17B", + "developer": "ai21", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.433, + "helm_classic/MMLU": 0.27, + "helm_classic/BoolQ": 0.722, + "helm_classic/NarrativeQA": 0.672, + "helm_classic/NaturalQuestions (open-book)": 0.578, + "helm_classic/QuAC": 0.362, + "helm_classic/HellaSwag": 0.739, + "helm_classic/OpenbookQA": 0.52, + "helm_classic/TruthfulQA": 0.193, + "helm_classic/MS MARCO (TREC)": 0.341, + "helm_classic/CNN/DailyMail": 0.143, + "helm_classic/XSUM": 0.122, + "helm_classic/IMDB": 0.953, + "helm_classic/CivilComments": 0.529, + "helm_classic/RAFT": 0.658 + } + }, + { + "id": "ai21/J1-Grande-v2-beta-17B", + "name": "J1-Grande v2 beta 17B", + "developer": "ai21", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.706, + "helm_classic/MMLU": 0.445, + "helm_classic/BoolQ": 0.812, + "helm_classic/NarrativeQA": 0.725, + "helm_classic/NaturalQuestions (open-book)": 0.625, + "helm_classic/QuAC": 0.392, + "helm_classic/HellaSwag": 0.764, + "helm_classic/OpenbookQA": 0.56, + "helm_classic/TruthfulQA": 0.306, + "helm_classic/MS MARCO (TREC)": 0.46, + "helm_classic/CNN/DailyMail": 0.146, + "helm_classic/XSUM": 0.152, + "helm_classic/IMDB": 0.957, + "helm_classic/CivilComments": 0.546, + "helm_classic/RAFT": 0.679 + } + }, + { + "id": "ai21/J1-Jumbo-v1-178B", + "name": "J1-Jumbo v1 178B", + "developer": "ai21", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.517, + "helm_classic/MMLU": 0.259, + "helm_classic/BoolQ": 0.776, + "helm_classic/NarrativeQA": 0.695, + "helm_classic/NaturalQuestions (open-book)": 0.595, + "helm_classic/QuAC": 0.358, + "helm_classic/HellaSwag": 0.765, + "helm_classic/OpenbookQA": 0.534, + "helm_classic/TruthfulQA": 0.175, + "helm_classic/MS MARCO (TREC)": 0.363, + "helm_classic/CNN/DailyMail": 0.144, + "helm_classic/XSUM": 0.129, + "helm_classic/IMDB": 0.943, + "helm_classic/CivilComments": 0.553, + "helm_classic/RAFT": 0.681 + } + }, + { + "id": "ai21/J1-Large-v1-7.5B", + "name": "J1-Large v1 7.5B", + "developer": "ai21", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.285, + "helm_classic/MMLU": 0.241, + "helm_classic/BoolQ": 0.683, + "helm_classic/NarrativeQA": 0.623, + "helm_classic/NaturalQuestions (open-book)": 0.532, + "helm_classic/QuAC": 0.328, + "helm_classic/HellaSwag": 0.7, + "helm_classic/OpenbookQA": 0.514, + "helm_classic/TruthfulQA": 0.197, + "helm_classic/MS MARCO (TREC)": 0.292, + "helm_classic/CNN/DailyMail": 0.134, + "helm_classic/XSUM": 0.102, + "helm_classic/IMDB": 0.956, + "helm_classic/CivilComments": 0.532, + "helm_classic/RAFT": 0.545 + } + }, + { + "id": "ai21/Jurassic-2-Grande-17B", + "name": "Jurassic-2 Grande 17B", + "developer": "ai21", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.743, + "helm_classic/MMLU": 0.475, + "helm_classic/BoolQ": 0.826, + "helm_classic/NarrativeQA": 0.737, + "helm_classic/NaturalQuestions (open-book)": 0.639, + "helm_classic/QuAC": 0.418, + "helm_classic/HellaSwag": 0.781, + "helm_classic/OpenbookQA": 0.542, + "helm_classic/TruthfulQA": 0.348, + "helm_classic/MS MARCO (TREC)": 0.514, + "helm_classic/CNN/DailyMail": 0.144, + "helm_classic/XSUM": 0.167, + "helm_classic/IMDB": 0.938, + "helm_classic/CivilComments": 0.547, + "helm_classic/RAFT": 0.712 + } + }, + { + "id": "ai21/Jurassic-2-Jumbo-178B", + "name": "Jurassic-2 Jumbo 178B", + "developer": "ai21", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.824, + "helm_classic/MMLU": 0.48, + "helm_classic/BoolQ": 0.829, + "helm_classic/NarrativeQA": 0.733, + "helm_classic/NaturalQuestions (open-book)": 0.669, + "helm_classic/QuAC": 0.435, + "helm_classic/HellaSwag": 0.788, + "helm_classic/OpenbookQA": 0.558, + "helm_classic/TruthfulQA": 0.437, + "helm_classic/MS MARCO (TREC)": 0.661, + "helm_classic/CNN/DailyMail": 0.149, + "helm_classic/XSUM": 0.182, + "helm_classic/IMDB": 0.938, + "helm_classic/CivilComments": 0.57, + "helm_classic/RAFT": 0.746 + } + }, + { + "id": "ai21/Jurassic-2-Large-7.5B", + "name": "Jurassic-2 Large 7.5B", + "developer": "ai21", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.553, + "helm_classic/MMLU": 0.339, + "helm_classic/BoolQ": 0.742, + "helm_classic/NarrativeQA": -1.0, + "helm_classic/NaturalQuestions (open-book)": 0.589, + "helm_classic/QuAC": -1.0, + "helm_classic/HellaSwag": 0.729, + "helm_classic/OpenbookQA": 0.53, + "helm_classic/TruthfulQA": 0.245, + "helm_classic/MS MARCO (TREC)": 0.464, + "helm_classic/CNN/DailyMail": 0.136, + "helm_classic/XSUM": 0.142, + "helm_classic/IMDB": 0.956, + "helm_classic/CivilComments": 0.57, + "helm_classic/RAFT": 0.622 + } + }, + { + "id": "ai21/j2-grande", + "name": "Jurassic-2 Grande 17B", + "developer": "ai21", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.172, + "helm_lite/NarrativeQA": 0.744, + "helm_lite/NaturalQuestions (closed-book)": 0.35, + "helm_lite/OpenbookQA": 0.614, + "helm_lite/MMLU": 0.471, + "helm_lite/MATH": 0.064, + "helm_lite/GSM8K": 0.159, + "helm_lite/LegalBench": 0.468, + "helm_lite/MedQA": 0.39, + "helm_lite/WMT 2014": 0.102 + } + }, + { + "id": "ai21/j2-jumbo", + "name": "Jurassic-2 Jumbo 178B", + "developer": "ai21", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.215, + "helm_lite/NarrativeQA": 0.728, + "helm_lite/NaturalQuestions (closed-book)": 0.385, + "helm_lite/OpenbookQA": 0.688, + "helm_lite/MMLU": 0.483, + "helm_lite/MATH": 0.103, + "helm_lite/GSM8K": 0.239, + "helm_lite/LegalBench": 0.533, + "helm_lite/MedQA": 0.431, + "helm_lite/WMT 2014": 0.114 + } + }, + { + "id": "ai21/jamba-1.5-large", + "name": "Jamba 1.5 Large", + "developer": "ai21", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.637, + "helm_lite/NarrativeQA": 0.664, + "helm_lite/NaturalQuestions (closed-book)": 0.394, + "helm_lite/OpenbookQA": 0.948, + "helm_lite/MMLU": 0.683, + "helm_lite/MATH": 0.692, + "helm_lite/GSM8K": 0.846, + "helm_lite/LegalBench": 0.675, + "helm_lite/MedQA": 0.698, + "helm_lite/WMT 2014": 0.203, + "helm_mmlu/MMLU All Subjects": 0.782, + "helm_mmlu/Abstract Algebra": 0.53, + "helm_mmlu/Anatomy": 0.793, + "helm_mmlu/College Physics": 0.51, + "helm_mmlu/Computer Security": 0.8, + "helm_mmlu/Econometrics": 0.614, + "helm_mmlu/Global Facts": 0.54, + "helm_mmlu/Jurisprudence": 0.87, + "helm_mmlu/Philosophy": 0.849, + "helm_mmlu/Professional Psychology": 0.842, + "helm_mmlu/Us Foreign Policy": 0.92, + "helm_mmlu/Astronomy": 0.882, + "helm_mmlu/Business Ethics": 0.77, + "helm_mmlu/Clinical Knowledge": 0.849, + "helm_mmlu/Conceptual Physics": 0.779, + "helm_mmlu/Electrical Engineering": 0.793, + "helm_mmlu/Elementary Mathematics": 0.656, + "helm_mmlu/Formal Logic": 0.619, + "helm_mmlu/High School World History": 0.911, + "helm_mmlu/Human Sexuality": 0.832, + "helm_mmlu/International Law": 0.884, + "helm_mmlu/Logical Fallacies": 0.859, + "helm_mmlu/Machine Learning": 0.688, + "helm_mmlu/Management": 0.864, + "helm_mmlu/Marketing": 0.94, + "helm_mmlu/Medical Genetics": 0.89, + "helm_mmlu/Miscellaneous": 0.931, + "helm_mmlu/Moral Scenarios": 0.686, + "helm_mmlu/Nutrition": 0.869, + "helm_mmlu/Prehistory": 0.892, + "helm_mmlu/Public Relations": 0.755, + "helm_mmlu/Security Studies": 0.771, + "helm_mmlu/Sociology": 0.93, + "helm_mmlu/Virology": 0.554, + "helm_mmlu/World Religions": 0.865, + "helm_mmlu/Mean win rate": 0.147 + } + }, + { + "id": "ai21/jamba-1.5-mini", + "name": "Jamba 1.5 Mini", + "developer": "ai21", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.414, + "helm_lite/NarrativeQA": 0.746, + "helm_lite/NaturalQuestions (closed-book)": 0.388, + "helm_lite/OpenbookQA": 0.89, + "helm_lite/MMLU": 0.582, + "helm_lite/MATH": 0.318, + "helm_lite/GSM8K": 0.691, + "helm_lite/LegalBench": 0.503, + "helm_lite/MedQA": 0.632, + "helm_lite/WMT 2014": 0.179, + "helm_mmlu/MMLU All Subjects": 0.699, + "helm_mmlu/Abstract Algebra": 0.33, + "helm_mmlu/Anatomy": 0.711, + "helm_mmlu/College Physics": 0.48, + "helm_mmlu/Computer Security": 0.73, + "helm_mmlu/Econometrics": 0.491, + "helm_mmlu/Global Facts": 0.43, + "helm_mmlu/Jurisprudence": 0.88, + "helm_mmlu/Philosophy": 0.752, + "helm_mmlu/Professional Psychology": 0.76, + "helm_mmlu/Us Foreign Policy": 0.9, + "helm_mmlu/Astronomy": 0.822, + "helm_mmlu/Business Ethics": 0.76, + "helm_mmlu/Clinical Knowledge": 0.74, + "helm_mmlu/Conceptual Physics": 0.677, + "helm_mmlu/Electrical Engineering": 0.683, + "helm_mmlu/Elementary Mathematics": 0.553, + "helm_mmlu/Formal Logic": 0.452, + "helm_mmlu/High School World History": 0.84, + "helm_mmlu/Human Sexuality": 0.809, + "helm_mmlu/International Law": 0.893, + "helm_mmlu/Logical Fallacies": 0.81, + "helm_mmlu/Machine Learning": 0.509, + "helm_mmlu/Management": 0.825, + "helm_mmlu/Marketing": 0.915, + "helm_mmlu/Medical Genetics": 0.69, + "helm_mmlu/Miscellaneous": 0.902, + "helm_mmlu/Moral Scenarios": 0.269, + "helm_mmlu/Nutrition": 0.801, + "helm_mmlu/Prehistory": 0.824, + "helm_mmlu/Public Relations": 0.727, + "helm_mmlu/Security Studies": 0.755, + "helm_mmlu/Sociology": 0.876, + "helm_mmlu/Virology": 0.578, + "helm_mmlu/World Religions": 0.842, + "helm_mmlu/Mean win rate": 0.206 + } + }, + { + "id": "ai21/jamba-instruct", + "name": "Jamba Instruct", + "developer": "ai21", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.287, + "helm_lite/NarrativeQA": 0.658, + "helm_lite/NaturalQuestions (closed-book)": 0.384, + "helm_lite/OpenbookQA": 0.796, + "helm_lite/MMLU": 0.582, + "helm_lite/MATH": 0.38, + "helm_lite/GSM8K": 0.67, + "helm_lite/LegalBench": 0.54, + "helm_lite/MedQA": 0.519, + "helm_lite/WMT 2014": 0.164, + "helm_mmlu/MMLU All Subjects": 0.659, + "helm_mmlu/Abstract Algebra": 0.36, + "helm_mmlu/Anatomy": 0.615, + "helm_mmlu/College Physics": 0.422, + "helm_mmlu/Computer Security": 0.76, + "helm_mmlu/Econometrics": 0.439, + "helm_mmlu/Global Facts": 0.4, + "helm_mmlu/Jurisprudence": 0.796, + "helm_mmlu/Philosophy": 0.749, + "helm_mmlu/Professional Psychology": 0.716, + "helm_mmlu/Us Foreign Policy": 0.91, + "helm_mmlu/Astronomy": 0.73, + "helm_mmlu/Business Ethics": 0.6, + "helm_mmlu/Clinical Knowledge": 0.702, + "helm_mmlu/Conceptual Physics": 0.677, + "helm_mmlu/Electrical Engineering": 0.621, + "helm_mmlu/Elementary Mathematics": 0.497, + "helm_mmlu/Formal Logic": 0.444, + "helm_mmlu/High School World History": 0.797, + "helm_mmlu/Human Sexuality": 0.794, + "helm_mmlu/International Law": 0.835, + "helm_mmlu/Logical Fallacies": 0.706, + "helm_mmlu/Machine Learning": 0.536, + "helm_mmlu/Management": 0.786, + "helm_mmlu/Marketing": 0.885, + "helm_mmlu/Medical Genetics": 0.67, + "helm_mmlu/Miscellaneous": 0.865, + "helm_mmlu/Moral Scenarios": 0.465, + "helm_mmlu/Nutrition": 0.745, + "helm_mmlu/Prehistory": 0.796, + "helm_mmlu/Public Relations": 0.682, + "helm_mmlu/Security Studies": 0.743, + "helm_mmlu/Sociology": 0.891, + "helm_mmlu/Virology": 0.53, + "helm_mmlu/World Religions": 0.813, + "helm_mmlu/Mean win rate": 0.887 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ai21labs.json b/data/developers/ai21labs.json new file mode 100644 index 0000000000000000000000000000000000000000..a4677a41b6989649ba4cf65097296ceb40de8f1b --- /dev/null +++ b/data/developers/ai21labs.json @@ -0,0 +1,19 @@ +{ + "developer": "ai21labs", + "models": [ + { + "id": "ai21labs/Jamba-v0.1", + "name": "Jamba-v0.1", + "developer": "ai21labs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2026, + "hfopenllm_v2/BBH": 0.3602, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.359, + "hfopenllm_v2/MMLU-PRO": 0.2492 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ai4bharat.json b/data/developers/ai4bharat.json new file mode 100644 index 0000000000000000000000000000000000000000..b71d1440943c4bf07cedce605684de3dd4f777d7 --- /dev/null +++ b/data/developers/ai4bharat.json @@ -0,0 +1,19 @@ +{ + "developer": "ai4bharat", + "models": [ + { + "id": "ai4bharat/Airavata", + "name": "Airavata", + "developer": "ai4bharat", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0559, + "hfopenllm_v2/BBH": 0.3628, + "hfopenllm_v2/MATH Level 5": 0.0181, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3763, + "hfopenllm_v2/MMLU-PRO": 0.1635 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/aixonlab.json b/data/developers/aixonlab.json new file mode 100644 index 0000000000000000000000000000000000000000..030865828370dbc407c177eed1de281d34ad8d0d --- /dev/null +++ b/data/developers/aixonlab.json @@ -0,0 +1,47 @@ +{ + "developer": "aixonlab", + "models": [ + { + "id": "aixonlab/Aether-12b", + "name": "Aether-12b", + "developer": "aixonlab", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2347, + "hfopenllm_v2/BBH": 0.5179, + "hfopenllm_v2/MATH Level 5": 0.1065, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.3829, + "hfopenllm_v2/MMLU-PRO": 0.341 + } + }, + { + "id": "aixonlab/Grey-12b", + "name": "Grey-12b", + "developer": "aixonlab", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3968, + "hfopenllm_v2/BBH": 0.5699, + "hfopenllm_v2/MATH Level 5": 0.0982, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4516, + "hfopenllm_v2/MMLU-PRO": 0.3779 + } + }, + { + "id": "aixonlab/Zara-14b-v1.2", + "name": "Zara-14b-v1.2", + "developer": "aixonlab", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6197, + "hfopenllm_v2/BBH": 0.6405, + "hfopenllm_v2/MATH Level 5": 0.3535, + "hfopenllm_v2/GPQA": 0.3817, + "hfopenllm_v2/MUSR": 0.4675, + "hfopenllm_v2/MMLU-PRO": 0.5263 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/akhadangi.json b/data/developers/akhadangi.json new file mode 100644 index 0000000000000000000000000000000000000000..a45a05965f0b7c3c570fed151d6ee3e5cc9b7be0 --- /dev/null +++ b/data/developers/akhadangi.json @@ -0,0 +1,75 @@ +{ + "developer": "akhadangi", + "models": [ + { + "id": "akhadangi/Llama3.2.1B.0.01-First", + "name": "Llama3.2.1B.0.01-First", + "developer": "akhadangi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0814, + "hfopenllm_v2/BBH": 0.3189, + "hfopenllm_v2/MATH Level 5": 0.0181, + "hfopenllm_v2/GPQA": 0.2483, + "hfopenllm_v2/MUSR": 0.3194, + "hfopenllm_v2/MMLU-PRO": 0.1197 + } + }, + { + "id": "akhadangi/Llama3.2.1B.0.01-Last", + "name": "Llama3.2.1B.0.01-Last", + "developer": "akhadangi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0917, + "hfopenllm_v2/BBH": 0.3159, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2433, + "hfopenllm_v2/MUSR": 0.3206, + "hfopenllm_v2/MMLU-PRO": 0.1227 + } + }, + { + "id": "akhadangi/Llama3.2.1B.0.1-First", + "name": "Llama3.2.1B.0.1-First", + "developer": "akhadangi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1001, + "hfopenllm_v2/BBH": 0.312, + "hfopenllm_v2/MATH Level 5": 0.0211, + "hfopenllm_v2/GPQA": 0.245, + "hfopenllm_v2/MUSR": 0.3301, + "hfopenllm_v2/MMLU-PRO": 0.1169 + } + }, + { + "id": "akhadangi/Llama3.2.1B.0.1-Last", + "name": "Llama3.2.1B.0.1-Last", + "developer": "akhadangi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.095, + "hfopenllm_v2/BBH": 0.3164, + "hfopenllm_v2/MATH Level 5": 0.0211, + "hfopenllm_v2/GPQA": 0.2383, + "hfopenllm_v2/MUSR": 0.3341, + "hfopenllm_v2/MMLU-PRO": 0.1178 + } + }, + { + "id": "akhadangi/Llama3.2.1B.BaseFiT", + "name": "Llama3.2.1B.BaseFiT", + "developer": "akhadangi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0883, + "hfopenllm_v2/BBH": 0.3175, + "hfopenllm_v2/MATH Level 5": 0.0242, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3221, + "hfopenllm_v2/MMLU-PRO": 0.1172 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/akjindal53244.json b/data/developers/akjindal53244.json new file mode 100644 index 0000000000000000000000000000000000000000..237ea0357d953fdc2d416f7c27241c406836e723 --- /dev/null +++ b/data/developers/akjindal53244.json @@ -0,0 +1,19 @@ +{ + "developer": "akjindal53244", + "models": [ + { + "id": "akjindal53244/Llama-3.1-Storm-8B", + "name": "Llama-3.1-Storm-8B", + "developer": "akjindal53244", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8051, + "hfopenllm_v2/BBH": 0.5189, + "hfopenllm_v2/MATH Level 5": 0.1722, + "hfopenllm_v2/GPQA": 0.3263, + "hfopenllm_v2/MUSR": 0.4028, + "hfopenllm_v2/MMLU-PRO": 0.3803 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/alcholjung.json b/data/developers/alcholjung.json new file mode 100644 index 0000000000000000000000000000000000000000..6e1e3376cbc24e2a296860982d2dd301a89de9d8 --- /dev/null +++ b/data/developers/alcholjung.json @@ -0,0 +1,19 @@ +{ + "developer": "alcholjung", + "models": [ + { + "id": "alcholjung/llama3_medical_tuned", + "name": "llama3_medical_tuned", + "developer": "alcholjung", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0106, + "hfopenllm_v2/BBH": 0.4513, + "hfopenllm_v2/MATH Level 5": 0.0468, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.466, + "hfopenllm_v2/MMLU-PRO": 0.2946 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/aleph-alpha.json b/data/developers/aleph-alpha.json new file mode 100644 index 0000000000000000000000000000000000000000..5431a53fbdc57b70164fd78532e9e2a2758e28be --- /dev/null +++ b/data/developers/aleph-alpha.json @@ -0,0 +1,74 @@ +{ + "developer": "aleph-alpha", + "models": [ + { + "id": "aleph-alpha/Luminous-Base-13B", + "name": "Luminous Base 13B", + "developer": "aleph-alpha", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.315, + "helm_classic/MMLU": 0.27, + "helm_classic/BoolQ": 0.719, + "helm_classic/NarrativeQA": 0.605, + "helm_classic/NaturalQuestions (open-book)": 0.568, + "helm_classic/QuAC": 0.334, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.182, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": 0.11, + "helm_classic/XSUM": 0.105, + "helm_classic/IMDB": 0.939, + "helm_classic/CivilComments": 0.544, + "helm_classic/RAFT": 0.473 + } + }, + { + "id": "aleph-alpha/Luminous-Extended-30B", + "name": "Luminous Extended 30B", + "developer": "aleph-alpha", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.485, + "helm_classic/MMLU": 0.321, + "helm_classic/BoolQ": 0.767, + "helm_classic/NarrativeQA": 0.665, + "helm_classic/NaturalQuestions (open-book)": 0.609, + "helm_classic/QuAC": 0.349, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.221, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": 0.139, + "helm_classic/XSUM": 0.124, + "helm_classic/IMDB": 0.947, + "helm_classic/CivilComments": 0.524, + "helm_classic/RAFT": 0.523 + } + }, + { + "id": "aleph-alpha/Luminous-Supreme-70B", + "name": "Luminous Supreme 70B", + "developer": "aleph-alpha", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.662, + "helm_classic/MMLU": 0.38, + "helm_classic/BoolQ": 0.775, + "helm_classic/NarrativeQA": 0.711, + "helm_classic/NaturalQuestions (open-book)": 0.649, + "helm_classic/QuAC": 0.37, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.222, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": 0.15, + "helm_classic/XSUM": 0.136, + "helm_classic/IMDB": 0.959, + "helm_classic/CivilComments": 0.562, + "helm_classic/RAFT": 0.653 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/alibaba.json b/data/developers/alibaba.json new file mode 100644 index 0000000000000000000000000000000000000000..3464fe4959367b9df63ad6a26271766d68bbb5a6 --- /dev/null +++ b/data/developers/alibaba.json @@ -0,0 +1,32 @@ +{ + "developer": "alibaba", + "models": [ + { + "id": "alibaba/qwen3-235b-a22b-instruct-2507", + "name": "qwen3-235b-a22b-instruct-2507", + "developer": "alibaba", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.8798, + "global-mmlu-lite/Culturally Sensitive": 0.8522, + "global-mmlu-lite/Culturally Agnostic": 0.9075, + "global-mmlu-lite/Arabic": 0.88, + "global-mmlu-lite/English": 0.89, + "global-mmlu-lite/Bengali": 0.8875, + "global-mmlu-lite/German": 0.885, + "global-mmlu-lite/French": 0.88, + "global-mmlu-lite/Hindi": 0.8775, + "global-mmlu-lite/Indonesian": 0.88, + "global-mmlu-lite/Italian": 0.88, + "global-mmlu-lite/Japanese": 0.88, + "global-mmlu-lite/Korean": 0.875, + "global-mmlu-lite/Portuguese": 0.8875, + "global-mmlu-lite/Spanish": 0.875, + "global-mmlu-lite/Swahili": 0.87, + "global-mmlu-lite/Yoruba": 0.8725, + "global-mmlu-lite/Chinese": 0.8775, + "global-mmlu-lite/Burmese": 0.88 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/aliyun.json b/data/developers/aliyun.json new file mode 100644 index 0000000000000000000000000000000000000000..07bfc64c9c2fb6834fa3708114878f5fe38b0cb1 --- /dev/null +++ b/data/developers/aliyun.json @@ -0,0 +1,16 @@ +{ + "developer": "aliyun", + "models": [ + { + "id": "aliyun/qwen3-next-80b-a3b-thinking", + "name": "qwen3-next-80b-a3b-thinking", + "developer": "aliyun", + "evaluator_relationship": null, + "benchmark_scores": { + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.0704, + "livecodebenchpro/Easy Problems": 0.6901 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/allenai.json b/data/developers/allenai.json new file mode 100644 index 0000000000000000000000000000000000000000..d86875077c7b6298c92ac8f5bdb15a0edeaef46e --- /dev/null +++ b/data/developers/allenai.json @@ -0,0 +1,2537 @@ +{ + "developer": "allenai", + "models": [ + { + "id": "allenai/Llama-3.1-70B-Instruct-RM-RB2", + "name": "allenai/Llama-3.1-70B-Instruct-RM-RB2", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9021, + "reward-bench/Factuality": 0.8126, + "reward-bench/Precise IF": 0.4188, + "reward-bench/Math": 0.6995, + "reward-bench/Safety": 0.9095, + "reward-bench/Focus": 0.8646, + "reward-bench/Ties": 0.8835, + "reward-bench/Chat": 0.9665, + "reward-bench/Chat Hard": 0.8355, + "reward-bench/Reasoning": 0.8969, + "reward-bench/Prior Sets (0.5 weight)": 0.0 + } + }, + { + "id": "allenai/Llama-3.1-8B-Base-RM-RB2", + "name": "allenai/Llama-3.1-8B-Base-RM-RB2", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.649, + "reward-bench/Chat": 0.933, + "reward-bench/Chat Hard": 0.7785, + "reward-bench/Safety": 0.8267, + "reward-bench/Reasoning": 0.7886, + "reward-bench/Prior Sets (0.5 weight)": 0.0, + "reward-bench/Factuality": 0.72, + "reward-bench/Precise IF": 0.3625, + "reward-bench/Math": 0.612, + "reward-bench/Focus": 0.8323, + "reward-bench/Ties": 0.5406 + } + }, + { + "id": "allenai/Llama-3.1-8B-Instruct-RM-RB2", + "name": "allenai/Llama-3.1-8B-Instruct-RM-RB2", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8885, + "reward-bench/Factuality": 0.7432, + "reward-bench/Precise IF": 0.4437, + "reward-bench/Math": 0.6175, + "reward-bench/Safety": 0.8932, + "reward-bench/Focus": 0.9071, + "reward-bench/Ties": 0.7638, + "reward-bench/Chat": 0.9581, + "reward-bench/Chat Hard": 0.8158, + "reward-bench/Reasoning": 0.887, + "reward-bench/Prior Sets (0.5 weight)": 0.0 + } + }, + { + "id": "allenai/Llama-3.1-Tulu-3-70B", + "name": "Llama-3.1-Tulu-3-70B", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8291, + "hfopenllm_v2/BBH": 0.6164, + "hfopenllm_v2/MATH Level 5": 0.4502, + "hfopenllm_v2/GPQA": 0.3733, + "hfopenllm_v2/MUSR": 0.4948, + "hfopenllm_v2/MMLU-PRO": 0.4645 + } + }, + { + "id": "allenai/Llama-3.1-Tulu-3-70B-DPO", + "name": "Llama-3.1-Tulu-3-70B-DPO", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8282, + "hfopenllm_v2/BBH": 0.6146, + "hfopenllm_v2/MATH Level 5": 0.4494, + "hfopenllm_v2/GPQA": 0.3758, + "hfopenllm_v2/MUSR": 0.4923, + "hfopenllm_v2/MMLU-PRO": 0.4633 + } + }, + { + "id": "allenai/Llama-3.1-Tulu-3-70B-SFT", + "name": "Llama-3.1-Tulu-3-70B-SFT", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8051, + "hfopenllm_v2/BBH": 0.5951, + "hfopenllm_v2/MATH Level 5": 0.3316, + "hfopenllm_v2/GPQA": 0.3448, + "hfopenllm_v2/MUSR": 0.5026, + "hfopenllm_v2/MMLU-PRO": 0.4624 + } + }, + { + "id": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2", + "name": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8892, + "reward-bench/Factuality": 0.8084, + "reward-bench/Precise IF": 0.3688, + "reward-bench/Math": 0.6776, + "reward-bench/Safety": 0.9027, + "reward-bench/Focus": 0.7778, + "reward-bench/Ties": 0.8308, + "reward-bench/Chat": 0.9693, + "reward-bench/Chat Hard": 0.8268, + "reward-bench/Reasoning": 0.8583, + "reward-bench/Prior Sets (0.5 weight)": 0.0 + } + }, + { + "id": "allenai/Llama-3.1-Tulu-3-8B", + "name": "Llama-3.1-Tulu-3-8B", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8255, + "hfopenllm_v2/BBH": 0.4061, + "hfopenllm_v2/MATH Level 5": 0.2115, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4175, + "hfopenllm_v2/MMLU-PRO": 0.2821 + } + }, + { + "id": "allenai/Llama-3.1-Tulu-3-8B-DPO", + "name": "Llama-3.1-Tulu-3-8B-DPO", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8029, + "hfopenllm_v2/BBH": 0.4079, + "hfopenllm_v2/MATH Level 5": 0.2364, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4161, + "hfopenllm_v2/MMLU-PRO": 0.2898 + } + }, + { + "id": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2", + "name": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.687, + "reward-bench/Chat": 0.9553, + "reward-bench/Chat Hard": 0.761, + "reward-bench/Safety": 0.86, + "reward-bench/Reasoning": 0.7898, + "reward-bench/Prior Sets (0.5 weight)": 0.0, + "reward-bench/Factuality": 0.7516, + "reward-bench/Precise IF": 0.3875, + "reward-bench/Math": 0.6284, + "reward-bench/Focus": 0.8545, + "reward-bench/Ties": 0.6397 + } + }, + { + "id": "allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2", + "name": "allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6871, + "reward-bench/Chat": 0.9469, + "reward-bench/Chat Hard": 0.7588, + "reward-bench/Safety": 0.8644, + "reward-bench/Reasoning": 0.7715, + "reward-bench/Prior Sets (0.5 weight)": 0.0, + "reward-bench/Factuality": 0.7642, + "reward-bench/Precise IF": 0.4, + "reward-bench/Math": 0.6175, + "reward-bench/Focus": 0.8485, + "reward-bench/Ties": 0.6281 + } + }, + { + "id": "allenai/Llama-3.1-Tulu-3-8B-RM", + "name": "Llama-3.1-Tulu-3-8B-RM", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.167, + "hfopenllm_v2/BBH": 0.295, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3764, + "hfopenllm_v2/MMLU-PRO": 0.1082, + "reward-bench/Score": 0.59, + "reward-bench/Factuality": 0.7453, + "reward-bench/Precise IF": 0.3469, + "reward-bench/Math": 0.6448, + "reward-bench/Safety": 0.7422, + "reward-bench/Focus": 0.5364, + "reward-bench/Ties": 0.5243 + } + }, + { + "id": "allenai/Llama-3.1-Tulu-3-8B-SFT", + "name": "Llama-3.1-Tulu-3-8B-SFT", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7403, + "hfopenllm_v2/BBH": 0.3872, + "hfopenllm_v2/MATH Level 5": 0.1178, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.4268, + "hfopenllm_v2/MMLU-PRO": 0.2812 + } + }, + { + "id": "allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2", + "name": "allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8551, + "reward-bench/Factuality": 0.7326, + "reward-bench/Precise IF": 0.3875, + "reward-bench/Math": 0.5792, + "reward-bench/Safety": 0.8784, + "reward-bench/Focus": 0.8889, + "reward-bench/Ties": 0.6063, + "reward-bench/Chat": 0.9497, + "reward-bench/Chat Hard": 0.7917, + "reward-bench/Reasoning": 0.8005, + "reward-bench/Prior Sets (0.5 weight)": 0.0 + } + }, + { + "id": "allenai/OLMo-1.7-7B-hf", + "name": "OLMo-1.7-7B-hf", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1569, + "hfopenllm_v2/BBH": 0.3014, + "hfopenllm_v2/MATH Level 5": 0.0023, + "hfopenllm_v2/GPQA": 0.255, + "hfopenllm_v2/MUSR": 0.3475, + "hfopenllm_v2/MMLU-PRO": 0.1124 + } + }, + { + "id": "allenai/OLMo-1B-hf", + "name": "OLMo-1B-hf", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2182, + "hfopenllm_v2/BBH": 0.3052, + "hfopenllm_v2/MATH Level 5": 0.0174, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.4098, + "hfopenllm_v2/MMLU-PRO": 0.1174 + } + }, + { + "id": "allenai/OLMo-2-1124-7B-Instruct", + "name": "OLMo-2-1124-7B-Instruct", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7244, + "hfopenllm_v2/BBH": 0.4022, + "hfopenllm_v2/MATH Level 5": 0.1488, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.3508, + "hfopenllm_v2/MMLU-PRO": 0.2672 + } + }, + { + "id": "allenai/OLMo-7B-Instruct", + "name": "allenai/OLMo-7B-Instruct", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6727, + "reward-bench/Chat": 0.8966, + "reward-bench/Chat Hard": 0.5066, + "reward-bench/Safety": 0.6486, + "reward-bench/Reasoning": 0.7168, + "reward-bench/Prior Sets (0.5 weight)": 0.5173 + } + }, + { + "id": "allenai/OLMo-7B-Instruct-hf", + "name": "OLMo-7B-Instruct-hf", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3473, + "hfopenllm_v2/BBH": 0.3706, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3765, + "hfopenllm_v2/MMLU-PRO": 0.1785 + } + }, + { + "id": "allenai/OLMo-7B-hf", + "name": "OLMo-7B-hf", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2719, + "hfopenllm_v2/BBH": 0.3279, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3487, + "hfopenllm_v2/MMLU-PRO": 0.1173 + } + }, + { + "id": "allenai/OLMoE-1B-7B-0125-Instruct", + "name": "OLMoE-1B-7B-0125-Instruct", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6757, + "hfopenllm_v2/BBH": 0.3825, + "hfopenllm_v2/MATH Level 5": 0.0899, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3636, + "hfopenllm_v2/MMLU-PRO": 0.1915 + } + }, + { + "id": "allenai/OLMoE-1B-7B-0924", + "name": "OLMoE-1B-7B-0924", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2185, + "hfopenllm_v2/BBH": 0.3393, + "hfopenllm_v2/MATH Level 5": 0.0166, + "hfopenllm_v2/GPQA": 0.2475, + "hfopenllm_v2/MUSR": 0.3488, + "hfopenllm_v2/MMLU-PRO": 0.174 + } + }, + { + "id": "allenai/OLMoE-1B-7B-0924-Instruct", + "name": "OLMoE-1B-7B-0924-Instruct", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4667, + "hfopenllm_v2/BBH": 0.3902, + "hfopenllm_v2/MATH Level 5": 0.0279, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3848, + "hfopenllm_v2/MMLU-PRO": 0.1876 + } + }, + { + "id": "allenai/llama-3-tulu-2-70b-uf-mean-rm", + "name": "allenai/llama-3-tulu-2-70b-uf-mean-rm", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7019, + "reward-bench/Chat": 0.8631, + "reward-bench/Chat Hard": 0.5614, + "reward-bench/Safety": 0.6095, + "reward-bench/Reasoning": 0.8268, + "reward-bench/Prior Sets (0.5 weight)": 0.5957 + } + }, + { + "id": "allenai/llama-3-tulu-2-8b-uf-mean-rm", + "name": "allenai/llama-3-tulu-2-8b-uf-mean-rm", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7342, + "reward-bench/Chat": 0.9525, + "reward-bench/Chat Hard": 0.5921, + "reward-bench/Safety": 0.6162, + "reward-bench/Reasoning": 0.8212, + "reward-bench/Prior Sets (0.5 weight)": 0.6434 + } + }, + { + "id": "allenai/llama-3-tulu-2-dpo-70b", + "name": "allenai/llama-3-tulu-2-dpo-70b", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7496, + "reward-bench/Chat": 0.9637, + "reward-bench/Chat Hard": 0.5746, + "reward-bench/Safety": 0.7486, + "reward-bench/Reasoning": 0.802, + "reward-bench/Prior Sets (0.5 weight)": 0.5687 + } + }, + { + "id": "allenai/llama-3-tulu-2-dpo-8b", + "name": "allenai/llama-3-tulu-2-dpo-8b", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7275, + "reward-bench/Chat": 0.9525, + "reward-bench/Chat Hard": 0.5351, + "reward-bench/Safety": 0.6649, + "reward-bench/Reasoning": 0.8663, + "reward-bench/Prior Sets (0.5 weight)": 0.5097 + } + }, + { + "id": "allenai/olmo-1.7-7b", + "name": "OLMo 1.7 7B", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_mmlu/MMLU All Subjects": 0.538, + "helm_mmlu/Abstract Algebra": 0.33, + "helm_mmlu/Anatomy": 0.496, + "helm_mmlu/College Physics": 0.333, + "helm_mmlu/Computer Security": 0.65, + "helm_mmlu/Econometrics": 0.404, + "helm_mmlu/Global Facts": 0.34, + "helm_mmlu/Jurisprudence": 0.565, + "helm_mmlu/Philosophy": 0.592, + "helm_mmlu/Professional Psychology": 0.526, + "helm_mmlu/Us Foreign Policy": 0.76, + "helm_mmlu/Astronomy": 0.526, + "helm_mmlu/Business Ethics": 0.59, + "helm_mmlu/Clinical Knowledge": 0.57, + "helm_mmlu/Conceptual Physics": 0.434, + "helm_mmlu/Electrical Engineering": 0.517, + "helm_mmlu/Elementary Mathematics": 0.307, + "helm_mmlu/Formal Logic": 0.325, + "helm_mmlu/High School World History": 0.713, + "helm_mmlu/Human Sexuality": 0.595, + "helm_mmlu/International Law": 0.612, + "helm_mmlu/Logical Fallacies": 0.607, + "helm_mmlu/Machine Learning": 0.375, + "helm_mmlu/Management": 0.689, + "helm_mmlu/Marketing": 0.769, + "helm_mmlu/Medical Genetics": 0.56, + "helm_mmlu/Miscellaneous": 0.734, + "helm_mmlu/Moral Scenarios": 0.335, + "helm_mmlu/Nutrition": 0.608, + "helm_mmlu/Prehistory": 0.593, + "helm_mmlu/Public Relations": 0.6, + "helm_mmlu/Security Studies": 0.522, + "helm_mmlu/Sociology": 0.751, + "helm_mmlu/Virology": 0.452, + "helm_mmlu/World Religions": 0.731, + "helm_mmlu/Mean win rate": 0.196 + } + }, + { + "id": "allenai/olmo-2-0325-32b-instruct", + "name": "OLMo 2 32B Instruct March 2025", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.475, + "helm_capabilities/MMLU-Pro": 0.414, + "helm_capabilities/GPQA": 0.287, + "helm_capabilities/IFEval": 0.78, + "helm_capabilities/WildBench": 0.734, + "helm_capabilities/Omni-MATH": 0.161 + } + }, + { + "id": "allenai/olmo-2-1124-13b-instruct", + "name": "OLMo 2 13B Instruct November 2024", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.44, + "helm_capabilities/MMLU-Pro": 0.31, + "helm_capabilities/GPQA": 0.316, + "helm_capabilities/IFEval": 0.73, + "helm_capabilities/WildBench": 0.689, + "helm_capabilities/Omni-MATH": 0.156 + } + }, + { + "id": "allenai/olmo-2-1124-7b-instruct", + "name": "OLMo 2 7B Instruct November 2024", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.405, + "helm_capabilities/MMLU-Pro": 0.292, + "helm_capabilities/GPQA": 0.296, + "helm_capabilities/IFEval": 0.693, + "helm_capabilities/WildBench": 0.628, + "helm_capabilities/Omni-MATH": 0.116 + } + }, + { + "id": "allenai/olmo-7b", + "name": "OLMo 7B", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.052, + "helm_lite/NarrativeQA": 0.597, + "helm_lite/NaturalQuestions (closed-book)": 0.259, + "helm_lite/OpenbookQA": 0.222, + "helm_lite/MMLU": 0.305, + "helm_lite/MATH": 0.029, + "helm_lite/GSM8K": 0.044, + "helm_lite/LegalBench": 0.341, + "helm_lite/MedQA": 0.229, + "helm_lite/WMT 2014": 0.097, + "helm_mmlu/MMLU All Subjects": 0.295, + "helm_mmlu/Abstract Algebra": 0.26, + "helm_mmlu/Anatomy": 0.222, + "helm_mmlu/College Physics": 0.294, + "helm_mmlu/Computer Security": 0.3, + "helm_mmlu/Econometrics": 0.325, + "helm_mmlu/Global Facts": 0.32, + "helm_mmlu/Jurisprudence": 0.25, + "helm_mmlu/Philosophy": 0.325, + "helm_mmlu/Professional Psychology": 0.232, + "helm_mmlu/Us Foreign Policy": 0.26, + "helm_mmlu/Astronomy": 0.342, + "helm_mmlu/Business Ethics": 0.24, + "helm_mmlu/Clinical Knowledge": 0.26, + "helm_mmlu/Conceptual Physics": 0.319, + "helm_mmlu/Electrical Engineering": 0.29, + "helm_mmlu/Elementary Mathematics": 0.254, + "helm_mmlu/Formal Logic": 0.278, + "helm_mmlu/High School World History": 0.253, + "helm_mmlu/Human Sexuality": 0.267, + "helm_mmlu/International Law": 0.306, + "helm_mmlu/Logical Fallacies": 0.264, + "helm_mmlu/Machine Learning": 0.286, + "helm_mmlu/Management": 0.272, + "helm_mmlu/Marketing": 0.269, + "helm_mmlu/Medical Genetics": 0.28, + "helm_mmlu/Miscellaneous": 0.292, + "helm_mmlu/Moral Scenarios": 0.265, + "helm_mmlu/Nutrition": 0.34, + "helm_mmlu/Prehistory": 0.318, + "helm_mmlu/Public Relations": 0.345, + "helm_mmlu/Security Studies": 0.408, + "helm_mmlu/Sociology": 0.383, + "helm_mmlu/Virology": 0.416, + "helm_mmlu/World Religions": 0.234, + "helm_mmlu/Mean win rate": 0.68 + } + }, + { + "id": "allenai/olmoe-1b-7b-0125-instruct", + "name": "OLMoE 1B-7B Instruct January 2025", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.332, + "helm_capabilities/MMLU-Pro": 0.169, + "helm_capabilities/GPQA": 0.22, + "helm_capabilities/IFEval": 0.628, + "helm_capabilities/WildBench": 0.551, + "helm_capabilities/Omni-MATH": 0.093 + } + }, + { + "id": "allenai/open_instruct_dev-reward_modeling__1__1739590997", + "name": "allenai/open_instruct_dev-reward_modeling__1__1739590997", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6004, + "reward-bench/Factuality": 0.7032, + "reward-bench/Precise IF": 0.375, + "reward-bench/Math": 0.623, + "reward-bench/Safety": 0.7867, + "reward-bench/Focus": 0.598, + "reward-bench/Ties": 0.5165 + } + }, + { + "id": "allenai/open_instruct_dev-reward_modeling__1__1739871066", + "name": "allenai/open_instruct_dev-reward_modeling__1__1739871066", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6012, + "reward-bench/Factuality": 0.6989, + "reward-bench/Precise IF": 0.425, + "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.7978, + "reward-bench/Focus": 0.604, + "reward-bench/Ties": 0.4527 + } + }, + { + "id": "allenai/open_instruct_dev-reward_modeling__1__1739925892", + "name": "allenai/open_instruct_dev-reward_modeling__1__1739925892", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6345, + "reward-bench/Factuality": 0.7432, + "reward-bench/Precise IF": 0.3563, + "reward-bench/Math": 0.623, + "reward-bench/Safety": 0.8111, + "reward-bench/Focus": 0.7131, + "reward-bench/Ties": 0.5606 + } + }, + { + "id": "allenai/open_instruct_dev-reward_modeling__1__1739943850", + "name": "allenai/open_instruct_dev-reward_modeling__1__1739943850", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.4978, + "reward-bench/Factuality": 0.5726, + "reward-bench/Precise IF": 0.3125, + "reward-bench/Math": 0.5191, + "reward-bench/Safety": 0.6489, + "reward-bench/Focus": 0.6222, + "reward-bench/Ties": 0.3114 + } + }, + { + "id": "allenai/open_instruct_dev-reward_modeling__1__1739943881", + "name": "allenai/open_instruct_dev-reward_modeling__1__1739943881", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5998, + "reward-bench/Factuality": 0.7032, + "reward-bench/Precise IF": 0.3187, + "reward-bench/Math": 0.5792, + "reward-bench/Safety": 0.8222, + "reward-bench/Focus": 0.6727, + "reward-bench/Ties": 0.5025 + } + }, + { + "id": "allenai/open_instruct_dev-reward_modeling__1__1739943972", + "name": "allenai/open_instruct_dev-reward_modeling__1__1739943972", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5289, + "reward-bench/Factuality": 0.6168, + "reward-bench/Precise IF": 0.375, + "reward-bench/Math": 0.5738, + "reward-bench/Safety": 0.6844, + "reward-bench/Focus": 0.5657, + "reward-bench/Ties": 0.3577 + } + }, + { + "id": "allenai/open_instruct_dev-reward_modeling__1__1739957701", + "name": "allenai/open_instruct_dev-reward_modeling__1__1739957701", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6194, + "reward-bench/Factuality": 0.6779, + "reward-bench/Precise IF": 0.3563, + "reward-bench/Math": 0.6011, + "reward-bench/Safety": 0.8022, + "reward-bench/Focus": 0.697, + "reward-bench/Ties": 0.5822 + } + }, + { + "id": "allenai/open_instruct_dev-reward_modeling__1__1739971507", + "name": "allenai/open_instruct_dev-reward_modeling__1__1739971507", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5717, + "reward-bench/Factuality": 0.68, + "reward-bench/Precise IF": 0.375, + "reward-bench/Math": 0.6066, + "reward-bench/Safety": 0.7667, + "reward-bench/Focus": 0.5475, + "reward-bench/Ties": 0.4545 + } + }, + { + "id": "allenai/open_instruct_dev-reward_modeling__1__1739971529", + "name": "allenai/open_instruct_dev-reward_modeling__1__1739971529", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5564, + "reward-bench/Factuality": 0.6568, + "reward-bench/Precise IF": 0.3563, + "reward-bench/Math": 0.5956, + "reward-bench/Safety": 0.7533, + "reward-bench/Focus": 0.5737, + "reward-bench/Ties": 0.4027 + } + }, + { + "id": "allenai/open_instruct_dev-reward_modeling__1__1739998765", + "name": "allenai/open_instruct_dev-reward_modeling__1__1739998765", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6008, + "reward-bench/Factuality": 0.7095, + "reward-bench/Precise IF": 0.4125, + "reward-bench/Math": 0.6066, + "reward-bench/Safety": 0.8022, + "reward-bench/Focus": 0.5859, + "reward-bench/Ties": 0.4883 + } + }, + { + "id": "allenai/open_instruct_dev-reward_modeling__1__1740005072", + "name": "allenai/open_instruct_dev-reward_modeling__1__1740005072", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6097, + "reward-bench/Factuality": 0.7137, + "reward-bench/Precise IF": 0.3937, + "reward-bench/Math": 0.6339, + "reward-bench/Safety": 0.7778, + "reward-bench/Focus": 0.6343, + "reward-bench/Ties": 0.5047 + } + }, + { + "id": "allenai/open_instruct_dev-reward_modeling__1__1740129284", + "name": "allenai/open_instruct_dev-reward_modeling__1__1740129284", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6129, + "reward-bench/Factuality": 0.7116, + "reward-bench/Precise IF": 0.4437, + "reward-bench/Math": 0.6448, + "reward-bench/Safety": 0.8022, + "reward-bench/Focus": 0.6101, + "reward-bench/Ties": 0.4652 + } + }, + { + "id": "allenai/open_instruct_dev-reward_modeling__1__1741286813", + "name": "allenai/open_instruct_dev-reward_modeling__1__1741286813", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6557, + "reward-bench/Factuality": 0.6295, + "reward-bench/Precise IF": 0.4188, + "reward-bench/Math": 0.612, + "reward-bench/Safety": 0.9111, + "reward-bench/Focus": 0.8263, + "reward-bench/Ties": 0.5365 + } + }, + { + "id": "allenai/open_instruct_dev-reward_modeling__1__1741287363", + "name": "allenai/open_instruct_dev-reward_modeling__1__1741287363", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6672, + "reward-bench/Factuality": 0.6295, + "reward-bench/Precise IF": 0.375, + "reward-bench/Math": 0.6066, + "reward-bench/Safety": 0.88, + "reward-bench/Focus": 0.9374, + "reward-bench/Ties": 0.5748 + } + }, + { + "id": "allenai/open_instruct_dev-reward_modeling__1__1741292911", + "name": "allenai/open_instruct_dev-reward_modeling__1__1741292911", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6607, + "reward-bench/Factuality": 0.6589, + "reward-bench/Precise IF": 0.4, + "reward-bench/Math": 0.6066, + "reward-bench/Safety": 0.9089, + "reward-bench/Focus": 0.8869, + "reward-bench/Ties": 0.5028 + } + }, + { + "id": "allenai/open_instruct_dev-reward_modeling__1__1742338142", + "name": "allenai/open_instruct_dev-reward_modeling__1__1742338142", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6344, + "reward-bench/Factuality": 0.7326, + "reward-bench/Precise IF": 0.3812, + "reward-bench/Math": 0.7049, + "reward-bench/Safety": 0.88, + "reward-bench/Focus": 0.6323, + "reward-bench/Ties": 0.475 + } + }, + { + "id": "allenai/open_instruct_dev-reward_modeling__1__1742519610", + "name": "allenai/open_instruct_dev-reward_modeling__1__1742519610", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6361, + "reward-bench/Factuality": 0.7074, + "reward-bench/Precise IF": 0.3812, + "reward-bench/Math": 0.6721, + "reward-bench/Safety": 0.82, + "reward-bench/Focus": 0.6444, + "reward-bench/Ties": 0.5915 + } + }, + { + "id": "allenai/open_instruct_dev-reward_modeling__1__1742519628", + "name": "allenai/open_instruct_dev-reward_modeling__1__1742519628", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5609, + "reward-bench/Factuality": 0.5179, + "reward-bench/Precise IF": 0.3563, + "reward-bench/Math": 0.623, + "reward-bench/Safety": 0.8356, + "reward-bench/Focus": 0.5071, + "reward-bench/Ties": 0.5254 + } + }, + { + "id": "allenai/open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455", + "name": "allenai/open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.0576, + "reward-bench/Factuality": 0.04, + "reward-bench/Precise IF": 0.1313, + "reward-bench/Math": 0.0546, + "reward-bench/Safety": 0.0489, + "reward-bench/Focus": 0.0808, + "reward-bench/Ties": -0.01 + } + }, + { + "id": "allenai/open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511", + "name": "allenai/open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5499, + "reward-bench/Factuality": 0.6821, + "reward-bench/Precise IF": 0.3937, + "reward-bench/Math": 0.5956, + "reward-bench/Safety": 0.7356, + "reward-bench/Focus": 0.5212, + "reward-bench/Ties": 0.3711 + } + }, + { + "id": "allenai/open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406", + "name": "allenai/open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5054, + "reward-bench/Factuality": 0.6358, + "reward-bench/Precise IF": 0.3688, + "reward-bench/Math": 0.6066, + "reward-bench/Safety": 0.6867, + "reward-bench/Focus": 0.4424, + "reward-bench/Ties": 0.2922 + } + }, + { + "id": "allenai/open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136", + "name": "allenai/open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.478, + "reward-bench/Factuality": 0.6442, + "reward-bench/Precise IF": 0.3563, + "reward-bench/Math": 0.612, + "reward-bench/Safety": 0.6356, + "reward-bench/Focus": 0.2707, + "reward-bench/Ties": 0.3496 + } + }, + { + "id": "allenai/open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398", + "name": "allenai/open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.219, + "reward-bench/Factuality": 0.2484, + "reward-bench/Precise IF": 0.2812, + "reward-bench/Math": 0.2623, + "reward-bench/Safety": 0.3422, + "reward-bench/Focus": 0.1717, + "reward-bench/Ties": 0.008 + } + }, + { + "id": "allenai/open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535", + "name": "allenai/open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5625, + "reward-bench/Factuality": 0.6821, + "reward-bench/Precise IF": 0.4062, + "reward-bench/Math": 0.6011, + "reward-bench/Safety": 0.7511, + "reward-bench/Focus": 0.5313, + "reward-bench/Ties": 0.403 + } + }, + { + "id": "allenai/open_instruct_dev-rm_1e-6_1_dpo__1__1743550054", + "name": "allenai/open_instruct_dev-rm_1e-6_1_dpo__1__1743550054", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5759, + "reward-bench/Factuality": 0.7074, + "reward-bench/Precise IF": 0.375, + "reward-bench/Math": 0.623, + "reward-bench/Safety": 0.7578, + "reward-bench/Focus": 0.5333, + "reward-bench/Ties": 0.459 + } + }, + { + "id": "allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271", + "name": "allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6057, + "reward-bench/Factuality": 0.5053, + "reward-bench/Precise IF": 0.375, + "reward-bench/Math": 0.5902, + "reward-bench/Safety": 0.8422, + "reward-bench/Focus": 0.7798, + "reward-bench/Ties": 0.5419 + } + }, + { + "id": "allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181", + "name": "allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6535, + "reward-bench/Factuality": 0.7137, + "reward-bench/Precise IF": 0.3812, + "reward-bench/Math": 0.6175, + "reward-bench/Safety": 0.8244, + "reward-bench/Focus": 0.7737, + "reward-bench/Ties": 0.6101 + } + }, + { + "id": "allenai/open_instruct_dev-rm_1e-6_1_rl__1__1743551221", + "name": "allenai/open_instruct_dev-rm_1e-6_1_rl__1__1743551221", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5799, + "reward-bench/Factuality": 0.7116, + "reward-bench/Precise IF": 0.3812, + "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.76, + "reward-bench/Focus": 0.5374, + "reward-bench/Ties": 0.461 + } + }, + { + "id": "allenai/open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262", + "name": "allenai/open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5903, + "reward-bench/Factuality": 0.4863, + "reward-bench/Precise IF": 0.3625, + "reward-bench/Math": 0.5738, + "reward-bench/Safety": 0.8489, + "reward-bench/Focus": 0.7778, + "reward-bench/Ties": 0.4926 + } + }, + { + "id": "allenai/open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523", + "name": "allenai/open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6483, + "reward-bench/Factuality": 0.7074, + "reward-bench/Precise IF": 0.3625, + "reward-bench/Math": 0.6175, + "reward-bench/Safety": 0.8222, + "reward-bench/Focus": 0.7758, + "reward-bench/Ties": 0.6044 + } + }, + { + "id": "allenai/open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750", + "name": "allenai/open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5157, + "reward-bench/Factuality": 0.6084, + "reward-bench/Precise IF": 0.3688, + "reward-bench/Math": 0.6066, + "reward-bench/Safety": 0.7089, + "reward-bench/Focus": 0.4222, + "reward-bench/Ties": 0.3791 + } + }, + { + "id": "allenai/open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427", + "name": "allenai/open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6009, + "reward-bench/Factuality": 0.7263, + "reward-bench/Precise IF": 0.375, + "reward-bench/Math": 0.5902, + "reward-bench/Safety": 0.7933, + "reward-bench/Focus": 0.7273, + "reward-bench/Ties": 0.3931 + } + }, + { + "id": "allenai/open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446", + "name": "allenai/open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5716, + "reward-bench/Factuality": 0.6779, + "reward-bench/Precise IF": 0.3937, + "reward-bench/Math": 0.5464, + "reward-bench/Safety": 0.7533, + "reward-bench/Focus": 0.7051, + "reward-bench/Ties": 0.3534 + } + }, + { + "id": "allenai/open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094", + "name": "allenai/open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5151, + "reward-bench/Factuality": 0.6484, + "reward-bench/Precise IF": 0.3312, + "reward-bench/Math": 0.5574, + "reward-bench/Safety": 0.7289, + "reward-bench/Focus": 0.4889, + "reward-bench/Ties": 0.3357 + } + }, + { + "id": "allenai/open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636", + "name": "allenai/open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6119, + "reward-bench/Factuality": 0.72, + "reward-bench/Precise IF": 0.4062, + "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.8067, + "reward-bench/Focus": 0.6889, + "reward-bench/Ties": 0.421 + } + }, + { + "id": "allenai/open_instruct_dev-rm_1e-6_2_dpo__1__1743549325", + "name": "allenai/open_instruct_dev-rm_1e-6_2_dpo__1__1743549325", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6008, + "reward-bench/Factuality": 0.7179, + "reward-bench/Precise IF": 0.35, + "reward-bench/Math": 0.5956, + "reward-bench/Safety": 0.8, + "reward-bench/Focus": 0.6707, + "reward-bench/Ties": 0.4707 + } + }, + { + "id": "allenai/open_instruct_dev-rm_1e-6_2_rl__1__1743551238", + "name": "allenai/open_instruct_dev-rm_1e-6_2_rl__1__1743551238", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5965, + "reward-bench/Factuality": 0.7095, + "reward-bench/Precise IF": 0.3438, + "reward-bench/Math": 0.612, + "reward-bench/Safety": 0.8044, + "reward-bench/Focus": 0.6566, + "reward-bench/Ties": 0.453 + } + }, + { + "id": "allenai/open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906", + "name": "allenai/open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5574, + "reward-bench/Factuality": 0.6526, + "reward-bench/Precise IF": 0.3937, + "reward-bench/Math": 0.6011, + "reward-bench/Safety": 0.7711, + "reward-bench/Focus": 0.5051, + "reward-bench/Ties": 0.4208 + } + }, + { + "id": "allenai/open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529", + "name": "allenai/open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.0719, + "reward-bench/Factuality": 0.0421, + "reward-bench/Precise IF": 0.2062, + "reward-bench/Math": 0.0601, + "reward-bench/Safety": 0.0378, + "reward-bench/Focus": 0.0949, + "reward-bench/Ties": -0.01 + } + }, + { + "id": "allenai/open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305", + "name": "allenai/open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.553, + "reward-bench/Factuality": 0.6674, + "reward-bench/Precise IF": 0.3563, + "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.6733, + "reward-bench/Focus": 0.5697, + "reward-bench/Ties": 0.4227 + } + }, + { + "id": "allenai/open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778", + "name": "allenai/open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.4955, + "reward-bench/Factuality": 0.6189, + "reward-bench/Precise IF": 0.325, + "reward-bench/Math": 0.5792, + "reward-bench/Safety": 0.6378, + "reward-bench/Focus": 0.5657, + "reward-bench/Ties": 0.2466 + } + }, + { + "id": "allenai/open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459", + "name": "allenai/open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.4198, + "reward-bench/Factuality": 0.5747, + "reward-bench/Precise IF": 0.3375, + "reward-bench/Math": 0.5464, + "reward-bench/Safety": 0.4933, + "reward-bench/Focus": 0.3596, + "reward-bench/Ties": 0.2073 + } + }, + { + "id": "allenai/open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747", + "name": "allenai/open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5465, + "reward-bench/Factuality": 0.6821, + "reward-bench/Precise IF": 0.375, + "reward-bench/Math": 0.612, + "reward-bench/Safety": 0.7333, + "reward-bench/Focus": 0.5051, + "reward-bench/Ties": 0.3713 + } + }, + { + "id": "allenai/open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935", + "name": "allenai/open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5197, + "reward-bench/Factuality": 0.6126, + "reward-bench/Precise IF": 0.3375, + "reward-bench/Math": 0.5847, + "reward-bench/Safety": 0.7333, + "reward-bench/Focus": 0.4646, + "reward-bench/Ties": 0.3855 + } + }, + { + "id": "allenai/open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360", + "name": "allenai/open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.4555, + "reward-bench/Factuality": 0.5495, + "reward-bench/Precise IF": 0.3063, + "reward-bench/Math": 0.4262, + "reward-bench/Safety": 0.5711, + "reward-bench/Focus": 0.6101, + "reward-bench/Ties": 0.2696 + } + }, + { + "id": "allenai/open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366", + "name": "allenai/open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.4422, + "reward-bench/Factuality": 0.5053, + "reward-bench/Precise IF": 0.3375, + "reward-bench/Math": 0.4044, + "reward-bench/Safety": 0.5422, + "reward-bench/Focus": 0.6646, + "reward-bench/Ties": 0.1991 + } + }, + { + "id": "allenai/open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352", + "name": "allenai/open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.341, + "reward-bench/Factuality": 0.4674, + "reward-bench/Precise IF": 0.2875, + "reward-bench/Math": 0.3333, + "reward-bench/Safety": 0.3711, + "reward-bench/Focus": 0.3919, + "reward-bench/Ties": 0.195 + } + }, + { + "id": "allenai/open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634", + "name": "allenai/open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.4698, + "reward-bench/Factuality": 0.5853, + "reward-bench/Precise IF": 0.2562, + "reward-bench/Math": 0.5027, + "reward-bench/Safety": 0.6489, + "reward-bench/Focus": 0.5697, + "reward-bench/Ties": 0.2562 + } + }, + { + "id": "allenai/open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988", + "name": "allenai/open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.4791, + "reward-bench/Factuality": 0.6421, + "reward-bench/Precise IF": 0.3125, + "reward-bench/Math": 0.541, + "reward-bench/Safety": 0.6911, + "reward-bench/Focus": 0.4182, + "reward-bench/Ties": 0.27 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103", + "name": "allenai/open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.0607, + "reward-bench/Factuality": 0.0274, + "reward-bench/Precise IF": 0.1625, + "reward-bench/Math": 0.0656, + "reward-bench/Safety": 0.04, + "reward-bench/Focus": 0.0788, + "reward-bench/Ties": -0.01 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835", + "name": "allenai/open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6089, + "reward-bench/Factuality": 0.7284, + "reward-bench/Precise IF": 0.4375, + "reward-bench/Math": 0.612, + "reward-bench/Safety": 0.7622, + "reward-bench/Focus": 0.6444, + "reward-bench/Ties": 0.4686 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221", + "name": "allenai/open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6032, + "reward-bench/Factuality": 0.7158, + "reward-bench/Precise IF": 0.4062, + "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.7778, + "reward-bench/Focus": 0.5859, + "reward-bench/Ties": 0.5051 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826", + "name": "allenai/open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5831, + "reward-bench/Factuality": 0.6947, + "reward-bench/Precise IF": 0.4188, + "reward-bench/Math": 0.623, + "reward-bench/Safety": 0.74, + "reward-bench/Focus": 0.5758, + "reward-bench/Ties": 0.4465 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363", + "name": "allenai/open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5268, + "reward-bench/Factuality": 0.68, + "reward-bench/Precise IF": 0.3688, + "reward-bench/Math": 0.5792, + "reward-bench/Safety": 0.7178, + "reward-bench/Focus": 0.4343, + "reward-bench/Ties": 0.3809 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498", + "name": "allenai/open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6093, + "reward-bench/Factuality": 0.7326, + "reward-bench/Precise IF": 0.4313, + "reward-bench/Math": 0.6339, + "reward-bench/Safety": 0.7578, + "reward-bench/Focus": 0.5859, + "reward-bench/Ties": 0.5143 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_1__2__1743897475", + "name": "allenai/open_instruct_dev-rm_3e-6_1__2__1743897475", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6122, + "reward-bench/Factuality": 0.7368, + "reward-bench/Precise IF": 0.4, + "reward-bench/Math": 0.623, + "reward-bench/Safety": 0.8044, + "reward-bench/Focus": 0.602, + "reward-bench/Ties": 0.5071 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_1__3__1744311421", + "name": "allenai/open_instruct_dev-rm_3e-6_1__3__1744311421", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5995, + "reward-bench/Factuality": 0.7179, + "reward-bench/Precise IF": 0.3375, + "reward-bench/Math": 0.6066, + "reward-bench/Safety": 0.8, + "reward-bench/Focus": 0.6323, + "reward-bench/Ties": 0.503 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_1_dpo__1__1743549903", + "name": "allenai/open_instruct_dev-rm_3e-6_1_dpo__1__1743549903", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6154, + "reward-bench/Factuality": 0.7326, + "reward-bench/Precise IF": 0.4375, + "reward-bench/Math": 0.6339, + "reward-bench/Safety": 0.7778, + "reward-bench/Focus": 0.6061, + "reward-bench/Ties": 0.5043 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368", + "name": "allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6604, + "reward-bench/Factuality": 0.6316, + "reward-bench/Precise IF": 0.3937, + "reward-bench/Math": 0.5792, + "reward-bench/Safety": 0.9044, + "reward-bench/Focus": 0.8929, + "reward-bench/Ties": 0.5604 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182", + "name": "allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6783, + "reward-bench/Factuality": 0.7705, + "reward-bench/Precise IF": 0.4, + "reward-bench/Math": 0.6066, + "reward-bench/Safety": 0.84, + "reward-bench/Focus": 0.8101, + "reward-bench/Ties": 0.6427 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_1_no_if__2__1744316012", + "name": "allenai/open_instruct_dev-rm_3e-6_1_no_if__2__1744316012", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5911, + "reward-bench/Factuality": 0.7347, + "reward-bench/Precise IF": 0.4, + "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.74, + "reward-bench/Focus": 0.604, + "reward-bench/Ties": 0.4392 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_1_no_if__3__1744315765", + "name": "allenai/open_instruct_dev-rm_3e-6_1_no_if__3__1744315765", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5926, + "reward-bench/Factuality": 0.7263, + "reward-bench/Precise IF": 0.3563, + "reward-bench/Math": 0.623, + "reward-bench/Safety": 0.7889, + "reward-bench/Focus": 0.5879, + "reward-bench/Ties": 0.4733 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_1_rl__1__1743551527", + "name": "allenai/open_instruct_dev-rm_3e-6_1_rl__1__1743551527", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6126, + "reward-bench/Factuality": 0.7411, + "reward-bench/Precise IF": 0.425, + "reward-bench/Math": 0.623, + "reward-bench/Safety": 0.7822, + "reward-bench/Focus": 0.5939, + "reward-bench/Ties": 0.5104 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236", + "name": "allenai/open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6525, + "reward-bench/Factuality": 0.6021, + "reward-bench/Precise IF": 0.3875, + "reward-bench/Math": 0.5792, + "reward-bench/Safety": 0.8933, + "reward-bench/Focus": 0.8626, + "reward-bench/Ties": 0.59 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530", + "name": "allenai/open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6849, + "reward-bench/Factuality": 0.7453, + "reward-bench/Precise IF": 0.3812, + "reward-bench/Math": 0.612, + "reward-bench/Safety": 0.8422, + "reward-bench/Focus": 0.8404, + "reward-bench/Ties": 0.6885 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417", + "name": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.586, + "reward-bench/Factuality": 0.6632, + "reward-bench/Precise IF": 0.425, + "reward-bench/Math": 0.6557, + "reward-bench/Safety": 0.7778, + "reward-bench/Focus": 0.5172, + "reward-bench/Ties": 0.477 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486", + "name": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6773, + "reward-bench/Factuality": 0.7432, + "reward-bench/Precise IF": 0.4, + "reward-bench/Math": 0.612, + "reward-bench/Safety": 0.8422, + "reward-bench/Focus": 0.804, + "reward-bench/Ties": 0.6626 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745", + "name": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6793, + "reward-bench/Factuality": 0.7558, + "reward-bench/Precise IF": 0.4062, + "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.8311, + "reward-bench/Focus": 0.8061, + "reward-bench/Ties": 0.6485 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661", + "name": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6611, + "reward-bench/Factuality": 0.72, + "reward-bench/Precise IF": 0.3563, + "reward-bench/Math": 0.6393, + "reward-bench/Safety": 0.8444, + "reward-bench/Focus": 0.7636, + "reward-bench/Ties": 0.6428 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472", + "name": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5778, + "reward-bench/Factuality": 0.6674, + "reward-bench/Precise IF": 0.3875, + "reward-bench/Math": 0.6011, + "reward-bench/Safety": 0.7933, + "reward-bench/Focus": 0.5172, + "reward-bench/Ties": 0.5003 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267", + "name": "allenai/open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5746, + "reward-bench/Factuality": 0.6505, + "reward-bench/Precise IF": 0.35, + "reward-bench/Math": 0.5082, + "reward-bench/Safety": 0.7844, + "reward-bench/Focus": 0.7414, + "reward-bench/Ties": 0.4128 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759", + "name": "allenai/open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6065, + "reward-bench/Factuality": 0.7116, + "reward-bench/Precise IF": 0.35, + "reward-bench/Math": 0.5792, + "reward-bench/Safety": 0.8178, + "reward-bench/Focus": 0.7152, + "reward-bench/Ties": 0.465 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905", + "name": "allenai/open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5305, + "reward-bench/Factuality": 0.5832, + "reward-bench/Precise IF": 0.3312, + "reward-bench/Math": 0.459, + "reward-bench/Safety": 0.7178, + "reward-bench/Focus": 0.7071, + "reward-bench/Ties": 0.3849 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363", + "name": "allenai/open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.4436, + "reward-bench/Factuality": 0.5411, + "reward-bench/Precise IF": 0.3312, + "reward-bench/Math": 0.3115, + "reward-bench/Safety": 0.6267, + "reward-bench/Focus": 0.5414, + "reward-bench/Ties": 0.31 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505", + "name": "allenai/open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5925, + "reward-bench/Factuality": 0.68, + "reward-bench/Precise IF": 0.3688, + "reward-bench/Math": 0.5519, + "reward-bench/Safety": 0.78, + "reward-bench/Focus": 0.7434, + "reward-bench/Ties": 0.431 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_2_dpo__1__1743550180", + "name": "allenai/open_instruct_dev-rm_3e-6_2_dpo__1__1743550180", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6198, + "reward-bench/Factuality": 0.7263, + "reward-bench/Precise IF": 0.3312, + "reward-bench/Math": 0.6339, + "reward-bench/Safety": 0.8133, + "reward-bench/Focus": 0.7232, + "reward-bench/Ties": 0.4908 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187", + "name": "allenai/open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6763, + "reward-bench/Factuality": 0.7411, + "reward-bench/Precise IF": 0.375, + "reward-bench/Math": 0.612, + "reward-bench/Safety": 0.8844, + "reward-bench/Focus": 0.8545, + "reward-bench/Ties": 0.5908 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_2_rl__1__1743551509", + "name": "allenai/open_instruct_dev-rm_3e-6_2_rl__1__1743551509", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6245, + "reward-bench/Factuality": 0.7242, + "reward-bench/Precise IF": 0.35, + "reward-bench/Math": 0.6175, + "reward-bench/Safety": 0.8178, + "reward-bench/Focus": 0.7253, + "reward-bench/Ties": 0.5124 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498", + "name": "allenai/open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6673, + "reward-bench/Factuality": 0.7326, + "reward-bench/Precise IF": 0.3438, + "reward-bench/Math": 0.6175, + "reward-bench/Safety": 0.8622, + "reward-bench/Focus": 0.8566, + "reward-bench/Ties": 0.5911 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926", + "name": "allenai/open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5863, + "reward-bench/Factuality": 0.6674, + "reward-bench/Precise IF": 0.3937, + "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.8, + "reward-bench/Focus": 0.5515, + "reward-bench/Ties": 0.4768 + } + }, + { + "id": "allenai/open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661", + "name": "allenai/open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.589, + "reward-bench/Factuality": 0.6842, + "reward-bench/Precise IF": 0.3688, + "reward-bench/Math": 0.6393, + "reward-bench/Safety": 0.7867, + "reward-bench/Focus": 0.6081, + "reward-bench/Ties": 0.447 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598", + "name": "allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7306, + "reward-bench/Factuality": 0.7474, + "reward-bench/Precise IF": 0.375, + "reward-bench/Math": 0.694, + "reward-bench/Safety": 0.8622, + "reward-bench/Focus": 0.8061, + "reward-bench/Ties": 0.8992 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923", + "name": "allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7573, + "reward-bench/Factuality": 0.8168, + "reward-bench/Precise IF": 0.4125, + "reward-bench/Math": 0.7049, + "reward-bench/Safety": 0.8733, + "reward-bench/Focus": 0.8545, + "reward-bench/Ties": 0.8814 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llama_1e-6_1__1__1743896628", + "name": "allenai/open_instruct_dev-rm_llama_1e-6_1__1__1743896628", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6637, + "reward-bench/Factuality": 0.6947, + "reward-bench/Precise IF": 0.4062, + "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.8422, + "reward-bench/Focus": 0.7273, + "reward-bench/Ties": 0.6834 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999", + "name": "allenai/open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6665, + "reward-bench/Factuality": 0.5979, + "reward-bench/Precise IF": 0.3688, + "reward-bench/Math": 0.6339, + "reward-bench/Safety": 0.8956, + "reward-bench/Focus": 0.8606, + "reward-bench/Ties": 0.6422 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777", + "name": "allenai/open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7038, + "reward-bench/Factuality": 0.6947, + "reward-bench/Precise IF": 0.3937, + "reward-bench/Math": 0.6557, + "reward-bench/Safety": 0.8867, + "reward-bench/Focus": 0.8586, + "reward-bench/Ties": 0.7331 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llama_1e-6_2__1__1743896638", + "name": "allenai/open_instruct_dev-rm_llama_1e-6_2__1__1743896638", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6754, + "reward-bench/Factuality": 0.6716, + "reward-bench/Precise IF": 0.4, + "reward-bench/Math": 0.6339, + "reward-bench/Safety": 0.8756, + "reward-bench/Focus": 0.7737, + "reward-bench/Ties": 0.6976 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938", + "name": "allenai/open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7241, + "reward-bench/Factuality": 0.7305, + "reward-bench/Precise IF": 0.4, + "reward-bench/Math": 0.6667, + "reward-bench/Safety": 0.9422, + "reward-bench/Focus": 0.9414, + "reward-bench/Ties": 0.6635 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885", + "name": "allenai/open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6716, + "reward-bench/Factuality": 0.6632, + "reward-bench/Precise IF": 0.3688, + "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.82, + "reward-bench/Focus": 0.8303, + "reward-bench/Ties": 0.719 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773", + "name": "allenai/open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6207, + "reward-bench/Factuality": 0.6358, + "reward-bench/Precise IF": 0.375, + "reward-bench/Math": 0.5902, + "reward-bench/Safety": 0.8267, + "reward-bench/Focus": 0.802, + "reward-bench/Ties": 0.4948 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867", + "name": "allenai/open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.719, + "reward-bench/Factuality": 0.7263, + "reward-bench/Precise IF": 0.3875, + "reward-bench/Math": 0.6393, + "reward-bench/Safety": 0.8956, + "reward-bench/Focus": 0.9273, + "reward-bench/Ties": 0.738 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llama_3e-6_1__1__1743929424", + "name": "allenai/open_instruct_dev-rm_llama_3e-6_1__1__1743929424", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6572, + "reward-bench/Factuality": 0.7305, + "reward-bench/Precise IF": 0.3688, + "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.8289, + "reward-bench/Focus": 0.703, + "reward-bench/Ties": 0.6837 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llama_3e-6_1__2__1744311395", + "name": "allenai/open_instruct_dev-rm_llama_3e-6_1__2__1744311395", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6938, + "reward-bench/Factuality": 0.7537, + "reward-bench/Precise IF": 0.45, + "reward-bench/Math": 0.6393, + "reward-bench/Safety": 0.8667, + "reward-bench/Focus": 0.7616, + "reward-bench/Ties": 0.6913 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llama_3e-6_1__3__1744311491", + "name": "allenai/open_instruct_dev-rm_llama_3e-6_1__3__1744311491", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6754, + "reward-bench/Factuality": 0.7242, + "reward-bench/Precise IF": 0.4062, + "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.8422, + "reward-bench/Focus": 0.7535, + "reward-bench/Ties": 0.6976 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787", + "name": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7045, + "reward-bench/Factuality": 0.6253, + "reward-bench/Precise IF": 0.3812, + "reward-bench/Math": 0.6667, + "reward-bench/Safety": 0.92, + "reward-bench/Focus": 0.9232, + "reward-bench/Ties": 0.7109 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461", + "name": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7189, + "reward-bench/Factuality": 0.7305, + "reward-bench/Precise IF": 0.3937, + "reward-bench/Math": 0.6066, + "reward-bench/Safety": 0.8978, + "reward-bench/Focus": 0.9374, + "reward-bench/Ties": 0.7475 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780", + "name": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7172, + "reward-bench/Factuality": 0.7242, + "reward-bench/Precise IF": 0.4313, + "reward-bench/Math": 0.6175, + "reward-bench/Safety": 0.8778, + "reward-bench/Focus": 0.897, + "reward-bench/Ties": 0.7555 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llama_3e-6_2__1__1743896489", + "name": "allenai/open_instruct_dev-rm_llama_3e-6_2__1__1743896489", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6813, + "reward-bench/Factuality": 0.7137, + "reward-bench/Precise IF": 0.4437, + "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.8644, + "reward-bench/Focus": 0.7596, + "reward-bench/Ties": 0.6781 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713", + "name": "allenai/open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7209, + "reward-bench/Factuality": 0.7116, + "reward-bench/Precise IF": 0.3875, + "reward-bench/Math": 0.6612, + "reward-bench/Safety": 0.9067, + "reward-bench/Focus": 0.9172, + "reward-bench/Ties": 0.7414 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911", + "name": "allenai/open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7266, + "reward-bench/Factuality": 0.7347, + "reward-bench/Precise IF": 0.4313, + "reward-bench/Math": 0.6339, + "reward-bench/Safety": 0.8933, + "reward-bench/Focus": 0.897, + "reward-bench/Ties": 0.7697 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412", + "name": "allenai/open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5342, + "reward-bench/Factuality": 0.6042, + "reward-bench/Precise IF": 0.275, + "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.7222, + "reward-bench/Focus": 0.5818, + "reward-bench/Ties": 0.3935 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922", + "name": "allenai/open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6111, + "reward-bench/Factuality": 0.6884, + "reward-bench/Precise IF": 0.3063, + "reward-bench/Math": 0.623, + "reward-bench/Safety": 0.8289, + "reward-bench/Focus": 0.7576, + "reward-bench/Ties": 0.4628 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495", + "name": "allenai/open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5825, + "reward-bench/Factuality": 0.6379, + "reward-bench/Precise IF": 0.325, + "reward-bench/Math": 0.5355, + "reward-bench/Safety": 0.8222, + "reward-bench/Focus": 0.7051, + "reward-bench/Ties": 0.4691 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507", + "name": "allenai/open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5598, + "reward-bench/Factuality": 0.5495, + "reward-bench/Precise IF": 0.3563, + "reward-bench/Math": 0.5902, + "reward-bench/Safety": 0.76, + "reward-bench/Focus": 0.7273, + "reward-bench/Ties": 0.3754 + } + }, + { + "id": "allenai/open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507", + "name": "allenai/open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6101, + "reward-bench/Factuality": 0.6632, + "reward-bench/Precise IF": 0.35, + "reward-bench/Math": 0.6175, + "reward-bench/Safety": 0.7778, + "reward-bench/Focus": 0.7111, + "reward-bench/Ties": 0.5408 + } + }, + { + "id": "allenai/open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917", + "name": "allenai/open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7185, + "reward-bench/Factuality": 0.7305, + "reward-bench/Precise IF": 0.4125, + "reward-bench/Math": 0.7158, + "reward-bench/Safety": 0.7933, + "reward-bench/Focus": 0.8545, + "reward-bench/Ties": 0.804 + } + }, + { + "id": "allenai/open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961", + "name": "allenai/open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7325, + "reward-bench/Factuality": 0.7474, + "reward-bench/Precise IF": 0.4437, + "reward-bench/Math": 0.7158, + "reward-bench/Safety": 0.7978, + "reward-bench/Focus": 0.8141, + "reward-bench/Ties": 0.8763 + } + }, + { + "id": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830", + "name": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6022, + "reward-bench/Factuality": 0.5284, + "reward-bench/Precise IF": 0.325, + "reward-bench/Math": 0.694, + "reward-bench/Safety": 0.7556, + "reward-bench/Focus": 0.7616, + "reward-bench/Ties": 0.5486 + } + }, + { + "id": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024", + "name": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5948, + "reward-bench/Factuality": 0.5579, + "reward-bench/Precise IF": 0.2875, + "reward-bench/Math": 0.6776, + "reward-bench/Safety": 0.72, + "reward-bench/Focus": 0.7394, + "reward-bench/Ties": 0.5863 + } + }, + { + "id": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914", + "name": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6492, + "reward-bench/Factuality": 0.6084, + "reward-bench/Precise IF": 0.35, + "reward-bench/Math": 0.6776, + "reward-bench/Safety": 0.76, + "reward-bench/Focus": 0.8, + "reward-bench/Ties": 0.699 + } + }, + { + "id": "allenai/open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091", + "name": "allenai/open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6764, + "reward-bench/Factuality": 0.7074, + "reward-bench/Precise IF": 0.3, + "reward-bench/Math": 0.6885, + "reward-bench/Safety": 0.8622, + "reward-bench/Focus": 0.802, + "reward-bench/Ties": 0.6984 + } + }, + { + "id": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829", + "name": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6408, + "reward-bench/Factuality": 0.6337, + "reward-bench/Precise IF": 0.3063, + "reward-bench/Math": 0.6831, + "reward-bench/Safety": 0.8467, + "reward-bench/Focus": 0.8222, + "reward-bench/Ties": 0.5529 + } + }, + { + "id": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050", + "name": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6452, + "reward-bench/Factuality": 0.6063, + "reward-bench/Precise IF": 0.3187, + "reward-bench/Math": 0.7158, + "reward-bench/Safety": 0.8356, + "reward-bench/Focus": 0.8343, + "reward-bench/Ties": 0.5603 + } + }, + { + "id": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916", + "name": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7013, + "reward-bench/Factuality": 0.7263, + "reward-bench/Precise IF": 0.3438, + "reward-bench/Math": 0.6995, + "reward-bench/Safety": 0.8222, + "reward-bench/Focus": 0.8444, + "reward-bench/Ties": 0.7714 + } + }, + { + "id": "allenai/open_instruct_dev-rm_qwen_3e-6_2__1__1743023576", + "name": "allenai/open_instruct_dev-rm_qwen_3e-6_2__1__1743023576", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6369, + "reward-bench/Factuality": 0.6905, + "reward-bench/Precise IF": 0.3187, + "reward-bench/Math": 0.6448, + "reward-bench/Safety": 0.7844, + "reward-bench/Focus": 0.7596, + "reward-bench/Ties": 0.6236 + } + }, + { + "id": "allenai/open_instruct_dev-rm_qwen_3e-6_3__1__1743023619", + "name": "allenai/open_instruct_dev-rm_qwen_3e-6_3__1__1743023619", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6221, + "reward-bench/Factuality": 0.6674, + "reward-bench/Precise IF": 0.325, + "reward-bench/Math": 0.612, + "reward-bench/Safety": 0.7978, + "reward-bench/Focus": 0.7455, + "reward-bench/Ties": 0.5852 + } + }, + { + "id": "allenai/open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583", + "name": "allenai/open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5735, + "reward-bench/Factuality": 0.5895, + "reward-bench/Precise IF": 0.2625, + "reward-bench/Math": 0.6448, + "reward-bench/Safety": 0.6889, + "reward-bench/Focus": 0.6727, + "reward-bench/Ties": 0.5823 + } + }, + { + "id": "allenai/open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604", + "name": "allenai/open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6336, + "reward-bench/Factuality": 0.6337, + "reward-bench/Precise IF": 0.3063, + "reward-bench/Math": 0.6885, + "reward-bench/Safety": 0.7244, + "reward-bench/Focus": 0.802, + "reward-bench/Ties": 0.6465 + } + }, + { + "id": "allenai/open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738", + "name": "allenai/open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6824, + "reward-bench/Factuality": 0.6989, + "reward-bench/Precise IF": 0.3625, + "reward-bench/Math": 0.6831, + "reward-bench/Safety": 0.8311, + "reward-bench/Focus": 0.8081, + "reward-bench/Ties": 0.7107 + } + }, + { + "id": "allenai/open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191", + "name": "allenai/open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6392, + "reward-bench/Factuality": 0.6589, + "reward-bench/Precise IF": 0.3312, + "reward-bench/Math": 0.6995, + "reward-bench/Safety": 0.7933, + "reward-bench/Focus": 0.7717, + "reward-bench/Ties": 0.5804 + } + }, + { + "id": "allenai/open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737", + "name": "allenai/open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.664, + "reward-bench/Factuality": 0.6821, + "reward-bench/Precise IF": 0.3312, + "reward-bench/Math": 0.6448, + "reward-bench/Safety": 0.8133, + "reward-bench/Focus": 0.8061, + "reward-bench/Ties": 0.7066 + } + }, + { + "id": "allenai/open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138", + "name": "allenai/open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6678, + "reward-bench/Factuality": 0.6505, + "reward-bench/Precise IF": 0.3312, + "reward-bench/Math": 0.6831, + "reward-bench/Safety": 0.7978, + "reward-bench/Focus": 0.8808, + "reward-bench/Ties": 0.6632 + } + }, + { + "id": "allenai/open_instruct_dev-rm_tulu3_70b_1__8__1742924455", + "name": "allenai/open_instruct_dev-rm_tulu3_70b_1__8__1742924455", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6618, + "reward-bench/Factuality": 0.7958, + "reward-bench/Precise IF": 0.325, + "reward-bench/Math": 0.6557, + "reward-bench/Safety": 0.8311, + "reward-bench/Focus": 0.6323, + "reward-bench/Ties": 0.7311 + } + }, + { + "id": "allenai/open_instruct_dev-rm_tulu3_70b_2__8__1742982964", + "name": "allenai/open_instruct_dev-rm_tulu3_70b_2__8__1742982964", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6605, + "reward-bench/Factuality": 0.7789, + "reward-bench/Precise IF": 0.3688, + "reward-bench/Math": 0.6448, + "reward-bench/Safety": 0.8844, + "reward-bench/Focus": 0.6667, + "reward-bench/Ties": 0.6195 + } + }, + { + "id": "allenai/tulu-2-dpo-13b", + "name": "allenai/tulu-2-dpo-13b", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7368, + "reward-bench/Chat": 0.9581, + "reward-bench/Chat Hard": 0.5833, + "reward-bench/Safety": 0.7946, + "reward-bench/Reasoning": 0.7323, + "reward-bench/Prior Sets (0.5 weight)": 0.4947 + } + }, + { + "id": "allenai/tulu-2-dpo-70b", + "name": "allenai/tulu-2-dpo-70b", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7621, + "reward-bench/Chat": 0.9749, + "reward-bench/Chat Hard": 0.6053, + "reward-bench/Safety": 0.8446, + "reward-bench/Reasoning": 0.7407, + "reward-bench/Prior Sets (0.5 weight)": 0.5278 + } + }, + { + "id": "allenai/tulu-2-dpo-7b", + "name": "allenai/tulu-2-dpo-7b", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7212, + "reward-bench/Chat": 0.9749, + "reward-bench/Chat Hard": 0.5614, + "reward-bench/Safety": 0.7527, + "reward-bench/Reasoning": 0.7176, + "reward-bench/Prior Sets (0.5 weight)": 0.4774 + } + }, + { + "id": "allenai/tulu-v2.5-13b-preference-mix-rm", + "name": "allenai/tulu-v2.5-13b-preference-mix-rm", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8027, + "reward-bench/Chat": 0.9358, + "reward-bench/Chat Hard": 0.682, + "reward-bench/Safety": 0.773, + "reward-bench/Reasoning": 0.885, + "reward-bench/Prior Sets (0.5 weight)": 0.6724 + } + }, + { + "id": "allenai/tulu-v2.5-13b-uf-rm", + "name": "allenai/tulu-v2.5-13b-uf-rm", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.4806, + "reward-bench/Chat": 0.3939, + "reward-bench/Chat Hard": 0.4232, + "reward-bench/Safety": 0.5554, + "reward-bench/Reasoning": 0.4737, + "reward-bench/Prior Sets (0.5 weight)": 0.6326 + } + }, + { + "id": "allenai/tulu-v2.5-70b-preference-mix-rm", + "name": "allenai/tulu-v2.5-70b-preference-mix-rm", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6516, + "reward-bench/Chat": 0.7737, + "reward-bench/Chat Hard": 0.5921, + "reward-bench/Safety": 0.8486, + "reward-bench/Reasoning": 0.4138, + "reward-bench/Prior Sets (0.5 weight)": 0.6079 + } + }, + { + "id": "allenai/tulu-v2.5-70b-uf-rm", + "name": "allenai/tulu-v2.5-70b-uf-rm", + "developer": "allenai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7398, + "reward-bench/Chat": 0.8659, + "reward-bench/Chat Hard": 0.7171, + "reward-bench/Safety": 0.7014, + "reward-bench/Reasoning": 0.757, + "reward-bench/Prior Sets (0.5 weight)": 0.5757 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/allknowingroger.json b/data/developers/allknowingroger.json new file mode 100644 index 0000000000000000000000000000000000000000..a89aed44fcee0652d7b7083dcfb44ac0b7220ad9 --- /dev/null +++ b/data/developers/allknowingroger.json @@ -0,0 +1,1237 @@ +{ + "developer": "allknowingroger", + "models": [ + { + "id": "allknowingroger/Chocolatine-24B", + "name": "Chocolatine-24B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1958, + "hfopenllm_v2/BBH": 0.6191, + "hfopenllm_v2/MATH Level 5": 0.0008, + "hfopenllm_v2/GPQA": 0.3255, + "hfopenllm_v2/MUSR": 0.4323, + "hfopenllm_v2/MMLU-PRO": 0.4566 + } + }, + { + "id": "allknowingroger/Gemma2Slerp1-2.6B", + "name": "Gemma2Slerp1-2.6B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5354, + "hfopenllm_v2/BBH": 0.4343, + "hfopenllm_v2/MATH Level 5": 0.1065, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.4562, + "hfopenllm_v2/MMLU-PRO": 0.2689 + } + }, + { + "id": "allknowingroger/Gemma2Slerp1-27B", + "name": "Gemma2Slerp1-27B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7186, + "hfopenllm_v2/BBH": 0.6399, + "hfopenllm_v2/MATH Level 5": 0.2583, + "hfopenllm_v2/GPQA": 0.3641, + "hfopenllm_v2/MUSR": 0.4767, + "hfopenllm_v2/MMLU-PRO": 0.4456 + } + }, + { + "id": "allknowingroger/Gemma2Slerp2-2.6B", + "name": "Gemma2Slerp2-2.6B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5747, + "hfopenllm_v2/BBH": 0.4308, + "hfopenllm_v2/MATH Level 5": 0.0906, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4468, + "hfopenllm_v2/MMLU-PRO": 0.2696 + } + }, + { + "id": "allknowingroger/Gemma2Slerp2-27B", + "name": "Gemma2Slerp2-27B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7546, + "hfopenllm_v2/BBH": 0.6557, + "hfopenllm_v2/MATH Level 5": 0.2787, + "hfopenllm_v2/GPQA": 0.37, + "hfopenllm_v2/MUSR": 0.4621, + "hfopenllm_v2/MMLU-PRO": 0.4623 + } + }, + { + "id": "allknowingroger/Gemma2Slerp3-27B", + "name": "Gemma2Slerp3-27B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7426, + "hfopenllm_v2/BBH": 0.65, + "hfopenllm_v2/MATH Level 5": 0.2742, + "hfopenllm_v2/GPQA": 0.3549, + "hfopenllm_v2/MUSR": 0.474, + "hfopenllm_v2/MMLU-PRO": 0.4641 + } + }, + { + "id": "allknowingroger/Gemma2Slerp4-27B", + "name": "Gemma2Slerp4-27B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7497, + "hfopenllm_v2/BBH": 0.653, + "hfopenllm_v2/MATH Level 5": 0.2719, + "hfopenllm_v2/GPQA": 0.3666, + "hfopenllm_v2/MUSR": 0.4502, + "hfopenllm_v2/MMLU-PRO": 0.4649 + } + }, + { + "id": "allknowingroger/GemmaSlerp-9B", + "name": "GemmaSlerp-9B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7043, + "hfopenllm_v2/BBH": 0.5921, + "hfopenllm_v2/MATH Level 5": 0.216, + "hfopenllm_v2/GPQA": 0.344, + "hfopenllm_v2/MUSR": 0.4673, + "hfopenllm_v2/MMLU-PRO": 0.4161 + } + }, + { + "id": "allknowingroger/GemmaSlerp2-9B", + "name": "GemmaSlerp2-9B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7281, + "hfopenllm_v2/BBH": 0.5983, + "hfopenllm_v2/MATH Level 5": 0.2107, + "hfopenllm_v2/GPQA": 0.3523, + "hfopenllm_v2/MUSR": 0.4767, + "hfopenllm_v2/MMLU-PRO": 0.4239 + } + }, + { + "id": "allknowingroger/GemmaSlerp4-10B", + "name": "GemmaSlerp4-10B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7326, + "hfopenllm_v2/BBH": 0.6028, + "hfopenllm_v2/MATH Level 5": 0.2243, + "hfopenllm_v2/GPQA": 0.3532, + "hfopenllm_v2/MUSR": 0.454, + "hfopenllm_v2/MMLU-PRO": 0.425 + } + }, + { + "id": "allknowingroger/GemmaSlerp5-10B", + "name": "GemmaSlerp5-10B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7353, + "hfopenllm_v2/BBH": 0.6054, + "hfopenllm_v2/MATH Level 5": 0.2183, + "hfopenllm_v2/GPQA": 0.3523, + "hfopenllm_v2/MUSR": 0.4608, + "hfopenllm_v2/MMLU-PRO": 0.4328 + } + }, + { + "id": "allknowingroger/GemmaStock1-27B", + "name": "GemmaStock1-27B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7509, + "hfopenllm_v2/BBH": 0.6566, + "hfopenllm_v2/MATH Level 5": 0.2636, + "hfopenllm_v2/GPQA": 0.3641, + "hfopenllm_v2/MUSR": 0.4527, + "hfopenllm_v2/MMLU-PRO": 0.473 + } + }, + { + "id": "allknowingroger/HomerSlerp1-7B", + "name": "HomerSlerp1-7B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4621, + "hfopenllm_v2/BBH": 0.5518, + "hfopenllm_v2/MATH Level 5": 0.2719, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.4359, + "hfopenllm_v2/MMLU-PRO": 0.4504 + } + }, + { + "id": "allknowingroger/HomerSlerp2-7B", + "name": "HomerSlerp2-7B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4487, + "hfopenllm_v2/BBH": 0.5649, + "hfopenllm_v2/MATH Level 5": 0.2968, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.4356, + "hfopenllm_v2/MMLU-PRO": 0.4515 + } + }, + { + "id": "allknowingroger/HomerSlerp3-7B", + "name": "HomerSlerp3-7B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4363, + "hfopenllm_v2/BBH": 0.5598, + "hfopenllm_v2/MATH Level 5": 0.3021, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4462, + "hfopenllm_v2/MMLU-PRO": 0.4535 + } + }, + { + "id": "allknowingroger/HomerSlerp4-7B", + "name": "HomerSlerp4-7B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4374, + "hfopenllm_v2/BBH": 0.5571, + "hfopenllm_v2/MATH Level 5": 0.327, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.4408, + "hfopenllm_v2/MMLU-PRO": 0.4472 + } + }, + { + "id": "allknowingroger/LimyQstar-7B-slerp", + "name": "LimyQstar-7B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3491, + "hfopenllm_v2/BBH": 0.5024, + "hfopenllm_v2/MATH Level 5": 0.0687, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4146, + "hfopenllm_v2/MMLU-PRO": 0.3103 + } + }, + { + "id": "allknowingroger/Llama3.1-60B", + "name": "Llama3.1-60B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1815, + "hfopenllm_v2/BBH": 0.3242, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.3596, + "hfopenllm_v2/MMLU-PRO": 0.331 + } + }, + { + "id": "allknowingroger/Marco-01-slerp1-7B", + "name": "Marco-01-slerp1-7B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4681, + "hfopenllm_v2/BBH": 0.5541, + "hfopenllm_v2/MATH Level 5": 0.3157, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4452, + "hfopenllm_v2/MMLU-PRO": 0.4483 + } + }, + { + "id": "allknowingroger/Meme-7B-slerp", + "name": "Meme-7B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5164, + "hfopenllm_v2/BBH": 0.4661, + "hfopenllm_v2/MATH Level 5": 0.0438, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.4223, + "hfopenllm_v2/MMLU-PRO": 0.281 + } + }, + { + "id": "allknowingroger/Ministral-8B-slerp", + "name": "Ministral-8B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1961, + "hfopenllm_v2/BBH": 0.4686, + "hfopenllm_v2/MATH Level 5": 0.0038, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4285, + "hfopenllm_v2/MMLU-PRO": 0.3119 + } + }, + { + "id": "allknowingroger/MistralPhi3-11B", + "name": "MistralPhi3-11B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1943, + "hfopenllm_v2/BBH": 0.6234, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.3322, + "hfopenllm_v2/MUSR": 0.4267, + "hfopenllm_v2/MMLU-PRO": 0.4688 + } + }, + { + "id": "allknowingroger/Mistralmash1-7B-s", + "name": "Mistralmash1-7B-s", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3961, + "hfopenllm_v2/BBH": 0.5277, + "hfopenllm_v2/MATH Level 5": 0.0921, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4267, + "hfopenllm_v2/MMLU-PRO": 0.3293 + } + }, + { + "id": "allknowingroger/Mistralmash2-7B-s", + "name": "Mistralmash2-7B-s", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4102, + "hfopenllm_v2/BBH": 0.5305, + "hfopenllm_v2/MATH Level 5": 0.0793, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4372, + "hfopenllm_v2/MMLU-PRO": 0.3345 + } + }, + { + "id": "allknowingroger/MixTAO-19B-pass", + "name": "MixTAO-19B-pass", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3814, + "hfopenllm_v2/BBH": 0.5128, + "hfopenllm_v2/MATH Level 5": 0.0612, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4783, + "hfopenllm_v2/MMLU-PRO": 0.3105 + } + }, + { + "id": "allknowingroger/MixTaoTruthful-13B-slerp", + "name": "MixTaoTruthful-13B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4139, + "hfopenllm_v2/BBH": 0.5207, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4292, + "hfopenllm_v2/MMLU-PRO": 0.31 + } + }, + { + "id": "allknowingroger/MultiCalm-7B-slerp", + "name": "MultiCalm-7B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3927, + "hfopenllm_v2/BBH": 0.5122, + "hfopenllm_v2/MATH Level 5": 0.0619, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4319, + "hfopenllm_v2/MMLU-PRO": 0.3033 + } + }, + { + "id": "allknowingroger/MultiMash-12B-slerp", + "name": "MultiMash-12B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3974, + "hfopenllm_v2/BBH": 0.5142, + "hfopenllm_v2/MATH Level 5": 0.0808, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.4438, + "hfopenllm_v2/MMLU-PRO": 0.3068 + } + }, + { + "id": "allknowingroger/MultiMash10-13B-slerp", + "name": "MultiMash10-13B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4163, + "hfopenllm_v2/BBH": 0.5186, + "hfopenllm_v2/MATH Level 5": 0.0718, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.4318, + "hfopenllm_v2/MMLU-PRO": 0.3117 + } + }, + { + "id": "allknowingroger/MultiMash11-13B-slerp", + "name": "MultiMash11-13B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4251, + "hfopenllm_v2/BBH": 0.5194, + "hfopenllm_v2/MATH Level 5": 0.0702, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4373, + "hfopenllm_v2/MMLU-PRO": 0.3085 + } + }, + { + "id": "allknowingroger/MultiMash2-12B-slerp", + "name": "MultiMash2-12B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4261, + "hfopenllm_v2/BBH": 0.5134, + "hfopenllm_v2/MATH Level 5": 0.0642, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.4228, + "hfopenllm_v2/MMLU-PRO": 0.3043 + } + }, + { + "id": "allknowingroger/MultiMash5-12B-slerp", + "name": "MultiMash5-12B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4142, + "hfopenllm_v2/BBH": 0.5145, + "hfopenllm_v2/MATH Level 5": 0.0634, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.4203, + "hfopenllm_v2/MMLU-PRO": 0.3028 + } + }, + { + "id": "allknowingroger/MultiMash6-12B-slerp", + "name": "MultiMash6-12B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.43, + "hfopenllm_v2/BBH": 0.5196, + "hfopenllm_v2/MATH Level 5": 0.0725, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.4306, + "hfopenllm_v2/MMLU-PRO": 0.3091 + } + }, + { + "id": "allknowingroger/MultiMash7-12B-slerp", + "name": "MultiMash7-12B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4213, + "hfopenllm_v2/BBH": 0.5111, + "hfopenllm_v2/MATH Level 5": 0.0695, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.4279, + "hfopenllm_v2/MMLU-PRO": 0.3029 + } + }, + { + "id": "allknowingroger/MultiMash8-13B-slerp", + "name": "MultiMash8-13B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4321, + "hfopenllm_v2/BBH": 0.5178, + "hfopenllm_v2/MATH Level 5": 0.077, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.4424, + "hfopenllm_v2/MMLU-PRO": 0.3126 + } + }, + { + "id": "allknowingroger/MultiMash9-13B-slerp", + "name": "MultiMash9-13B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4188, + "hfopenllm_v2/BBH": 0.5194, + "hfopenllm_v2/MATH Level 5": 0.0785, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.4398, + "hfopenllm_v2/MMLU-PRO": 0.31 + } + }, + { + "id": "allknowingroger/MultiMerge-7B-slerp", + "name": "MultiMerge-7B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3948, + "hfopenllm_v2/BBH": 0.514, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.428, + "hfopenllm_v2/MMLU-PRO": 0.3037 + } + }, + { + "id": "allknowingroger/Multimash3-12B-slerp", + "name": "Multimash3-12B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4437, + "hfopenllm_v2/BBH": 0.5177, + "hfopenllm_v2/MATH Level 5": 0.0627, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.4344, + "hfopenllm_v2/MMLU-PRO": 0.3068 + } + }, + { + "id": "allknowingroger/Multimerge-19B-pass", + "name": "Multimerge-19B-pass", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1773, + "hfopenllm_v2/BBH": 0.2892, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.343, + "hfopenllm_v2/MMLU-PRO": 0.1169 + } + }, + { + "id": "allknowingroger/MultiverseEx26-7B-slerp", + "name": "MultiverseEx26-7B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3939, + "hfopenllm_v2/BBH": 0.5134, + "hfopenllm_v2/MATH Level 5": 0.0755, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4293, + "hfopenllm_v2/MMLU-PRO": 0.3035 + } + }, + { + "id": "allknowingroger/NeuralWestSeverus-7B-slerp", + "name": "NeuralWestSeverus-7B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4136, + "hfopenllm_v2/BBH": 0.5244, + "hfopenllm_v2/MATH Level 5": 0.0733, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.4529, + "hfopenllm_v2/MMLU-PRO": 0.3137 + } + }, + { + "id": "allknowingroger/Neuralcoven-7B-slerp", + "name": "Neuralcoven-7B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3859, + "hfopenllm_v2/BBH": 0.5303, + "hfopenllm_v2/MATH Level 5": 0.0785, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.429, + "hfopenllm_v2/MMLU-PRO": 0.3294 + } + }, + { + "id": "allknowingroger/Neuralmultiverse-7B-slerp", + "name": "Neuralmultiverse-7B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3769, + "hfopenllm_v2/BBH": 0.5166, + "hfopenllm_v2/MATH Level 5": 0.065, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.428, + "hfopenllm_v2/MMLU-PRO": 0.3042 + } + }, + { + "id": "allknowingroger/Ph3della5-14B", + "name": "Ph3della5-14B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4799, + "hfopenllm_v2/BBH": 0.6332, + "hfopenllm_v2/MATH Level 5": 0.1767, + "hfopenllm_v2/GPQA": 0.3423, + "hfopenllm_v2/MUSR": 0.4386, + "hfopenllm_v2/MMLU-PRO": 0.4787 + } + }, + { + "id": "allknowingroger/Ph3merge-14B", + "name": "Ph3merge-14B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2701, + "hfopenllm_v2/BBH": 0.6381, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.3381, + "hfopenllm_v2/MUSR": 0.4334, + "hfopenllm_v2/MMLU-PRO": 0.4611 + } + }, + { + "id": "allknowingroger/Ph3merge2-14B", + "name": "Ph3merge2-14B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1706, + "hfopenllm_v2/BBH": 0.3607, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.3911, + "hfopenllm_v2/MMLU-PRO": 0.1723 + } + }, + { + "id": "allknowingroger/Ph3merge3-14B", + "name": "Ph3merge3-14B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1645, + "hfopenllm_v2/BBH": 0.3597, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.4082, + "hfopenllm_v2/MMLU-PRO": 0.1647 + } + }, + { + "id": "allknowingroger/Ph3task1-14B", + "name": "Ph3task1-14B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4695, + "hfopenllm_v2/BBH": 0.6318, + "hfopenllm_v2/MATH Level 5": 0.1669, + "hfopenllm_v2/GPQA": 0.3507, + "hfopenllm_v2/MUSR": 0.4508, + "hfopenllm_v2/MMLU-PRO": 0.4734 + } + }, + { + "id": "allknowingroger/Ph3task2-14B", + "name": "Ph3task2-14B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4713, + "hfopenllm_v2/BBH": 0.6098, + "hfopenllm_v2/MATH Level 5": 0.1465, + "hfopenllm_v2/GPQA": 0.3305, + "hfopenllm_v2/MUSR": 0.4535, + "hfopenllm_v2/MMLU-PRO": 0.446 + } + }, + { + "id": "allknowingroger/Ph3task3-14B", + "name": "Ph3task3-14B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4962, + "hfopenllm_v2/BBH": 0.6298, + "hfopenllm_v2/MATH Level 5": 0.176, + "hfopenllm_v2/GPQA": 0.3414, + "hfopenllm_v2/MUSR": 0.4426, + "hfopenllm_v2/MMLU-PRO": 0.4771 + } + }, + { + "id": "allknowingroger/Ph3unsloth-3B-slerp", + "name": "Ph3unsloth-3B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1894, + "hfopenllm_v2/BBH": 0.5468, + "hfopenllm_v2/MATH Level 5": 0.1012, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.4528, + "hfopenllm_v2/MMLU-PRO": 0.3701 + } + }, + { + "id": "allknowingroger/Phi3mash1-17B-pass", + "name": "Phi3mash1-17B-pass", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1884, + "hfopenllm_v2/BBH": 0.6129, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.4451, + "hfopenllm_v2/MMLU-PRO": 0.4589 + } + }, + { + "id": "allknowingroger/Quen2-65B", + "name": "Quen2-65B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1758, + "hfopenllm_v2/BBH": 0.2757, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2357, + "hfopenllm_v2/MUSR": 0.3209, + "hfopenllm_v2/MMLU-PRO": 0.1114 + } + }, + { + "id": "allknowingroger/Qwen2.5-42B-AGI", + "name": "Qwen2.5-42B-AGI", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1913, + "hfopenllm_v2/BBH": 0.2942, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.362, + "hfopenllm_v2/MMLU-PRO": 0.1168 + } + }, + { + "id": "allknowingroger/Qwen2.5-7B-task2", + "name": "Qwen2.5-7B-task2", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4527, + "hfopenllm_v2/BBH": 0.5626, + "hfopenllm_v2/MATH Level 5": 0.355, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.437, + "hfopenllm_v2/MMLU-PRO": 0.4517 + } + }, + { + "id": "allknowingroger/Qwen2.5-7B-task3", + "name": "Qwen2.5-7B-task3", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5129, + "hfopenllm_v2/BBH": 0.5398, + "hfopenllm_v2/MATH Level 5": 0.2606, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4356, + "hfopenllm_v2/MMLU-PRO": 0.4501 + } + }, + { + "id": "allknowingroger/Qwen2.5-7B-task4", + "name": "Qwen2.5-7B-task4", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5005, + "hfopenllm_v2/BBH": 0.5583, + "hfopenllm_v2/MATH Level 5": 0.3112, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.4395, + "hfopenllm_v2/MMLU-PRO": 0.4561 + } + }, + { + "id": "allknowingroger/Qwen2.5-7B-task7", + "name": "Qwen2.5-7B-task7", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4284, + "hfopenllm_v2/BBH": 0.5552, + "hfopenllm_v2/MATH Level 5": 0.065, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.4326, + "hfopenllm_v2/MMLU-PRO": 0.4133 + } + }, + { + "id": "allknowingroger/Qwen2.5-7B-task8", + "name": "Qwen2.5-7B-task8", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4645, + "hfopenllm_v2/BBH": 0.5525, + "hfopenllm_v2/MATH Level 5": 0.3527, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.4514, + "hfopenllm_v2/MMLU-PRO": 0.4433 + } + }, + { + "id": "allknowingroger/Qwen2.5-slerp-14B", + "name": "Qwen2.5-slerp-14B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4928, + "hfopenllm_v2/BBH": 0.6512, + "hfopenllm_v2/MATH Level 5": 0.4622, + "hfopenllm_v2/GPQA": 0.3674, + "hfopenllm_v2/MUSR": 0.4744, + "hfopenllm_v2/MMLU-PRO": 0.5379 + } + }, + { + "id": "allknowingroger/QwenSlerp12-7B", + "name": "QwenSlerp12-7B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5076, + "hfopenllm_v2/BBH": 0.5556, + "hfopenllm_v2/MATH Level 5": 0.2946, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.4595, + "hfopenllm_v2/MMLU-PRO": 0.4461 + } + }, + { + "id": "allknowingroger/QwenSlerp4-14B", + "name": "QwenSlerp4-14B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6328, + "hfopenllm_v2/BBH": 0.6483, + "hfopenllm_v2/MATH Level 5": 0.3693, + "hfopenllm_v2/GPQA": 0.3725, + "hfopenllm_v2/MUSR": 0.465, + "hfopenllm_v2/MMLU-PRO": 0.5436 + } + }, + { + "id": "allknowingroger/QwenSlerp5-14B", + "name": "QwenSlerp5-14B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7119, + "hfopenllm_v2/BBH": 0.6357, + "hfopenllm_v2/MATH Level 5": 0.3565, + "hfopenllm_v2/GPQA": 0.3649, + "hfopenllm_v2/MUSR": 0.4675, + "hfopenllm_v2/MMLU-PRO": 0.5391 + } + }, + { + "id": "allknowingroger/QwenSlerp6-14B", + "name": "QwenSlerp6-14B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6867, + "hfopenllm_v2/BBH": 0.6384, + "hfopenllm_v2/MATH Level 5": 0.3724, + "hfopenllm_v2/GPQA": 0.3733, + "hfopenllm_v2/MUSR": 0.469, + "hfopenllm_v2/MMLU-PRO": 0.5406 + } + }, + { + "id": "allknowingroger/QwenStock1-14B", + "name": "QwenStock1-14B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5634, + "hfopenllm_v2/BBH": 0.6528, + "hfopenllm_v2/MATH Level 5": 0.3769, + "hfopenllm_v2/GPQA": 0.3767, + "hfopenllm_v2/MUSR": 0.473, + "hfopenllm_v2/MMLU-PRO": 0.5418 + } + }, + { + "id": "allknowingroger/QwenStock2-14B", + "name": "QwenStock2-14B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5563, + "hfopenllm_v2/BBH": 0.6569, + "hfopenllm_v2/MATH Level 5": 0.3882, + "hfopenllm_v2/GPQA": 0.3792, + "hfopenllm_v2/MUSR": 0.4756, + "hfopenllm_v2/MMLU-PRO": 0.5406 + } + }, + { + "id": "allknowingroger/QwenStock3-14B", + "name": "QwenStock3-14B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5615, + "hfopenllm_v2/BBH": 0.6565, + "hfopenllm_v2/MATH Level 5": 0.3776, + "hfopenllm_v2/GPQA": 0.3784, + "hfopenllm_v2/MUSR": 0.4756, + "hfopenllm_v2/MMLU-PRO": 0.5428 + } + }, + { + "id": "allknowingroger/Qwenslerp2-14B", + "name": "Qwenslerp2-14B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5007, + "hfopenllm_v2/BBH": 0.6555, + "hfopenllm_v2/MATH Level 5": 0.4456, + "hfopenllm_v2/GPQA": 0.3683, + "hfopenllm_v2/MUSR": 0.4729, + "hfopenllm_v2/MMLU-PRO": 0.5403 + } + }, + { + "id": "allknowingroger/Qwenslerp2-7B", + "name": "Qwenslerp2-7B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5294, + "hfopenllm_v2/BBH": 0.5609, + "hfopenllm_v2/MATH Level 5": 0.3421, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4356, + "hfopenllm_v2/MMLU-PRO": 0.4515 + } + }, + { + "id": "allknowingroger/Qwenslerp3-14B", + "name": "Qwenslerp3-14B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5052, + "hfopenllm_v2/BBH": 0.6521, + "hfopenllm_v2/MATH Level 5": 0.4464, + "hfopenllm_v2/GPQA": 0.375, + "hfopenllm_v2/MUSR": 0.4676, + "hfopenllm_v2/MMLU-PRO": 0.5395 + } + }, + { + "id": "allknowingroger/Qwenslerp3-7B", + "name": "Qwenslerp3-7B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5018, + "hfopenllm_v2/BBH": 0.558, + "hfopenllm_v2/MATH Level 5": 0.3218, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.4515, + "hfopenllm_v2/MMLU-PRO": 0.4542 + } + }, + { + "id": "allknowingroger/ROGERphi-7B-slerp", + "name": "ROGERphi-7B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3861, + "hfopenllm_v2/BBH": 0.5196, + "hfopenllm_v2/MATH Level 5": 0.0733, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.4685, + "hfopenllm_v2/MMLU-PRO": 0.3053 + } + }, + { + "id": "allknowingroger/RogerMerge-7B-slerp", + "name": "RogerMerge-7B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3933, + "hfopenllm_v2/BBH": 0.516, + "hfopenllm_v2/MATH Level 5": 0.0687, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.432, + "hfopenllm_v2/MMLU-PRO": 0.303 + } + }, + { + "id": "allknowingroger/Rombos-LLM-V2.5-Qwen-42b", + "name": "Rombos-LLM-V2.5-Qwen-42b", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1879, + "hfopenllm_v2/BBH": 0.2969, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3633, + "hfopenllm_v2/MMLU-PRO": 0.1168 + } + }, + { + "id": "allknowingroger/Strangecoven-7B-slerp", + "name": "Strangecoven-7B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3746, + "hfopenllm_v2/BBH": 0.5368, + "hfopenllm_v2/MATH Level 5": 0.0763, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.4199, + "hfopenllm_v2/MMLU-PRO": 0.3364 + } + }, + { + "id": "allknowingroger/Weirdslerp2-25B", + "name": "Weirdslerp2-25B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1754, + "hfopenllm_v2/BBH": 0.2874, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2492, + "hfopenllm_v2/MUSR": 0.3524, + "hfopenllm_v2/MMLU-PRO": 0.1128 + } + }, + { + "id": "allknowingroger/WestlakeMaziyar-7B-slerp", + "name": "WestlakeMaziyar-7B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4838, + "hfopenllm_v2/BBH": 0.5245, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4474, + "hfopenllm_v2/MMLU-PRO": 0.3078 + } + }, + { + "id": "allknowingroger/YamMaths-7B-slerp", + "name": "YamMaths-7B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4148, + "hfopenllm_v2/BBH": 0.5156, + "hfopenllm_v2/MATH Level 5": 0.0853, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.4384, + "hfopenllm_v2/MMLU-PRO": 0.3131 + } + }, + { + "id": "allknowingroger/Yi-1.5-34B", + "name": "Yi-1.5-34B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1639, + "hfopenllm_v2/BBH": 0.2827, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3857, + "hfopenllm_v2/MMLU-PRO": 0.1095 + } + }, + { + "id": "allknowingroger/Yi-blossom-40B", + "name": "Yi-blossom-40B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2009, + "hfopenllm_v2/BBH": 0.3215, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3843, + "hfopenllm_v2/MMLU-PRO": 0.108 + } + }, + { + "id": "allknowingroger/Yibuddy-35B", + "name": "Yibuddy-35B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4235, + "hfopenllm_v2/BBH": 0.5916, + "hfopenllm_v2/MATH Level 5": 0.1571, + "hfopenllm_v2/GPQA": 0.3557, + "hfopenllm_v2/MUSR": 0.4505, + "hfopenllm_v2/MMLU-PRO": 0.4489 + } + }, + { + "id": "allknowingroger/Yillama-40B", + "name": "Yillama-40B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1697, + "hfopenllm_v2/BBH": 0.4063, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.3501, + "hfopenllm_v2/MMLU-PRO": 0.1981 + } + }, + { + "id": "allknowingroger/Yislerp-34B", + "name": "Yislerp-34B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3692, + "hfopenllm_v2/BBH": 0.6159, + "hfopenllm_v2/MATH Level 5": 0.216, + "hfopenllm_v2/GPQA": 0.3582, + "hfopenllm_v2/MUSR": 0.4566, + "hfopenllm_v2/MMLU-PRO": 0.4751 + } + }, + { + "id": "allknowingroger/Yislerp2-34B", + "name": "Yislerp2-34B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3999, + "hfopenllm_v2/BBH": 0.6246, + "hfopenllm_v2/MATH Level 5": 0.2296, + "hfopenllm_v2/GPQA": 0.3641, + "hfopenllm_v2/MUSR": 0.453, + "hfopenllm_v2/MMLU-PRO": 0.4724 + } + }, + { + "id": "allknowingroger/Yunconglong-13B-slerp", + "name": "Yunconglong-13B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4242, + "hfopenllm_v2/BBH": 0.5166, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.4161, + "hfopenllm_v2/MMLU-PRO": 0.3036 + } + }, + { + "id": "allknowingroger/limyClown-7B-slerp", + "name": "limyClown-7B-slerp", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4017, + "hfopenllm_v2/BBH": 0.5148, + "hfopenllm_v2/MATH Level 5": 0.0687, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.4293, + "hfopenllm_v2/MMLU-PRO": 0.3038 + } + }, + { + "id": "allknowingroger/llama3-Jallabi-40B-s", + "name": "llama3-Jallabi-40B-s", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1921, + "hfopenllm_v2/BBH": 0.3252, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2374, + "hfopenllm_v2/MUSR": 0.375, + "hfopenllm_v2/MMLU-PRO": 0.1088 + } + }, + { + "id": "allknowingroger/llama3AnFeng-40B", + "name": "llama3AnFeng-40B", + "developer": "allknowingroger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1742, + "hfopenllm_v2/BBH": 0.3794, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.394, + "hfopenllm_v2/MMLU-PRO": 0.198 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/allura-org.json b/data/developers/allura-org.json new file mode 100644 index 0000000000000000000000000000000000000000..4a536e62cdaf18013d06f154d3fd500645bb6528 --- /dev/null +++ b/data/developers/allura-org.json @@ -0,0 +1,131 @@ +{ + "developer": "allura-org", + "models": [ + { + "id": "allura-org/L3.1-8b-RP-Ink", + "name": "L3.1-8b-RP-Ink", + "developer": "allura-org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7811, + "hfopenllm_v2/BBH": 0.4828, + "hfopenllm_v2/MATH Level 5": 0.148, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3608, + "hfopenllm_v2/MMLU-PRO": 0.3428 + } + }, + { + "id": "allura-org/MN-12b-RP-Ink", + "name": "MN-12b-RP-Ink", + "developer": "allura-org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7186, + "hfopenllm_v2/BBH": 0.4834, + "hfopenllm_v2/MATH Level 5": 0.1186, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.3818, + "hfopenllm_v2/MMLU-PRO": 0.3514 + } + }, + { + "id": "allura-org/MS-Meadowlark-22B", + "name": "MS-Meadowlark-22B", + "developer": "allura-org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6697, + "hfopenllm_v2/BBH": 0.5163, + "hfopenllm_v2/MATH Level 5": 0.1835, + "hfopenllm_v2/GPQA": 0.3255, + "hfopenllm_v2/MUSR": 0.3843, + "hfopenllm_v2/MMLU-PRO": 0.3823 + } + }, + { + "id": "allura-org/Mistral-Small-24b-Sertraline-0304", + "name": "Mistral-Small-24b-Sertraline-0304", + "developer": "allura-org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.68, + "hfopenllm_v2/BBH": 0.6525, + "hfopenllm_v2/MATH Level 5": 0.2228, + "hfopenllm_v2/GPQA": 0.3515, + "hfopenllm_v2/MUSR": 0.4395, + "hfopenllm_v2/MMLU-PRO": 0.5106 + } + }, + { + "id": "allura-org/Mistral-Small-Sisyphus-24b-2503", + "name": "Mistral-Small-Sisyphus-24b-2503", + "developer": "allura-org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6848, + "hfopenllm_v2/BBH": 0.627, + "hfopenllm_v2/MATH Level 5": 0.25, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3977, + "hfopenllm_v2/MMLU-PRO": 0.5127 + } + }, + { + "id": "allura-org/MoE-Girl-1BA-7BT", + "name": "MoE-Girl-1BA-7BT", + "developer": "allura-org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2705, + "hfopenllm_v2/BBH": 0.3139, + "hfopenllm_v2/MATH Level 5": 0.0151, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3436, + "hfopenllm_v2/MMLU-PRO": 0.1218 + } + }, + { + "id": "allura-org/TQ2.5-14B-Aletheia-v1", + "name": "TQ2.5-14B-Aletheia-v1", + "developer": "allura-org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.753, + "hfopenllm_v2/BBH": 0.6585, + "hfopenllm_v2/MATH Level 5": 0.3399, + "hfopenllm_v2/GPQA": 0.3624, + "hfopenllm_v2/MUSR": 0.4452, + "hfopenllm_v2/MMLU-PRO": 0.5241 + } + }, + { + "id": "allura-org/TQ2.5-14B-Neon-v1", + "name": "TQ2.5-14B-Neon-v1", + "developer": "allura-org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6754, + "hfopenllm_v2/BBH": 0.6553, + "hfopenllm_v2/MATH Level 5": 0.3603, + "hfopenllm_v2/GPQA": 0.3716, + "hfopenllm_v2/MUSR": 0.461, + "hfopenllm_v2/MMLU-PRO": 0.5253 + } + }, + { + "id": "allura-org/Teleut-7b", + "name": "Teleut-7b", + "developer": "allura-org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6379, + "hfopenllm_v2/BBH": 0.5141, + "hfopenllm_v2/MATH Level 5": 0.2409, + "hfopenllm_v2/GPQA": 0.3263, + "hfopenllm_v2/MUSR": 0.464, + "hfopenllm_v2/MMLU-PRO": 0.4131 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/aloobun.json b/data/developers/aloobun.json new file mode 100644 index 0000000000000000000000000000000000000000..00f88d918d09895e7465dbaa629e187163b18545 --- /dev/null +++ b/data/developers/aloobun.json @@ -0,0 +1,33 @@ +{ + "developer": "aloobun", + "models": [ + { + "id": "aloobun/Meta-Llama-3-7B-28Layers", + "name": "Meta-Llama-3-7B-28Layers", + "developer": "aloobun", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1964, + "hfopenllm_v2/BBH": 0.4437, + "hfopenllm_v2/MATH Level 5": 0.0279, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.3589, + "hfopenllm_v2/MMLU-PRO": 0.316 + } + }, + { + "id": "aloobun/d-SmolLM2-360M", + "name": "d-SmolLM2-360M", + "developer": "aloobun", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2097, + "hfopenllm_v2/BBH": 0.3196, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3981, + "hfopenllm_v2/MMLU-PRO": 0.1169 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/alpindale.json b/data/developers/alpindale.json new file mode 100644 index 0000000000000000000000000000000000000000..cbaa57e3df5bda765b762feba44e77c4c6ff1f7f --- /dev/null +++ b/data/developers/alpindale.json @@ -0,0 +1,33 @@ +{ + "developer": "alpindale", + "models": [ + { + "id": "alpindale/WizardLM-2-8x22B", + "name": "WizardLM-2-8x22B", + "developer": "alpindale", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5272, + "hfopenllm_v2/BBH": 0.6377, + "hfopenllm_v2/MATH Level 5": 0.25, + "hfopenllm_v2/GPQA": 0.3817, + "hfopenllm_v2/MUSR": 0.4387, + "hfopenllm_v2/MMLU-PRO": 0.4596 + } + }, + { + "id": "alpindale/magnum-72b-v1", + "name": "magnum-72b-v1", + "developer": "alpindale", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7606, + "hfopenllm_v2/BBH": 0.6982, + "hfopenllm_v2/MATH Level 5": 0.398, + "hfopenllm_v2/GPQA": 0.3909, + "hfopenllm_v2/MUSR": 0.4489, + "hfopenllm_v2/MMLU-PRO": 0.5468 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/altomek.json b/data/developers/altomek.json new file mode 100644 index 0000000000000000000000000000000000000000..a03cfa0efb1312f43ad7d237a6113f60efb95585 --- /dev/null +++ b/data/developers/altomek.json @@ -0,0 +1,19 @@ +{ + "developer": "altomek", + "models": [ + { + "id": "altomek/YiSM-34B-0rn", + "name": "YiSM-34B-0rn", + "developer": "altomek", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4284, + "hfopenllm_v2/BBH": 0.614, + "hfopenllm_v2/MATH Level 5": 0.2281, + "hfopenllm_v2/GPQA": 0.3716, + "hfopenllm_v2/MUSR": 0.445, + "hfopenllm_v2/MMLU-PRO": 0.4696 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/amazon.json b/data/developers/amazon.json new file mode 100644 index 0000000000000000000000000000000000000000..b5b07ecb5b482acd911d7aede4ad53d324acd538 --- /dev/null +++ b/data/developers/amazon.json @@ -0,0 +1,213 @@ +{ + "developer": "amazon", + "models": [ + { + "id": "amazon/MegaBeam-Mistral-7B-300k", + "name": "MegaBeam-Mistral-7B-300k", + "developer": "amazon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5203, + "hfopenllm_v2/BBH": 0.4228, + "hfopenllm_v2/MATH Level 5": 0.0211, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.398, + "hfopenllm_v2/MMLU-PRO": 0.2549 + } + }, + { + "id": "amazon/nova-lite-v1:0", + "name": "Amazon Nova Lite", + "developer": "amazon", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.551, + "helm_capabilities/MMLU-Pro": 0.6, + "helm_capabilities/GPQA": 0.397, + "helm_capabilities/IFEval": 0.776, + "helm_capabilities/WildBench": 0.75, + "helm_capabilities/Omni-MATH": 0.233, + "helm_lite/Mean win rate": 0.708, + "helm_lite/NarrativeQA": 0.768, + "helm_lite/NaturalQuestions (closed-book)": 0.352, + "helm_lite/OpenbookQA": 0.928, + "helm_lite/MMLU": 0.693, + "helm_lite/MATH": 0.779, + "helm_lite/GSM8K": 0.829, + "helm_lite/LegalBench": 0.659, + "helm_lite/MedQA": 0.696, + "helm_lite/WMT 2014": 0.204, + "helm_mmlu/MMLU All Subjects": 0.77, + "helm_mmlu/Abstract Algebra": 0.52, + "helm_mmlu/Anatomy": 0.719, + "helm_mmlu/College Physics": 0.608, + "helm_mmlu/Computer Security": 0.79, + "helm_mmlu/Econometrics": 0.675, + "helm_mmlu/Global Facts": 0.55, + "helm_mmlu/Jurisprudence": 0.852, + "helm_mmlu/Philosophy": 0.817, + "helm_mmlu/Professional Psychology": 0.812, + "helm_mmlu/Us Foreign Policy": 0.92, + "helm_mmlu/Astronomy": 0.862, + "helm_mmlu/Business Ethics": 0.73, + "helm_mmlu/Clinical Knowledge": 0.8, + "helm_mmlu/Conceptual Physics": 0.796, + "helm_mmlu/Electrical Engineering": 0.779, + "helm_mmlu/Elementary Mathematics": 0.757, + "helm_mmlu/Formal Logic": 0.643, + "helm_mmlu/High School World History": 0.886, + "helm_mmlu/Human Sexuality": 0.84, + "helm_mmlu/International Law": 0.843, + "helm_mmlu/Logical Fallacies": 0.81, + "helm_mmlu/Machine Learning": 0.509, + "helm_mmlu/Management": 0.864, + "helm_mmlu/Marketing": 0.889, + "helm_mmlu/Medical Genetics": 0.9, + "helm_mmlu/Miscellaneous": 0.872, + "helm_mmlu/Moral Scenarios": 0.694, + "helm_mmlu/Nutrition": 0.788, + "helm_mmlu/Prehistory": 0.849, + "helm_mmlu/Public Relations": 0.682, + "helm_mmlu/Security Studies": 0.788, + "helm_mmlu/Sociology": 0.896, + "helm_mmlu/Virology": 0.542, + "helm_mmlu/World Religions": 0.871, + "helm_mmlu/Mean win rate": 0.987 + } + }, + { + "id": "amazon/nova-micro-v1:0", + "name": "Amazon Nova Micro", + "developer": "amazon", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.522, + "helm_capabilities/MMLU-Pro": 0.511, + "helm_capabilities/GPQA": 0.383, + "helm_capabilities/IFEval": 0.76, + "helm_capabilities/WildBench": 0.743, + "helm_capabilities/Omni-MATH": 0.214, + "helm_lite/Mean win rate": 0.524, + "helm_lite/NarrativeQA": 0.744, + "helm_lite/NaturalQuestions (closed-book)": 0.285, + "helm_lite/OpenbookQA": 0.888, + "helm_lite/MMLU": 0.64, + "helm_lite/MATH": 0.76, + "helm_lite/GSM8K": 0.794, + "helm_lite/LegalBench": 0.615, + "helm_lite/MedQA": 0.608, + "helm_lite/WMT 2014": 0.192, + "helm_mmlu/MMLU All Subjects": 0.708, + "helm_mmlu/Abstract Algebra": 0.42, + "helm_mmlu/Anatomy": 0.726, + "helm_mmlu/College Physics": 0.5, + "helm_mmlu/Computer Security": 0.77, + "helm_mmlu/Econometrics": 0.57, + "helm_mmlu/Global Facts": 0.44, + "helm_mmlu/Jurisprudence": 0.815, + "helm_mmlu/Philosophy": 0.733, + "helm_mmlu/Professional Psychology": 0.739, + "helm_mmlu/Us Foreign Policy": 0.9, + "helm_mmlu/Astronomy": 0.822, + "helm_mmlu/Business Ethics": 0.71, + "helm_mmlu/Clinical Knowledge": 0.751, + "helm_mmlu/Conceptual Physics": 0.706, + "helm_mmlu/Electrical Engineering": 0.683, + "helm_mmlu/Elementary Mathematics": 0.55, + "helm_mmlu/Formal Logic": 0.508, + "helm_mmlu/High School World History": 0.84, + "helm_mmlu/Human Sexuality": 0.824, + "helm_mmlu/International Law": 0.843, + "helm_mmlu/Logical Fallacies": 0.798, + "helm_mmlu/Machine Learning": 0.562, + "helm_mmlu/Management": 0.816, + "helm_mmlu/Marketing": 0.91, + "helm_mmlu/Medical Genetics": 0.82, + "helm_mmlu/Miscellaneous": 0.83, + "helm_mmlu/Moral Scenarios": 0.464, + "helm_mmlu/Nutrition": 0.778, + "helm_mmlu/Prehistory": 0.787, + "helm_mmlu/Public Relations": 0.673, + "helm_mmlu/Security Studies": 0.718, + "helm_mmlu/Sociology": 0.846, + "helm_mmlu/Virology": 0.524, + "helm_mmlu/World Religions": 0.825, + "helm_mmlu/Mean win rate": 1.0 + } + }, + { + "id": "amazon/nova-premier-v1:0", + "name": "Amazon Nova Premier", + "developer": "amazon", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.637, + "helm_capabilities/MMLU-Pro": 0.726, + "helm_capabilities/GPQA": 0.518, + "helm_capabilities/IFEval": 0.803, + "helm_capabilities/WildBench": 0.788, + "helm_capabilities/Omni-MATH": 0.35 + } + }, + { + "id": "amazon/nova-pro-v1:0", + "name": "Amazon Nova Pro", + "developer": "amazon", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.591, + "helm_capabilities/MMLU-Pro": 0.673, + "helm_capabilities/GPQA": 0.446, + "helm_capabilities/IFEval": 0.815, + "helm_capabilities/WildBench": 0.777, + "helm_capabilities/Omni-MATH": 0.242, + "helm_lite/Mean win rate": 0.885, + "helm_lite/NarrativeQA": 0.791, + "helm_lite/NaturalQuestions (closed-book)": 0.405, + "helm_lite/OpenbookQA": 0.96, + "helm_lite/MMLU": 0.758, + "helm_lite/MATH": 0.821, + "helm_lite/GSM8K": 0.87, + "helm_lite/LegalBench": 0.736, + "helm_lite/MedQA": 0.811, + "helm_lite/WMT 2014": 0.229, + "helm_mmlu/MMLU All Subjects": 0.82, + "helm_mmlu/Abstract Algebra": 0.69, + "helm_mmlu/Anatomy": 0.807, + "helm_mmlu/College Physics": 0.647, + "helm_mmlu/Computer Security": 0.84, + "helm_mmlu/Econometrics": 0.702, + "helm_mmlu/Global Facts": 0.54, + "helm_mmlu/Jurisprudence": 0.861, + "helm_mmlu/Philosophy": 0.826, + "helm_mmlu/Professional Psychology": 0.864, + "helm_mmlu/Us Foreign Policy": 0.93, + "helm_mmlu/Astronomy": 0.895, + "helm_mmlu/Business Ethics": 0.81, + "helm_mmlu/Clinical Knowledge": 0.875, + "helm_mmlu/Conceptual Physics": 0.851, + "helm_mmlu/Electrical Engineering": 0.8, + "helm_mmlu/Elementary Mathematics": 0.831, + "helm_mmlu/Formal Logic": 0.714, + "helm_mmlu/High School World History": 0.928, + "helm_mmlu/Human Sexuality": 0.885, + "helm_mmlu/International Law": 0.901, + "helm_mmlu/Logical Fallacies": 0.871, + "helm_mmlu/Machine Learning": 0.625, + "helm_mmlu/Management": 0.922, + "helm_mmlu/Marketing": 0.923, + "helm_mmlu/Medical Genetics": 0.87, + "helm_mmlu/Miscellaneous": 0.912, + "helm_mmlu/Moral Scenarios": 0.76, + "helm_mmlu/Nutrition": 0.866, + "helm_mmlu/Prehistory": 0.926, + "helm_mmlu/Public Relations": 0.8, + "helm_mmlu/Security Studies": 0.849, + "helm_mmlu/Sociology": 0.905, + "helm_mmlu/Virology": 0.59, + "helm_mmlu/World Religions": 0.877, + "helm_mmlu/Mean win rate": 0.975 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/amd.json b/data/developers/amd.json new file mode 100644 index 0000000000000000000000000000000000000000..ec85a4364a761e30f1999f359d0d247d8857e139 --- /dev/null +++ b/data/developers/amd.json @@ -0,0 +1,19 @@ +{ + "developer": "amd", + "models": [ + { + "id": "amd/AMD-Llama-135m", + "name": "AMD-Llama-135m", + "developer": "amd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1842, + "hfopenllm_v2/BBH": 0.2974, + "hfopenllm_v2/MATH Level 5": 0.0053, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.378, + "hfopenllm_v2/MMLU-PRO": 0.1169 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/anakin87.json b/data/developers/anakin87.json new file mode 100644 index 0000000000000000000000000000000000000000..451701c03c05f06014d98ab731d7b6819d4a5185 --- /dev/null +++ b/data/developers/anakin87.json @@ -0,0 +1,19 @@ +{ + "developer": "anakin87", + "models": [ + { + "id": "anakin87/gemma-2b-orpo", + "name": "gemma-2b-orpo", + "developer": "anakin87", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2478, + "hfopenllm_v2/BBH": 0.3426, + "hfopenllm_v2/MATH Level 5": 0.0189, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3728, + "hfopenllm_v2/MMLU-PRO": 0.1306 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/anthracite-org.json b/data/developers/anthracite-org.json new file mode 100644 index 0000000000000000000000000000000000000000..ea1ebc2ec69855fa07e44ad5f7f899ef47b408dd --- /dev/null +++ b/data/developers/anthracite-org.json @@ -0,0 +1,173 @@ +{ + "developer": "anthracite-org", + "models": [ + { + "id": "anthracite-org/magnum-v1-72b", + "name": "magnum-v1-72b", + "developer": "anthracite-org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7606, + "hfopenllm_v2/BBH": 0.6982, + "hfopenllm_v2/MATH Level 5": 0.398, + "hfopenllm_v2/GPQA": 0.3909, + "hfopenllm_v2/MUSR": 0.4489, + "hfopenllm_v2/MMLU-PRO": 0.5486 + } + }, + { + "id": "anthracite-org/magnum-v2-12b", + "name": "magnum-v2-12b", + "developer": "anthracite-org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3762, + "hfopenllm_v2/BBH": 0.5021, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.4179, + "hfopenllm_v2/MMLU-PRO": 0.3167 + } + }, + { + "id": "anthracite-org/magnum-v2-72b", + "name": "magnum-v2-72b", + "developer": "anthracite-org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.756, + "hfopenllm_v2/BBH": 0.7005, + "hfopenllm_v2/MATH Level 5": 0.3542, + "hfopenllm_v2/GPQA": 0.3859, + "hfopenllm_v2/MUSR": 0.4372, + "hfopenllm_v2/MMLU-PRO": 0.5456 + } + }, + { + "id": "anthracite-org/magnum-v2.5-12b-kto", + "name": "magnum-v2.5-12b-kto", + "developer": "anthracite-org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3866, + "hfopenllm_v2/BBH": 0.5077, + "hfopenllm_v2/MATH Level 5": 0.0521, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4086, + "hfopenllm_v2/MMLU-PRO": 0.3215 + } + }, + { + "id": "anthracite-org/magnum-v3-27b-kto", + "name": "magnum-v3-27b-kto", + "developer": "anthracite-org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5675, + "hfopenllm_v2/BBH": 0.586, + "hfopenllm_v2/MATH Level 5": 0.1813, + "hfopenllm_v2/GPQA": 0.3557, + "hfopenllm_v2/MUSR": 0.3855, + "hfopenllm_v2/MMLU-PRO": 0.4238 + } + }, + { + "id": "anthracite-org/magnum-v3-34b", + "name": "magnum-v3-34b", + "developer": "anthracite-org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5115, + "hfopenllm_v2/BBH": 0.6088, + "hfopenllm_v2/MATH Level 5": 0.1949, + "hfopenllm_v2/GPQA": 0.3607, + "hfopenllm_v2/MUSR": 0.3872, + "hfopenllm_v2/MMLU-PRO": 0.4752 + } + }, + { + "id": "anthracite-org/magnum-v3-9b-chatml", + "name": "magnum-v3-9b-chatml", + "developer": "anthracite-org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1275, + "hfopenllm_v2/BBH": 0.5428, + "hfopenllm_v2/MATH Level 5": 0.0695, + "hfopenllm_v2/GPQA": 0.3456, + "hfopenllm_v2/MUSR": 0.4432, + "hfopenllm_v2/MMLU-PRO": 0.4242 + } + }, + { + "id": "anthracite-org/magnum-v3-9b-customgemma2", + "name": "magnum-v3-9b-customgemma2", + "developer": "anthracite-org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1273, + "hfopenllm_v2/BBH": 0.534, + "hfopenllm_v2/MATH Level 5": 0.0718, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.4565, + "hfopenllm_v2/MMLU-PRO": 0.4205 + } + }, + { + "id": "anthracite-org/magnum-v4-12b", + "name": "magnum-v4-12b", + "developer": "anthracite-org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3393, + "hfopenllm_v2/BBH": 0.5177, + "hfopenllm_v2/MATH Level 5": 0.1178, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4093, + "hfopenllm_v2/MMLU-PRO": 0.3604 + } + }, + { + "id": "anthracite-org/magnum-v4-22b", + "name": "magnum-v4-22b", + "developer": "anthracite-org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5629, + "hfopenllm_v2/BBH": 0.5486, + "hfopenllm_v2/MATH Level 5": 0.2002, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.4408, + "hfopenllm_v2/MMLU-PRO": 0.383 + } + }, + { + "id": "anthracite-org/magnum-v4-27b", + "name": "magnum-v4-27b", + "developer": "anthracite-org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3454, + "hfopenllm_v2/BBH": 0.5867, + "hfopenllm_v2/MATH Level 5": 0.1798, + "hfopenllm_v2/GPQA": 0.37, + "hfopenllm_v2/MUSR": 0.438, + "hfopenllm_v2/MMLU-PRO": 0.4376 + } + }, + { + "id": "anthracite-org/magnum-v4-9b", + "name": "magnum-v4-9b", + "developer": "anthracite-org", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3503, + "hfopenllm_v2/BBH": 0.5336, + "hfopenllm_v2/MATH Level 5": 0.1307, + "hfopenllm_v2/GPQA": 0.3473, + "hfopenllm_v2/MUSR": 0.4516, + "hfopenllm_v2/MMLU-PRO": 0.3953 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/anthropic.json b/data/developers/anthropic.json new file mode 100644 index 0000000000000000000000000000000000000000..6ddacfee09f6754246b1911e3bf4c320d5aa4eeb --- /dev/null +++ b/data/developers/anthropic.json @@ -0,0 +1,750 @@ +{ + "developer": "anthropic", + "models": [ + { + "id": "anthropic/Opus 4.1", + "name": "Opus 4.1", + "developer": "anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "ace/Overall Score": 0.4, + "ace/Gaming Score": 0.318 + } + }, + { + "id": "anthropic/Opus 4.5", + "name": "Opus 4.5", + "developer": "anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "ace/Overall Score": 0.478, + "ace/Gaming Score": 0.391, + "apex-agents/Overall Pass@1": 0.184, + "apex-agents/Overall Pass@8": 0.34, + "apex-agents/Overall Mean Score": 0.348, + "apex-agents/Investment Banking Pass@1": 0.216, + "apex-agents/Management Consulting Pass@1": 0.132, + "apex-agents/Corporate Law Pass@1": 0.202, + "apex-agents/Corporate Lawyer Mean Score": 0.471, + "apex-v1/Medicine (MD) Score": 0.65 + } + }, + { + "id": "anthropic/Opus 4.6", + "name": "Opus 4.6", + "developer": "anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "apex-agents/Overall Pass@1": 0.298, + "apex-agents/Corporate Lawyer Mean Score": 0.502 + } + }, + { + "id": "anthropic/Sonnet 4.5", + "name": "Sonnet 4.5", + "developer": "anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "ace/Overall Score": 0.44, + "ace/Gaming Score": 0.373 + } + }, + { + "id": "anthropic/claude-2.0", + "name": "Claude 2.0", + "developer": "anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.489, + "helm_lite/NarrativeQA": 0.718, + "helm_lite/NaturalQuestions (closed-book)": 0.428, + "helm_lite/OpenbookQA": 0.862, + "helm_lite/MMLU": 0.639, + "helm_lite/MATH": 0.603, + "helm_lite/GSM8K": 0.583, + "helm_lite/LegalBench": 0.643, + "helm_lite/MedQA": 0.652, + "helm_lite/WMT 2014": 0.219 + } + }, + { + "id": "anthropic/claude-2.1", + "name": "Claude 2.1", + "developer": "anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.437, + "helm_lite/NarrativeQA": 0.677, + "helm_lite/NaturalQuestions (closed-book)": 0.375, + "helm_lite/OpenbookQA": 0.872, + "helm_lite/MMLU": 0.643, + "helm_lite/MATH": 0.632, + "helm_lite/GSM8K": 0.604, + "helm_lite/LegalBench": 0.643, + "helm_lite/MedQA": 0.644, + "helm_lite/WMT 2014": 0.204, + "helm_mmlu/MMLU All Subjects": 0.735, + "helm_mmlu/Abstract Algebra": 0.4, + "helm_mmlu/Anatomy": 0.726, + "helm_mmlu/College Physics": 0.5, + "helm_mmlu/Computer Security": 0.81, + "helm_mmlu/Econometrics": 0.596, + "helm_mmlu/Global Facts": 0.55, + "helm_mmlu/Jurisprudence": 0.87, + "helm_mmlu/Philosophy": 0.794, + "helm_mmlu/Professional Psychology": 0.797, + "helm_mmlu/Us Foreign Policy": 0.92, + "helm_mmlu/Astronomy": 0.855, + "helm_mmlu/Business Ethics": 0.73, + "helm_mmlu/Clinical Knowledge": 0.785, + "helm_mmlu/Conceptual Physics": 0.766, + "helm_mmlu/Electrical Engineering": 0.724, + "helm_mmlu/Elementary Mathematics": 0.521, + "helm_mmlu/Formal Logic": 0.5, + "helm_mmlu/High School World History": 0.903, + "helm_mmlu/Human Sexuality": 0.847, + "helm_mmlu/International Law": 0.901, + "helm_mmlu/Logical Fallacies": 0.834, + "helm_mmlu/Machine Learning": 0.482, + "helm_mmlu/Management": 0.825, + "helm_mmlu/Marketing": 0.923, + "helm_mmlu/Medical Genetics": 0.81, + "helm_mmlu/Miscellaneous": 0.88, + "helm_mmlu/Moral Scenarios": 0.52, + "helm_mmlu/Nutrition": 0.781, + "helm_mmlu/Prehistory": 0.821, + "helm_mmlu/Public Relations": 0.773, + "helm_mmlu/Security Studies": 0.812, + "helm_mmlu/Sociology": 0.886, + "helm_mmlu/Virology": 0.554, + "helm_mmlu/World Religions": 0.854, + "helm_mmlu/Mean win rate": 0.048 + } + }, + { + "id": "anthropic/claude-3-5-haiku-20241022", + "name": "claude-3-5-haiku-20241022", + "developer": "anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.6114, + "global-mmlu-lite/Culturally Sensitive": 0.5834, + "global-mmlu-lite/Culturally Agnostic": 0.6394, + "global-mmlu-lite/Arabic": 0.695, + "global-mmlu-lite/English": 0.485, + "global-mmlu-lite/Bengali": 0.675, + "global-mmlu-lite/German": 0.565, + "global-mmlu-lite/French": 0.61, + "global-mmlu-lite/Hindi": 0.6575, + "global-mmlu-lite/Indonesian": 0.5475, + "global-mmlu-lite/Italian": 0.48, + "global-mmlu-lite/Japanese": 0.655, + "global-mmlu-lite/Korean": 0.6575, + "global-mmlu-lite/Portuguese": 0.5225, + "global-mmlu-lite/Spanish": 0.485, + "global-mmlu-lite/Swahili": 0.69, + "global-mmlu-lite/Yoruba": 0.6675, + "global-mmlu-lite/Chinese": 0.69, + "global-mmlu-lite/Burmese": 0.7, + "helm_capabilities/Mean score": 0.549, + "helm_capabilities/MMLU-Pro": 0.605, + "helm_capabilities/GPQA": 0.363, + "helm_capabilities/IFEval": 0.792, + "helm_capabilities/WildBench": 0.76, + "helm_capabilities/Omni-MATH": 0.224, + "helm_lite/Mean win rate": 0.531, + "helm_lite/NarrativeQA": 0.763, + "helm_lite/NaturalQuestions (closed-book)": 0.344, + "helm_lite/OpenbookQA": 0.854, + "helm_lite/MMLU": 0.671, + "helm_lite/MATH": 0.872, + "helm_lite/GSM8K": 0.815, + "helm_lite/LegalBench": 0.631, + "helm_lite/MedQA": 0.722, + "helm_lite/WMT 2014": 0.135, + "helm_mmlu/MMLU All Subjects": 0.743, + "helm_mmlu/Abstract Algebra": 0.47, + "helm_mmlu/Anatomy": 0.793, + "helm_mmlu/College Physics": 0.52, + "helm_mmlu/Computer Security": 0.84, + "helm_mmlu/Econometrics": 0.596, + "helm_mmlu/Global Facts": 0.5, + "helm_mmlu/Jurisprudence": 0.861, + "helm_mmlu/Philosophy": 0.823, + "helm_mmlu/Professional Psychology": 0.825, + "helm_mmlu/Us Foreign Policy": 0.94, + "helm_mmlu/Astronomy": 0.829, + "helm_mmlu/Business Ethics": 0.8, + "helm_mmlu/Clinical Knowledge": 0.823, + "helm_mmlu/Conceptual Physics": 0.723, + "helm_mmlu/Electrical Engineering": 0.717, + "helm_mmlu/Elementary Mathematics": 0.561, + "helm_mmlu/Formal Logic": 0.619, + "helm_mmlu/High School World History": 0.882, + "helm_mmlu/Human Sexuality": 0.885, + "helm_mmlu/International Law": 0.884, + "helm_mmlu/Logical Fallacies": 0.822, + "helm_mmlu/Machine Learning": 0.518, + "helm_mmlu/Management": 0.845, + "helm_mmlu/Marketing": 0.897, + "helm_mmlu/Medical Genetics": 0.83, + "helm_mmlu/Miscellaneous": 0.905, + "helm_mmlu/Moral Scenarios": 0.476, + "helm_mmlu/Nutrition": 0.846, + "helm_mmlu/Prehistory": 0.877, + "helm_mmlu/Public Relations": 0.727, + "helm_mmlu/Security Studies": 0.792, + "helm_mmlu/Sociology": 0.905, + "helm_mmlu/Virology": 0.566, + "helm_mmlu/World Religions": 0.865, + "helm_mmlu/Mean win rate": 0.128 + } + }, + { + "id": "anthropic/claude-3-5-sonnet-20240620", + "name": "Claude 3.5 Sonnet 20240620", + "developer": "anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.885, + "helm_lite/NarrativeQA": 0.746, + "helm_lite/NaturalQuestions (closed-book)": 0.502, + "helm_lite/OpenbookQA": 0.972, + "helm_lite/MMLU": 0.799, + "helm_lite/MATH": 0.813, + "helm_lite/GSM8K": 0.949, + "helm_lite/LegalBench": 0.707, + "helm_lite/MedQA": 0.825, + "helm_lite/WMT 2014": 0.229, + "helm_mmlu/MMLU All Subjects": 0.865, + "helm_mmlu/Abstract Algebra": 0.75, + "helm_mmlu/Anatomy": 0.844, + "helm_mmlu/College Physics": 0.696, + "helm_mmlu/Computer Security": 0.89, + "helm_mmlu/Econometrics": 0.807, + "helm_mmlu/Global Facts": 0.72, + "helm_mmlu/Jurisprudence": 0.889, + "helm_mmlu/Philosophy": 0.891, + "helm_mmlu/Professional Psychology": 0.922, + "helm_mmlu/Us Foreign Policy": 0.96, + "helm_mmlu/Astronomy": 0.961, + "helm_mmlu/Business Ethics": 0.85, + "helm_mmlu/Clinical Knowledge": 0.913, + "helm_mmlu/Conceptual Physics": 0.885, + "helm_mmlu/Electrical Engineering": 0.828, + "helm_mmlu/Elementary Mathematics": 0.892, + "helm_mmlu/Formal Logic": 0.698, + "helm_mmlu/High School World History": 0.954, + "helm_mmlu/Human Sexuality": 0.939, + "helm_mmlu/International Law": 0.959, + "helm_mmlu/Logical Fallacies": 0.926, + "helm_mmlu/Machine Learning": 0.786, + "helm_mmlu/Management": 0.942, + "helm_mmlu/Marketing": 0.949, + "helm_mmlu/Medical Genetics": 0.98, + "helm_mmlu/Miscellaneous": 0.962, + "helm_mmlu/Moral Scenarios": 0.882, + "helm_mmlu/Nutrition": 0.912, + "helm_mmlu/Prehistory": 0.951, + "helm_mmlu/Public Relations": 0.855, + "helm_mmlu/Security Studies": 0.878, + "helm_mmlu/Sociology": 0.96, + "helm_mmlu/Virology": 0.602, + "helm_mmlu/World Religions": 0.924, + "helm_mmlu/Mean win rate": 0.17, + "reward-bench/Score": 0.6466, + "reward-bench/Factuality": 0.5284, + "reward-bench/Precise IF": 0.3875, + "reward-bench/Math": 0.5683, + "reward-bench/Safety": 0.8519, + "reward-bench/Focus": 0.8697, + "reward-bench/Ties": 0.674 + } + }, + { + "id": "anthropic/claude-3-5-sonnet-20241022", + "name": "Claude 3.5 Sonnet 20241022", + "developer": "anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.653, + "helm_capabilities/MMLU-Pro": 0.777, + "helm_capabilities/GPQA": 0.565, + "helm_capabilities/IFEval": 0.856, + "helm_capabilities/WildBench": 0.792, + "helm_capabilities/Omni-MATH": 0.276, + "helm_lite/Mean win rate": 0.846, + "helm_lite/NarrativeQA": 0.77, + "helm_lite/NaturalQuestions (closed-book)": 0.467, + "helm_lite/OpenbookQA": 0.966, + "helm_lite/MMLU": 0.809, + "helm_lite/MATH": 0.904, + "helm_lite/GSM8K": 0.956, + "helm_lite/LegalBench": 0.647, + "helm_lite/MedQA": 0.859, + "helm_lite/WMT 2014": 0.226, + "helm_mmlu/MMLU All Subjects": 0.873, + "helm_mmlu/Abstract Algebra": 0.78, + "helm_mmlu/Anatomy": 0.859, + "helm_mmlu/College Physics": 0.775, + "helm_mmlu/Computer Security": 0.87, + "helm_mmlu/Econometrics": 0.807, + "helm_mmlu/Global Facts": 0.8, + "helm_mmlu/Jurisprudence": 0.898, + "helm_mmlu/Philosophy": 0.891, + "helm_mmlu/Professional Psychology": 0.922, + "helm_mmlu/Us Foreign Policy": 0.96, + "helm_mmlu/Astronomy": 0.974, + "helm_mmlu/Business Ethics": 0.83, + "helm_mmlu/Clinical Knowledge": 0.928, + "helm_mmlu/Conceptual Physics": 0.906, + "helm_mmlu/Electrical Engineering": 0.848, + "helm_mmlu/Elementary Mathematics": 0.918, + "helm_mmlu/Formal Logic": 0.786, + "helm_mmlu/High School World History": 0.958, + "helm_mmlu/Human Sexuality": 0.939, + "helm_mmlu/International Law": 0.959, + "helm_mmlu/Logical Fallacies": 0.914, + "helm_mmlu/Machine Learning": 0.839, + "helm_mmlu/Management": 0.932, + "helm_mmlu/Marketing": 0.953, + "helm_mmlu/Medical Genetics": 0.96, + "helm_mmlu/Miscellaneous": 0.964, + "helm_mmlu/Moral Scenarios": 0.888, + "helm_mmlu/Nutrition": 0.922, + "helm_mmlu/Prehistory": 0.941, + "helm_mmlu/Public Relations": 0.8, + "helm_mmlu/Security Studies": 0.882, + "helm_mmlu/Sociology": 0.955, + "helm_mmlu/Virology": 0.584, + "helm_mmlu/World Religions": 0.901, + "helm_mmlu/Mean win rate": 0.311 + } + }, + { + "id": "anthropic/claude-3-7-sonnet-20250219", + "name": "claude-3-7-sonnet-20250219", + "developer": "anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.8078, + "global-mmlu-lite/Culturally Sensitive": 0.7794, + "global-mmlu-lite/Culturally Agnostic": 0.8362, + "global-mmlu-lite/Arabic": 0.7925, + "global-mmlu-lite/English": 0.7625, + "global-mmlu-lite/Bengali": 0.825, + "global-mmlu-lite/German": 0.8125, + "global-mmlu-lite/French": 0.7675, + "global-mmlu-lite/Hindi": 0.805, + "global-mmlu-lite/Indonesian": 0.8175, + "global-mmlu-lite/Italian": 0.8225, + "global-mmlu-lite/Japanese": 0.8425, + "global-mmlu-lite/Korean": 0.83, + "global-mmlu-lite/Portuguese": 0.77, + "global-mmlu-lite/Spanish": 0.8075, + "global-mmlu-lite/Swahili": 0.8125, + "global-mmlu-lite/Yoruba": 0.81, + "global-mmlu-lite/Chinese": 0.835, + "global-mmlu-lite/Burmese": 0.8125, + "helm_capabilities/Mean score": 0.674, + "helm_capabilities/MMLU-Pro": 0.784, + "helm_capabilities/GPQA": 0.608, + "helm_capabilities/IFEval": 0.834, + "helm_capabilities/WildBench": 0.814, + "helm_capabilities/Omni-MATH": 0.33, + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.0, + "livecodebenchpro/Easy Problems": 0.28169014084507044, + "reward-bench/Score": 0.7539, + "reward-bench/Factuality": 0.7326, + "reward-bench/Precise IF": 0.5437, + "reward-bench/Math": 0.75, + "reward-bench/Safety": 0.9033, + "reward-bench/Focus": 0.9212, + "reward-bench/Ties": 0.6723 + } + }, + { + "id": "anthropic/claude-3-haiku-20240307", + "name": "Claude 3 Haiku 20240307", + "developer": "anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.263, + "helm_lite/NarrativeQA": 0.244, + "helm_lite/NaturalQuestions (closed-book)": 0.144, + "helm_lite/OpenbookQA": 0.838, + "helm_lite/MMLU": 0.662, + "helm_lite/MATH": 0.131, + "helm_lite/GSM8K": 0.699, + "helm_lite/LegalBench": 0.46, + "helm_lite/MedQA": 0.702, + "helm_lite/WMT 2014": 0.148, + "helm_mmlu/MMLU All Subjects": 0.738, + "helm_mmlu/Abstract Algebra": 0.42, + "helm_mmlu/Anatomy": 0.711, + "helm_mmlu/College Physics": 0.48, + "helm_mmlu/Computer Security": 0.79, + "helm_mmlu/Econometrics": 0.632, + "helm_mmlu/Global Facts": 0.47, + "helm_mmlu/Jurisprudence": 0.861, + "helm_mmlu/Philosophy": 0.814, + "helm_mmlu/Professional Psychology": 0.802, + "helm_mmlu/Us Foreign Policy": 0.95, + "helm_mmlu/Astronomy": 0.901, + "helm_mmlu/Business Ethics": 0.78, + "helm_mmlu/Clinical Knowledge": 0.789, + "helm_mmlu/Conceptual Physics": 0.715, + "helm_mmlu/Electrical Engineering": 0.69, + "helm_mmlu/Elementary Mathematics": 0.558, + "helm_mmlu/Formal Logic": 0.579, + "helm_mmlu/High School World History": 0.878, + "helm_mmlu/Human Sexuality": 0.824, + "helm_mmlu/International Law": 0.901, + "helm_mmlu/Logical Fallacies": 0.791, + "helm_mmlu/Machine Learning": 0.589, + "helm_mmlu/Management": 0.874, + "helm_mmlu/Marketing": 0.91, + "helm_mmlu/Medical Genetics": 0.8, + "helm_mmlu/Miscellaneous": 0.893, + "helm_mmlu/Moral Scenarios": 0.502, + "helm_mmlu/Nutrition": 0.83, + "helm_mmlu/Prehistory": 0.824, + "helm_mmlu/Public Relations": 0.755, + "helm_mmlu/Security Studies": 0.808, + "helm_mmlu/Sociology": 0.9, + "helm_mmlu/Virology": 0.542, + "helm_mmlu/World Religions": 0.871, + "helm_mmlu/Mean win rate": 0.28, + "reward-bench/Score": 0.3711, + "reward-bench/Factuality": 0.4042, + "reward-bench/Precise IF": 0.2812, + "reward-bench/Math": 0.3552, + "reward-bench/Safety": 0.595, + "reward-bench/Focus": 0.501, + "reward-bench/Ties": 0.0899 + } + }, + { + "id": "anthropic/claude-3-opus-20240229", + "name": "Claude 3 Opus 20240229", + "developer": "anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.683, + "helm_lite/NarrativeQA": 0.351, + "helm_lite/NaturalQuestions (closed-book)": 0.441, + "helm_lite/OpenbookQA": 0.956, + "helm_lite/MMLU": 0.768, + "helm_lite/MATH": 0.76, + "helm_lite/GSM8K": 0.924, + "helm_lite/LegalBench": 0.662, + "helm_lite/MedQA": 0.775, + "helm_lite/WMT 2014": 0.24, + "helm_mmlu/MMLU All Subjects": 0.846, + "helm_mmlu/Abstract Algebra": 0.64, + "helm_mmlu/Anatomy": 0.8, + "helm_mmlu/College Physics": 0.716, + "helm_mmlu/Computer Security": 0.85, + "helm_mmlu/Econometrics": 0.789, + "helm_mmlu/Global Facts": 0.66, + "helm_mmlu/Jurisprudence": 0.88, + "helm_mmlu/Philosophy": 0.9, + "helm_mmlu/Professional Psychology": 0.904, + "helm_mmlu/Us Foreign Policy": 0.96, + "helm_mmlu/Astronomy": 0.967, + "helm_mmlu/Business Ethics": 0.86, + "helm_mmlu/Clinical Knowledge": 0.879, + "helm_mmlu/Conceptual Physics": 0.881, + "helm_mmlu/Electrical Engineering": 0.814, + "helm_mmlu/Elementary Mathematics": 0.862, + "helm_mmlu/Formal Logic": 0.698, + "helm_mmlu/High School World History": 0.941, + "helm_mmlu/Human Sexuality": 0.908, + "helm_mmlu/International Law": 0.901, + "helm_mmlu/Logical Fallacies": 0.896, + "helm_mmlu/Machine Learning": 0.741, + "helm_mmlu/Management": 0.942, + "helm_mmlu/Marketing": 0.944, + "helm_mmlu/Medical Genetics": 0.93, + "helm_mmlu/Miscellaneous": 0.951, + "helm_mmlu/Moral Scenarios": 0.826, + "helm_mmlu/Nutrition": 0.925, + "helm_mmlu/Prehistory": 0.941, + "helm_mmlu/Public Relations": 0.827, + "helm_mmlu/Security Studies": 0.886, + "helm_mmlu/Sociology": 0.94, + "helm_mmlu/Virology": 0.578, + "helm_mmlu/World Religions": 0.901, + "helm_mmlu/Mean win rate": 0.014, + "reward-bench/Score": 0.5744, + "reward-bench/Factuality": 0.5389, + "reward-bench/Precise IF": 0.3312, + "reward-bench/Math": 0.5137, + "reward-bench/Safety": 0.8378, + "reward-bench/Focus": 0.6646, + "reward-bench/Ties": 0.5601 + } + }, + { + "id": "anthropic/claude-3-sonnet-20240229", + "name": "Claude 3 Sonnet 20240229", + "developer": "anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.377, + "helm_lite/NarrativeQA": 0.111, + "helm_lite/NaturalQuestions (closed-book)": 0.028, + "helm_lite/OpenbookQA": 0.918, + "helm_lite/MMLU": 0.652, + "helm_lite/MATH": 0.084, + "helm_lite/GSM8K": 0.907, + "helm_lite/LegalBench": 0.49, + "helm_lite/MedQA": 0.684, + "helm_lite/WMT 2014": 0.218, + "helm_mmlu/MMLU All Subjects": 0.759, + "helm_mmlu/Abstract Algebra": 0.39, + "helm_mmlu/Anatomy": 0.711, + "helm_mmlu/College Physics": 0.559, + "helm_mmlu/Computer Security": 0.79, + "helm_mmlu/Econometrics": 0.64, + "helm_mmlu/Global Facts": 0.53, + "helm_mmlu/Jurisprudence": 0.861, + "helm_mmlu/Philosophy": 0.852, + "helm_mmlu/Professional Psychology": 0.814, + "helm_mmlu/Us Foreign Policy": 0.94, + "helm_mmlu/Astronomy": 0.855, + "helm_mmlu/Business Ethics": 0.82, + "helm_mmlu/Clinical Knowledge": 0.804, + "helm_mmlu/Conceptual Physics": 0.774, + "helm_mmlu/Electrical Engineering": 0.703, + "helm_mmlu/Elementary Mathematics": 0.635, + "helm_mmlu/Formal Logic": 0.579, + "helm_mmlu/High School World History": 0.895, + "helm_mmlu/Human Sexuality": 0.809, + "helm_mmlu/International Law": 0.909, + "helm_mmlu/Logical Fallacies": 0.853, + "helm_mmlu/Machine Learning": 0.643, + "helm_mmlu/Management": 0.922, + "helm_mmlu/Marketing": 0.85, + "helm_mmlu/Medical Genetics": 0.79, + "helm_mmlu/Miscellaneous": 0.872, + "helm_mmlu/Moral Scenarios": 0.626, + "helm_mmlu/Nutrition": 0.82, + "helm_mmlu/Prehistory": 0.864, + "helm_mmlu/Public Relations": 0.782, + "helm_mmlu/Security Studies": 0.865, + "helm_mmlu/Sociology": 0.905, + "helm_mmlu/Virology": 0.578, + "helm_mmlu/World Religions": 0.871, + "helm_mmlu/Mean win rate": 0.082 + } + }, + { + "id": "anthropic/claude-instant-1.2", + "name": "Claude Instant 1.2", + "developer": "anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.399, + "helm_lite/NarrativeQA": 0.616, + "helm_lite/NaturalQuestions (closed-book)": 0.343, + "helm_lite/OpenbookQA": 0.844, + "helm_lite/MMLU": 0.631, + "helm_lite/MATH": 0.499, + "helm_lite/GSM8K": 0.721, + "helm_lite/LegalBench": 0.586, + "helm_lite/MedQA": 0.559, + "helm_lite/WMT 2014": 0.194, + "helm_mmlu/MMLU All Subjects": 0.688, + "helm_mmlu/Abstract Algebra": 0.37, + "helm_mmlu/Anatomy": 0.637, + "helm_mmlu/College Physics": 0.49, + "helm_mmlu/Computer Security": 0.76, + "helm_mmlu/Econometrics": 0.614, + "helm_mmlu/Global Facts": 0.38, + "helm_mmlu/Jurisprudence": 0.833, + "helm_mmlu/Philosophy": 0.756, + "helm_mmlu/Professional Psychology": 0.724, + "helm_mmlu/Us Foreign Policy": 0.9, + "helm_mmlu/Astronomy": 0.743, + "helm_mmlu/Business Ethics": 0.7, + "helm_mmlu/Clinical Knowledge": 0.709, + "helm_mmlu/Conceptual Physics": 0.613, + "helm_mmlu/Electrical Engineering": 0.641, + "helm_mmlu/Elementary Mathematics": 0.45, + "helm_mmlu/Formal Logic": 0.444, + "helm_mmlu/High School World History": 0.878, + "helm_mmlu/Human Sexuality": 0.794, + "helm_mmlu/International Law": 0.851, + "helm_mmlu/Logical Fallacies": 0.81, + "helm_mmlu/Machine Learning": 0.67, + "helm_mmlu/Management": 0.835, + "helm_mmlu/Marketing": 0.885, + "helm_mmlu/Medical Genetics": 0.71, + "helm_mmlu/Miscellaneous": 0.828, + "helm_mmlu/Moral Scenarios": 0.488, + "helm_mmlu/Nutrition": 0.735, + "helm_mmlu/Prehistory": 0.762, + "helm_mmlu/Public Relations": 0.627, + "helm_mmlu/Security Studies": 0.784, + "helm_mmlu/Sociology": 0.841, + "helm_mmlu/Virology": 0.548, + "helm_mmlu/World Religions": 0.784, + "helm_mmlu/Mean win rate": 0.186 + } + }, + { + "id": "anthropic/claude-opus-4-1-20250805", + "name": "claude-opus-4-1-20250805", + "developer": "anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.943, + "global-mmlu-lite/Culturally Sensitive": 0.9331, + "global-mmlu-lite/Culturally Agnostic": 0.9528, + "global-mmlu-lite/Arabic": 0.945, + "global-mmlu-lite/English": 0.9475, + "global-mmlu-lite/Bengali": 0.9425, + "global-mmlu-lite/German": 0.94, + "global-mmlu-lite/French": 0.945, + "global-mmlu-lite/Hindi": 0.9475, + "global-mmlu-lite/Indonesian": 0.9425, + "global-mmlu-lite/Italian": 0.94, + "global-mmlu-lite/Japanese": 0.94, + "global-mmlu-lite/Korean": 0.95, + "global-mmlu-lite/Portuguese": 0.945, + "global-mmlu-lite/Spanish": 0.945, + "global-mmlu-lite/Swahili": 0.93, + "global-mmlu-lite/Yoruba": 0.9375, + "global-mmlu-lite/Chinese": 0.945, + "global-mmlu-lite/Burmese": 0.945 + } + }, + { + "id": "anthropic/claude-opus-4-20250514", + "name": "Claude 4 Opus 20250514", + "developer": "anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.757, + "helm_capabilities/MMLU-Pro": 0.859, + "helm_capabilities/GPQA": 0.666, + "helm_capabilities/IFEval": 0.918, + "helm_capabilities/WildBench": 0.833, + "helm_capabilities/Omni-MATH": 0.511, + "reward-bench/Score": 0.7648, + "reward-bench/Factuality": 0.8267, + "reward-bench/Precise IF": 0.4188, + "reward-bench/Math": 0.7491, + "reward-bench/Safety": 0.8954, + "reward-bench/Focus": 0.8616, + "reward-bench/Ties": 0.8375 + } + }, + { + "id": "anthropic/claude-opus-4-20250514-thinking-10k", + "name": "Claude 4 Opus 20250514, extended thinking", + "developer": "anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.78, + "helm_capabilities/MMLU-Pro": 0.875, + "helm_capabilities/GPQA": 0.709, + "helm_capabilities/IFEval": 0.849, + "helm_capabilities/WildBench": 0.852, + "helm_capabilities/Omni-MATH": 0.616 + } + }, + { + "id": "anthropic/claude-sonnet-4-20250514", + "name": "claude-sonnet-4-20250514", + "developer": "anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.9058, + "global-mmlu-lite/Culturally Sensitive": 0.8913, + "global-mmlu-lite/Culturally Agnostic": 0.9203, + "global-mmlu-lite/Arabic": 0.9125, + "global-mmlu-lite/English": 0.905, + "global-mmlu-lite/Bengali": 0.9075, + "global-mmlu-lite/German": 0.9125, + "global-mmlu-lite/French": 0.91, + "global-mmlu-lite/Hindi": 0.9, + "global-mmlu-lite/Indonesian": 0.9025, + "global-mmlu-lite/Italian": 0.9075, + "global-mmlu-lite/Japanese": 0.9, + "global-mmlu-lite/Korean": 0.9125, + "global-mmlu-lite/Portuguese": 0.91, + "global-mmlu-lite/Spanish": 0.9075, + "global-mmlu-lite/Swahili": 0.8975, + "global-mmlu-lite/Yoruba": 0.8975, + "global-mmlu-lite/Chinese": 0.9175, + "global-mmlu-lite/Burmese": 0.8925, + "helm_capabilities/Mean score": 0.733, + "helm_capabilities/MMLU-Pro": 0.843, + "helm_capabilities/GPQA": 0.643, + "helm_capabilities/IFEval": 0.839, + "helm_capabilities/WildBench": 0.825, + "helm_capabilities/Omni-MATH": 0.512, + "reward-bench/Score": 0.7117, + "reward-bench/Factuality": 0.7612, + "reward-bench/Precise IF": 0.3594, + "reward-bench/Math": 0.7049, + "reward-bench/Safety": 0.8909, + "reward-bench/Focus": 0.7596, + "reward-bench/Ties": 0.7939 + } + }, + { + "id": "anthropic/claude-sonnet-4-20250514-thinking-10k", + "name": "Claude 4 Sonnet 20250514, extended thinking", + "developer": "anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.766, + "helm_capabilities/MMLU-Pro": 0.843, + "helm_capabilities/GPQA": 0.706, + "helm_capabilities/IFEval": 0.84, + "helm_capabilities/WildBench": 0.838, + "helm_capabilities/Omni-MATH": 0.602 + } + }, + { + "id": "anthropic/claude-sonnet-4-5-20250929", + "name": "claude-sonnet-4-5-20250929", + "developer": "anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.0, + "livecodebenchpro/Easy Problems": 0.5352 + } + }, + { + "id": "anthropic/claude-v1.3", + "name": "Anthropic Claude v1.3", + "developer": "anthropic", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_instruct/Mean win rate": 0.611, + "helm_instruct/Anthropic RLHF dataset": 4.965, + "helm_instruct/Best ChatGPT Prompts": 4.995, + "helm_instruct/Koala test dataset": 4.981, + "helm_instruct/Open Assistant": 4.975, + "helm_instruct/Self Instruct": 4.992, + "helm_instruct/Vicuna": 4.989, + "helm_lite/Mean win rate": 0.518, + "helm_lite/NarrativeQA": 0.723, + "helm_lite/NaturalQuestions (closed-book)": 0.409, + "helm_lite/OpenbookQA": 0.908, + "helm_lite/MMLU": 0.631, + "helm_lite/MATH": 0.54, + "helm_lite/GSM8K": 0.784, + "helm_lite/LegalBench": 0.629, + "helm_lite/MedQA": 0.618, + "helm_lite/WMT 2014": 0.219 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/apple.json b/data/developers/apple.json new file mode 100644 index 0000000000000000000000000000000000000000..bfdde752f718aeb716116ad152a8ca94d3a244e2 --- /dev/null +++ b/data/developers/apple.json @@ -0,0 +1,19 @@ +{ + "developer": "apple", + "models": [ + { + "id": "apple/DCLM-7B", + "name": "DCLM-7B", + "developer": "apple", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2173, + "hfopenllm_v2/BBH": 0.4232, + "hfopenllm_v2/MATH Level 5": 0.037, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.3921, + "hfopenllm_v2/MMLU-PRO": 0.3111 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/applied-compute.json b/data/developers/applied-compute.json new file mode 100644 index 0000000000000000000000000000000000000000..45897ff6f4efc8574ee8e8aa6faebd24a5bd9681 --- /dev/null +++ b/data/developers/applied-compute.json @@ -0,0 +1,17 @@ +{ + "developer": "applied-compute", + "models": [ + { + "id": "applied-compute/Applied Compute: Small", + "name": "Applied Compute: Small", + "developer": "applied-compute", + "evaluator_relationship": null, + "benchmark_scores": { + "apex-agents/Overall Pass@1": 0.23, + "apex-agents/Overall Mean Score": 0.401, + "apex-agents/Corporate Law Pass@1": 0.266, + "apex-agents/Corporate Lawyer Mean Score": 0.548 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/appvoid.json b/data/developers/appvoid.json new file mode 100644 index 0000000000000000000000000000000000000000..cd81f56e4efd66ed71d5b9dc55a6e2b36aa44396 --- /dev/null +++ b/data/developers/appvoid.json @@ -0,0 +1,33 @@ +{ + "developer": "appvoid", + "models": [ + { + "id": "appvoid/arco-2", + "name": "arco-2", + "developer": "appvoid", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1991, + "hfopenllm_v2/BBH": 0.3146, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2391, + "hfopenllm_v2/MUSR": 0.3536, + "hfopenllm_v2/MMLU-PRO": 0.1116 + } + }, + { + "id": "appvoid/arco-2-instruct", + "name": "arco-2-instruct", + "developer": "appvoid", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2164, + "hfopenllm_v2/BBH": 0.3133, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2383, + "hfopenllm_v2/MUSR": 0.3496, + "hfopenllm_v2/MMLU-PRO": 0.1113 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/arcee-ai.json b/data/developers/arcee-ai.json new file mode 100644 index 0000000000000000000000000000000000000000..d8aa1016215919dac028d356ce0e4ec137bb50c2 --- /dev/null +++ b/data/developers/arcee-ai.json @@ -0,0 +1,159 @@ +{ + "developer": "arcee-ai", + "models": [ + { + "id": "arcee-ai/Arcee-Blitz", + "name": "Arcee-Blitz", + "developer": "arcee-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5543, + "hfopenllm_v2/BBH": 0.6607, + "hfopenllm_v2/MATH Level 5": 0.3482, + "hfopenllm_v2/GPQA": 0.3851, + "hfopenllm_v2/MUSR": 0.5047, + "hfopenllm_v2/MMLU-PRO": 0.6154 + } + }, + { + "id": "arcee-ai/Arcee-Maestro-7B-Preview", + "name": "Arcee-Maestro-7B-Preview", + "developer": "arcee-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.275, + "hfopenllm_v2/BBH": 0.4648, + "hfopenllm_v2/MATH Level 5": 0.4992, + "hfopenllm_v2/GPQA": 0.3322, + "hfopenllm_v2/MUSR": 0.3885, + "hfopenllm_v2/MMLU-PRO": 0.3039 + } + }, + { + "id": "arcee-ai/Arcee-Nova", + "name": "Arcee-Nova", + "developer": "arcee-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7907, + "hfopenllm_v2/BBH": 0.6942, + "hfopenllm_v2/MATH Level 5": 0.4381, + "hfopenllm_v2/GPQA": 0.3851, + "hfopenllm_v2/MUSR": 0.4562, + "hfopenllm_v2/MMLU-PRO": 0.5452 + } + }, + { + "id": "arcee-ai/Arcee-Spark", + "name": "Arcee-Spark", + "developer": "arcee-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5718, + "hfopenllm_v2/BBH": 0.5481, + "hfopenllm_v2/MATH Level 5": 0.114, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4008, + "hfopenllm_v2/MMLU-PRO": 0.3813 + } + }, + { + "id": "arcee-ai/Llama-3.1-SuperNova-Lite", + "name": "Llama-3.1-SuperNova-Lite", + "developer": "arcee-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8017, + "hfopenllm_v2/BBH": 0.5152, + "hfopenllm_v2/MATH Level 5": 0.1828, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4163, + "hfopenllm_v2/MMLU-PRO": 0.3877 + } + }, + { + "id": "arcee-ai/Llama-Spark", + "name": "Llama-Spark", + "developer": "arcee-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7911, + "hfopenllm_v2/BBH": 0.5054, + "hfopenllm_v2/MATH Level 5": 0.139, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.3593, + "hfopenllm_v2/MMLU-PRO": 0.3721 + } + }, + { + "id": "arcee-ai/SuperNova-Medius", + "name": "SuperNova-Medius", + "developer": "arcee-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7184, + "hfopenllm_v2/BBH": 0.6377, + "hfopenllm_v2/MATH Level 5": 0.469, + "hfopenllm_v2/GPQA": 0.3331, + "hfopenllm_v2/MUSR": 0.4233, + "hfopenllm_v2/MMLU-PRO": 0.5035 + } + }, + { + "id": "arcee-ai/Virtuoso-Lite", + "name": "Virtuoso-Lite", + "developer": "arcee-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.81, + "hfopenllm_v2/BBH": 0.6099, + "hfopenllm_v2/MATH Level 5": 0.253, + "hfopenllm_v2/GPQA": 0.344, + "hfopenllm_v2/MUSR": 0.4595, + "hfopenllm_v2/MMLU-PRO": 0.4441 + } + }, + { + "id": "arcee-ai/Virtuoso-Small", + "name": "Virtuoso-Small", + "developer": "arcee-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7935, + "hfopenllm_v2/BBH": 0.6518, + "hfopenllm_v2/MATH Level 5": 0.4094, + "hfopenllm_v2/GPQA": 0.3364, + "hfopenllm_v2/MUSR": 0.4339, + "hfopenllm_v2/MMLU-PRO": 0.5191 + } + }, + { + "id": "arcee-ai/Virtuoso-Small-v2", + "name": "Virtuoso-Small-v2", + "developer": "arcee-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8273, + "hfopenllm_v2/BBH": 0.6554, + "hfopenllm_v2/MATH Level 5": 0.466, + "hfopenllm_v2/GPQA": 0.3532, + "hfopenllm_v2/MUSR": 0.4313, + "hfopenllm_v2/MMLU-PRO": 0.5188 + } + }, + { + "id": "arcee-ai/raspberry-3B", + "name": "raspberry-3B", + "developer": "arcee-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3154, + "hfopenllm_v2/BBH": 0.4269, + "hfopenllm_v2/MATH Level 5": 0.1035, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.4123, + "hfopenllm_v2/MMLU-PRO": 0.2854 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/argilla-warehouse.json b/data/developers/argilla-warehouse.json new file mode 100644 index 0000000000000000000000000000000000000000..cafd84ecedb50e04d94c0f7000f6f1b99795d5c2 --- /dev/null +++ b/data/developers/argilla-warehouse.json @@ -0,0 +1,19 @@ +{ + "developer": "argilla-warehouse", + "models": [ + { + "id": "argilla-warehouse/Llama-3.1-8B-MagPie-Ultra", + "name": "Llama-3.1-8B-MagPie-Ultra", + "developer": "argilla-warehouse", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5757, + "hfopenllm_v2/BBH": 0.462, + "hfopenllm_v2/MATH Level 5": 0.077, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3543, + "hfopenllm_v2/MMLU-PRO": 0.3144 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/argilla.json b/data/developers/argilla.json new file mode 100644 index 0000000000000000000000000000000000000000..8008c2c0602464001b097978abfb5c652a714ab1 --- /dev/null +++ b/data/developers/argilla.json @@ -0,0 +1,33 @@ +{ + "developer": "argilla", + "models": [ + { + "id": "argilla/notus-7b-v1", + "name": "notus-7b-v1", + "developer": "argilla", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5082, + "hfopenllm_v2/BBH": 0.4512, + "hfopenllm_v2/MATH Level 5": 0.0317, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.3364, + "hfopenllm_v2/MMLU-PRO": 0.3004 + } + }, + { + "id": "argilla/notux-8x7b-v1", + "name": "notux-8x7b-v1", + "developer": "argilla", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5422, + "hfopenllm_v2/BBH": 0.5363, + "hfopenllm_v2/MATH Level 5": 0.0997, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4176, + "hfopenllm_v2/MMLU-PRO": 0.366 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/arisin.json b/data/developers/arisin.json new file mode 100644 index 0000000000000000000000000000000000000000..e516001be2b227004b679fcc822dc1d6ea19ea0f --- /dev/null +++ b/data/developers/arisin.json @@ -0,0 +1,19 @@ +{ + "developer": "arisin", + "models": [ + { + "id": "arisin/orca-platypus-13B-slerp", + "name": "orca-platypus-13B-slerp", + "developer": "arisin", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2672, + "hfopenllm_v2/BBH": 0.4631, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4253, + "hfopenllm_v2/MMLU-PRO": 0.2592 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ark.json b/data/developers/ark.json new file mode 100644 index 0000000000000000000000000000000000000000..a7167e1b286fbdc55350fc0f8dc71e54f3b61330 --- /dev/null +++ b/data/developers/ark.json @@ -0,0 +1,16 @@ +{ + "developer": "ark", + "models": [ + { + "id": "ark/ep-20250603132404-cgpjm", + "name": "ep-20250603132404-cgpjm", + "developer": "ark", + "evaluator_relationship": null, + "benchmark_scores": { + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.0141, + "livecodebenchpro/Easy Problems": 0.507 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/arshiaafshani.json b/data/developers/arshiaafshani.json new file mode 100644 index 0000000000000000000000000000000000000000..1f3469e3647e1bc24484a7e92c7b1d2973239522 --- /dev/null +++ b/data/developers/arshiaafshani.json @@ -0,0 +1,19 @@ +{ + "developer": "arshiaafshani", + "models": [ + { + "id": "arshiaafshani/Arsh-V1", + "name": "Arsh-V1", + "developer": "arshiaafshani", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6043, + "hfopenllm_v2/BBH": 0.674, + "hfopenllm_v2/MATH Level 5": 0.2621, + "hfopenllm_v2/GPQA": 0.3733, + "hfopenllm_v2/MUSR": 0.4899, + "hfopenllm_v2/MMLU-PRO": 0.5257 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/asharsha30.json b/data/developers/asharsha30.json new file mode 100644 index 0000000000000000000000000000000000000000..22896ed7153814cf1efa535557809de4c2229894 --- /dev/null +++ b/data/developers/asharsha30.json @@ -0,0 +1,19 @@ +{ + "developer": "asharsha30", + "models": [ + { + "id": "asharsha30/LLAMA_Harsha_8_B_ORDP_10k", + "name": "LLAMA_Harsha_8_B_ORDP_10k", + "developer": "asharsha30", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3464, + "hfopenllm_v2/BBH": 0.4669, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3697, + "hfopenllm_v2/MMLU-PRO": 0.281 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ashercn97.json b/data/developers/ashercn97.json new file mode 100644 index 0000000000000000000000000000000000000000..968df5c6a4a4a29fa6d509d913b24dd3850d9f23 --- /dev/null +++ b/data/developers/ashercn97.json @@ -0,0 +1,33 @@ +{ + "developer": "ashercn97", + "models": [ + { + "id": "ashercn97/a1-v0.0.1", + "name": "a1-v0.0.1", + "developer": "ashercn97", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2198, + "hfopenllm_v2/BBH": 0.5188, + "hfopenllm_v2/MATH Level 5": 0.2145, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.412, + "hfopenllm_v2/MMLU-PRO": 0.4165 + } + }, + { + "id": "ashercn97/a1-v002", + "name": "a1-v002", + "developer": "ashercn97", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2585, + "hfopenllm_v2/BBH": 0.5261, + "hfopenllm_v2/MATH Level 5": 0.2341, + "hfopenllm_v2/GPQA": 0.3188, + "hfopenllm_v2/MUSR": 0.4159, + "hfopenllm_v2/MMLU-PRO": 0.4175 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/assskelad.json b/data/developers/assskelad.json new file mode 100644 index 0000000000000000000000000000000000000000..62961e2dd41dec4193a73dc4e3fd21d1abe21a10 --- /dev/null +++ b/data/developers/assskelad.json @@ -0,0 +1,19 @@ +{ + "developer": "assskelad", + "models": [ + { + "id": "assskelad/smollm2-360M-sft_SmallThoughts", + "name": "smollm2-360M-sft_SmallThoughts", + "developer": "assskelad", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2007, + "hfopenllm_v2/BBH": 0.315, + "hfopenllm_v2/MATH Level 5": 0.0166, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3395, + "hfopenllm_v2/MMLU-PRO": 0.1182 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/athirdpath.json b/data/developers/athirdpath.json new file mode 100644 index 0000000000000000000000000000000000000000..71f45173dabd165d55432713c1b8b7bd1b5cda60 --- /dev/null +++ b/data/developers/athirdpath.json @@ -0,0 +1,19 @@ +{ + "developer": "athirdpath", + "models": [ + { + "id": "athirdpath/Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit", + "name": "Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit", + "developer": "athirdpath", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4521, + "hfopenllm_v2/BBH": 0.4939, + "hfopenllm_v2/MATH Level 5": 0.102, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.3864, + "hfopenllm_v2/MMLU-PRO": 0.3565 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/automerger.json b/data/developers/automerger.json new file mode 100644 index 0000000000000000000000000000000000000000..59ef7711446d3e502098944b67b8cdece7088cb8 --- /dev/null +++ b/data/developers/automerger.json @@ -0,0 +1,19 @@ +{ + "developer": "automerger", + "models": [ + { + "id": "automerger/YamshadowExperiment28-7B", + "name": "YamshadowExperiment28-7B", + "developer": "automerger", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.407, + "hfopenllm_v2/BBH": 0.515, + "hfopenllm_v2/MATH Level 5": 0.0612, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.4306, + "hfopenllm_v2/MMLU-PRO": 0.306 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/avemio.json b/data/developers/avemio.json new file mode 100644 index 0000000000000000000000000000000000000000..8f56cb959cd209d06e114ab7f3b1732cb4178d36 --- /dev/null +++ b/data/developers/avemio.json @@ -0,0 +1,19 @@ +{ + "developer": "avemio", + "models": [ + { + "id": "avemio/GRAG-NEMO-12B-ORPO-HESSIAN-AI", + "name": "GRAG-NEMO-12B-ORPO-HESSIAN-AI", + "developer": "avemio", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0, + "hfopenllm_v2/BBH": 0.2607, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3447, + "hfopenllm_v2/MMLU-PRO": 0.1061 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/awnr.json b/data/developers/awnr.json new file mode 100644 index 0000000000000000000000000000000000000000..8c06c6f8375ba1d9602d9190503bb9a8a4e2be1c --- /dev/null +++ b/data/developers/awnr.json @@ -0,0 +1,75 @@ +{ + "developer": "awnr", + "models": [ + { + "id": "awnr/Mistral-7B-v0.1-signtensors-1-over-2", + "name": "Mistral-7B-v0.1-signtensors-1-over-2", + "developer": "awnr", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2179, + "hfopenllm_v2/BBH": 0.4423, + "hfopenllm_v2/MATH Level 5": 0.034, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4006, + "hfopenllm_v2/MMLU-PRO": 0.3 + } + }, + { + "id": "awnr/Mistral-7B-v0.1-signtensors-1-over-4", + "name": "Mistral-7B-v0.1-signtensors-1-over-4", + "developer": "awnr", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2133, + "hfopenllm_v2/BBH": 0.3507, + "hfopenllm_v2/MATH Level 5": 0.0249, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.346, + "hfopenllm_v2/MMLU-PRO": 0.2311 + } + }, + { + "id": "awnr/Mistral-7B-v0.1-signtensors-3-over-8", + "name": "Mistral-7B-v0.1-signtensors-3-over-8", + "developer": "awnr", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2394, + "hfopenllm_v2/BBH": 0.43, + "hfopenllm_v2/MATH Level 5": 0.0332, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.3818, + "hfopenllm_v2/MMLU-PRO": 0.3001 + } + }, + { + "id": "awnr/Mistral-7B-v0.1-signtensors-5-over-16", + "name": "Mistral-7B-v0.1-signtensors-5-over-16", + "developer": "awnr", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2118, + "hfopenllm_v2/BBH": 0.4124, + "hfopenllm_v2/MATH Level 5": 0.0295, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.3686, + "hfopenllm_v2/MMLU-PRO": 0.2958 + } + }, + { + "id": "awnr/Mistral-7B-v0.1-signtensors-7-over-16", + "name": "Mistral-7B-v0.1-signtensors-7-over-16", + "developer": "awnr", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2294, + "hfopenllm_v2/BBH": 0.4316, + "hfopenllm_v2/MATH Level 5": 0.0385, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.3952, + "hfopenllm_v2/MMLU-PRO": 0.303 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/aws-prototyping.json b/data/developers/aws-prototyping.json new file mode 100644 index 0000000000000000000000000000000000000000..f6b88d0b982c2c112edab66cdfd44e4db0d9da5f --- /dev/null +++ b/data/developers/aws-prototyping.json @@ -0,0 +1,19 @@ +{ + "developer": "aws-prototyping", + "models": [ + { + "id": "aws-prototyping/MegaBeam-Mistral-7B-512k", + "name": "MegaBeam-Mistral-7B-512k", + "developer": "aws-prototyping", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5973, + "hfopenllm_v2/BBH": 0.3662, + "hfopenllm_v2/MATH Level 5": 0.0287, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.3994, + "hfopenllm_v2/MMLU-PRO": 0.2589 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/axolotl-ai-co.json b/data/developers/axolotl-ai-co.json new file mode 100644 index 0000000000000000000000000000000000000000..9ec75b3ec427478e4061d744c4239ca43009e1be --- /dev/null +++ b/data/developers/axolotl-ai-co.json @@ -0,0 +1,19 @@ +{ + "developer": "axolotl-ai-co", + "models": [ + { + "id": "axolotl-ai-co/romulus-mistral-nemo-12b-simpo", + "name": "romulus-mistral-nemo-12b-simpo", + "developer": "axolotl-ai-co", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6079, + "hfopenllm_v2/BBH": 0.5395, + "hfopenllm_v2/MATH Level 5": 0.114, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.4233, + "hfopenllm_v2/MMLU-PRO": 0.3469 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/baconnier.json b/data/developers/baconnier.json new file mode 100644 index 0000000000000000000000000000000000000000..e71ba7e1addf9b831d19fc8062eac0270044a36b --- /dev/null +++ b/data/developers/baconnier.json @@ -0,0 +1,33 @@ +{ + "developer": "baconnier", + "models": [ + { + "id": "baconnier/Napoleon_24B_V0.0", + "name": "Napoleon_24B_V0.0", + "developer": "baconnier", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1801, + "hfopenllm_v2/BBH": 0.6367, + "hfopenllm_v2/MATH Level 5": 0.2273, + "hfopenllm_v2/GPQA": 0.3792, + "hfopenllm_v2/MUSR": 0.442, + "hfopenllm_v2/MMLU-PRO": 0.504 + } + }, + { + "id": "baconnier/Napoleon_24B_V0.2", + "name": "Napoleon_24B_V0.2", + "developer": "baconnier", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2527, + "hfopenllm_v2/BBH": 0.5911, + "hfopenllm_v2/MATH Level 5": 0.1435, + "hfopenllm_v2/GPQA": 0.3381, + "hfopenllm_v2/MUSR": 0.446, + "hfopenllm_v2/MMLU-PRO": 0.4357 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/baebee.json b/data/developers/baebee.json new file mode 100644 index 0000000000000000000000000000000000000000..e4726f4d0644277ef595905d47aad6feb287568e --- /dev/null +++ b/data/developers/baebee.json @@ -0,0 +1,47 @@ +{ + "developer": "baebee", + "models": [ + { + "id": "baebee/7B-Cetacea", + "name": "7B-Cetacea", + "developer": "baebee", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5279, + "hfopenllm_v2/BBH": 0.4757, + "hfopenllm_v2/MATH Level 5": 0.0468, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.4136, + "hfopenllm_v2/MMLU-PRO": 0.2955 + } + }, + { + "id": "baebee/mergekit-model_stock-nzjnheg", + "name": "mergekit-model_stock-nzjnheg", + "developer": "baebee", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4844, + "hfopenllm_v2/BBH": 0.5287, + "hfopenllm_v2/MATH Level 5": 0.1677, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3847, + "hfopenllm_v2/MMLU-PRO": 0.3699 + } + }, + { + "id": "baebee/mergekit-ties-fnjenli", + "name": "mergekit-ties-fnjenli", + "developer": "baebee", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1988, + "hfopenllm_v2/BBH": 0.3024, + "hfopenllm_v2/MATH Level 5": 0.0023, + "hfopenllm_v2/GPQA": 0.245, + "hfopenllm_v2/MUSR": 0.4019, + "hfopenllm_v2/MMLU-PRO": 0.1129 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/bamec66557.json b/data/developers/bamec66557.json new file mode 100644 index 0000000000000000000000000000000000000000..abb6d8bc4bef7b3243ccbc32e5d5155031c67bad --- /dev/null +++ b/data/developers/bamec66557.json @@ -0,0 +1,383 @@ +{ + "developer": "bamec66557", + "models": [ + { + "id": "bamec66557/MISCHIEVOUS-12B", + "name": "MISCHIEVOUS-12B", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3852, + "hfopenllm_v2/BBH": 0.5405, + "hfopenllm_v2/MATH Level 5": 0.1276, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.4145, + "hfopenllm_v2/MMLU-PRO": 0.3672 + } + }, + { + "id": "bamec66557/MISCHIEVOUS-12B-Mix_0.1v", + "name": "MISCHIEVOUS-12B-Mix_0.1v", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3636, + "hfopenllm_v2/BBH": 0.5436, + "hfopenllm_v2/MATH Level 5": 0.1329, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.4132, + "hfopenllm_v2/MMLU-PRO": 0.3674 + } + }, + { + "id": "bamec66557/MISCHIEVOUS-12B-Mix_0.2v", + "name": "MISCHIEVOUS-12B-Mix_0.2v", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3624, + "hfopenllm_v2/BBH": 0.5434, + "hfopenllm_v2/MATH Level 5": 0.1261, + "hfopenllm_v2/GPQA": 0.3255, + "hfopenllm_v2/MUSR": 0.4158, + "hfopenllm_v2/MMLU-PRO": 0.3663 + } + }, + { + "id": "bamec66557/MISCHIEVOUS-12B-Mix_0.3v", + "name": "MISCHIEVOUS-12B-Mix_0.3v", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.387, + "hfopenllm_v2/BBH": 0.5431, + "hfopenllm_v2/MATH Level 5": 0.1337, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.4131, + "hfopenllm_v2/MMLU-PRO": 0.3664 + } + }, + { + "id": "bamec66557/MISCHIEVOUS-12B-Mix_0.4v", + "name": "MISCHIEVOUS-12B-Mix_0.4v", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6508, + "hfopenllm_v2/BBH": 0.5094, + "hfopenllm_v2/MATH Level 5": 0.1352, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4176, + "hfopenllm_v2/MMLU-PRO": 0.3683 + } + }, + { + "id": "bamec66557/MISCHIEVOUS-12B-Mix_0.5v", + "name": "MISCHIEVOUS-12B-Mix_0.5v", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3746, + "hfopenllm_v2/BBH": 0.5422, + "hfopenllm_v2/MATH Level 5": 0.1367, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.4132, + "hfopenllm_v2/MMLU-PRO": 0.3661 + } + }, + { + "id": "bamec66557/MISCHIEVOUS-12B-Mix_0.6v", + "name": "MISCHIEVOUS-12B-Mix_0.6v", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4366, + "hfopenllm_v2/BBH": 0.5449, + "hfopenllm_v2/MATH Level 5": 0.1254, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.4185, + "hfopenllm_v2/MMLU-PRO": 0.3662 + } + }, + { + "id": "bamec66557/MISCHIEVOUS-12B-Mix_III_IV_V", + "name": "MISCHIEVOUS-12B-Mix_III_IV_V", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4031, + "hfopenllm_v2/BBH": 0.5465, + "hfopenllm_v2/MATH Level 5": 0.1292, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.4198, + "hfopenllm_v2/MMLU-PRO": 0.3664 + } + }, + { + "id": "bamec66557/MISCHIEVOUS-12B-Mix_III_ex_V", + "name": "MISCHIEVOUS-12B-Mix_III_ex_V", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4316, + "hfopenllm_v2/BBH": 0.5449, + "hfopenllm_v2/MATH Level 5": 0.1322, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.4198, + "hfopenllm_v2/MMLU-PRO": 0.3649 + } + }, + { + "id": "bamec66557/MISCHIEVOUS-12B-Mix_Neo", + "name": "MISCHIEVOUS-12B-Mix_Neo", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.625, + "hfopenllm_v2/BBH": 0.5078, + "hfopenllm_v2/MATH Level 5": 0.1329, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.415, + "hfopenllm_v2/MMLU-PRO": 0.3685 + } + }, + { + "id": "bamec66557/Mistral-Nemo-VICIOUS_MESH-12B-2407", + "name": "Mistral-Nemo-VICIOUS_MESH-12B-2407", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6706, + "hfopenllm_v2/BBH": 0.5156, + "hfopenllm_v2/MATH Level 5": 0.1367, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.431, + "hfopenllm_v2/MMLU-PRO": 0.3677 + } + }, + { + "id": "bamec66557/NameLess-12B-prob", + "name": "NameLess-12B-prob", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6602, + "hfopenllm_v2/BBH": 0.5158, + "hfopenllm_v2/MATH Level 5": 0.1261, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.4336, + "hfopenllm_v2/MMLU-PRO": 0.3684 + } + }, + { + "id": "bamec66557/VICIOUS_MESH-12B", + "name": "VICIOUS_MESH-12B", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3716, + "hfopenllm_v2/BBH": 0.5436, + "hfopenllm_v2/MATH Level 5": 0.1344, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.4105, + "hfopenllm_v2/MMLU-PRO": 0.3679 + } + }, + { + "id": "bamec66557/VICIOUS_MESH-12B-0.1v", + "name": "VICIOUS_MESH-12B-0.1v", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3657, + "hfopenllm_v2/BBH": 0.5412, + "hfopenllm_v2/MATH Level 5": 0.1322, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.4158, + "hfopenllm_v2/MMLU-PRO": 0.3683 + } + }, + { + "id": "bamec66557/VICIOUS_MESH-12B-0.X.ver", + "name": "VICIOUS_MESH-12B-0.X.ver", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3776, + "hfopenllm_v2/BBH": 0.5416, + "hfopenllm_v2/MATH Level 5": 0.1201, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.4198, + "hfopenllm_v2/MMLU-PRO": 0.3671 + } + }, + { + "id": "bamec66557/VICIOUS_MESH-12B-ALPHA", + "name": "VICIOUS_MESH-12B-ALPHA", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6365, + "hfopenllm_v2/BBH": 0.5094, + "hfopenllm_v2/MATH Level 5": 0.1367, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4203, + "hfopenllm_v2/MMLU-PRO": 0.3697 + } + }, + { + "id": "bamec66557/VICIOUS_MESH-12B-BETA", + "name": "VICIOUS_MESH-12B-BETA", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6721, + "hfopenllm_v2/BBH": 0.5156, + "hfopenllm_v2/MATH Level 5": 0.1329, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.431, + "hfopenllm_v2/MMLU-PRO": 0.3679 + } + }, + { + "id": "bamec66557/VICIOUS_MESH-12B-DELTA", + "name": "VICIOUS_MESH-12B-DELTA", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6469, + "hfopenllm_v2/BBH": 0.5055, + "hfopenllm_v2/MATH Level 5": 0.1375, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4057, + "hfopenllm_v2/MMLU-PRO": 0.3651 + } + }, + { + "id": "bamec66557/VICIOUS_MESH-12B-DIGAMMA", + "name": "VICIOUS_MESH-12B-DIGAMMA", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6429, + "hfopenllm_v2/BBH": 0.5061, + "hfopenllm_v2/MATH Level 5": 0.1337, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4097, + "hfopenllm_v2/MMLU-PRO": 0.3659 + } + }, + { + "id": "bamec66557/VICIOUS_MESH-12B-EPSILON", + "name": "VICIOUS_MESH-12B-EPSILON", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6305, + "hfopenllm_v2/BBH": 0.5038, + "hfopenllm_v2/MATH Level 5": 0.1261, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.407, + "hfopenllm_v2/MMLU-PRO": 0.3648 + } + }, + { + "id": "bamec66557/VICIOUS_MESH-12B-GAMMA", + "name": "VICIOUS_MESH-12B-GAMMA", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6362, + "hfopenllm_v2/BBH": 0.5182, + "hfopenllm_v2/MATH Level 5": 0.1307, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4363, + "hfopenllm_v2/MMLU-PRO": 0.3666 + } + }, + { + "id": "bamec66557/VICIOUS_MESH-12B-NEMO", + "name": "VICIOUS_MESH-12B-NEMO", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4022, + "hfopenllm_v2/BBH": 0.5442, + "hfopenllm_v2/MATH Level 5": 0.1269, + "hfopenllm_v2/GPQA": 0.3238, + "hfopenllm_v2/MUSR": 0.4251, + "hfopenllm_v2/MMLU-PRO": 0.3716 + } + }, + { + "id": "bamec66557/VICIOUS_MESH-12B-OMEGA", + "name": "VICIOUS_MESH-12B-OMEGA", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.67, + "hfopenllm_v2/BBH": 0.5166, + "hfopenllm_v2/MATH Level 5": 0.1344, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.4323, + "hfopenllm_v2/MMLU-PRO": 0.3677 + } + }, + { + "id": "bamec66557/VICIOUS_MESH-12B-UNION", + "name": "VICIOUS_MESH-12B-UNION", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6429, + "hfopenllm_v2/BBH": 0.5107, + "hfopenllm_v2/MATH Level 5": 0.139, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4257, + "hfopenllm_v2/MMLU-PRO": 0.3672 + } + }, + { + "id": "bamec66557/VICIOUS_MESH-12B_Razor", + "name": "VICIOUS_MESH-12B_Razor", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3736, + "hfopenllm_v2/BBH": 0.5447, + "hfopenllm_v2/MATH Level 5": 0.1299, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.4092, + "hfopenllm_v2/MMLU-PRO": 0.3669 + } + }, + { + "id": "bamec66557/mergekit-model_stock-zdaysvi", + "name": "mergekit-model_stock-zdaysvi", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6426, + "hfopenllm_v2/BBH": 0.5063, + "hfopenllm_v2/MATH Level 5": 0.1352, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4124, + "hfopenllm_v2/MMLU-PRO": 0.3688 + } + }, + { + "id": "bamec66557/mergekit-ties-sinbkow", + "name": "mergekit-ties-sinbkow", + "developer": "bamec66557", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6432, + "hfopenllm_v2/BBH": 0.5092, + "hfopenllm_v2/MATH Level 5": 0.145, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.4045, + "hfopenllm_v2/MMLU-PRO": 0.3603 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/belztjti.json b/data/developers/belztjti.json new file mode 100644 index 0000000000000000000000000000000000000000..c079f10ae86fc88cfd453883e20491bd3134d711 --- /dev/null +++ b/data/developers/belztjti.json @@ -0,0 +1,33 @@ +{ + "developer": "belztjti", + "models": [ + { + "id": "belztjti/dffghgjh", + "name": "dffghgjh", + "developer": "belztjti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5784, + "hfopenllm_v2/BBH": 0.3582, + "hfopenllm_v2/MATH Level 5": 0.0234, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3475, + "hfopenllm_v2/MMLU-PRO": 0.3422 + } + }, + { + "id": "belztjti/dtfgv", + "name": "dtfgv", + "developer": "belztjti", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3345, + "hfopenllm_v2/BBH": 0.3282, + "hfopenllm_v2/MATH Level 5": 0.0181, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3794, + "hfopenllm_v2/MMLU-PRO": 0.1504 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/benhaotang.json b/data/developers/benhaotang.json new file mode 100644 index 0000000000000000000000000000000000000000..3cefd0491456319d63ecbc33ab43d51d9c5f6854 --- /dev/null +++ b/data/developers/benhaotang.json @@ -0,0 +1,19 @@ +{ + "developer": "benhaotang", + "models": [ + { + "id": "benhaotang/phi4-qwq-sky-t1", + "name": "phi4-qwq-sky-t1", + "developer": "benhaotang", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.046, + "hfopenllm_v2/BBH": 0.6711, + "hfopenllm_v2/MATH Level 5": 0.4101, + "hfopenllm_v2/GPQA": 0.3951, + "hfopenllm_v2/MUSR": 0.49, + "hfopenllm_v2/MMLU-PRO": 0.5244 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/beomi.json b/data/developers/beomi.json new file mode 100644 index 0000000000000000000000000000000000000000..48772acd42eb341b4f10f2f3869c9336f1fc1969 --- /dev/null +++ b/data/developers/beomi.json @@ -0,0 +1,19 @@ +{ + "developer": "beomi", + "models": [ + { + "id": "beomi/gemma-mling-7b", + "name": "gemma-mling-7b", + "developer": "beomi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2029, + "hfopenllm_v2/BBH": 0.4068, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3759, + "hfopenllm_v2/MMLU-PRO": 0.2633 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/beowolx.json b/data/developers/beowolx.json new file mode 100644 index 0000000000000000000000000000000000000000..b4783772930dc66317186f829aae454ef8aa3927 --- /dev/null +++ b/data/developers/beowolx.json @@ -0,0 +1,19 @@ +{ + "developer": "beowolx", + "models": [ + { + "id": "beowolx/CodeNinja-1.0-OpenChat-7B", + "name": "CodeNinja-1.0-OpenChat-7B", + "developer": "beowolx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5447, + "hfopenllm_v2/BBH": 0.4441, + "hfopenllm_v2/MATH Level 5": 0.0672, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4243, + "hfopenllm_v2/MMLU-PRO": 0.3015 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/berkeley-nest.json b/data/developers/berkeley-nest.json new file mode 100644 index 0000000000000000000000000000000000000000..d8136759c2d7820038932fbb3bc27e3af2e373c3 --- /dev/null +++ b/data/developers/berkeley-nest.json @@ -0,0 +1,33 @@ +{ + "developer": "berkeley-nest", + "models": [ + { + "id": "berkeley-nest/Starling-LM-7B-alpha", + "name": "Starling-LM-7B-alpha", + "developer": "berkeley-nest", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.548, + "hfopenllm_v2/BBH": 0.444, + "hfopenllm_v2/MATH Level 5": 0.0838, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.412, + "hfopenllm_v2/MMLU-PRO": 0.3172 + } + }, + { + "id": "berkeley-nest/Starling-RM-7B-alpha", + "name": "berkeley-nest/Starling-RM-7B-alpha", + "developer": "berkeley-nest", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7113, + "reward-bench/Chat": 0.9804, + "reward-bench/Chat Hard": 0.4561, + "reward-bench/Safety": 0.8446, + "reward-bench/Reasoning": 0.58, + "reward-bench/Prior Sets (0.5 weight)": 0.6794 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/bfuzzy1.json b/data/developers/bfuzzy1.json new file mode 100644 index 0000000000000000000000000000000000000000..046908901f198eefd67e0383e1e29f9477b06c45 --- /dev/null +++ b/data/developers/bfuzzy1.json @@ -0,0 +1,103 @@ +{ + "developer": "bfuzzy1", + "models": [ + { + "id": "bfuzzy1/Gunny", + "name": "Gunny", + "developer": "bfuzzy1", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7129, + "hfopenllm_v2/BBH": 0.4546, + "hfopenllm_v2/MATH Level 5": 0.173, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.3583, + "hfopenllm_v2/MMLU-PRO": 0.3039 + } + }, + { + "id": "bfuzzy1/acheron", + "name": "acheron", + "developer": "bfuzzy1", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1983, + "hfopenllm_v2/BBH": 0.3108, + "hfopenllm_v2/MATH Level 5": 0.0166, + "hfopenllm_v2/GPQA": 0.2391, + "hfopenllm_v2/MUSR": 0.3511, + "hfopenllm_v2/MMLU-PRO": 0.1096 + } + }, + { + "id": "bfuzzy1/acheron-c", + "name": "acheron-c", + "developer": "bfuzzy1", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1929, + "hfopenllm_v2/BBH": 0.3026, + "hfopenllm_v2/MATH Level 5": 0.003, + "hfopenllm_v2/GPQA": 0.2475, + "hfopenllm_v2/MUSR": 0.3382, + "hfopenllm_v2/MMLU-PRO": 0.1172 + } + }, + { + "id": "bfuzzy1/acheron-d", + "name": "acheron-d", + "developer": "bfuzzy1", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1925, + "hfopenllm_v2/BBH": 0.314, + "hfopenllm_v2/MATH Level 5": 0.0151, + "hfopenllm_v2/GPQA": 0.2366, + "hfopenllm_v2/MUSR": 0.3497, + "hfopenllm_v2/MMLU-PRO": 0.1134 + } + }, + { + "id": "bfuzzy1/acheron-m", + "name": "acheron-m", + "developer": "bfuzzy1", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1758, + "hfopenllm_v2/BBH": 0.2928, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3487, + "hfopenllm_v2/MMLU-PRO": 0.1113 + } + }, + { + "id": "bfuzzy1/acheron-m1a-llama", + "name": "acheron-m1a-llama", + "developer": "bfuzzy1", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1125, + "hfopenllm_v2/BBH": 0.2956, + "hfopenllm_v2/MATH Level 5": 0.0076, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3633, + "hfopenllm_v2/MMLU-PRO": 0.1146 + } + }, + { + "id": "bfuzzy1/llambses-1", + "name": "llambses-1", + "developer": "bfuzzy1", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3554, + "hfopenllm_v2/BBH": 0.5047, + "hfopenllm_v2/MATH Level 5": 0.0687, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4529, + "hfopenllm_v2/MMLU-PRO": 0.314 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/bhuvneshsaini.json b/data/developers/bhuvneshsaini.json new file mode 100644 index 0000000000000000000000000000000000000000..db93b0aa253c65665bb6c422aba8d2220cea00cc --- /dev/null +++ b/data/developers/bhuvneshsaini.json @@ -0,0 +1,19 @@ +{ + "developer": "bhuvneshsaini", + "models": [ + { + "id": "bhuvneshsaini/merged_model", + "name": "merged_model", + "developer": "bhuvneshsaini", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1813, + "hfopenllm_v2/BBH": 0.336, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3497, + "hfopenllm_v2/MMLU-PRO": 0.1445 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/bigcode.json b/data/developers/bigcode.json new file mode 100644 index 0000000000000000000000000000000000000000..49c489c55c2d18098aa824060d42d3ff9fd39a20 --- /dev/null +++ b/data/developers/bigcode.json @@ -0,0 +1,47 @@ +{ + "developer": "bigcode", + "models": [ + { + "id": "bigcode/starcoder2-15b", + "name": "starcoder2-15b", + "developer": "bigcode", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.278, + "hfopenllm_v2/BBH": 0.4448, + "hfopenllm_v2/MATH Level 5": 0.0597, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3501, + "hfopenllm_v2/MMLU-PRO": 0.2353 + } + }, + { + "id": "bigcode/starcoder2-3b", + "name": "starcoder2-3b", + "developer": "bigcode", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2037, + "hfopenllm_v2/BBH": 0.3509, + "hfopenllm_v2/MATH Level 5": 0.0151, + "hfopenllm_v2/GPQA": 0.2441, + "hfopenllm_v2/MUSR": 0.3435, + "hfopenllm_v2/MMLU-PRO": 0.1636 + } + }, + { + "id": "bigcode/starcoder2-7b", + "name": "starcoder2-7b", + "developer": "bigcode", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2209, + "hfopenllm_v2/BBH": 0.3661, + "hfopenllm_v2/MATH Level 5": 0.031, + "hfopenllm_v2/GPQA": 0.2517, + "hfopenllm_v2/MUSR": 0.3793, + "hfopenllm_v2/MMLU-PRO": 0.1642 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/bigscience.json b/data/developers/bigscience.json new file mode 100644 index 0000000000000000000000000000000000000000..5479e845729c72c79bcc469939f8a76fd1ab9a0d --- /dev/null +++ b/data/developers/bigscience.json @@ -0,0 +1,121 @@ +{ + "developer": "bigscience", + "models": [ + { + "id": "bigscience/BLOOM-176B", + "name": "BLOOM 176B", + "developer": "bigscience", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.446, + "helm_classic/MMLU": 0.299, + "helm_classic/BoolQ": 0.704, + "helm_classic/NarrativeQA": 0.662, + "helm_classic/NaturalQuestions (open-book)": 0.621, + "helm_classic/QuAC": 0.361, + "helm_classic/HellaSwag": 0.744, + "helm_classic/OpenbookQA": 0.534, + "helm_classic/TruthfulQA": 0.205, + "helm_classic/MS MARCO (TREC)": 0.386, + "helm_classic/CNN/DailyMail": 0.08, + "helm_classic/XSUM": 0.03, + "helm_classic/IMDB": 0.945, + "helm_classic/CivilComments": 0.62, + "helm_classic/RAFT": 0.592 + } + }, + { + "id": "bigscience/T0pp-11B", + "name": "T0pp 11B", + "developer": "bigscience", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.197, + "helm_classic/MMLU": 0.407, + "helm_classic/BoolQ": 0.0, + "helm_classic/NarrativeQA": 0.151, + "helm_classic/NaturalQuestions (open-book)": 0.19, + "helm_classic/QuAC": 0.121, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.377, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": 0.122, + "helm_classic/XSUM": 0.09, + "helm_classic/IMDB": 0.207, + "helm_classic/CivilComments": 0.234, + "helm_classic/RAFT": 0.118 + } + }, + { + "id": "bigscience/bloom-1b1", + "name": "bloom-1b1", + "developer": "bigscience", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1373, + "hfopenllm_v2/BBH": 0.3107, + "hfopenllm_v2/MATH Level 5": 0.0053, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.37, + "hfopenllm_v2/MMLU-PRO": 0.1108 + } + }, + { + "id": "bigscience/bloom-1b7", + "name": "bloom-1b7", + "developer": "bigscience", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1044, + "hfopenllm_v2/BBH": 0.3141, + "hfopenllm_v2/MATH Level 5": 0.0053, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3886, + "hfopenllm_v2/MMLU-PRO": 0.1086 + } + }, + { + "id": "bigscience/bloom-3b", + "name": "bloom-3b", + "developer": "bigscience", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1271, + "hfopenllm_v2/BBH": 0.3063, + "hfopenllm_v2/MATH Level 5": 0.0083, + "hfopenllm_v2/GPQA": 0.2399, + "hfopenllm_v2/MUSR": 0.3981, + "hfopenllm_v2/MMLU-PRO": 0.1133 + } + }, + { + "id": "bigscience/bloom-560m", + "name": "bloom-560m", + "developer": "bigscience", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.062, + "hfopenllm_v2/BBH": 0.3026, + "hfopenllm_v2/MATH Level 5": 0.0038, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.4031, + "hfopenllm_v2/MMLU-PRO": 0.1164 + } + }, + { + "id": "bigscience/bloom-7b1", + "name": "bloom-7b1", + "developer": "bigscience", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1322, + "hfopenllm_v2/BBH": 0.3114, + "hfopenllm_v2/MATH Level 5": 0.0053, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3487, + "hfopenllm_v2/MMLU-PRO": 0.1105 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/bluuwhale.json b/data/developers/bluuwhale.json new file mode 100644 index 0000000000000000000000000000000000000000..b1b3fddc18204487786da7b6ea3631d8c4a75e1b --- /dev/null +++ b/data/developers/bluuwhale.json @@ -0,0 +1,19 @@ +{ + "developer": "bluuwhale", + "models": [ + { + "id": "bluuwhale/L3-SthenoMaid-8B-V1", + "name": "L3-SthenoMaid-8B-V1", + "developer": "bluuwhale", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7345, + "hfopenllm_v2/BBH": 0.5219, + "hfopenllm_v2/MATH Level 5": 0.108, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3687, + "hfopenllm_v2/MMLU-PRO": 0.3656 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/bond005.json b/data/developers/bond005.json new file mode 100644 index 0000000000000000000000000000000000000000..63cb95c6e1355d070fbb69654eba15c592a3a05b --- /dev/null +++ b/data/developers/bond005.json @@ -0,0 +1,19 @@ +{ + "developer": "bond005", + "models": [ + { + "id": "bond005/meno-tiny-0.1", + "name": "meno-tiny-0.1", + "developer": "bond005", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.455, + "hfopenllm_v2/BBH": 0.4263, + "hfopenllm_v2/MATH Level 5": 0.139, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.4185, + "hfopenllm_v2/MMLU-PRO": 0.2786 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/bosonai.json b/data/developers/bosonai.json new file mode 100644 index 0000000000000000000000000000000000000000..d7ccb9b0aaf5da4648f1403012b652cc4ff6f92e --- /dev/null +++ b/data/developers/bosonai.json @@ -0,0 +1,19 @@ +{ + "developer": "bosonai", + "models": [ + { + "id": "bosonai/Higgs-Llama-3-70B", + "name": "Higgs-Llama-3-70B", + "developer": "bosonai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5561, + "hfopenllm_v2/BBH": 0.6258, + "hfopenllm_v2/MATH Level 5": 0.2523, + "hfopenllm_v2/GPQA": 0.3666, + "hfopenllm_v2/MUSR": 0.4471, + "hfopenllm_v2/MMLU-PRO": 0.4902 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/braindao.json b/data/developers/braindao.json new file mode 100644 index 0000000000000000000000000000000000000000..d2c33ac7a285750646bf2c73b4059137679bcea8 --- /dev/null +++ b/data/developers/braindao.json @@ -0,0 +1,243 @@ +{ + "developer": "braindao", + "models": [ + { + "id": "braindao/DeepSeek-R1-Distill-Qwen-1.5B-Blunt", + "name": "DeepSeek-R1-Distill-Qwen-1.5B-Blunt", + "developer": "braindao", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2611, + "hfopenllm_v2/BBH": 0.2774, + "hfopenllm_v2/MATH Level 5": 0.1382, + "hfopenllm_v2/GPQA": 0.2475, + "hfopenllm_v2/MUSR": 0.3595, + "hfopenllm_v2/MMLU-PRO": 0.1184 + } + }, + { + "id": "braindao/DeepSeek-R1-Distill-Qwen-1.5B-Reflective", + "name": "DeepSeek-R1-Distill-Qwen-1.5B-Reflective", + "developer": "braindao", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3033, + "hfopenllm_v2/BBH": 0.2908, + "hfopenllm_v2/MATH Level 5": 0.1631, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3356, + "hfopenllm_v2/MMLU-PRO": 0.113 + } + }, + { + "id": "braindao/DeepSeek-R1-Distill-Qwen-14B", + "name": "DeepSeek-R1-Distill-Qwen-14B", + "developer": "braindao", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4172, + "hfopenllm_v2/BBH": 0.3033, + "hfopenllm_v2/MATH Level 5": 0.176, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.4488, + "hfopenllm_v2/MMLU-PRO": 0.1127 + } + }, + { + "id": "braindao/DeepSeek-R1-Distill-Qwen-14B-ABUB-ST", + "name": "DeepSeek-R1-Distill-Qwen-14B-ABUB-ST", + "developer": "braindao", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3752, + "hfopenllm_v2/BBH": 0.4927, + "hfopenllm_v2/MATH Level 5": 0.5015, + "hfopenllm_v2/GPQA": 0.3448, + "hfopenllm_v2/MUSR": 0.4221, + "hfopenllm_v2/MMLU-PRO": 0.4243 + } + }, + { + "id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt", + "name": "DeepSeek-R1-Distill-Qwen-14B-Blunt", + "developer": "braindao", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5612, + "hfopenllm_v2/BBH": 0.3283, + "hfopenllm_v2/MATH Level 5": 0.1639, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4554, + "hfopenllm_v2/MMLU-PRO": 0.1447 + } + }, + { + "id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored", + "name": "DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored", + "developer": "braindao", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5422, + "hfopenllm_v2/BBH": 0.317, + "hfopenllm_v2/MATH Level 5": 0.1631, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4487, + "hfopenllm_v2/MMLU-PRO": 0.1431 + } + }, + { + "id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt", + "name": "DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt", + "developer": "braindao", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5221, + "hfopenllm_v2/BBH": 0.3199, + "hfopenllm_v2/MATH Level 5": 0.2508, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.4527, + "hfopenllm_v2/MMLU-PRO": 0.1484 + } + }, + { + "id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective", + "name": "DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective", + "developer": "braindao", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.554, + "hfopenllm_v2/BBH": 0.3371, + "hfopenllm_v2/MATH Level 5": 0.2372, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.4248, + "hfopenllm_v2/MMLU-PRO": 0.1504 + } + }, + { + "id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective", + "name": "DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective", + "developer": "braindao", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5139, + "hfopenllm_v2/BBH": 0.3013, + "hfopenllm_v2/MATH Level 5": 0.1473, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.4433, + "hfopenllm_v2/MMLU-PRO": 0.1289 + } + }, + { + "id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Reflective", + "name": "DeepSeek-R1-Distill-Qwen-14B-Reflective", + "developer": "braindao", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.429, + "hfopenllm_v2/BBH": 0.3012, + "hfopenllm_v2/MATH Level 5": 0.1918, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.4554, + "hfopenllm_v2/MMLU-PRO": 0.1129 + } + }, + { + "id": "braindao/DeepSeek-R1-Distill-Qwen-7B", + "name": "DeepSeek-R1-Distill-Qwen-7B", + "developer": "braindao", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3968, + "hfopenllm_v2/BBH": 0.2887, + "hfopenllm_v2/MATH Level 5": 0.1918, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3767, + "hfopenllm_v2/MMLU-PRO": 0.1141 + } + }, + { + "id": "braindao/DeepSeek-R1-Distill-Qwen-7B-Blunt", + "name": "DeepSeek-R1-Distill-Qwen-7B-Blunt", + "developer": "braindao", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4266, + "hfopenllm_v2/BBH": 0.2902, + "hfopenllm_v2/MATH Level 5": 0.2145, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3885, + "hfopenllm_v2/MMLU-PRO": 0.1169 + } + }, + { + "id": "braindao/DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored", + "name": "DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored", + "developer": "braindao", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3655, + "hfopenllm_v2/BBH": 0.2958, + "hfopenllm_v2/MATH Level 5": 0.1737, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3846, + "hfopenllm_v2/MMLU-PRO": 0.1133 + } + }, + { + "id": "braindao/DeepSeek-R1-Distill-Qwen-7B-Reflective", + "name": "DeepSeek-R1-Distill-Qwen-7B-Reflective", + "developer": "braindao", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3922, + "hfopenllm_v2/BBH": 0.2907, + "hfopenllm_v2/MATH Level 5": 0.2024, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.39, + "hfopenllm_v2/MMLU-PRO": 0.1155 + } + }, + { + "id": "braindao/Qwen2.5-14B", + "name": "Qwen2.5-14B", + "developer": "braindao", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5409, + "hfopenllm_v2/BBH": 0.5853, + "hfopenllm_v2/MATH Level 5": 0.2923, + "hfopenllm_v2/GPQA": 0.3733, + "hfopenllm_v2/MUSR": 0.4124, + "hfopenllm_v2/MMLU-PRO": 0.4884 + } + }, + { + "id": "braindao/Qwen2.5-14B-Instruct", + "name": "Qwen2.5-14B-Instruct", + "developer": "braindao", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8143, + "hfopenllm_v2/BBH": 0.6404, + "hfopenllm_v2/MATH Level 5": 0.5529, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.414, + "hfopenllm_v2/MMLU-PRO": 0.4889 + } + }, + { + "id": "braindao/iq-code-evmind-0.5b", + "name": "iq-code-evmind-0.5b", + "developer": "braindao", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3216, + "hfopenllm_v2/BBH": 0.3164, + "hfopenllm_v2/MATH Level 5": 0.0242, + "hfopenllm_v2/GPQA": 0.2416, + "hfopenllm_v2/MUSR": 0.3304, + "hfopenllm_v2/MMLU-PRO": 0.1189 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/brgx53.json b/data/developers/brgx53.json new file mode 100644 index 0000000000000000000000000000000000000000..900b8b3bb64d152e8f11cf2a4b13ce2ea0c9b2a2 --- /dev/null +++ b/data/developers/brgx53.json @@ -0,0 +1,89 @@ +{ + "developer": "brgx53", + "models": [ + { + "id": "brgx53/3Bgeneral-ECE-PRYMMAL-Martial", + "name": "3Bgeneral-ECE-PRYMMAL-Martial", + "developer": "brgx53", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3289, + "hfopenllm_v2/BBH": 0.5458, + "hfopenllm_v2/MATH Level 5": 0.1314, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.4373, + "hfopenllm_v2/MMLU-PRO": 0.3934 + } + }, + { + "id": "brgx53/3Bgeneralv2-ECE-PRYMMAL-Martial", + "name": "3Bgeneralv2-ECE-PRYMMAL-Martial", + "developer": "brgx53", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5677, + "hfopenllm_v2/BBH": 0.5607, + "hfopenllm_v2/MATH Level 5": 0.3497, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.4356, + "hfopenllm_v2/MMLU-PRO": 0.4505 + } + }, + { + "id": "brgx53/3Blareneg-ECE-PRYMMAL-Martial", + "name": "3Blareneg-ECE-PRYMMAL-Martial", + "developer": "brgx53", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2876, + "hfopenllm_v2/BBH": 0.5358, + "hfopenllm_v2/MATH Level 5": 0.1208, + "hfopenllm_v2/GPQA": 0.3347, + "hfopenllm_v2/MUSR": 0.4429, + "hfopenllm_v2/MMLU-PRO": 0.4016 + } + }, + { + "id": "brgx53/3Blarenegv2-ECE-PRYMMAL-Martial", + "name": "3Blarenegv2-ECE-PRYMMAL-Martial", + "developer": "brgx53", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5662, + "hfopenllm_v2/BBH": 0.5607, + "hfopenllm_v2/MATH Level 5": 0.3497, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.4356, + "hfopenllm_v2/MMLU-PRO": 0.4505 + } + }, + { + "id": "brgx53/Barracuda-PRYMMAL-ECE-TW3", + "name": "Barracuda-PRYMMAL-ECE-TW3", + "developer": "brgx53", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.164, + "hfopenllm_v2/BBH": 0.3002, + "hfopenllm_v2/MATH Level 5": 0.0023, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3609, + "hfopenllm_v2/MMLU-PRO": 0.1093 + } + }, + { + "id": "brgx53/LaConfiance-PRYMMAL-ECE-TW3", + "name": "LaConfiance-PRYMMAL-ECE-TW3", + "developer": "brgx53", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1579, + "hfopenllm_v2/BBH": 0.2962, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2517, + "hfopenllm_v2/MUSR": 0.3846, + "hfopenllm_v2/MMLU-PRO": 0.1146 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/bunnycore.json b/data/developers/bunnycore.json new file mode 100644 index 0000000000000000000000000000000000000000..99fe58df58a117598126b277ff0e780c1f1338e7 --- /dev/null +++ b/data/developers/bunnycore.json @@ -0,0 +1,1195 @@ +{ + "developer": "bunnycore", + "models": [ + { + "id": "bunnycore/Best-Mix-Llama-3.1-8B", + "name": "Best-Mix-Llama-3.1-8B", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2067, + "hfopenllm_v2/BBH": 0.3432, + "hfopenllm_v2/MATH Level 5": 0.2054, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.2929, + "hfopenllm_v2/MMLU-PRO": 0.1565 + } + }, + { + "id": "bunnycore/Blabbertron-1.0", + "name": "Blabbertron-1.0", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7433, + "hfopenllm_v2/BBH": 0.5497, + "hfopenllm_v2/MATH Level 5": 0.4924, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4337, + "hfopenllm_v2/MMLU-PRO": 0.4354 + } + }, + { + "id": "bunnycore/Blabbertron-1.1", + "name": "Blabbertron-1.1", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7265, + "hfopenllm_v2/BBH": 0.5534, + "hfopenllm_v2/MATH Level 5": 0.4804, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4416, + "hfopenllm_v2/MMLU-PRO": 0.4431 + } + }, + { + "id": "bunnycore/CyberCore-Qwen-2.1-7B", + "name": "CyberCore-Qwen-2.1-7B", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5766, + "hfopenllm_v2/BBH": 0.5572, + "hfopenllm_v2/MATH Level 5": 0.3588, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4145, + "hfopenllm_v2/MMLU-PRO": 0.4445 + } + }, + { + "id": "bunnycore/DeepQwen-3B-LCoT-SCE", + "name": "DeepQwen-3B-LCoT-SCE", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.449, + "hfopenllm_v2/BBH": 0.4512, + "hfopenllm_v2/MATH Level 5": 0.247, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3514, + "hfopenllm_v2/MMLU-PRO": 0.329 + } + }, + { + "id": "bunnycore/DeepSeek-R1-Distill-Qwen-7B-RRP-Ex", + "name": "DeepSeek-R1-Distill-Qwen-7B-RRP-Ex", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3901, + "hfopenllm_v2/BBH": 0.3494, + "hfopenllm_v2/MATH Level 5": 0.1654, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.3663, + "hfopenllm_v2/MMLU-PRO": 0.2508 + } + }, + { + "id": "bunnycore/DeepThinker-7B-Sce-v1", + "name": "DeepThinker-7B-Sce-v1", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1218, + "hfopenllm_v2/BBH": 0.3018, + "hfopenllm_v2/MATH Level 5": 0.0098, + "hfopenllm_v2/GPQA": 0.2517, + "hfopenllm_v2/MUSR": 0.4194, + "hfopenllm_v2/MMLU-PRO": 0.1123 + } + }, + { + "id": "bunnycore/DeepThinker-7B-Sce-v2", + "name": "DeepThinker-7B-Sce-v2", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1631, + "hfopenllm_v2/BBH": 0.3057, + "hfopenllm_v2/MATH Level 5": 0.0113, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.4101, + "hfopenllm_v2/MMLU-PRO": 0.1146 + } + }, + { + "id": "bunnycore/FuseCyberMix-Qwen-2.5-7B-Instruct", + "name": "FuseCyberMix-Qwen-2.5-7B-Instruct", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7019, + "hfopenllm_v2/BBH": 0.5518, + "hfopenllm_v2/MATH Level 5": 0.4841, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.402, + "hfopenllm_v2/MMLU-PRO": 0.4337 + } + }, + { + "id": "bunnycore/FuseQwQen-7B", + "name": "FuseQwQen-7B", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7275, + "hfopenllm_v2/BBH": 0.5504, + "hfopenllm_v2/MATH Level 5": 0.4366, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4217, + "hfopenllm_v2/MMLU-PRO": 0.4407 + } + }, + { + "id": "bunnycore/FwF-Qwen-7B-0.1", + "name": "FwF-Qwen-7B-0.1", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3005, + "hfopenllm_v2/BBH": 0.5019, + "hfopenllm_v2/MATH Level 5": 0.2764, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3952, + "hfopenllm_v2/MMLU-PRO": 0.4061 + } + }, + { + "id": "bunnycore/FwF-Qwen-7B-0.2", + "name": "FwF-Qwen-7B-0.2", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4479, + "hfopenllm_v2/BBH": 0.5596, + "hfopenllm_v2/MATH Level 5": 0.426, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4218, + "hfopenllm_v2/MMLU-PRO": 0.4382 + } + }, + { + "id": "bunnycore/Gemma-2-2B-Smart", + "name": "Gemma-2-2B-Smart", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1321, + "hfopenllm_v2/BBH": 0.3974, + "hfopenllm_v2/MATH Level 5": 0.0332, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4249, + "hfopenllm_v2/MMLU-PRO": 0.2426 + } + }, + { + "id": "bunnycore/Gemma2-9B-TitanFusion", + "name": "Gemma2-9B-TitanFusion", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1618, + "hfopenllm_v2/BBH": 0.5712, + "hfopenllm_v2/MATH Level 5": 0.077, + "hfopenllm_v2/GPQA": 0.3322, + "hfopenllm_v2/MUSR": 0.4136, + "hfopenllm_v2/MMLU-PRO": 0.396 + } + }, + { + "id": "bunnycore/HyperLlama-3.1-8B", + "name": "HyperLlama-3.1-8B", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7883, + "hfopenllm_v2/BBH": 0.5103, + "hfopenllm_v2/MATH Level 5": 0.1828, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.3829, + "hfopenllm_v2/MMLU-PRO": 0.3783 + } + }, + { + "id": "bunnycore/Llama-3.1-8B-TitanFusion-Mix", + "name": "Llama-3.1-8B-TitanFusion-Mix", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4925, + "hfopenllm_v2/BBH": 0.5756, + "hfopenllm_v2/MATH Level 5": 0.1284, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4317, + "hfopenllm_v2/MMLU-PRO": 0.3695 + } + }, + { + "id": "bunnycore/Llama-3.1-8B-TitanFusion-v3", + "name": "Llama-3.1-8B-TitanFusion-v3", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.481, + "hfopenllm_v2/BBH": 0.5262, + "hfopenllm_v2/MATH Level 5": 0.142, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4302, + "hfopenllm_v2/MMLU-PRO": 0.3806 + } + }, + { + "id": "bunnycore/Llama-3.2-3B-All-Mix", + "name": "Llama-3.2-3B-All-Mix", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7226, + "hfopenllm_v2/BBH": 0.4508, + "hfopenllm_v2/MATH Level 5": 0.1503, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3287, + "hfopenllm_v2/MMLU-PRO": 0.316 + } + }, + { + "id": "bunnycore/Llama-3.2-3B-Bespoke-Thought", + "name": "Llama-3.2-3B-Bespoke-Thought", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4113, + "hfopenllm_v2/BBH": 0.4522, + "hfopenllm_v2/MATH Level 5": 0.1647, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3302, + "hfopenllm_v2/MMLU-PRO": 0.311 + } + }, + { + "id": "bunnycore/Llama-3.2-3B-Booval", + "name": "Llama-3.2-3B-Booval", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6669, + "hfopenllm_v2/BBH": 0.4514, + "hfopenllm_v2/MATH Level 5": 0.1269, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3394, + "hfopenllm_v2/MMLU-PRO": 0.3058 + } + }, + { + "id": "bunnycore/Llama-3.2-3B-Deep-Test", + "name": "Llama-3.2-3B-Deep-Test", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4652, + "hfopenllm_v2/BBH": 0.4531, + "hfopenllm_v2/MATH Level 5": 0.1284, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3394, + "hfopenllm_v2/MMLU-PRO": 0.3152 + } + }, + { + "id": "bunnycore/Llama-3.2-3B-Della", + "name": "Llama-3.2-3B-Della", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3561, + "hfopenllm_v2/BBH": 0.3683, + "hfopenllm_v2/MATH Level 5": 0.0302, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3902, + "hfopenllm_v2/MMLU-PRO": 0.2128 + } + }, + { + "id": "bunnycore/Llama-3.2-3B-Long-Think", + "name": "Llama-3.2-3B-Long-Think", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5473, + "hfopenllm_v2/BBH": 0.461, + "hfopenllm_v2/MATH Level 5": 0.1458, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3396, + "hfopenllm_v2/MMLU-PRO": 0.3048 + } + }, + { + "id": "bunnycore/Llama-3.2-3B-Mix-Skill", + "name": "Llama-3.2-3B-Mix-Skill", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6404, + "hfopenllm_v2/BBH": 0.4582, + "hfopenllm_v2/MATH Level 5": 0.1473, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3396, + "hfopenllm_v2/MMLU-PRO": 0.3121 + } + }, + { + "id": "bunnycore/Llama-3.2-3B-ProdigyPlus", + "name": "Llama-3.2-3B-ProdigyPlus", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4015, + "hfopenllm_v2/BBH": 0.4392, + "hfopenllm_v2/MATH Level 5": 0.1156, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.358, + "hfopenllm_v2/MMLU-PRO": 0.2817 + } + }, + { + "id": "bunnycore/Llama-3.2-3B-ProdigyPlusPlus", + "name": "Llama-3.2-3B-ProdigyPlusPlus", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1645, + "hfopenllm_v2/BBH": 0.369, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3541, + "hfopenllm_v2/MMLU-PRO": 0.15 + } + }, + { + "id": "bunnycore/Llama-3.2-3B-RP-DeepThink", + "name": "Llama-3.2-3B-RP-DeepThink", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7144, + "hfopenllm_v2/BBH": 0.4563, + "hfopenllm_v2/MATH Level 5": 0.1609, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3302, + "hfopenllm_v2/MMLU-PRO": 0.3242 + } + }, + { + "id": "bunnycore/Llama-3.2-3B-RRStock", + "name": "Llama-3.2-3B-RRStock", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6657, + "hfopenllm_v2/BBH": 0.4568, + "hfopenllm_v2/MATH Level 5": 0.1699, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3314, + "hfopenllm_v2/MMLU-PRO": 0.3236 + } + }, + { + "id": "bunnycore/Llama-3.2-3B-ToxicKod", + "name": "Llama-3.2-3B-ToxicKod", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6319, + "hfopenllm_v2/BBH": 0.4525, + "hfopenllm_v2/MATH Level 5": 0.1699, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3475, + "hfopenllm_v2/MMLU-PRO": 0.288 + } + }, + { + "id": "bunnycore/Llama-3.2-3b-RP-Toxic-Fuse", + "name": "Llama-3.2-3b-RP-Toxic-Fuse", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6834, + "hfopenllm_v2/BBH": 0.465, + "hfopenllm_v2/MATH Level 5": 0.2402, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.3954, + "hfopenllm_v2/MMLU-PRO": 0.3106 + } + }, + { + "id": "bunnycore/Maestro-S1k-7B-Sce", + "name": "Maestro-S1k-7B-Sce", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2523, + "hfopenllm_v2/BBH": 0.3104, + "hfopenllm_v2/MATH Level 5": 0.0279, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3768, + "hfopenllm_v2/MMLU-PRO": 0.117 + } + }, + { + "id": "bunnycore/Phi-3.5-mini-TitanFusion-0.1", + "name": "Phi-3.5-mini-TitanFusion-0.1", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5228, + "hfopenllm_v2/BBH": 0.5374, + "hfopenllm_v2/MATH Level 5": 0.1186, + "hfopenllm_v2/GPQA": 0.3314, + "hfopenllm_v2/MUSR": 0.4453, + "hfopenllm_v2/MMLU-PRO": 0.3807 + } + }, + { + "id": "bunnycore/Phi-4-Model-Stock", + "name": "Phi-4-Model-Stock", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6879, + "hfopenllm_v2/BBH": 0.689, + "hfopenllm_v2/MATH Level 5": 0.4298, + "hfopenllm_v2/GPQA": 0.3549, + "hfopenllm_v2/MUSR": 0.4441, + "hfopenllm_v2/MMLU-PRO": 0.5368 + } + }, + { + "id": "bunnycore/Phi-4-Model-Stock-v2", + "name": "Phi-4-Model-Stock-v2", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6375, + "hfopenllm_v2/BBH": 0.6825, + "hfopenllm_v2/MATH Level 5": 0.3754, + "hfopenllm_v2/GPQA": 0.349, + "hfopenllm_v2/MUSR": 0.4662, + "hfopenllm_v2/MMLU-PRO": 0.5331 + } + }, + { + "id": "bunnycore/Phi-4-Model-Stock-v3", + "name": "Phi-4-Model-Stock-v3", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5912, + "hfopenllm_v2/BBH": 0.6726, + "hfopenllm_v2/MATH Level 5": 0.4902, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.4166, + "hfopenllm_v2/MMLU-PRO": 0.5381 + } + }, + { + "id": "bunnycore/Phi-4-Model-Stock-v4", + "name": "Phi-4-Model-Stock-v4", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.711, + "hfopenllm_v2/BBH": 0.6924, + "hfopenllm_v2/MATH Level 5": 0.3829, + "hfopenllm_v2/GPQA": 0.3691, + "hfopenllm_v2/MUSR": 0.4611, + "hfopenllm_v2/MMLU-PRO": 0.5394 + } + }, + { + "id": "bunnycore/Phi-4-RP-v0", + "name": "Phi-4-RP-v0", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6827, + "hfopenllm_v2/BBH": 0.6856, + "hfopenllm_v2/MATH Level 5": 0.3316, + "hfopenllm_v2/GPQA": 0.3523, + "hfopenllm_v2/MUSR": 0.4141, + "hfopenllm_v2/MMLU-PRO": 0.5364 + } + }, + { + "id": "bunnycore/Phi-4-RR-Shoup", + "name": "Phi-4-RR-Shoup", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6587, + "hfopenllm_v2/BBH": 0.6947, + "hfopenllm_v2/MATH Level 5": 0.4992, + "hfopenllm_v2/GPQA": 0.3372, + "hfopenllm_v2/MUSR": 0.444, + "hfopenllm_v2/MMLU-PRO": 0.5429 + } + }, + { + "id": "bunnycore/Phi-4-RStock-v0.1", + "name": "Phi-4-RStock-v0.1", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7019, + "hfopenllm_v2/BBH": 0.6928, + "hfopenllm_v2/MATH Level 5": 0.395, + "hfopenllm_v2/GPQA": 0.3649, + "hfopenllm_v2/MUSR": 0.4584, + "hfopenllm_v2/MMLU-PRO": 0.5401 + } + }, + { + "id": "bunnycore/Phi-4-ReasoningRP", + "name": "Phi-4-ReasoningRP", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6736, + "hfopenllm_v2/BBH": 0.6922, + "hfopenllm_v2/MATH Level 5": 0.4569, + "hfopenllm_v2/GPQA": 0.344, + "hfopenllm_v2/MUSR": 0.4491, + "hfopenllm_v2/MMLU-PRO": 0.5421 + } + }, + { + "id": "bunnycore/Phi-4-Sce-exp-v0.1", + "name": "Phi-4-Sce-exp-v0.1", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6595, + "hfopenllm_v2/BBH": 0.6943, + "hfopenllm_v2/MATH Level 5": 0.503, + "hfopenllm_v2/GPQA": 0.3356, + "hfopenllm_v2/MUSR": 0.4441, + "hfopenllm_v2/MMLU-PRO": 0.5423 + } + }, + { + "id": "bunnycore/Phi-4-Stock-Ex", + "name": "Phi-4-Stock-Ex", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6575, + "hfopenllm_v2/BBH": 0.6864, + "hfopenllm_v2/MATH Level 5": 0.4086, + "hfopenllm_v2/GPQA": 0.3507, + "hfopenllm_v2/MUSR": 0.4624, + "hfopenllm_v2/MMLU-PRO": 0.5375 + } + }, + { + "id": "bunnycore/Phi-4-Stock-RP", + "name": "Phi-4-Stock-RP", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6399, + "hfopenllm_v2/BBH": 0.686, + "hfopenllm_v2/MATH Level 5": 0.3414, + "hfopenllm_v2/GPQA": 0.3582, + "hfopenllm_v2/MUSR": 0.4715, + "hfopenllm_v2/MMLU-PRO": 0.5317 + } + }, + { + "id": "bunnycore/Phi-4-Trim-Exp1", + "name": "Phi-4-Trim-Exp1", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1219, + "hfopenllm_v2/BBH": 0.2852, + "hfopenllm_v2/MATH Level 5": 0.0053, + "hfopenllm_v2/GPQA": 0.255, + "hfopenllm_v2/MUSR": 0.4177, + "hfopenllm_v2/MMLU-PRO": 0.1147 + } + }, + { + "id": "bunnycore/Phi-Seek-4-Sce-V1", + "name": "Phi-Seek-4-Sce-V1", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2935, + "hfopenllm_v2/BBH": 0.6459, + "hfopenllm_v2/MATH Level 5": 0.2145, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3982, + "hfopenllm_v2/MMLU-PRO": 0.5123 + } + }, + { + "id": "bunnycore/Qandora-2.5-7B-Creative", + "name": "Qandora-2.5-7B-Creative", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6803, + "hfopenllm_v2/BBH": 0.5542, + "hfopenllm_v2/MATH Level 5": 0.3059, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4212, + "hfopenllm_v2/MMLU-PRO": 0.448 + } + }, + { + "id": "bunnycore/QandoraExp-7B", + "name": "QandoraExp-7B", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7509, + "hfopenllm_v2/BBH": 0.5478, + "hfopenllm_v2/MATH Level 5": 0.4743, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4312, + "hfopenllm_v2/MMLU-PRO": 0.441 + } + }, + { + "id": "bunnycore/QandoraExp-7B-Persona", + "name": "QandoraExp-7B-Persona", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6247, + "hfopenllm_v2/BBH": 0.5558, + "hfopenllm_v2/MATH Level 5": 0.3104, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.4372, + "hfopenllm_v2/MMLU-PRO": 0.4407 + } + }, + { + "id": "bunnycore/QandoraExp-7B-v2", + "name": "QandoraExp-7B-v2", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5607, + "hfopenllm_v2/BBH": 0.5445, + "hfopenllm_v2/MATH Level 5": 0.4713, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4045, + "hfopenllm_v2/MMLU-PRO": 0.3909 + } + }, + { + "id": "bunnycore/QwQen-3B-LCoT", + "name": "QwQen-3B-LCoT", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6025, + "hfopenllm_v2/BBH": 0.4899, + "hfopenllm_v2/MATH Level 5": 0.3618, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.4178, + "hfopenllm_v2/MMLU-PRO": 0.3699 + } + }, + { + "id": "bunnycore/QwQen-3B-LCoT-R1", + "name": "QwQen-3B-LCoT-R1", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5342, + "hfopenllm_v2/BBH": 0.4799, + "hfopenllm_v2/MATH Level 5": 0.3353, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.4138, + "hfopenllm_v2/MMLU-PRO": 0.3723 + } + }, + { + "id": "bunnycore/Qwen-2.5-7B-Deep-Sky-T1", + "name": "Qwen-2.5-7B-Deep-Sky-T1", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4208, + "hfopenllm_v2/BBH": 0.414, + "hfopenllm_v2/MATH Level 5": 0.0551, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.4018, + "hfopenllm_v2/MMLU-PRO": 0.2104 + } + }, + { + "id": "bunnycore/Qwen-2.5-7B-Deep-Stock-v1", + "name": "Qwen-2.5-7B-Deep-Stock-v1", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5695, + "hfopenllm_v2/BBH": 0.5361, + "hfopenllm_v2/MATH Level 5": 0.2644, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.4109, + "hfopenllm_v2/MMLU-PRO": 0.4066 + } + }, + { + "id": "bunnycore/Qwen-2.5-7B-Deep-Stock-v4", + "name": "Qwen-2.5-7B-Deep-Stock-v4", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7753, + "hfopenllm_v2/BBH": 0.5453, + "hfopenllm_v2/MATH Level 5": 0.4894, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4127, + "hfopenllm_v2/MMLU-PRO": 0.4342 + } + }, + { + "id": "bunnycore/Qwen-2.5-7B-Deep-Stock-v5", + "name": "Qwen-2.5-7B-Deep-Stock-v5", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4509, + "hfopenllm_v2/BBH": 0.4672, + "hfopenllm_v2/MATH Level 5": 0.1473, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3648, + "hfopenllm_v2/MMLU-PRO": 0.2832 + } + }, + { + "id": "bunnycore/Qwen-2.5-7B-Exp-Sce", + "name": "Qwen-2.5-7B-Exp-Sce", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7652, + "hfopenllm_v2/BBH": 0.5506, + "hfopenllm_v2/MATH Level 5": 0.3255, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.443, + "hfopenllm_v2/MMLU-PRO": 0.4259 + } + }, + { + "id": "bunnycore/Qwen-2.5-7B-R1-Stock", + "name": "Qwen-2.5-7B-R1-Stock", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7573, + "hfopenllm_v2/BBH": 0.5393, + "hfopenllm_v2/MATH Level 5": 0.5008, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.3994, + "hfopenllm_v2/MMLU-PRO": 0.4294 + } + }, + { + "id": "bunnycore/Qwen-2.5-7B-Stock-Deep-Bespoke", + "name": "Qwen-2.5-7B-Stock-Deep-Bespoke", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5206, + "hfopenllm_v2/BBH": 0.492, + "hfopenllm_v2/MATH Level 5": 0.1888, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.4068, + "hfopenllm_v2/MMLU-PRO": 0.358 + } + }, + { + "id": "bunnycore/Qwen-2.5-7b-S1k", + "name": "Qwen-2.5-7b-S1k", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7162, + "hfopenllm_v2/BBH": 0.5563, + "hfopenllm_v2/MATH Level 5": 0.4781, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4071, + "hfopenllm_v2/MMLU-PRO": 0.4382 + } + }, + { + "id": "bunnycore/Qwen2.5-1.5B-Model-Stock", + "name": "Qwen2.5-1.5B-Model-Stock", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1829, + "hfopenllm_v2/BBH": 0.2874, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3674, + "hfopenllm_v2/MMLU-PRO": 0.11 + } + }, + { + "id": "bunnycore/Qwen2.5-3B-Model-Stock", + "name": "Qwen2.5-3B-Model-Stock", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6381, + "hfopenllm_v2/BBH": 0.4712, + "hfopenllm_v2/MATH Level 5": 0.3799, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.3942, + "hfopenllm_v2/MMLU-PRO": 0.325 + } + }, + { + "id": "bunnycore/Qwen2.5-3B-Model-Stock-v2", + "name": "Qwen2.5-3B-Model-Stock-v2", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.649, + "hfopenllm_v2/BBH": 0.4677, + "hfopenllm_v2/MATH Level 5": 0.3867, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.3915, + "hfopenllm_v2/MMLU-PRO": 0.327 + } + }, + { + "id": "bunnycore/Qwen2.5-3B-Model-Stock-v3.1", + "name": "Qwen2.5-3B-Model-Stock-v3.1", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6481, + "hfopenllm_v2/BBH": 0.4737, + "hfopenllm_v2/MATH Level 5": 0.3897, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.3968, + "hfopenllm_v2/MMLU-PRO": 0.329 + } + }, + { + "id": "bunnycore/Qwen2.5-3B-Model-Stock-v3.2", + "name": "Qwen2.5-3B-Model-Stock-v3.2", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6353, + "hfopenllm_v2/BBH": 0.4727, + "hfopenllm_v2/MATH Level 5": 0.3754, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3928, + "hfopenllm_v2/MMLU-PRO": 0.3294 + } + }, + { + "id": "bunnycore/Qwen2.5-3B-Model-Stock-v4.1", + "name": "Qwen2.5-3B-Model-Stock-v4.1", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6381, + "hfopenllm_v2/BBH": 0.482, + "hfopenllm_v2/MATH Level 5": 0.3769, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3941, + "hfopenllm_v2/MMLU-PRO": 0.3387 + } + }, + { + "id": "bunnycore/Qwen2.5-3B-RP-Mix", + "name": "Qwen2.5-3B-RP-Mix", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5721, + "hfopenllm_v2/BBH": 0.4894, + "hfopenllm_v2/MATH Level 5": 0.2153, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.4284, + "hfopenllm_v2/MMLU-PRO": 0.3728 + } + }, + { + "id": "bunnycore/Qwen2.5-3B-RP-Thinker", + "name": "Qwen2.5-3B-RP-Thinker", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5894, + "hfopenllm_v2/BBH": 0.4164, + "hfopenllm_v2/MATH Level 5": 0.3353, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3287, + "hfopenllm_v2/MMLU-PRO": 0.315 + } + }, + { + "id": "bunnycore/Qwen2.5-3B-RP-Thinker-V2", + "name": "Qwen2.5-3B-RP-Thinker-V2", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.642, + "hfopenllm_v2/BBH": 0.4678, + "hfopenllm_v2/MATH Level 5": 0.3829, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.3981, + "hfopenllm_v2/MMLU-PRO": 0.3271 + } + }, + { + "id": "bunnycore/Qwen2.5-7B-CyberRombos", + "name": "Qwen2.5-7B-CyberRombos", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7518, + "hfopenllm_v2/BBH": 0.5465, + "hfopenllm_v2/MATH Level 5": 0.4962, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4125, + "hfopenllm_v2/MMLU-PRO": 0.4391 + } + }, + { + "id": "bunnycore/Qwen2.5-7B-Fuse-Exp", + "name": "Qwen2.5-7B-Fuse-Exp", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5469, + "hfopenllm_v2/BBH": 0.5109, + "hfopenllm_v2/MATH Level 5": 0.3142, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.4573, + "hfopenllm_v2/MMLU-PRO": 0.3309 + } + }, + { + "id": "bunnycore/Qwen2.5-7B-Instruct-Fusion", + "name": "Qwen2.5-7B-Instruct-Fusion", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6962, + "hfopenllm_v2/BBH": 0.5492, + "hfopenllm_v2/MATH Level 5": 0.3406, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4297, + "hfopenllm_v2/MMLU-PRO": 0.4467 + } + }, + { + "id": "bunnycore/Qwen2.5-7B-Instruct-Merge-Stock-v0.1", + "name": "Qwen2.5-7B-Instruct-Merge-Stock-v0.1", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7509, + "hfopenllm_v2/BBH": 0.5529, + "hfopenllm_v2/MATH Level 5": 0.4894, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4231, + "hfopenllm_v2/MMLU-PRO": 0.4383 + } + }, + { + "id": "bunnycore/Qwen2.5-7B-MixStock-Sce-V0.3", + "name": "Qwen2.5-7B-MixStock-Sce-V0.3", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.212, + "hfopenllm_v2/BBH": 0.3479, + "hfopenllm_v2/MATH Level 5": 0.2576, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3714, + "hfopenllm_v2/MMLU-PRO": 0.1779 + } + }, + { + "id": "bunnycore/Qwen2.5-7B-MixStock-V0.1", + "name": "Qwen2.5-7B-MixStock-V0.1", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7673, + "hfopenllm_v2/BBH": 0.5479, + "hfopenllm_v2/MATH Level 5": 0.3172, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4416, + "hfopenllm_v2/MMLU-PRO": 0.4256 + } + }, + { + "id": "bunnycore/Qwen2.5-7B-R1-Bespoke-Stock", + "name": "Qwen2.5-7B-R1-Bespoke-Stock", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3726, + "hfopenllm_v2/BBH": 0.4822, + "hfopenllm_v2/MATH Level 5": 0.2047, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.3926, + "hfopenllm_v2/MMLU-PRO": 0.3472 + } + }, + { + "id": "bunnycore/Qwen2.5-7B-R1-Bespoke-Task", + "name": "Qwen2.5-7B-R1-Bespoke-Task", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3787, + "hfopenllm_v2/BBH": 0.415, + "hfopenllm_v2/MATH Level 5": 0.1782, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3569, + "hfopenllm_v2/MMLU-PRO": 0.2688 + } + }, + { + "id": "bunnycore/Qwen2.5-7B-RRP-1M", + "name": "Qwen2.5-7B-RRP-1M", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7481, + "hfopenllm_v2/BBH": 0.5452, + "hfopenllm_v2/MATH Level 5": 0.3248, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4483, + "hfopenllm_v2/MMLU-PRO": 0.4266 + } + }, + { + "id": "bunnycore/Qwen2.5-7B-RRP-1M-Thinker", + "name": "Qwen2.5-7B-RRP-1M-Thinker", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2308, + "hfopenllm_v2/BBH": 0.3482, + "hfopenllm_v2/MATH Level 5": 0.2719, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3767, + "hfopenllm_v2/MMLU-PRO": 0.1769 + } + }, + { + "id": "bunnycore/Qwen2.5-7B-RRP-ID", + "name": "Qwen2.5-7B-RRP-ID", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7473, + "hfopenllm_v2/BBH": 0.548, + "hfopenllm_v2/MATH Level 5": 0.4864, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.418, + "hfopenllm_v2/MMLU-PRO": 0.4387 + } + }, + { + "id": "bunnycore/Qwen2.5-7B-Sky-R1-Mini", + "name": "Qwen2.5-7B-Sky-R1-Mini", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2305, + "hfopenllm_v2/BBH": 0.3503, + "hfopenllm_v2/MATH Level 5": 0.0295, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.3448, + "hfopenllm_v2/MMLU-PRO": 0.1253 + } + }, + { + "id": "bunnycore/QwenMosaic-7B", + "name": "QwenMosaic-7B", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5819, + "hfopenllm_v2/BBH": 0.5564, + "hfopenllm_v2/MATH Level 5": 0.4441, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.4164, + "hfopenllm_v2/MMLU-PRO": 0.431 + } + }, + { + "id": "bunnycore/Smol-Llama-3.2-3B", + "name": "Smol-Llama-3.2-3B", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6679, + "hfopenllm_v2/BBH": 0.4539, + "hfopenllm_v2/MATH Level 5": 0.1382, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.346, + "hfopenllm_v2/MMLU-PRO": 0.3228 + } + }, + { + "id": "bunnycore/SmolLM2-1.7-Persona", + "name": "SmolLM2-1.7-Persona", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5465, + "hfopenllm_v2/BBH": 0.3623, + "hfopenllm_v2/MATH Level 5": 0.0566, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3341, + "hfopenllm_v2/MMLU-PRO": 0.1974 + } + }, + { + "id": "bunnycore/SmolLM2-1.7B-roleplay-lora", + "name": "SmolLM2-1.7B-roleplay-lora", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5382, + "hfopenllm_v2/BBH": 0.361, + "hfopenllm_v2/MATH Level 5": 0.0529, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3395, + "hfopenllm_v2/MMLU-PRO": 0.1966 + } + }, + { + "id": "bunnycore/Tulu-3.1-8B-SuperNova", + "name": "Tulu-3.1-8B-SuperNova", + "developer": "bunnycore", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8194, + "hfopenllm_v2/BBH": 0.5254, + "hfopenllm_v2/MATH Level 5": 0.2462, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.3935, + "hfopenllm_v2/MMLU-PRO": 0.3814 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/byroneverson.json b/data/developers/byroneverson.json new file mode 100644 index 0000000000000000000000000000000000000000..4b05f10e836206ccd27fd635e0336e8df718a806 --- /dev/null +++ b/data/developers/byroneverson.json @@ -0,0 +1,47 @@ +{ + "developer": "byroneverson", + "models": [ + { + "id": "byroneverson/Mistral-Small-Instruct-2409-abliterated", + "name": "Mistral-Small-Instruct-2409-abliterated", + "developer": "byroneverson", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6971, + "hfopenllm_v2/BBH": 0.5238, + "hfopenllm_v2/MATH Level 5": 0.2477, + "hfopenllm_v2/GPQA": 0.3331, + "hfopenllm_v2/MUSR": 0.3697, + "hfopenllm_v2/MMLU-PRO": 0.3923 + } + }, + { + "id": "byroneverson/Yi-1.5-9B-Chat-16K-abliterated", + "name": "Yi-1.5-9B-Chat-16K-abliterated", + "developer": "byroneverson", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5528, + "hfopenllm_v2/BBH": 0.5282, + "hfopenllm_v2/MATH Level 5": 0.1412, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4734, + "hfopenllm_v2/MMLU-PRO": 0.3823 + } + }, + { + "id": "byroneverson/Yi-1.5-9B-Chat-abliterated", + "name": "Yi-1.5-9B-Chat-abliterated", + "developer": "byroneverson", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5723, + "hfopenllm_v2/BBH": 0.5401, + "hfopenllm_v2/MATH Level 5": 0.1662, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4389, + "hfopenllm_v2/MMLU-PRO": 0.3715 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/c10x.json b/data/developers/c10x.json new file mode 100644 index 0000000000000000000000000000000000000000..cbbb9c520970bb1bab8afbfc9a39af2da05bb4b5 --- /dev/null +++ b/data/developers/c10x.json @@ -0,0 +1,33 @@ +{ + "developer": "c10x", + "models": [ + { + "id": "c10x/Q-Pluse", + "name": "Q-Pluse", + "developer": "c10x", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1123, + "hfopenllm_v2/BBH": 0.2875, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2466, + "hfopenllm_v2/MUSR": 0.3938, + "hfopenllm_v2/MMLU-PRO": 0.1135 + } + }, + { + "id": "c10x/longthinker", + "name": "longthinker", + "developer": "c10x", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3609, + "hfopenllm_v2/BBH": 0.4927, + "hfopenllm_v2/MATH Level 5": 0.2319, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.391, + "hfopenllm_v2/MMLU-PRO": 0.3527 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/carsenk.json b/data/developers/carsenk.json new file mode 100644 index 0000000000000000000000000000000000000000..e237086068941314940bb10d97f3842615a22d3d --- /dev/null +++ b/data/developers/carsenk.json @@ -0,0 +1,33 @@ +{ + "developer": "carsenk", + "models": [ + { + "id": "carsenk/flippa-v6", + "name": "flippa-v6", + "developer": "carsenk", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3439, + "hfopenllm_v2/BBH": 0.5047, + "hfopenllm_v2/MATH Level 5": 0.1405, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4089, + "hfopenllm_v2/MMLU-PRO": 0.3668 + } + }, + { + "id": "carsenk/phi3.5_mini_exp_825_uncensored", + "name": "phi3.5_mini_exp_825_uncensored", + "developer": "carsenk", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1364, + "hfopenllm_v2/BBH": 0.2965, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2492, + "hfopenllm_v2/MUSR": 0.3644, + "hfopenllm_v2/MMLU-PRO": 0.1175 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/cat-searcher.json b/data/developers/cat-searcher.json new file mode 100644 index 0000000000000000000000000000000000000000..1f3a47c36f373a2cecaea2380dd4d878e56fa04b --- /dev/null +++ b/data/developers/cat-searcher.json @@ -0,0 +1,33 @@ +{ + "developer": "cat-searcher", + "models": [ + { + "id": "cat-searcher/gemma-2-9b-it-sppo-iter-1", + "name": "gemma-2-9b-it-sppo-iter-1", + "developer": "cat-searcher", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3015, + "hfopenllm_v2/BBH": 0.5972, + "hfopenllm_v2/MATH Level 5": 0.0831, + "hfopenllm_v2/GPQA": 0.3448, + "hfopenllm_v2/MUSR": 0.3927, + "hfopenllm_v2/MMLU-PRO": 0.3854 + } + }, + { + "id": "cat-searcher/gemma-2-9b-it-sppo-iter-1-evol-1", + "name": "gemma-2-9b-it-sppo-iter-1-evol-1", + "developer": "cat-searcher", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2942, + "hfopenllm_v2/BBH": 0.5939, + "hfopenllm_v2/MATH Level 5": 0.0853, + "hfopenllm_v2/GPQA": 0.3406, + "hfopenllm_v2/MUSR": 0.3926, + "hfopenllm_v2/MMLU-PRO": 0.38 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/cckm.json b/data/developers/cckm.json new file mode 100644 index 0000000000000000000000000000000000000000..430066aff4bd7a50bac4899a6fc850677db7bcd4 --- /dev/null +++ b/data/developers/cckm.json @@ -0,0 +1,19 @@ +{ + "developer": "cckm", + "models": [ + { + "id": "cckm/tinymistral_950m", + "name": "tinymistral_950m", + "developer": "cckm", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2395, + "hfopenllm_v2/BBH": 0.2969, + "hfopenllm_v2/MATH Level 5": 0.0053, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3554, + "hfopenllm_v2/MMLU-PRO": 0.1096 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/cgato.json b/data/developers/cgato.json new file mode 100644 index 0000000000000000000000000000000000000000..d6b97e4a6c18cc9b58fe1a44e92090c5d1ddaf6e --- /dev/null +++ b/data/developers/cgato.json @@ -0,0 +1,19 @@ +{ + "developer": "cgato", + "models": [ + { + "id": "cgato/TheSalt-L3-8b-v0.3.2", + "name": "TheSalt-L3-8b-v0.3.2", + "developer": "cgato", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2705, + "hfopenllm_v2/BBH": 0.2968, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3896, + "hfopenllm_v2/MMLU-PRO": 0.1139 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/chargoddard.json b/data/developers/chargoddard.json new file mode 100644 index 0000000000000000000000000000000000000000..9a102b01665825b5e9fb3ab649b4e74980fd10fe --- /dev/null +++ b/data/developers/chargoddard.json @@ -0,0 +1,19 @@ +{ + "developer": "chargoddard", + "models": [ + { + "id": "chargoddard/prometheus-2-llama-3-8b", + "name": "prometheus-2-llama-3-8b", + "developer": "chargoddard", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5289, + "hfopenllm_v2/BBH": 0.4931, + "hfopenllm_v2/MATH Level 5": 0.0823, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3396, + "hfopenllm_v2/MMLU-PRO": 0.3087 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/chujiezheng.json b/data/developers/chujiezheng.json new file mode 100644 index 0000000000000000000000000000000000000000..2412580a2c434d737cc73ba56d1639f7493622c3 --- /dev/null +++ b/data/developers/chujiezheng.json @@ -0,0 +1,33 @@ +{ + "developer": "chujiezheng", + "models": [ + { + "id": "chujiezheng/Llama-3-Instruct-8B-SimPO-ExPO", + "name": "Llama-3-Instruct-8B-SimPO-ExPO", + "developer": "chujiezheng", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6434, + "hfopenllm_v2/BBH": 0.4765, + "hfopenllm_v2/MATH Level 5": 0.0702, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.392, + "hfopenllm_v2/MMLU-PRO": 0.3401 + } + }, + { + "id": "chujiezheng/Mistral7B-PairRM-SPPO-ExPO", + "name": "Mistral7B-PairRM-SPPO-ExPO", + "developer": "chujiezheng", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3673, + "hfopenllm_v2/BBH": 0.3882, + "hfopenllm_v2/MATH Level 5": 0.0181, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.4055, + "hfopenllm_v2/MMLU-PRO": 0.2552 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/cjvt.json b/data/developers/cjvt.json new file mode 100644 index 0000000000000000000000000000000000000000..5c34b1e5dcdc6d70822aaae14d19f1a36f506730 --- /dev/null +++ b/data/developers/cjvt.json @@ -0,0 +1,19 @@ +{ + "developer": "cjvt", + "models": [ + { + "id": "cjvt/GaMS-1B", + "name": "GaMS-1B", + "developer": "cjvt", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1635, + "hfopenllm_v2/BBH": 0.3075, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3684, + "hfopenllm_v2/MMLU-PRO": 0.1149 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/cloudyu.json b/data/developers/cloudyu.json new file mode 100644 index 0000000000000000000000000000000000000000..68e7805d1e3407eeb7307ca844c0c82c3c4c6d1b --- /dev/null +++ b/data/developers/cloudyu.json @@ -0,0 +1,103 @@ +{ + "developer": "cloudyu", + "models": [ + { + "id": "cloudyu/Llama-3-70Bx2-MOE", + "name": "Llama-3-70Bx2-MOE", + "developer": "cloudyu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5482, + "hfopenllm_v2/BBH": 0.6636, + "hfopenllm_v2/MATH Level 5": 0.2175, + "hfopenllm_v2/GPQA": 0.3935, + "hfopenllm_v2/MUSR": 0.4812, + "hfopenllm_v2/MMLU-PRO": 0.5142 + } + }, + { + "id": "cloudyu/Llama-3.2-3Bx4", + "name": "Llama-3.2-3Bx4", + "developer": "cloudyu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5069, + "hfopenllm_v2/BBH": 0.4332, + "hfopenllm_v2/MATH Level 5": 0.1073, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.3496, + "hfopenllm_v2/MMLU-PRO": 0.2985 + } + }, + { + "id": "cloudyu/Mixtral_11Bx2_MoE_19B", + "name": "Mixtral_11Bx2_MoE_19B", + "developer": "cloudyu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3851, + "hfopenllm_v2/BBH": 0.5209, + "hfopenllm_v2/MATH Level 5": 0.0672, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4297, + "hfopenllm_v2/MMLU-PRO": 0.3311 + } + }, + { + "id": "cloudyu/Mixtral_34Bx2_MoE_60B", + "name": "Mixtral_34Bx2_MoE_60B", + "developer": "cloudyu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4538, + "hfopenllm_v2/BBH": 0.587, + "hfopenllm_v2/MATH Level 5": 0.077, + "hfopenllm_v2/GPQA": 0.3381, + "hfopenllm_v2/MUSR": 0.4625, + "hfopenllm_v2/MMLU-PRO": 0.4766 + } + }, + { + "id": "cloudyu/Mixtral_7Bx2_MoE", + "name": "Mixtral_7Bx2_MoE", + "developer": "cloudyu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.448, + "hfopenllm_v2/BBH": 0.516, + "hfopenllm_v2/MATH Level 5": 0.0687, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4473, + "hfopenllm_v2/MMLU-PRO": 0.3044 + } + }, + { + "id": "cloudyu/S1-Llama-3.2-3Bx4-MoE", + "name": "S1-Llama-3.2-3Bx4-MoE", + "developer": "cloudyu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5302, + "hfopenllm_v2/BBH": 0.4358, + "hfopenllm_v2/MATH Level 5": 0.1201, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.3456, + "hfopenllm_v2/MMLU-PRO": 0.3044 + } + }, + { + "id": "cloudyu/Yi-34Bx2-MoE-60B-DPO", + "name": "Yi-34Bx2-MoE-60B-DPO", + "developer": "cloudyu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5319, + "hfopenllm_v2/BBH": 0.5168, + "hfopenllm_v2/MATH Level 5": 0.0702, + "hfopenllm_v2/GPQA": 0.3221, + "hfopenllm_v2/MUSR": 0.4375, + "hfopenllm_v2/MMLU-PRO": 0.4677 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/cluebbers.json b/data/developers/cluebbers.json new file mode 100644 index 0000000000000000000000000000000000000000..6f44b35eb07aa37d3d9f261199745296092d2cf1 --- /dev/null +++ b/data/developers/cluebbers.json @@ -0,0 +1,47 @@ +{ + "developer": "cluebbers", + "models": [ + { + "id": "cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-ipo", + "name": "Llama-3.1-8B-paraphrase-type-generation-apty-ipo", + "developer": "cluebbers", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1327, + "hfopenllm_v2/BBH": 0.38, + "hfopenllm_v2/MATH Level 5": 0.0249, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.4332, + "hfopenllm_v2/MMLU-PRO": 0.2591 + } + }, + { + "id": "cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid", + "name": "Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid", + "developer": "cluebbers", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1318, + "hfopenllm_v2/BBH": 0.3789, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.4306, + "hfopenllm_v2/MMLU-PRO": 0.2562 + } + }, + { + "id": "cluebbers/Llama-3.1-8B-paraphrase-type-generation-etpc", + "name": "Llama-3.1-8B-paraphrase-type-generation-etpc", + "developer": "cluebbers", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1209, + "hfopenllm_v2/BBH": 0.3781, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.4319, + "hfopenllm_v2/MMLU-PRO": 0.2556 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/cognitivecomputations.json b/data/developers/cognitivecomputations.json new file mode 100644 index 0000000000000000000000000000000000000000..82caa40acc22796b971b12d47ba02bdfcb85f10c --- /dev/null +++ b/data/developers/cognitivecomputations.json @@ -0,0 +1,243 @@ +{ + "developer": "cognitivecomputations", + "models": [ + { + "id": "cognitivecomputations/Dolphin3.0-Llama3.1-8B", + "name": "Dolphin3.0-Llama3.1-8B", + "developer": "cognitivecomputations", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7621, + "hfopenllm_v2/BBH": 0.4916, + "hfopenllm_v2/MATH Level 5": 0.1231, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.3653, + "hfopenllm_v2/MMLU-PRO": 0.2992 + } + }, + { + "id": "cognitivecomputations/Dolphin3.0-Llama3.2-1B", + "name": "Dolphin3.0-Llama3.2-1B", + "developer": "cognitivecomputations", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5428, + "hfopenllm_v2/BBH": 0.3122, + "hfopenllm_v2/MATH Level 5": 0.0279, + "hfopenllm_v2/GPQA": 0.2299, + "hfopenllm_v2/MUSR": 0.3249, + "hfopenllm_v2/MMLU-PRO": 0.1375 + } + }, + { + "id": "cognitivecomputations/Dolphin3.0-Qwen2.5-0.5B", + "name": "Dolphin3.0-Qwen2.5-0.5B", + "developer": "cognitivecomputations", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4697, + "hfopenllm_v2/BBH": 0.3114, + "hfopenllm_v2/MATH Level 5": 0.0514, + "hfopenllm_v2/GPQA": 0.2349, + "hfopenllm_v2/MUSR": 0.3555, + "hfopenllm_v2/MMLU-PRO": 0.1413 + } + }, + { + "id": "cognitivecomputations/Dolphin3.0-R1-Mistral-24B", + "name": "Dolphin3.0-R1-Mistral-24B", + "developer": "cognitivecomputations", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4068, + "hfopenllm_v2/BBH": 0.536, + "hfopenllm_v2/MATH Level 5": 0.3119, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.3952, + "hfopenllm_v2/MMLU-PRO": 0.3005 + } + }, + { + "id": "cognitivecomputations/dolphin-2.9-llama3-8b", + "name": "dolphin-2.9-llama3-8b", + "developer": "cognitivecomputations", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.385, + "hfopenllm_v2/BBH": 0.495, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.4375, + "hfopenllm_v2/MMLU-PRO": 0.2771 + } + }, + { + "id": "cognitivecomputations/dolphin-2.9.1-llama-3-70b", + "name": "dolphin-2.9.1-llama-3-70b", + "developer": "cognitivecomputations", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.376, + "hfopenllm_v2/BBH": 0.5205, + "hfopenllm_v2/MATH Level 5": 0.182, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4976, + "hfopenllm_v2/MMLU-PRO": 0.413 + } + }, + { + "id": "cognitivecomputations/dolphin-2.9.1-yi-1.5-34b", + "name": "dolphin-2.9.1-yi-1.5-34b", + "developer": "cognitivecomputations", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3853, + "hfopenllm_v2/BBH": 0.6076, + "hfopenllm_v2/MATH Level 5": 0.1866, + "hfopenllm_v2/GPQA": 0.3431, + "hfopenllm_v2/MUSR": 0.4598, + "hfopenllm_v2/MMLU-PRO": 0.4519 + } + }, + { + "id": "cognitivecomputations/dolphin-2.9.1-yi-1.5-9b", + "name": "dolphin-2.9.1-yi-1.5-9b", + "developer": "cognitivecomputations", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4465, + "hfopenllm_v2/BBH": 0.5484, + "hfopenllm_v2/MATH Level 5": 0.1518, + "hfopenllm_v2/GPQA": 0.3381, + "hfopenllm_v2/MUSR": 0.4348, + "hfopenllm_v2/MMLU-PRO": 0.3967 + } + }, + { + "id": "cognitivecomputations/dolphin-2.9.2-Phi-3-Medium", + "name": "dolphin-2.9.2-Phi-3-Medium", + "developer": "cognitivecomputations", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4248, + "hfopenllm_v2/BBH": 0.6457, + "hfopenllm_v2/MATH Level 5": 0.1828, + "hfopenllm_v2/GPQA": 0.3272, + "hfopenllm_v2/MUSR": 0.4191, + "hfopenllm_v2/MMLU-PRO": 0.4555 + } + }, + { + "id": "cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated", + "name": "dolphin-2.9.2-Phi-3-Medium-abliterated", + "developer": "cognitivecomputations", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4124, + "hfopenllm_v2/BBH": 0.6383, + "hfopenllm_v2/MATH Level 5": 0.182, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.4349, + "hfopenllm_v2/MMLU-PRO": 0.4525 + } + }, + { + "id": "cognitivecomputations/dolphin-2.9.2-qwen2-72b", + "name": "dolphin-2.9.2-qwen2-72b", + "developer": "cognitivecomputations", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6344, + "hfopenllm_v2/BBH": 0.6296, + "hfopenllm_v2/MATH Level 5": 0.2802, + "hfopenllm_v2/GPQA": 0.37, + "hfopenllm_v2/MUSR": 0.4521, + "hfopenllm_v2/MMLU-PRO": 0.5471 + } + }, + { + "id": "cognitivecomputations/dolphin-2.9.2-qwen2-7b", + "name": "dolphin-2.9.2-qwen2-7b", + "developer": "cognitivecomputations", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3535, + "hfopenllm_v2/BBH": 0.4894, + "hfopenllm_v2/MATH Level 5": 0.1344, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4191, + "hfopenllm_v2/MMLU-PRO": 0.4051 + } + }, + { + "id": "cognitivecomputations/dolphin-2.9.3-Yi-1.5-34B-32k", + "name": "dolphin-2.9.3-Yi-1.5-34B-32k", + "developer": "cognitivecomputations", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3639, + "hfopenllm_v2/BBH": 0.6047, + "hfopenllm_v2/MATH Level 5": 0.1669, + "hfopenllm_v2/GPQA": 0.3431, + "hfopenllm_v2/MUSR": 0.4311, + "hfopenllm_v2/MMLU-PRO": 0.463 + } + }, + { + "id": "cognitivecomputations/dolphin-2.9.3-mistral-7B-32k", + "name": "dolphin-2.9.3-mistral-7B-32k", + "developer": "cognitivecomputations", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4126, + "hfopenllm_v2/BBH": 0.4813, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.4643, + "hfopenllm_v2/MMLU-PRO": 0.2821 + } + }, + { + "id": "cognitivecomputations/dolphin-2.9.3-mistral-nemo-12b", + "name": "dolphin-2.9.3-mistral-nemo-12b", + "developer": "cognitivecomputations", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5601, + "hfopenllm_v2/BBH": 0.548, + "hfopenllm_v2/MATH Level 5": 0.074, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.443, + "hfopenllm_v2/MMLU-PRO": 0.3377 + } + }, + { + "id": "cognitivecomputations/dolphin-2.9.4-gemma2-2b", + "name": "dolphin-2.9.4-gemma2-2b", + "developer": "cognitivecomputations", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0896, + "hfopenllm_v2/BBH": 0.4081, + "hfopenllm_v2/MATH Level 5": 0.0491, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.418, + "hfopenllm_v2/MMLU-PRO": 0.2105 + } + }, + { + "id": "cognitivecomputations/dolphin-2.9.4-llama3.1-8b", + "name": "dolphin-2.9.4-llama3.1-8b", + "developer": "cognitivecomputations", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2757, + "hfopenllm_v2/BBH": 0.3524, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3236, + "hfopenllm_v2/MMLU-PRO": 0.1237 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/cohere.json b/data/developers/cohere.json new file mode 100644 index 0000000000000000000000000000000000000000..e15f766c124103dbd5ef18992684b9a461688bcf --- /dev/null +++ b/data/developers/cohere.json @@ -0,0 +1,402 @@ +{ + "developer": "cohere", + "models": [ + { + "id": "cohere/Cohere-Command-beta-52.4B", + "name": "Cohere Command beta 52.4B", + "developer": "cohere", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.874, + "helm_classic/MMLU": 0.452, + "helm_classic/BoolQ": 0.856, + "helm_classic/NarrativeQA": 0.752, + "helm_classic/NaturalQuestions (open-book)": 0.76, + "helm_classic/QuAC": 0.432, + "helm_classic/HellaSwag": 0.811, + "helm_classic/OpenbookQA": 0.582, + "helm_classic/TruthfulQA": 0.269, + "helm_classic/MS MARCO (TREC)": 0.762, + "helm_classic/CNN/DailyMail": 0.161, + "helm_classic/XSUM": 0.152, + "helm_classic/IMDB": 0.96, + "helm_classic/CivilComments": 0.601, + "helm_classic/RAFT": 0.667 + } + }, + { + "id": "cohere/Cohere-Command-beta-6.1B", + "name": "Cohere Command beta 6.1B", + "developer": "cohere", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.675, + "helm_classic/MMLU": 0.406, + "helm_classic/BoolQ": 0.798, + "helm_classic/NarrativeQA": 0.709, + "helm_classic/NaturalQuestions (open-book)": 0.717, + "helm_classic/QuAC": 0.375, + "helm_classic/HellaSwag": 0.752, + "helm_classic/OpenbookQA": 0.55, + "helm_classic/TruthfulQA": 0.203, + "helm_classic/MS MARCO (TREC)": 0.709, + "helm_classic/CNN/DailyMail": 0.153, + "helm_classic/XSUM": 0.122, + "helm_classic/IMDB": 0.961, + "helm_classic/CivilComments": 0.54, + "helm_classic/RAFT": 0.634 + } + }, + { + "id": "cohere/Cohere-large-v20220720-13.1B", + "name": "Cohere large v20220720 13.1B", + "developer": "cohere", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.372, + "helm_classic/MMLU": 0.324, + "helm_classic/BoolQ": 0.725, + "helm_classic/NarrativeQA": 0.625, + "helm_classic/NaturalQuestions (open-book)": 0.573, + "helm_classic/QuAC": 0.338, + "helm_classic/HellaSwag": 0.736, + "helm_classic/OpenbookQA": 0.542, + "helm_classic/TruthfulQA": 0.181, + "helm_classic/MS MARCO (TREC)": 0.33, + "helm_classic/CNN/DailyMail": 0.126, + "helm_classic/XSUM": 0.108, + "helm_classic/IMDB": 0.933, + "helm_classic/CivilComments": 0.507, + "helm_classic/RAFT": 0.596 + } + }, + { + "id": "cohere/Cohere-medium-v20220720-6.1B", + "name": "Cohere medium v20220720 6.1B", + "developer": "cohere", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.23, + "helm_classic/MMLU": 0.279, + "helm_classic/BoolQ": 0.659, + "helm_classic/NarrativeQA": 0.559, + "helm_classic/NaturalQuestions (open-book)": 0.504, + "helm_classic/QuAC": 0.279, + "helm_classic/HellaSwag": 0.706, + "helm_classic/OpenbookQA": 0.496, + "helm_classic/TruthfulQA": 0.19, + "helm_classic/MS MARCO (TREC)": 0.374, + "helm_classic/CNN/DailyMail": 0.077, + "helm_classic/XSUM": 0.087, + "helm_classic/IMDB": 0.935, + "helm_classic/CivilComments": 0.504, + "helm_classic/RAFT": 0.52 + } + }, + { + "id": "cohere/Cohere-medium-v20221108-6.1B", + "name": "Cohere medium v20221108 6.1B", + "developer": "cohere", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.312, + "helm_classic/MMLU": 0.254, + "helm_classic/BoolQ": 0.7, + "helm_classic/NarrativeQA": 0.61, + "helm_classic/NaturalQuestions (open-book)": 0.517, + "helm_classic/QuAC": 0.314, + "helm_classic/HellaSwag": 0.726, + "helm_classic/OpenbookQA": 0.538, + "helm_classic/TruthfulQA": 0.215, + "helm_classic/MS MARCO (TREC)": 0.373, + "helm_classic/CNN/DailyMail": 0.121, + "helm_classic/XSUM": 0.099, + "helm_classic/IMDB": 0.935, + "helm_classic/CivilComments": 0.5, + "helm_classic/RAFT": 0.591 + } + }, + { + "id": "cohere/Cohere-small-v20220720-410M", + "name": "Cohere small v20220720 410M", + "developer": "cohere", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.109, + "helm_classic/MMLU": 0.264, + "helm_classic/BoolQ": 0.457, + "helm_classic/NarrativeQA": 0.294, + "helm_classic/NaturalQuestions (open-book)": 0.309, + "helm_classic/QuAC": 0.219, + "helm_classic/HellaSwag": 0.483, + "helm_classic/OpenbookQA": 0.348, + "helm_classic/TruthfulQA": 0.217, + "helm_classic/MS MARCO (TREC)": 0.304, + "helm_classic/CNN/DailyMail": 0.063, + "helm_classic/XSUM": 0.033, + "helm_classic/IMDB": 0.578, + "helm_classic/CivilComments": 0.501, + "helm_classic/RAFT": 0.492 + } + }, + { + "id": "cohere/Cohere-xlarge-v20220609-52.4B", + "name": "Cohere xlarge v20220609 52.4B", + "developer": "cohere", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.56, + "helm_classic/MMLU": 0.353, + "helm_classic/BoolQ": 0.718, + "helm_classic/NarrativeQA": 0.65, + "helm_classic/NaturalQuestions (open-book)": 0.595, + "helm_classic/QuAC": 0.361, + "helm_classic/HellaSwag": 0.811, + "helm_classic/OpenbookQA": 0.55, + "helm_classic/TruthfulQA": 0.198, + "helm_classic/MS MARCO (TREC)": 0.459, + "helm_classic/CNN/DailyMail": 0.144, + "helm_classic/XSUM": 0.129, + "helm_classic/IMDB": 0.956, + "helm_classic/CivilComments": 0.532, + "helm_classic/RAFT": 0.633 + } + }, + { + "id": "cohere/Cohere-xlarge-v20221108-52.4B", + "name": "Cohere xlarge v20221108 52.4B", + "developer": "cohere", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.664, + "helm_classic/MMLU": 0.382, + "helm_classic/BoolQ": 0.762, + "helm_classic/NarrativeQA": 0.672, + "helm_classic/NaturalQuestions (open-book)": 0.628, + "helm_classic/QuAC": 0.374, + "helm_classic/HellaSwag": 0.81, + "helm_classic/OpenbookQA": 0.588, + "helm_classic/TruthfulQA": 0.169, + "helm_classic/MS MARCO (TREC)": 0.55, + "helm_classic/CNN/DailyMail": 0.153, + "helm_classic/XSUM": 0.153, + "helm_classic/IMDB": 0.956, + "helm_classic/CivilComments": 0.524, + "helm_classic/RAFT": 0.624 + } + }, + { + "id": "cohere/aya-expanse-32b", + "name": "aya-expanse-32b", + "developer": "cohere", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.7353, + "global-mmlu-lite/Culturally Sensitive": 0.6891, + "global-mmlu-lite/Culturally Agnostic": 0.7815, + "global-mmlu-lite/Arabic": 0.7425, + "global-mmlu-lite/English": 0.7544, + "global-mmlu-lite/Bengali": 0.7343, + "global-mmlu-lite/German": 0.7425, + "global-mmlu-lite/French": 0.7325, + "global-mmlu-lite/Hindi": 0.7375, + "global-mmlu-lite/Indonesian": 0.7594, + "global-mmlu-lite/Italian": 0.7305, + "global-mmlu-lite/Japanese": 0.7419, + "global-mmlu-lite/Korean": 0.7525, + "global-mmlu-lite/Portuguese": 0.7544, + "global-mmlu-lite/Spanish": 0.7362, + "global-mmlu-lite/Swahili": 0.7071, + "global-mmlu-lite/Yoruba": 0.6942, + "global-mmlu-lite/Chinese": 0.743, + "global-mmlu-lite/Burmese": 0.7025 + } + }, + { + "id": "cohere/command", + "name": "Command", + "developer": "cohere", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.327, + "helm_lite/NarrativeQA": 0.749, + "helm_lite/NaturalQuestions (closed-book)": 0.391, + "helm_lite/OpenbookQA": 0.774, + "helm_lite/MMLU": 0.525, + "helm_lite/MATH": 0.236, + "helm_lite/GSM8K": 0.452, + "helm_lite/LegalBench": 0.578, + "helm_lite/MedQA": 0.445, + "helm_lite/WMT 2014": 0.088 + } + }, + { + "id": "cohere/command-a-03-2025", + "name": "command-a-03-2025", + "developer": "cohere", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.8385, + "global-mmlu-lite/Culturally Sensitive": 0.7993, + "global-mmlu-lite/Culturally Agnostic": 0.8778, + "global-mmlu-lite/Arabic": 0.8425, + "global-mmlu-lite/English": 0.855, + "global-mmlu-lite/Bengali": 0.8225, + "global-mmlu-lite/German": 0.8425, + "global-mmlu-lite/French": 0.8375, + "global-mmlu-lite/Hindi": 0.8421, + "global-mmlu-lite/Indonesian": 0.8546, + "global-mmlu-lite/Italian": 0.8375, + "global-mmlu-lite/Japanese": 0.845, + "global-mmlu-lite/Korean": 0.85, + "global-mmlu-lite/Portuguese": 0.84, + "global-mmlu-lite/Spanish": 0.8525, + "global-mmlu-lite/Swahili": 0.8275, + "global-mmlu-lite/Yoruba": 0.815, + "global-mmlu-lite/Chinese": 0.835, + "global-mmlu-lite/Burmese": 0.8175 + } + }, + { + "id": "cohere/command-light", + "name": "Command Light", + "developer": "cohere", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.105, + "helm_lite/NarrativeQA": 0.629, + "helm_lite/NaturalQuestions (closed-book)": 0.195, + "helm_lite/OpenbookQA": 0.398, + "helm_lite/MMLU": 0.386, + "helm_lite/MATH": 0.098, + "helm_lite/GSM8K": 0.149, + "helm_lite/LegalBench": 0.397, + "helm_lite/MedQA": 0.312, + "helm_lite/WMT 2014": 0.023 + } + }, + { + "id": "cohere/command-r", + "name": "Command R", + "developer": "cohere", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.299, + "helm_lite/NarrativeQA": 0.742, + "helm_lite/NaturalQuestions (closed-book)": 0.352, + "helm_lite/OpenbookQA": 0.782, + "helm_lite/MMLU": 0.567, + "helm_lite/MATH": 0.266, + "helm_lite/GSM8K": 0.551, + "helm_lite/LegalBench": 0.507, + "helm_lite/MedQA": 0.555, + "helm_lite/WMT 2014": 0.149, + "helm_mmlu/MMLU All Subjects": 0.652, + "helm_mmlu/Abstract Algebra": 0.33, + "helm_mmlu/Anatomy": 0.615, + "helm_mmlu/College Physics": 0.382, + "helm_mmlu/Computer Security": 0.78, + "helm_mmlu/Econometrics": 0.456, + "helm_mmlu/Global Facts": 0.42, + "helm_mmlu/Jurisprudence": 0.796, + "helm_mmlu/Philosophy": 0.685, + "helm_mmlu/Professional Psychology": 0.681, + "helm_mmlu/Us Foreign Policy": 0.82, + "helm_mmlu/Astronomy": 0.743, + "helm_mmlu/Business Ethics": 0.63, + "helm_mmlu/Clinical Knowledge": 0.751, + "helm_mmlu/Conceptual Physics": 0.528, + "helm_mmlu/Electrical Engineering": 0.593, + "helm_mmlu/Elementary Mathematics": 0.437, + "helm_mmlu/Formal Logic": 0.405, + "helm_mmlu/High School World History": 0.84, + "helm_mmlu/Human Sexuality": 0.763, + "helm_mmlu/International Law": 0.802, + "helm_mmlu/Logical Fallacies": 0.798, + "helm_mmlu/Machine Learning": 0.446, + "helm_mmlu/Management": 0.796, + "helm_mmlu/Marketing": 0.872, + "helm_mmlu/Medical Genetics": 0.81, + "helm_mmlu/Miscellaneous": 0.848, + "helm_mmlu/Moral Scenarios": 0.451, + "helm_mmlu/Nutrition": 0.703, + "helm_mmlu/Prehistory": 0.728, + "helm_mmlu/Public Relations": 0.7, + "helm_mmlu/Security Studies": 0.714, + "helm_mmlu/Sociology": 0.866, + "helm_mmlu/Virology": 0.542, + "helm_mmlu/World Religions": 0.813, + "helm_mmlu/Mean win rate": 0.959 + } + }, + { + "id": "cohere/command-r-plus", + "name": "Command R Plus", + "developer": "cohere", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.441, + "helm_lite/NarrativeQA": 0.735, + "helm_lite/NaturalQuestions (closed-book)": 0.343, + "helm_lite/OpenbookQA": 0.828, + "helm_lite/MMLU": 0.59, + "helm_lite/MATH": 0.403, + "helm_lite/GSM8K": 0.738, + "helm_lite/LegalBench": 0.672, + "helm_lite/MedQA": 0.567, + "helm_lite/WMT 2014": 0.203, + "helm_mmlu/MMLU All Subjects": 0.694, + "helm_mmlu/Abstract Algebra": 0.21, + "helm_mmlu/Anatomy": 0.644, + "helm_mmlu/College Physics": 0.52, + "helm_mmlu/Computer Security": 0.74, + "helm_mmlu/Econometrics": 0.561, + "helm_mmlu/Global Facts": 0.5, + "helm_mmlu/Jurisprudence": 0.806, + "helm_mmlu/Philosophy": 0.695, + "helm_mmlu/Professional Psychology": 0.735, + "helm_mmlu/Us Foreign Policy": 0.89, + "helm_mmlu/Astronomy": 0.783, + "helm_mmlu/Business Ethics": 0.77, + "helm_mmlu/Clinical Knowledge": 0.743, + "helm_mmlu/Conceptual Physics": 0.591, + "helm_mmlu/Electrical Engineering": 0.71, + "helm_mmlu/Elementary Mathematics": 0.474, + "helm_mmlu/Formal Logic": 0.484, + "helm_mmlu/High School World History": 0.827, + "helm_mmlu/Human Sexuality": 0.786, + "helm_mmlu/International Law": 0.835, + "helm_mmlu/Logical Fallacies": 0.791, + "helm_mmlu/Machine Learning": 0.518, + "helm_mmlu/Management": 0.835, + "helm_mmlu/Marketing": 0.927, + "helm_mmlu/Medical Genetics": 0.77, + "helm_mmlu/Miscellaneous": 0.844, + "helm_mmlu/Moral Scenarios": 0.585, + "helm_mmlu/Nutrition": 0.742, + "helm_mmlu/Prehistory": 0.821, + "helm_mmlu/Public Relations": 0.709, + "helm_mmlu/Security Studies": 0.751, + "helm_mmlu/Sociology": 0.876, + "helm_mmlu/Virology": 0.56, + "helm_mmlu/World Religions": 0.842, + "helm_mmlu/Mean win rate": 0.825 + } + }, + { + "id": "cohere/command-xlarge-beta", + "name": "Cohere Command beta 52.4B", + "developer": "cohere", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_instruct/Mean win rate": 0.089, + "helm_instruct/Anthropic RLHF dataset": 4.214, + "helm_instruct/Best ChatGPT Prompts": 4.988, + "helm_instruct/Koala test dataset": 4.969, + "helm_instruct/Open Assistant": 4.967, + "helm_instruct/Self Instruct": 4.971, + "helm_instruct/Vicuna": 4.995 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/collaiborateorg.json b/data/developers/collaiborateorg.json new file mode 100644 index 0000000000000000000000000000000000000000..8d07059b317dd152a933e0ac5d8e5dfa6bb61c35 --- /dev/null +++ b/data/developers/collaiborateorg.json @@ -0,0 +1,19 @@ +{ + "developer": "collaiborateorg", + "models": [ + { + "id": "collaiborateorg/Collaiborator-MEDLLM-Llama-3-8B-v2", + "name": "Collaiborator-MEDLLM-Llama-3-8B-v2", + "developer": "collaiborateorg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3809, + "hfopenllm_v2/BBH": 0.4648, + "hfopenllm_v2/MATH Level 5": 0.0566, + "hfopenllm_v2/GPQA": 0.3331, + "hfopenllm_v2/MUSR": 0.3434, + "hfopenllm_v2/MMLU-PRO": 0.3481 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/cpayne1303.json b/data/developers/cpayne1303.json new file mode 100644 index 0000000000000000000000000000000000000000..6d735bd94a67b9fc86d407e5a74d4ec119a21a01 --- /dev/null +++ b/data/developers/cpayne1303.json @@ -0,0 +1,61 @@ +{ + "developer": "cpayne1303", + "models": [ + { + "id": "cpayne1303/cp2024", + "name": "cp2024", + "developer": "cpayne1303", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1658, + "hfopenllm_v2/BBH": 0.2985, + "hfopenllm_v2/MATH Level 5": 0.0053, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3383, + "hfopenllm_v2/MMLU-PRO": 0.1101 + } + }, + { + "id": "cpayne1303/cp2024-instruct", + "name": "cp2024-instruct", + "developer": "cpayne1303", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1706, + "hfopenllm_v2/BBH": 0.2947, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3686, + "hfopenllm_v2/MMLU-PRO": 0.1167 + } + }, + { + "id": "cpayne1303/llama-43m-beta", + "name": "llama-43m-beta", + "developer": "cpayne1303", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1916, + "hfopenllm_v2/BBH": 0.2977, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3872, + "hfopenllm_v2/MMLU-PRO": 0.1132 + } + }, + { + "id": "cpayne1303/smallcp2024", + "name": "smallcp2024", + "developer": "cpayne1303", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1582, + "hfopenllm_v2/BBH": 0.3027, + "hfopenllm_v2/MATH Level 5": 0.0053, + "hfopenllm_v2/GPQA": 0.2307, + "hfopenllm_v2/MUSR": 0.3425, + "hfopenllm_v2/MMLU-PRO": 0.1114 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/crestf411.json b/data/developers/crestf411.json new file mode 100644 index 0000000000000000000000000000000000000000..79a1922450759ce563b5462cc77575548591305f --- /dev/null +++ b/data/developers/crestf411.json @@ -0,0 +1,19 @@ +{ + "developer": "crestf411", + "models": [ + { + "id": "crestf411/MN-Slush", + "name": "MN-Slush", + "developer": "crestf411", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4077, + "hfopenllm_v2/BBH": 0.534, + "hfopenllm_v2/MATH Level 5": 0.1269, + "hfopenllm_v2/GPQA": 0.3238, + "hfopenllm_v2/MUSR": 0.3933, + "hfopenllm_v2/MMLU-PRO": 0.3508 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/cstr.json b/data/developers/cstr.json new file mode 100644 index 0000000000000000000000000000000000000000..0276c7cc7f9c8121a4528a183bb6747c330f08a7 --- /dev/null +++ b/data/developers/cstr.json @@ -0,0 +1,19 @@ +{ + "developer": "cstr", + "models": [ + { + "id": "cstr/llama3.1-8b-spaetzle-v90", + "name": "llama3.1-8b-spaetzle-v90", + "developer": "cstr", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7356, + "hfopenllm_v2/BBH": 0.5303, + "hfopenllm_v2/MATH Level 5": 0.1495, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4134, + "hfopenllm_v2/MMLU-PRO": 0.3731 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/cyberagent.json b/data/developers/cyberagent.json new file mode 100644 index 0000000000000000000000000000000000000000..671c959996d724a7093cd5abbf2aaff36891acab --- /dev/null +++ b/data/developers/cyberagent.json @@ -0,0 +1,19 @@ +{ + "developer": "cyberagent", + "models": [ + { + "id": "cyberagent/calm3-22b-chat", + "name": "calm3-22b-chat", + "developer": "cyberagent", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5091, + "hfopenllm_v2/BBH": 0.4992, + "hfopenllm_v2/MATH Level 5": 0.0695, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.4553, + "hfopenllm_v2/MMLU-PRO": 0.295 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/darkc0de.json b/data/developers/darkc0de.json new file mode 100644 index 0000000000000000000000000000000000000000..7dd0da82af4337f656650e431893af9909dcab80 --- /dev/null +++ b/data/developers/darkc0de.json @@ -0,0 +1,47 @@ +{ + "developer": "darkc0de", + "models": [ + { + "id": "darkc0de/BuddyGlassNeverSleeps", + "name": "BuddyGlassNeverSleeps", + "developer": "darkc0de", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4239, + "hfopenllm_v2/BBH": 0.4977, + "hfopenllm_v2/MATH Level 5": 0.0627, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.3993, + "hfopenllm_v2/MMLU-PRO": 0.3452 + } + }, + { + "id": "darkc0de/BuddyGlassUncensored2025.2", + "name": "BuddyGlassUncensored2025.2", + "developer": "darkc0de", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7731, + "hfopenllm_v2/BBH": 0.6095, + "hfopenllm_v2/MATH Level 5": 0.2402, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.4071, + "hfopenllm_v2/MMLU-PRO": 0.4336 + } + }, + { + "id": "darkc0de/BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp", + "name": "BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp", + "developer": "darkc0de", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4358, + "hfopenllm_v2/BBH": 0.5243, + "hfopenllm_v2/MATH Level 5": 0.1284, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4143, + "hfopenllm_v2/MMLU-PRO": 0.3673 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/databricks.json b/data/developers/databricks.json new file mode 100644 index 0000000000000000000000000000000000000000..baed75a2fae8ea193778c5b2817219afa46d4f73 --- /dev/null +++ b/data/developers/databricks.json @@ -0,0 +1,135 @@ +{ + "developer": "databricks", + "models": [ + { + "id": "databricks/dbrx-base", + "name": "dbrx-base", + "developer": "databricks", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0821, + "hfopenllm_v2/BBH": 0.5196, + "hfopenllm_v2/MATH Level 5": 0.1, + "hfopenllm_v2/GPQA": 0.3267, + "hfopenllm_v2/MUSR": 0.4067, + "hfopenllm_v2/MMLU-PRO": 0.35 + } + }, + { + "id": "databricks/dbrx-instruct", + "name": "DBRX Instruct", + "developer": "databricks", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.289, + "helm_lite/NarrativeQA": 0.488, + "helm_lite/NaturalQuestions (closed-book)": 0.284, + "helm_lite/OpenbookQA": 0.91, + "helm_lite/MMLU": 0.643, + "helm_lite/MATH": 0.358, + "helm_lite/GSM8K": 0.671, + "helm_lite/LegalBench": 0.426, + "helm_lite/MedQA": 0.694, + "helm_lite/WMT 2014": 0.131, + "helm_mmlu/MMLU All Subjects": 0.741, + "helm_mmlu/Abstract Algebra": 0.34, + "helm_mmlu/Anatomy": 0.667, + "helm_mmlu/College Physics": 0.539, + "helm_mmlu/Computer Security": 0.83, + "helm_mmlu/Econometrics": 0.605, + "helm_mmlu/Global Facts": 0.46, + "helm_mmlu/Jurisprudence": 0.843, + "helm_mmlu/Philosophy": 0.804, + "helm_mmlu/Professional Psychology": 0.801, + "helm_mmlu/Us Foreign Policy": 0.93, + "helm_mmlu/Astronomy": 0.836, + "helm_mmlu/Business Ethics": 0.78, + "helm_mmlu/Clinical Knowledge": 0.789, + "helm_mmlu/Conceptual Physics": 0.74, + "helm_mmlu/Electrical Engineering": 0.71, + "helm_mmlu/Elementary Mathematics": 0.563, + "helm_mmlu/Formal Logic": 0.563, + "helm_mmlu/High School World History": 0.903, + "helm_mmlu/Human Sexuality": 0.878, + "helm_mmlu/International Law": 0.884, + "helm_mmlu/Logical Fallacies": 0.847, + "helm_mmlu/Machine Learning": 0.625, + "helm_mmlu/Management": 0.854, + "helm_mmlu/Marketing": 0.94, + "helm_mmlu/Medical Genetics": 0.85, + "helm_mmlu/Miscellaneous": 0.911, + "helm_mmlu/Moral Scenarios": 0.465, + "helm_mmlu/Nutrition": 0.814, + "helm_mmlu/Prehistory": 0.84, + "helm_mmlu/Public Relations": 0.691, + "helm_mmlu/Security Studies": 0.804, + "helm_mmlu/Sociology": 0.896, + "helm_mmlu/Virology": 0.566, + "helm_mmlu/World Religions": 0.871, + "helm_mmlu/Mean win rate": 0.537, + "hfopenllm_v2/IFEval": 0.5416, + "hfopenllm_v2/BBH": 0.5429, + "hfopenllm_v2/MATH Level 5": 0.0687, + "hfopenllm_v2/GPQA": 0.3414, + "hfopenllm_v2/MUSR": 0.4269, + "hfopenllm_v2/MMLU-PRO": 0.3683 + } + }, + { + "id": "databricks/dolly-v1-6b", + "name": "dolly-v1-6b", + "developer": "databricks", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2224, + "hfopenllm_v2/BBH": 0.3172, + "hfopenllm_v2/MATH Level 5": 0.0189, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.4004, + "hfopenllm_v2/MMLU-PRO": 0.1266 + } + }, + { + "id": "databricks/dolly-v2-12b", + "name": "dolly-v2-12b", + "developer": "databricks", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2355, + "hfopenllm_v2/BBH": 0.332, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2408, + "hfopenllm_v2/MUSR": 0.3739, + "hfopenllm_v2/MMLU-PRO": 0.1129 + } + }, + { + "id": "databricks/dolly-v2-3b", + "name": "dolly-v2-3b", + "developer": "databricks", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2247, + "hfopenllm_v2/BBH": 0.3079, + "hfopenllm_v2/MATH Level 5": 0.0151, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3338, + "hfopenllm_v2/MMLU-PRO": 0.1145 + } + }, + { + "id": "databricks/dolly-v2-7b", + "name": "dolly-v2-7b", + "developer": "databricks", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.201, + "hfopenllm_v2/BBH": 0.3173, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3553, + "hfopenllm_v2/MMLU-PRO": 0.1149 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/davidkim205.json b/data/developers/davidkim205.json new file mode 100644 index 0000000000000000000000000000000000000000..199e5bb163ace1df2e604e2c6cb8efc790198f50 --- /dev/null +++ b/data/developers/davidkim205.json @@ -0,0 +1,33 @@ +{ + "developer": "davidkim205", + "models": [ + { + "id": "davidkim205/Rhea-72b-v0.5", + "name": "Rhea-72b-v0.5", + "developer": "davidkim205", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0145, + "hfopenllm_v2/BBH": 0.3078, + "hfopenllm_v2/MATH Level 5": 0.1737, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.4241, + "hfopenllm_v2/MMLU-PRO": 0.1166 + } + }, + { + "id": "davidkim205/nox-solar-10.7b-v4", + "name": "nox-solar-10.7b-v4", + "developer": "davidkim205", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3753, + "hfopenllm_v2/BBH": 0.4814, + "hfopenllm_v2/MATH Level 5": 0.0083, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4298, + "hfopenllm_v2/MMLU-PRO": 0.3333 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/deepseek-ai.json b/data/developers/deepseek-ai.json new file mode 100644 index 0000000000000000000000000000000000000000..af3b17d36c12abff247e266120f68eba05a1a99f --- /dev/null +++ b/data/developers/deepseek-ai.json @@ -0,0 +1,279 @@ +{ + "developer": "deepseek-ai", + "models": [ + { + "id": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B", + "name": "DeepSeek-R1-Distill-Llama-70B", + "developer": "deepseek-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4336, + "hfopenllm_v2/BBH": 0.5635, + "hfopenllm_v2/MATH Level 5": 0.3074, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.4342, + "hfopenllm_v2/MMLU-PRO": 0.4748 + } + }, + { + "id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "name": "DeepSeek-R1-Distill-Llama-8B", + "developer": "deepseek-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3782, + "hfopenllm_v2/BBH": 0.3239, + "hfopenllm_v2/MATH Level 5": 0.2198, + "hfopenllm_v2/GPQA": 0.255, + "hfopenllm_v2/MUSR": 0.325, + "hfopenllm_v2/MMLU-PRO": 0.2089 + } + }, + { + "id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "name": "DeepSeek-R1-Distill-Qwen-1.5B", + "developer": "deepseek-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3463, + "hfopenllm_v2/BBH": 0.3241, + "hfopenllm_v2/MATH Level 5": 0.1692, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3635, + "hfopenllm_v2/MMLU-PRO": 0.1187 + } + }, + { + "id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", + "name": "DeepSeek-R1-Distill-Qwen-14B", + "developer": "deepseek-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4382, + "hfopenllm_v2/BBH": 0.5906, + "hfopenllm_v2/MATH Level 5": 0.5702, + "hfopenllm_v2/GPQA": 0.3876, + "hfopenllm_v2/MUSR": 0.5366, + "hfopenllm_v2/MMLU-PRO": 0.4667 + } + }, + { + "id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "name": "DeepSeek-R1-Distill-Qwen-32B", + "developer": "deepseek-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4186, + "hfopenllm_v2/BBH": 0.4197, + "hfopenllm_v2/MATH Level 5": 0.1707, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4526, + "hfopenllm_v2/MMLU-PRO": 0.4687 + } + }, + { + "id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", + "name": "DeepSeek-R1-Distill-Qwen-7B", + "developer": "deepseek-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4038, + "hfopenllm_v2/BBH": 0.3443, + "hfopenllm_v2/MATH Level 5": 0.1956, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3663, + "hfopenllm_v2/MMLU-PRO": 0.2321 + } + }, + { + "id": "deepseek-ai/deepseek-llm-67b-chat", + "name": "DeepSeek LLM Chat 67B", + "developer": "deepseek-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.488, + "helm_lite/NarrativeQA": 0.581, + "helm_lite/NaturalQuestions (closed-book)": 0.412, + "helm_lite/OpenbookQA": 0.88, + "helm_lite/MMLU": 0.641, + "helm_lite/MATH": 0.615, + "helm_lite/GSM8K": 0.795, + "helm_lite/LegalBench": 0.637, + "helm_lite/MedQA": 0.628, + "helm_lite/WMT 2014": 0.186, + "helm_mmlu/MMLU All Subjects": 0.725, + "helm_mmlu/Abstract Algebra": 0.44, + "helm_mmlu/Anatomy": 0.667, + "helm_mmlu/College Physics": 0.363, + "helm_mmlu/Computer Security": 0.79, + "helm_mmlu/Econometrics": 0.553, + "helm_mmlu/Global Facts": 0.46, + "helm_mmlu/Jurisprudence": 0.852, + "helm_mmlu/Philosophy": 0.801, + "helm_mmlu/Professional Psychology": 0.809, + "helm_mmlu/Us Foreign Policy": 0.91, + "helm_mmlu/Astronomy": 0.822, + "helm_mmlu/Business Ethics": 0.86, + "helm_mmlu/Clinical Knowledge": 0.785, + "helm_mmlu/Conceptual Physics": 0.723, + "helm_mmlu/Electrical Engineering": 0.669, + "helm_mmlu/Elementary Mathematics": 0.548, + "helm_mmlu/Formal Logic": 0.548, + "helm_mmlu/High School World History": 0.911, + "helm_mmlu/Human Sexuality": 0.84, + "helm_mmlu/International Law": 0.851, + "helm_mmlu/Logical Fallacies": 0.847, + "helm_mmlu/Machine Learning": 0.562, + "helm_mmlu/Management": 0.903, + "helm_mmlu/Marketing": 0.923, + "helm_mmlu/Medical Genetics": 0.73, + "helm_mmlu/Miscellaneous": 0.904, + "helm_mmlu/Moral Scenarios": 0.544, + "helm_mmlu/Nutrition": 0.781, + "helm_mmlu/Prehistory": 0.858, + "helm_mmlu/Public Relations": 0.7, + "helm_mmlu/Security Studies": 0.796, + "helm_mmlu/Sociology": 0.876, + "helm_mmlu/Virology": 0.554, + "helm_mmlu/World Religions": 0.865, + "helm_mmlu/Mean win rate": 0.387, + "hfopenllm_v2/IFEval": 0.5587, + "hfopenllm_v2/BBH": 0.5243, + "hfopenllm_v2/MATH Level 5": 0.0929, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.5059, + "hfopenllm_v2/MMLU-PRO": 0.3944 + } + }, + { + "id": "deepseek-ai/deepseek-llm-7b-base", + "name": "deepseek-llm-7b-base", + "developer": "deepseek-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2179, + "hfopenllm_v2/BBH": 0.3503, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3738, + "hfopenllm_v2/MMLU-PRO": 0.1806 + } + }, + { + "id": "deepseek-ai/deepseek-llm-7b-chat", + "name": "deepseek-llm-7b-chat", + "developer": "deepseek-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4171, + "hfopenllm_v2/BBH": 0.3632, + "hfopenllm_v2/MATH Level 5": 0.0204, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.4668, + "hfopenllm_v2/MMLU-PRO": 0.2133 + } + }, + { + "id": "deepseek-ai/deepseek-moe-16b-base", + "name": "deepseek-moe-16b-base", + "developer": "deepseek-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.245, + "hfopenllm_v2/BBH": 0.3409, + "hfopenllm_v2/MATH Level 5": 0.0242, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.3658, + "hfopenllm_v2/MMLU-PRO": 0.1505 + } + }, + { + "id": "deepseek-ai/deepseek-moe-16b-chat", + "name": "deepseek-moe-16b-chat", + "developer": "deepseek-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3663, + "hfopenllm_v2/BBH": 0.3275, + "hfopenllm_v2/MATH Level 5": 0.0257, + "hfopenllm_v2/GPQA": 0.2248, + "hfopenllm_v2/MUSR": 0.3808, + "hfopenllm_v2/MMLU-PRO": 0.1964 + } + }, + { + "id": "deepseek-ai/deepseek-r1-0528", + "name": "DeepSeek-R1-0528", + "developer": "deepseek-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.699, + "helm_capabilities/MMLU-Pro": 0.793, + "helm_capabilities/GPQA": 0.666, + "helm_capabilities/IFEval": 0.784, + "helm_capabilities/WildBench": 0.828, + "helm_capabilities/Omni-MATH": 0.424 + } + }, + { + "id": "deepseek-ai/deepseek-v3", + "name": "DeepSeek v3", + "developer": "deepseek-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.665, + "helm_capabilities/MMLU-Pro": 0.723, + "helm_capabilities/GPQA": 0.538, + "helm_capabilities/IFEval": 0.832, + "helm_capabilities/WildBench": 0.831, + "helm_capabilities/Omni-MATH": 0.403, + "helm_lite/Mean win rate": 0.908, + "helm_lite/NarrativeQA": 0.796, + "helm_lite/NaturalQuestions (closed-book)": 0.467, + "helm_lite/OpenbookQA": 0.954, + "helm_lite/MMLU": 0.803, + "helm_lite/MATH": 0.912, + "helm_lite/GSM8K": 0.94, + "helm_lite/LegalBench": 0.718, + "helm_lite/MedQA": 0.809, + "helm_lite/WMT 2014": 0.209, + "helm_mmlu/MMLU All Subjects": 0.872, + "helm_mmlu/Abstract Algebra": 0.84, + "helm_mmlu/Anatomy": 0.867, + "helm_mmlu/College Physics": 0.814, + "helm_mmlu/Computer Security": 0.86, + "helm_mmlu/Econometrics": 0.746, + "helm_mmlu/Global Facts": 0.68, + "helm_mmlu/Jurisprudence": 0.898, + "helm_mmlu/Philosophy": 0.9, + "helm_mmlu/Professional Psychology": 0.887, + "helm_mmlu/Us Foreign Policy": 0.92, + "helm_mmlu/Astronomy": 0.921, + "helm_mmlu/Business Ethics": 0.89, + "helm_mmlu/Clinical Knowledge": 0.913, + "helm_mmlu/Conceptual Physics": 0.94, + "helm_mmlu/Electrical Engineering": 0.869, + "helm_mmlu/Elementary Mathematics": 0.942, + "helm_mmlu/Formal Logic": 0.77, + "helm_mmlu/High School World History": 0.928, + "helm_mmlu/Human Sexuality": 0.924, + "helm_mmlu/International Law": 0.95, + "helm_mmlu/Logical Fallacies": 0.914, + "helm_mmlu/Machine Learning": 0.786, + "helm_mmlu/Management": 0.903, + "helm_mmlu/Marketing": 0.949, + "helm_mmlu/Medical Genetics": 0.96, + "helm_mmlu/Miscellaneous": 0.949, + "helm_mmlu/Moral Scenarios": 0.808, + "helm_mmlu/Nutrition": 0.918, + "helm_mmlu/Prehistory": 0.923, + "helm_mmlu/Public Relations": 0.809, + "helm_mmlu/Security Studies": 0.837, + "helm_mmlu/Sociology": 0.955, + "helm_mmlu/Virology": 0.596, + "helm_mmlu/World Religions": 0.912, + "helm_mmlu/Mean win rate": 0.215 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/deepseek.json b/data/developers/deepseek.json new file mode 100644 index 0000000000000000000000000000000000000000..1c17d618529a54c25c88a53026be3b63a072fc2c --- /dev/null +++ b/data/developers/deepseek.json @@ -0,0 +1,59 @@ +{ + "developer": "deepseek", + "models": [ + { + "id": "deepseek/deepseek-r1-0528", + "name": "deepseek-r1-0528", + "developer": "deepseek", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.6744, + "global-mmlu-lite/Culturally Sensitive": 0.6672, + "global-mmlu-lite/Culturally Agnostic": 0.6816, + "global-mmlu-lite/Arabic": 0.6825, + "global-mmlu-lite/English": 0.715, + "global-mmlu-lite/Bengali": 0.655, + "global-mmlu-lite/German": 0.6375, + "global-mmlu-lite/French": 0.6925, + "global-mmlu-lite/Hindi": 0.6475, + "global-mmlu-lite/Indonesian": 0.655, + "global-mmlu-lite/Italian": 0.6775, + "global-mmlu-lite/Japanese": 0.7725, + "global-mmlu-lite/Korean": 0.6575, + "global-mmlu-lite/Portuguese": 0.635, + "global-mmlu-lite/Spanish": 0.7175, + "global-mmlu-lite/Swahili": 0.6775, + "global-mmlu-lite/Yoruba": 0.77, + "global-mmlu-lite/Chinese": 0.5075, + "global-mmlu-lite/Burmese": 0.69 + } + }, + { + "id": "deepseek/deepseek-v3.1", + "name": "deepseek-v3.1", + "developer": "deepseek", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.8044, + "global-mmlu-lite/Culturally Sensitive": 0.7793, + "global-mmlu-lite/Culturally Agnostic": 0.8295, + "global-mmlu-lite/Arabic": 0.805, + "global-mmlu-lite/English": 0.825, + "global-mmlu-lite/Bengali": 0.8157, + "global-mmlu-lite/German": 0.7925, + "global-mmlu-lite/French": 0.8175, + "global-mmlu-lite/Hindi": 0.7569, + "global-mmlu-lite/Indonesian": 0.7764, + "global-mmlu-lite/Italian": 0.8075, + "global-mmlu-lite/Japanese": 0.8312, + "global-mmlu-lite/Korean": 0.8125, + "global-mmlu-lite/Portuguese": 0.8246, + "global-mmlu-lite/Spanish": 0.8125, + "global-mmlu-lite/Swahili": 0.801, + "global-mmlu-lite/Yoruba": 0.7831, + "global-mmlu-lite/Chinese": 0.8161, + "global-mmlu-lite/Burmese": 0.7925 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/dfurman.json b/data/developers/dfurman.json new file mode 100644 index 0000000000000000000000000000000000000000..2947dc3ef503295f24886c787e729305dcebb026 --- /dev/null +++ b/data/developers/dfurman.json @@ -0,0 +1,61 @@ +{ + "developer": "dfurman", + "models": [ + { + "id": "dfurman/CalmeRys-78B-Orpo-v0.1", + "name": "CalmeRys-78B-Orpo-v0.1", + "developer": "dfurman", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8163, + "hfopenllm_v2/BBH": 0.7262, + "hfopenllm_v2/MATH Level 5": 0.4063, + "hfopenllm_v2/GPQA": 0.4002, + "hfopenllm_v2/MUSR": 0.5902, + "hfopenllm_v2/MMLU-PRO": 0.7012 + } + }, + { + "id": "dfurman/Llama-3-70B-Orpo-v0.1", + "name": "Llama-3-70B-Orpo-v0.1", + "developer": "dfurman", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2049, + "hfopenllm_v2/BBH": 0.4655, + "hfopenllm_v2/MATH Level 5": 0.1579, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.4534, + "hfopenllm_v2/MMLU-PRO": 0.3893 + } + }, + { + "id": "dfurman/Llama-3-8B-Orpo-v0.1", + "name": "Llama-3-8B-Orpo-v0.1", + "developer": "dfurman", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2835, + "hfopenllm_v2/BBH": 0.3842, + "hfopenllm_v2/MATH Level 5": 0.0521, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3566, + "hfopenllm_v2/MMLU-PRO": 0.2298 + } + }, + { + "id": "dfurman/Qwen2-72B-Orpo-v0.1", + "name": "Qwen2-72B-Orpo-v0.1", + "developer": "dfurman", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.788, + "hfopenllm_v2/BBH": 0.6969, + "hfopenllm_v2/MATH Level 5": 0.4056, + "hfopenllm_v2/GPQA": 0.3842, + "hfopenllm_v2/MUSR": 0.4784, + "hfopenllm_v2/MMLU-PRO": 0.5455 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/dicta-il.json b/data/developers/dicta-il.json new file mode 100644 index 0000000000000000000000000000000000000000..96d95a28710fe141cefa709d98c71f0591b446ff --- /dev/null +++ b/data/developers/dicta-il.json @@ -0,0 +1,33 @@ +{ + "developer": "dicta-il", + "models": [ + { + "id": "dicta-il/dictalm2.0", + "name": "dictalm2.0", + "developer": "dicta-il", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2413, + "hfopenllm_v2/BBH": 0.4018, + "hfopenllm_v2/MATH Level 5": 0.0181, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.382, + "hfopenllm_v2/MMLU-PRO": 0.2605 + } + }, + { + "id": "dicta-il/dictalm2.0-instruct", + "name": "dictalm2.0-instruct", + "developer": "dicta-il", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4412, + "hfopenllm_v2/BBH": 0.4256, + "hfopenllm_v2/MATH Level 5": 0.0227, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.3946, + "hfopenllm_v2/MMLU-PRO": 0.2605 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/distilbert.json b/data/developers/distilbert.json new file mode 100644 index 0000000000000000000000000000000000000000..e40493fad80544e55392936d1c3d06d2905b07f6 --- /dev/null +++ b/data/developers/distilbert.json @@ -0,0 +1,19 @@ +{ + "developer": "distilbert", + "models": [ + { + "id": "distilbert/distilgpt2", + "name": "distilgpt2", + "developer": "distilbert", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0611, + "hfopenllm_v2/BBH": 0.3038, + "hfopenllm_v2/MATH Level 5": 0.006, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.4207, + "hfopenllm_v2/MMLU-PRO": 0.1187 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/divyanshukunwar.json b/data/developers/divyanshukunwar.json new file mode 100644 index 0000000000000000000000000000000000000000..10b579d03cb68714ba306e97273cc6fd36fd44d7 --- /dev/null +++ b/data/developers/divyanshukunwar.json @@ -0,0 +1,19 @@ +{ + "developer": "divyanshukunwar", + "models": [ + { + "id": "divyanshukunwar/SASTRI_1_9B", + "name": "SASTRI_1_9B", + "developer": "divyanshukunwar", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4207, + "hfopenllm_v2/BBH": 0.468, + "hfopenllm_v2/MATH Level 5": 0.1156, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.3831, + "hfopenllm_v2/MMLU-PRO": 0.3187 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/djuna-test-lab.json b/data/developers/djuna-test-lab.json new file mode 100644 index 0000000000000000000000000000000000000000..a155d6ff32413d81bbc59a97ce20c2e851ad7bad --- /dev/null +++ b/data/developers/djuna-test-lab.json @@ -0,0 +1,33 @@ +{ + "developer": "djuna-test-lab", + "models": [ + { + "id": "djuna-test-lab/TEST-L3.2-ReWish-3B", + "name": "TEST-L3.2-ReWish-3B", + "developer": "djuna-test-lab", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6368, + "hfopenllm_v2/BBH": 0.4495, + "hfopenllm_v2/MATH Level 5": 0.1367, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3777, + "hfopenllm_v2/MMLU-PRO": 0.3126 + } + }, + { + "id": "djuna-test-lab/TEST-L3.2-ReWish-3B-ties-w-base", + "name": "TEST-L3.2-ReWish-3B-ties-w-base", + "developer": "djuna-test-lab", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6353, + "hfopenllm_v2/BBH": 0.4495, + "hfopenllm_v2/MATH Level 5": 0.1367, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3777, + "hfopenllm_v2/MMLU-PRO": 0.3126 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/djuna.json b/data/developers/djuna.json new file mode 100644 index 0000000000000000000000000000000000000000..722d94d3c6d099c9e6a314b22e61bda73817fd42 --- /dev/null +++ b/data/developers/djuna.json @@ -0,0 +1,215 @@ +{ + "developer": "djuna", + "models": [ + { + "id": "djuna/G2-BigGSHT-27B-2", + "name": "G2-BigGSHT-27B-2", + "developer": "djuna", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7974, + "hfopenllm_v2/BBH": 0.6415, + "hfopenllm_v2/MATH Level 5": 0.2349, + "hfopenllm_v2/GPQA": 0.3633, + "hfopenllm_v2/MUSR": 0.4072, + "hfopenllm_v2/MMLU-PRO": 0.4528 + } + }, + { + "id": "djuna/G2-GSHT", + "name": "G2-GSHT", + "developer": "djuna", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.563, + "hfopenllm_v2/BBH": 0.527, + "hfopenllm_v2/MATH Level 5": 0.1926, + "hfopenllm_v2/GPQA": 0.3255, + "hfopenllm_v2/MUSR": 0.4006, + "hfopenllm_v2/MMLU-PRO": 0.307 + } + }, + { + "id": "djuna/Gemma-2-gemmama-9b", + "name": "Gemma-2-gemmama-9b", + "developer": "djuna", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7703, + "hfopenllm_v2/BBH": 0.542, + "hfopenllm_v2/MATH Level 5": 0.1926, + "hfopenllm_v2/GPQA": 0.3356, + "hfopenllm_v2/MUSR": 0.4031, + "hfopenllm_v2/MMLU-PRO": 0.3109 + } + }, + { + "id": "djuna/L3.1-ForStHS", + "name": "L3.1-ForStHS", + "developer": "djuna", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7813, + "hfopenllm_v2/BBH": 0.5203, + "hfopenllm_v2/MATH Level 5": 0.1503, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.4026, + "hfopenllm_v2/MMLU-PRO": 0.3735 + } + }, + { + "id": "djuna/L3.1-Promissum_Mane-8B-Della-1.5-calc", + "name": "L3.1-Promissum_Mane-8B-Della-1.5-calc", + "developer": "djuna", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7235, + "hfopenllm_v2/BBH": 0.5433, + "hfopenllm_v2/MATH Level 5": 0.1639, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.4253, + "hfopenllm_v2/MMLU-PRO": 0.3904 + } + }, + { + "id": "djuna/L3.1-Promissum_Mane-8B-Della-calc", + "name": "L3.1-Promissum_Mane-8B-Della-calc", + "developer": "djuna", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5442, + "hfopenllm_v2/BBH": 0.5486, + "hfopenllm_v2/MATH Level 5": 0.1843, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.423, + "hfopenllm_v2/MMLU-PRO": 0.3802 + } + }, + { + "id": "djuna/L3.1-Purosani-2-8B", + "name": "L3.1-Purosani-2-8B", + "developer": "djuna", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4988, + "hfopenllm_v2/BBH": 0.5182, + "hfopenllm_v2/MATH Level 5": 0.1171, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.3816, + "hfopenllm_v2/MMLU-PRO": 0.3752 + } + }, + { + "id": "djuna/L3.1-Suze-Vume-calc", + "name": "L3.1-Suze-Vume-calc", + "developer": "djuna", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7297, + "hfopenllm_v2/BBH": 0.5164, + "hfopenllm_v2/MATH Level 5": 0.114, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.3843, + "hfopenllm_v2/MMLU-PRO": 0.3515 + } + }, + { + "id": "djuna/MN-Chinofun", + "name": "MN-Chinofun", + "developer": "djuna", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.611, + "hfopenllm_v2/BBH": 0.4953, + "hfopenllm_v2/MATH Level 5": 0.1307, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4084, + "hfopenllm_v2/MMLU-PRO": 0.3603 + } + }, + { + "id": "djuna/MN-Chinofun-12B-2", + "name": "MN-Chinofun-12B-2", + "developer": "djuna", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6171, + "hfopenllm_v2/BBH": 0.5037, + "hfopenllm_v2/MATH Level 5": 0.1307, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4268, + "hfopenllm_v2/MMLU-PRO": 0.3615 + } + }, + { + "id": "djuna/MN-Chinofun-12B-3", + "name": "MN-Chinofun-12B-3", + "developer": "djuna", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3053, + "hfopenllm_v2/BBH": 0.5348, + "hfopenllm_v2/MATH Level 5": 0.1005, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.4198, + "hfopenllm_v2/MMLU-PRO": 0.3026 + } + }, + { + "id": "djuna/MN-Chinofun-12B-4", + "name": "MN-Chinofun-12B-4", + "developer": "djuna", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5404, + "hfopenllm_v2/BBH": 0.5348, + "hfopenllm_v2/MATH Level 5": 0.1118, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4307, + "hfopenllm_v2/MMLU-PRO": 0.3497 + } + }, + { + "id": "djuna/Q2.5-Partron-7B", + "name": "Q2.5-Partron-7B", + "developer": "djuna", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7321, + "hfopenllm_v2/BBH": 0.5418, + "hfopenllm_v2/MATH Level 5": 0.4826, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4165, + "hfopenllm_v2/MMLU-PRO": 0.4283 + } + }, + { + "id": "djuna/Q2.5-Veltha-14B", + "name": "Q2.5-Veltha-14B", + "developer": "djuna", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8292, + "hfopenllm_v2/BBH": 0.6484, + "hfopenllm_v2/MATH Level 5": 0.4789, + "hfopenllm_v2/GPQA": 0.3591, + "hfopenllm_v2/MUSR": 0.4194, + "hfopenllm_v2/MMLU-PRO": 0.5298 + } + }, + { + "id": "djuna/Q2.5-Veltha-14B-0.5", + "name": "Q2.5-Veltha-14B-0.5", + "developer": "djuna", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7796, + "hfopenllm_v2/BBH": 0.6523, + "hfopenllm_v2/MATH Level 5": 0.4373, + "hfopenllm_v2/GPQA": 0.3683, + "hfopenllm_v2/MUSR": 0.4339, + "hfopenllm_v2/MMLU-PRO": 0.5295 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/dnhkng.json b/data/developers/dnhkng.json new file mode 100644 index 0000000000000000000000000000000000000000..487ace7edbb1d45e7e3210bde7770cc18e628272 --- /dev/null +++ b/data/developers/dnhkng.json @@ -0,0 +1,145 @@ +{ + "developer": "dnhkng", + "models": [ + { + "id": "dnhkng/RYS-Llama-3-8B-Instruct", + "name": "RYS-Llama-3-8B-Instruct", + "developer": "dnhkng", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6958, + "hfopenllm_v2/BBH": 0.4809, + "hfopenllm_v2/MATH Level 5": 0.0687, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3383, + "hfopenllm_v2/MMLU-PRO": 0.3557 + } + }, + { + "id": "dnhkng/RYS-Llama-3-Huge-Instruct", + "name": "RYS-Llama-3-Huge-Instruct", + "developer": "dnhkng", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7686, + "hfopenllm_v2/BBH": 0.6481, + "hfopenllm_v2/MATH Level 5": 0.2289, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.4208, + "hfopenllm_v2/MMLU-PRO": 0.511 + } + }, + { + "id": "dnhkng/RYS-Llama-3-Large-Instruct", + "name": "RYS-Llama-3-Large-Instruct", + "developer": "dnhkng", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8051, + "hfopenllm_v2/BBH": 0.6525, + "hfopenllm_v2/MATH Level 5": 0.2304, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.418, + "hfopenllm_v2/MMLU-PRO": 0.5137 + } + }, + { + "id": "dnhkng/RYS-Llama-3.1-8B-Instruct", + "name": "RYS-Llama-3.1-8B-Instruct", + "developer": "dnhkng", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7685, + "hfopenllm_v2/BBH": 0.5164, + "hfopenllm_v2/MATH Level 5": 0.1329, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3681, + "hfopenllm_v2/MMLU-PRO": 0.3639 + } + }, + { + "id": "dnhkng/RYS-Llama3.1-Large", + "name": "RYS-Llama3.1-Large", + "developer": "dnhkng", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8492, + "hfopenllm_v2/BBH": 0.6899, + "hfopenllm_v2/MATH Level 5": 0.3505, + "hfopenllm_v2/GPQA": 0.3742, + "hfopenllm_v2/MUSR": 0.4554, + "hfopenllm_v2/MMLU-PRO": 0.5249 + } + }, + { + "id": "dnhkng/RYS-Medium", + "name": "RYS-Medium", + "developer": "dnhkng", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4406, + "hfopenllm_v2/BBH": 0.6285, + "hfopenllm_v2/MATH Level 5": 0.108, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.4069, + "hfopenllm_v2/MMLU-PRO": 0.4326 + } + }, + { + "id": "dnhkng/RYS-Phi-3-medium-4k-instruct", + "name": "RYS-Phi-3-medium-4k-instruct", + "developer": "dnhkng", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4391, + "hfopenllm_v2/BBH": 0.6226, + "hfopenllm_v2/MATH Level 5": 0.1609, + "hfopenllm_v2/GPQA": 0.3549, + "hfopenllm_v2/MUSR": 0.4253, + "hfopenllm_v2/MMLU-PRO": 0.4846 + } + }, + { + "id": "dnhkng/RYS-XLarge", + "name": "RYS-XLarge", + "developer": "dnhkng", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7996, + "hfopenllm_v2/BBH": 0.705, + "hfopenllm_v2/MATH Level 5": 0.4252, + "hfopenllm_v2/GPQA": 0.3842, + "hfopenllm_v2/MUSR": 0.497, + "hfopenllm_v2/MMLU-PRO": 0.5428 + } + }, + { + "id": "dnhkng/RYS-XLarge-base", + "name": "RYS-XLarge-base", + "developer": "dnhkng", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.791, + "hfopenllm_v2/BBH": 0.7047, + "hfopenllm_v2/MATH Level 5": 0.3792, + "hfopenllm_v2/GPQA": 0.3792, + "hfopenllm_v2/MUSR": 0.4903, + "hfopenllm_v2/MMLU-PRO": 0.5431 + } + }, + { + "id": "dnhkng/RYS-XLarge2", + "name": "RYS-XLarge2", + "developer": "dnhkng", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4902, + "hfopenllm_v2/BBH": 0.6574, + "hfopenllm_v2/MATH Level 5": 0.2749, + "hfopenllm_v2/GPQA": 0.3742, + "hfopenllm_v2/MUSR": 0.4508, + "hfopenllm_v2/MMLU-PRO": 0.5378 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/dreamgen.json b/data/developers/dreamgen.json new file mode 100644 index 0000000000000000000000000000000000000000..05126ced46ac0037000c9a12a2d2e993b783fadd --- /dev/null +++ b/data/developers/dreamgen.json @@ -0,0 +1,19 @@ +{ + "developer": "dreamgen", + "models": [ + { + "id": "dreamgen/WizardLM-2-7B", + "name": "WizardLM-2-7B", + "developer": "dreamgen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4583, + "hfopenllm_v2/BBH": 0.3487, + "hfopenllm_v2/MATH Level 5": 0.0332, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.3941, + "hfopenllm_v2/MMLU-PRO": 0.266 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/dustinwloring1988.json b/data/developers/dustinwloring1988.json new file mode 100644 index 0000000000000000000000000000000000000000..95c00e3d31f713456520ba502375660b59ba6c39 --- /dev/null +++ b/data/developers/dustinwloring1988.json @@ -0,0 +1,103 @@ +{ + "developer": "dustinwloring1988", + "models": [ + { + "id": "dustinwloring1988/Reflexis-8b-chat-v1", + "name": "Reflexis-8b-chat-v1", + "developer": "dustinwloring1988", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3658, + "hfopenllm_v2/BBH": 0.4664, + "hfopenllm_v2/MATH Level 5": 0.1156, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.3754, + "hfopenllm_v2/MMLU-PRO": 0.3384 + } + }, + { + "id": "dustinwloring1988/Reflexis-8b-chat-v2", + "name": "Reflexis-8b-chat-v2", + "developer": "dustinwloring1988", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3912, + "hfopenllm_v2/BBH": 0.4724, + "hfopenllm_v2/MATH Level 5": 0.1163, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3526, + "hfopenllm_v2/MMLU-PRO": 0.3378 + } + }, + { + "id": "dustinwloring1988/Reflexis-8b-chat-v3", + "name": "Reflexis-8b-chat-v3", + "developer": "dustinwloring1988", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5367, + "hfopenllm_v2/BBH": 0.4658, + "hfopenllm_v2/MATH Level 5": 0.1224, + "hfopenllm_v2/GPQA": 0.2424, + "hfopenllm_v2/MUSR": 0.3512, + "hfopenllm_v2/MMLU-PRO": 0.3548 + } + }, + { + "id": "dustinwloring1988/Reflexis-8b-chat-v4", + "name": "Reflexis-8b-chat-v4", + "developer": "dustinwloring1988", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4698, + "hfopenllm_v2/BBH": 0.4686, + "hfopenllm_v2/MATH Level 5": 0.1027, + "hfopenllm_v2/GPQA": 0.2341, + "hfopenllm_v2/MUSR": 0.3393, + "hfopenllm_v2/MMLU-PRO": 0.339 + } + }, + { + "id": "dustinwloring1988/Reflexis-8b-chat-v5", + "name": "Reflexis-8b-chat-v5", + "developer": "dustinwloring1988", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4238, + "hfopenllm_v2/BBH": 0.4782, + "hfopenllm_v2/MATH Level 5": 0.1216, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3354, + "hfopenllm_v2/MMLU-PRO": 0.3217 + } + }, + { + "id": "dustinwloring1988/Reflexis-8b-chat-v6", + "name": "Reflexis-8b-chat-v6", + "developer": "dustinwloring1988", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4939, + "hfopenllm_v2/BBH": 0.481, + "hfopenllm_v2/MATH Level 5": 0.1299, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3753, + "hfopenllm_v2/MMLU-PRO": 0.3479 + } + }, + { + "id": "dustinwloring1988/Reflexis-8b-chat-v7", + "name": "Reflexis-8b-chat-v7", + "developer": "dustinwloring1988", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.398, + "hfopenllm_v2/BBH": 0.481, + "hfopenllm_v2/MATH Level 5": 0.1631, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3222, + "hfopenllm_v2/MMLU-PRO": 0.3643 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/duyhv1411.json b/data/developers/duyhv1411.json new file mode 100644 index 0000000000000000000000000000000000000000..9a83aa2bbbe16700ebdbb1ef8cd4508cced5dc60 --- /dev/null +++ b/data/developers/duyhv1411.json @@ -0,0 +1,33 @@ +{ + "developer": "duyhv1411", + "models": [ + { + "id": "duyhv1411/Llama-3.2-1B-en-vi", + "name": "Llama-3.2-1B-en-vi", + "developer": "duyhv1411", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4788, + "hfopenllm_v2/BBH": 0.3291, + "hfopenllm_v2/MATH Level 5": 0.0287, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.3197, + "hfopenllm_v2/MMLU-PRO": 0.1341 + } + }, + { + "id": "duyhv1411/Llama-3.2-3B-en-vi", + "name": "Llama-3.2-3B-en-vi", + "developer": "duyhv1411", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4852, + "hfopenllm_v2/BBH": 0.3272, + "hfopenllm_v2/MATH Level 5": 0.0227, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.321, + "hfopenllm_v2/MMLU-PRO": 0.1359 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/dwikitheduck.json b/data/developers/dwikitheduck.json new file mode 100644 index 0000000000000000000000000000000000000000..eb99106d7cc5453b2135a747e87d30df5a5952c6 --- /dev/null +++ b/data/developers/dwikitheduck.json @@ -0,0 +1,89 @@ +{ + "developer": "dwikitheduck", + "models": [ + { + "id": "dwikitheduck/gemma-2-2b-id", + "name": "gemma-2-2b-id", + "developer": "dwikitheduck", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3879, + "hfopenllm_v2/BBH": 0.3962, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4154, + "hfopenllm_v2/MMLU-PRO": 0.2173 + } + }, + { + "id": "dwikitheduck/gemma-2-2b-id-inst", + "name": "gemma-2-2b-id-inst", + "developer": "dwikitheduck", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3879, + "hfopenllm_v2/BBH": 0.3962, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4154, + "hfopenllm_v2/MMLU-PRO": 0.2173 + } + }, + { + "id": "dwikitheduck/gemma-2-2b-id-instruct", + "name": "gemma-2-2b-id-instruct", + "developer": "dwikitheduck", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3879, + "hfopenllm_v2/BBH": 0.3962, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4154, + "hfopenllm_v2/MMLU-PRO": 0.2173 + } + }, + { + "id": "dwikitheduck/gen-inst-1", + "name": "gen-inst-1", + "developer": "dwikitheduck", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.775, + "hfopenllm_v2/BBH": 0.642, + "hfopenllm_v2/MATH Level 5": 0.4554, + "hfopenllm_v2/GPQA": 0.3716, + "hfopenllm_v2/MUSR": 0.4205, + "hfopenllm_v2/MMLU-PRO": 0.5089 + } + }, + { + "id": "dwikitheduck/gen-try1", + "name": "gen-try1", + "developer": "dwikitheduck", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7522, + "hfopenllm_v2/BBH": 0.6359, + "hfopenllm_v2/MATH Level 5": 0.4101, + "hfopenllm_v2/GPQA": 0.3414, + "hfopenllm_v2/MUSR": 0.4416, + "hfopenllm_v2/MMLU-PRO": 0.5111 + } + }, + { + "id": "dwikitheduck/gen-try1-notemp", + "name": "gen-try1-notemp", + "developer": "dwikitheduck", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2627, + "hfopenllm_v2/BBH": 0.6263, + "hfopenllm_v2/MATH Level 5": 0.318, + "hfopenllm_v2/GPQA": 0.354, + "hfopenllm_v2/MUSR": 0.4714, + "hfopenllm_v2/MMLU-PRO": 0.521 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/dzakwan.json b/data/developers/dzakwan.json new file mode 100644 index 0000000000000000000000000000000000000000..7df352cfc70a8a60b8af56b5f8ac2199d97a2a54 --- /dev/null +++ b/data/developers/dzakwan.json @@ -0,0 +1,19 @@ +{ + "developer": "dzakwan", + "models": [ + { + "id": "dzakwan/dzakwan-MoE-4x7b-Beta", + "name": "dzakwan-MoE-4x7b-Beta", + "developer": "dzakwan", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4443, + "hfopenllm_v2/BBH": 0.514, + "hfopenllm_v2/MATH Level 5": 0.0778, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.4267, + "hfopenllm_v2/MMLU-PRO": 0.3108 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ehristoforu.json b/data/developers/ehristoforu.json new file mode 100644 index 0000000000000000000000000000000000000000..7a76d1f8b6441a01a07918f7360c31d78daf2b83 --- /dev/null +++ b/data/developers/ehristoforu.json @@ -0,0 +1,509 @@ +{ + "developer": "ehristoforu", + "models": [ + { + "id": "ehristoforu/Falcon3-8B-Franken-Basestruct", + "name": "Falcon3-8B-Franken-Basestruct", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1715, + "hfopenllm_v2/BBH": 0.5463, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.3406, + "hfopenllm_v2/MUSR": 0.3555, + "hfopenllm_v2/MMLU-PRO": 0.3947 + } + }, + { + "id": "ehristoforu/Falcon3-MoE-2x7B-Insruct", + "name": "Falcon3-MoE-2x7B-Insruct", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7643, + "hfopenllm_v2/BBH": 0.5648, + "hfopenllm_v2/MATH Level 5": 0.4124, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.484, + "hfopenllm_v2/MMLU-PRO": 0.4095 + } + }, + { + "id": "ehristoforu/Gemma2-9B-it-psy10k-mental_health", + "name": "Gemma2-9B-it-psy10k-mental_health", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5887, + "hfopenllm_v2/BBH": 0.5539, + "hfopenllm_v2/MATH Level 5": 0.1631, + "hfopenllm_v2/GPQA": 0.3372, + "hfopenllm_v2/MUSR": 0.4086, + "hfopenllm_v2/MMLU-PRO": 0.3829 + } + }, + { + "id": "ehristoforu/Gemma2-9b-it-train6", + "name": "Gemma2-9b-it-train6", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7025, + "hfopenllm_v2/BBH": 0.5898, + "hfopenllm_v2/MATH Level 5": 0.1911, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.4084, + "hfopenllm_v2/MMLU-PRO": 0.3942 + } + }, + { + "id": "ehristoforu/HappyLlama1", + "name": "HappyLlama1", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7363, + "hfopenllm_v2/BBH": 0.4996, + "hfopenllm_v2/MATH Level 5": 0.1427, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.4287, + "hfopenllm_v2/MMLU-PRO": 0.3546 + } + }, + { + "id": "ehristoforu/QwenQwen2.5-7B-IT", + "name": "QwenQwen2.5-7B-IT", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7518, + "hfopenllm_v2/BBH": 0.5398, + "hfopenllm_v2/MATH Level 5": 0.5091, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4034, + "hfopenllm_v2/MMLU-PRO": 0.4289 + } + }, + { + "id": "ehristoforu/QwenQwen2.5-7B-IT-Dare", + "name": "QwenQwen2.5-7B-IT-Dare", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7509, + "hfopenllm_v2/BBH": 0.5398, + "hfopenllm_v2/MATH Level 5": 0.5091, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4034, + "hfopenllm_v2/MMLU-PRO": 0.4289 + } + }, + { + "id": "ehristoforu/RQwen-v0.1", + "name": "RQwen-v0.1", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7625, + "hfopenllm_v2/BBH": 0.6446, + "hfopenllm_v2/MATH Level 5": 0.4645, + "hfopenllm_v2/GPQA": 0.3255, + "hfopenllm_v2/MUSR": 0.4139, + "hfopenllm_v2/MMLU-PRO": 0.5202 + } + }, + { + "id": "ehristoforu/RQwen-v0.2", + "name": "RQwen-v0.2", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7504, + "hfopenllm_v2/BBH": 0.6427, + "hfopenllm_v2/MATH Level 5": 0.327, + "hfopenllm_v2/GPQA": 0.3372, + "hfopenllm_v2/MUSR": 0.4207, + "hfopenllm_v2/MMLU-PRO": 0.5159 + } + }, + { + "id": "ehristoforu/SoRu-0009", + "name": "SoRu-0009", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2582, + "hfopenllm_v2/BBH": 0.315, + "hfopenllm_v2/MATH Level 5": 0.0211, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3369, + "hfopenllm_v2/MMLU-PRO": 0.1239 + } + }, + { + "id": "ehristoforu/coolqwen-3b-it", + "name": "coolqwen-3b-it", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6473, + "hfopenllm_v2/BBH": 0.4851, + "hfopenllm_v2/MATH Level 5": 0.3671, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4125, + "hfopenllm_v2/MMLU-PRO": 0.3601 + } + }, + { + "id": "ehristoforu/della-70b-test-v1", + "name": "della-70b-test-v1", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4979, + "hfopenllm_v2/BBH": 0.3029, + "hfopenllm_v2/MATH Level 5": 0.0098, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.4555, + "hfopenllm_v2/MMLU-PRO": 0.1575 + } + }, + { + "id": "ehristoforu/falcon3-ultraset", + "name": "falcon3-ultraset", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7135, + "hfopenllm_v2/BBH": 0.5584, + "hfopenllm_v2/MATH Level 5": 0.2122, + "hfopenllm_v2/GPQA": 0.3322, + "hfopenllm_v2/MUSR": 0.4853, + "hfopenllm_v2/MMLU-PRO": 0.3982 + } + }, + { + "id": "ehristoforu/fd-lora-merged-16x32", + "name": "fd-lora-merged-16x32", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3481, + "hfopenllm_v2/BBH": 0.3308, + "hfopenllm_v2/MATH Level 5": 0.1707, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3514, + "hfopenllm_v2/MMLU-PRO": 0.1205 + } + }, + { + "id": "ehristoforu/fd-lora-merged-64x128", + "name": "fd-lora-merged-64x128", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3281, + "hfopenllm_v2/BBH": 0.3345, + "hfopenllm_v2/MATH Level 5": 0.1873, + "hfopenllm_v2/GPQA": 0.255, + "hfopenllm_v2/MUSR": 0.3368, + "hfopenllm_v2/MMLU-PRO": 0.1537 + } + }, + { + "id": "ehristoforu/fp4-14b-it-v1", + "name": "fp4-14b-it-v1", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2535, + "hfopenllm_v2/BBH": 0.574, + "hfopenllm_v2/MATH Level 5": 0.0408, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.3595, + "hfopenllm_v2/MMLU-PRO": 0.4205 + } + }, + { + "id": "ehristoforu/fp4-14b-v1-fix", + "name": "fp4-14b-v1-fix", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6742, + "hfopenllm_v2/BBH": 0.6817, + "hfopenllm_v2/MATH Level 5": 0.4207, + "hfopenllm_v2/GPQA": 0.354, + "hfopenllm_v2/MUSR": 0.4532, + "hfopenllm_v2/MMLU-PRO": 0.5353 + } + }, + { + "id": "ehristoforu/fq2.5-7b-it-normalize_false", + "name": "fq2.5-7b-it-normalize_false", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7399, + "hfopenllm_v2/BBH": 0.552, + "hfopenllm_v2/MATH Level 5": 0.4622, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4612, + "hfopenllm_v2/MMLU-PRO": 0.4413 + } + }, + { + "id": "ehristoforu/fq2.5-7b-it-normalize_true", + "name": "fq2.5-7b-it-normalize_true", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7399, + "hfopenllm_v2/BBH": 0.552, + "hfopenllm_v2/MATH Level 5": 0.4622, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4612, + "hfopenllm_v2/MMLU-PRO": 0.4413 + } + }, + { + "id": "ehristoforu/frqwen2.5-from7b-duable4layers-it", + "name": "frqwen2.5-from7b-duable4layers-it", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7729, + "hfopenllm_v2/BBH": 0.5264, + "hfopenllm_v2/MATH Level 5": 0.4509, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4166, + "hfopenllm_v2/MMLU-PRO": 0.4126 + } + }, + { + "id": "ehristoforu/frqwen2.5-from7b-it", + "name": "frqwen2.5-from7b-it", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6532, + "hfopenllm_v2/BBH": 0.5143, + "hfopenllm_v2/MATH Level 5": 0.2923, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4086, + "hfopenllm_v2/MMLU-PRO": 0.3977 + } + }, + { + "id": "ehristoforu/mllama-3.1-8b-instruct", + "name": "mllama-3.1-8b-instruct", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3458, + "hfopenllm_v2/BBH": 0.4718, + "hfopenllm_v2/MATH Level 5": 0.3776, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.338, + "hfopenllm_v2/MMLU-PRO": 0.2533 + } + }, + { + "id": "ehristoforu/mllama-3.1-8b-it", + "name": "mllama-3.1-8b-it", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3879, + "hfopenllm_v2/BBH": 0.4868, + "hfopenllm_v2/MATH Level 5": 0.3799, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.3349, + "hfopenllm_v2/MMLU-PRO": 0.2622 + } + }, + { + "id": "ehristoforu/moremerge", + "name": "moremerge", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2019, + "hfopenllm_v2/BBH": 0.2868, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3566, + "hfopenllm_v2/MMLU-PRO": 0.1065 + } + }, + { + "id": "ehristoforu/moremerge-upscaled", + "name": "moremerge-upscaled", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1979, + "hfopenllm_v2/BBH": 0.2698, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2466, + "hfopenllm_v2/MUSR": 0.3593, + "hfopenllm_v2/MMLU-PRO": 0.1041 + } + }, + { + "id": "ehristoforu/phi-4-25b", + "name": "phi-4-25b", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6484, + "hfopenllm_v2/BBH": 0.6908, + "hfopenllm_v2/MATH Level 5": 0.4524, + "hfopenllm_v2/GPQA": 0.3188, + "hfopenllm_v2/MUSR": 0.4208, + "hfopenllm_v2/MMLU-PRO": 0.5351 + } + }, + { + "id": "ehristoforu/qwen2.5-test-32b-it", + "name": "qwen2.5-test-32b-it", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7889, + "hfopenllm_v2/BBH": 0.7081, + "hfopenllm_v2/MATH Level 5": 0.5974, + "hfopenllm_v2/GPQA": 0.3641, + "hfopenllm_v2/MUSR": 0.4578, + "hfopenllm_v2/MMLU-PRO": 0.5765 + } + }, + { + "id": "ehristoforu/qwen2.5-with-lora-think-3b-it", + "name": "qwen2.5-with-lora-think-3b-it", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5319, + "hfopenllm_v2/BBH": 0.4687, + "hfopenllm_v2/MATH Level 5": 0.2364, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.431, + "hfopenllm_v2/MMLU-PRO": 0.3403 + } + }, + { + "id": "ehristoforu/rmoe-v1", + "name": "rmoe-v1", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.265, + "hfopenllm_v2/BBH": 0.2929, + "hfopenllm_v2/MATH Level 5": 0.0015, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3663, + "hfopenllm_v2/MMLU-PRO": 0.1125 + } + }, + { + "id": "ehristoforu/rufalcon3-3b-it", + "name": "rufalcon3-3b-it", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5942, + "hfopenllm_v2/BBH": 0.4155, + "hfopenllm_v2/MATH Level 5": 0.1782, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3895, + "hfopenllm_v2/MMLU-PRO": 0.2348 + } + }, + { + "id": "ehristoforu/ruphi-4b", + "name": "ruphi-4b", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1752, + "hfopenllm_v2/BBH": 0.2906, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2399, + "hfopenllm_v2/MUSR": 0.3512, + "hfopenllm_v2/MMLU-PRO": 0.1126 + } + }, + { + "id": "ehristoforu/testq-32b", + "name": "testq-32b", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1876, + "hfopenllm_v2/BBH": 0.2877, + "hfopenllm_v2/MATH Level 5": 0.003, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.3715, + "hfopenllm_v2/MMLU-PRO": 0.1166 + } + }, + { + "id": "ehristoforu/tmoe", + "name": "tmoe", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1193, + "hfopenllm_v2/BBH": 0.3073, + "hfopenllm_v2/MATH Level 5": 0.0076, + "hfopenllm_v2/GPQA": 0.2232, + "hfopenllm_v2/MUSR": 0.3699, + "hfopenllm_v2/MMLU-PRO": 0.1191 + } + }, + { + "id": "ehristoforu/tmoe-v2", + "name": "tmoe-v2", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1903, + "hfopenllm_v2/BBH": 0.2897, + "hfopenllm_v2/MATH Level 5": 0.0023, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.4151, + "hfopenllm_v2/MMLU-PRO": 0.11 + } + }, + { + "id": "ehristoforu/trd-7b-it", + "name": "trd-7b-it", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2185, + "hfopenllm_v2/BBH": 0.299, + "hfopenllm_v2/MATH Level 5": 0.0317, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3794, + "hfopenllm_v2/MMLU-PRO": 0.1179 + } + }, + { + "id": "ehristoforu/ud-14b", + "name": "ud-14b", + "developer": "ehristoforu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4235, + "hfopenllm_v2/BBH": 0.3324, + "hfopenllm_v2/MATH Level 5": 0.1903, + "hfopenllm_v2/GPQA": 0.2374, + "hfopenllm_v2/MUSR": 0.4394, + "hfopenllm_v2/MMLU-PRO": 0.2415 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/eleutherai.json b/data/developers/eleutherai.json new file mode 100644 index 0000000000000000000000000000000000000000..05408b0d99003f1925545d7ec738bb633a752eea --- /dev/null +++ b/data/developers/eleutherai.json @@ -0,0 +1,51 @@ +{ + "developer": "eleutherai", + "models": [ + { + "id": "eleutherai/Pythia-12B", + "name": "Pythia 12B", + "developer": "eleutherai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.257, + "helm_classic/MMLU": 0.274, + "helm_classic/BoolQ": 0.662, + "helm_classic/NarrativeQA": 0.596, + "helm_classic/NaturalQuestions (open-book)": 0.581, + "helm_classic/QuAC": 0.313, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.177, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.931, + "helm_classic/CivilComments": 0.531, + "helm_classic/RAFT": 0.514 + } + }, + { + "id": "eleutherai/Pythia-6.9B", + "name": "Pythia 6.9B", + "developer": "eleutherai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.196, + "helm_classic/MMLU": 0.236, + "helm_classic/BoolQ": 0.631, + "helm_classic/NarrativeQA": 0.528, + "helm_classic/NaturalQuestions (open-book)": 0.539, + "helm_classic/QuAC": 0.296, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.213, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.928, + "helm_classic/CivilComments": 0.511, + "helm_classic/RAFT": 0.502 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/elinas.json b/data/developers/elinas.json new file mode 100644 index 0000000000000000000000000000000000000000..653f6b1374f0460ec8082dd82f66e3a98606d82d --- /dev/null +++ b/data/developers/elinas.json @@ -0,0 +1,19 @@ +{ + "developer": "elinas", + "models": [ + { + "id": "elinas/Chronos-Gold-12B-1.0", + "name": "Chronos-Gold-12B-1.0", + "developer": "elinas", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3166, + "hfopenllm_v2/BBH": 0.5515, + "hfopenllm_v2/MATH Level 5": 0.0695, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.474, + "hfopenllm_v2/MMLU-PRO": 0.3518 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ell44ot.json b/data/developers/ell44ot.json new file mode 100644 index 0000000000000000000000000000000000000000..6cb8e6dc7481b51ca955f70a13a7130fea6e83d3 --- /dev/null +++ b/data/developers/ell44ot.json @@ -0,0 +1,19 @@ +{ + "developer": "ell44ot", + "models": [ + { + "id": "ell44ot/gemma-2b-def", + "name": "gemma-2b-def", + "developer": "ell44ot", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2693, + "hfopenllm_v2/BBH": 0.3159, + "hfopenllm_v2/MATH Level 5": 0.0242, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.367, + "hfopenllm_v2/MMLU-PRO": 0.1572 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/euclaise.json b/data/developers/euclaise.json new file mode 100644 index 0000000000000000000000000000000000000000..93a139a4ab4cddc7cc9009dc8e0af3539662409f --- /dev/null +++ b/data/developers/euclaise.json @@ -0,0 +1,19 @@ +{ + "developer": "euclaise", + "models": [ + { + "id": "euclaise/ReMask-3B", + "name": "ReMask-3B", + "developer": "euclaise", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2419, + "hfopenllm_v2/BBH": 0.3517, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3341, + "hfopenllm_v2/MMLU-PRO": 0.1357 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/eworojoshua.json b/data/developers/eworojoshua.json new file mode 100644 index 0000000000000000000000000000000000000000..90484cd7279ed512276ecc4e78d37a5c24ae0289 --- /dev/null +++ b/data/developers/eworojoshua.json @@ -0,0 +1,19 @@ +{ + "developer": "eworojoshua", + "models": [ + { + "id": "eworojoshua/vas-01", + "name": "vas-01", + "developer": "eworojoshua", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7612, + "hfopenllm_v2/BBH": 0.5418, + "hfopenllm_v2/MATH Level 5": 0.4736, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4432, + "hfopenllm_v2/MMLU-PRO": 0.4348 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ewre324.json b/data/developers/ewre324.json new file mode 100644 index 0000000000000000000000000000000000000000..9ef5db19b292c6f511170d4ace64ffe317ec21e3 --- /dev/null +++ b/data/developers/ewre324.json @@ -0,0 +1,61 @@ +{ + "developer": "ewre324", + "models": [ + { + "id": "ewre324/Thinker-Llama-3.2-3B-Instruct-Reasoning", + "name": "Thinker-Llama-3.2-3B-Instruct-Reasoning", + "developer": "ewre324", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4439, + "hfopenllm_v2/BBH": 0.4273, + "hfopenllm_v2/MATH Level 5": 0.0846, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.3655, + "hfopenllm_v2/MMLU-PRO": 0.2886 + } + }, + { + "id": "ewre324/Thinker-Qwen2.5-0.5B-Instruct-Reasoning", + "name": "Thinker-Qwen2.5-0.5B-Instruct-Reasoning", + "developer": "ewre324", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2476, + "hfopenllm_v2/BBH": 0.3292, + "hfopenllm_v2/MATH Level 5": 0.0287, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.3382, + "hfopenllm_v2/MMLU-PRO": 0.1647 + } + }, + { + "id": "ewre324/Thinker-SmolLM2-135M-Instruct-Reasoning", + "name": "Thinker-SmolLM2-135M-Instruct-Reasoning", + "developer": "ewre324", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2584, + "hfopenllm_v2/BBH": 0.3071, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.3661, + "hfopenllm_v2/MMLU-PRO": 0.1094 + } + }, + { + "id": "ewre324/ewre324-R1-SmolLM2-135M-Distill", + "name": "ewre324-R1-SmolLM2-135M-Distill", + "developer": "ewre324", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1649, + "hfopenllm_v2/BBH": 0.3042, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3409, + "hfopenllm_v2/MMLU-PRO": 0.1134 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/experiment-llm.json b/data/developers/experiment-llm.json new file mode 100644 index 0000000000000000000000000000000000000000..fe679ed014b617a49cefb46b00c2228700dca44c --- /dev/null +++ b/data/developers/experiment-llm.json @@ -0,0 +1,19 @@ +{ + "developer": "experiment-llm", + "models": [ + { + "id": "experiment-llm/exp-3-q-r", + "name": "exp-3-q-r", + "developer": "experiment-llm", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6036, + "hfopenllm_v2/BBH": 0.5397, + "hfopenllm_v2/MATH Level 5": 0.2787, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4315, + "hfopenllm_v2/MMLU-PRO": 0.4316 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/facebook.json b/data/developers/facebook.json new file mode 100644 index 0000000000000000000000000000000000000000..e8b1799fe5c6c7907f0e84e1d072b046d122b110 --- /dev/null +++ b/data/developers/facebook.json @@ -0,0 +1,59 @@ +{ + "developer": "facebook", + "models": [ + { + "id": "facebook/Self-taught-Llama-3-70B", + "name": "facebook/Self-taught-Llama-3-70B", + "developer": "facebook", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8863, + "reward-bench/Chat": 0.9693, + "reward-bench/Chat Hard": 0.8399, + "reward-bench/Safety": 0.9108, + "reward-bench/Reasoning": 0.8251 + } + }, + { + "id": "facebook/Self-taught-evaluator-llama3.1-70B", + "name": "facebook/Self-taught-evaluator-llama3.1-70B", + "developer": "facebook", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9001, + "reward-bench/Chat": 0.9693, + "reward-bench/Chat Hard": 0.8509, + "reward-bench/Safety": 0.8959, + "reward-bench/Reasoning": 0.8844 + } + }, + { + "id": "facebook/opt-1.3b", + "name": "opt-1.3b", + "developer": "facebook", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2383, + "hfopenllm_v2/BBH": 0.3094, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.2424, + "hfopenllm_v2/MUSR": 0.342, + "hfopenllm_v2/MMLU-PRO": 0.1107 + } + }, + { + "id": "facebook/opt-30b", + "name": "opt-30b", + "developer": "facebook", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2453, + "hfopenllm_v2/BBH": 0.307, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3604, + "hfopenllm_v2/MMLU-PRO": 0.1164 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/failspy.json b/data/developers/failspy.json new file mode 100644 index 0000000000000000000000000000000000000000..ec3d856dbe4ae19eb1d5735bcd30eec43a831383 --- /dev/null +++ b/data/developers/failspy.json @@ -0,0 +1,89 @@ +{ + "developer": "failspy", + "models": [ + { + "id": "failspy/Llama-3-8B-Instruct-MopeyMule", + "name": "Llama-3-8B-Instruct-MopeyMule", + "developer": "failspy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.675, + "hfopenllm_v2/BBH": 0.3839, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2391, + "hfopenllm_v2/MUSR": 0.3513, + "hfopenllm_v2/MMLU-PRO": 0.1764 + } + }, + { + "id": "failspy/Llama-3-8B-Instruct-abliterated", + "name": "Llama-3-8B-Instruct-abliterated", + "developer": "failspy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5909, + "hfopenllm_v2/BBH": 0.4354, + "hfopenllm_v2/MATH Level 5": 0.0385, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.4116, + "hfopenllm_v2/MMLU-PRO": 0.2742 + } + }, + { + "id": "failspy/Meta-Llama-3-70B-Instruct-abliterated-v3.5", + "name": "Meta-Llama-3-70B-Instruct-abliterated-v3.5", + "developer": "failspy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7747, + "hfopenllm_v2/BBH": 0.5747, + "hfopenllm_v2/MATH Level 5": 0.1284, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.3982, + "hfopenllm_v2/MMLU-PRO": 0.4452 + } + }, + { + "id": "failspy/Meta-Llama-3-8B-Instruct-abliterated-v3", + "name": "Meta-Llama-3-8B-Instruct-abliterated-v3", + "developer": "failspy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7245, + "hfopenllm_v2/BBH": 0.4925, + "hfopenllm_v2/MATH Level 5": 0.0959, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3622, + "hfopenllm_v2/MMLU-PRO": 0.3654 + } + }, + { + "id": "failspy/Phi-3-medium-4k-instruct-abliterated-v3", + "name": "Phi-3-medium-4k-instruct-abliterated-v3", + "developer": "failspy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6319, + "hfopenllm_v2/BBH": 0.6305, + "hfopenllm_v2/MATH Level 5": 0.1594, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4604, + "hfopenllm_v2/MMLU-PRO": 0.44 + } + }, + { + "id": "failspy/llama-3-70B-Instruct-abliterated", + "name": "llama-3-70B-Instruct-abliterated", + "developer": "failspy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8023, + "hfopenllm_v2/BBH": 0.6465, + "hfopenllm_v2/MATH Level 5": 0.2432, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.4128, + "hfopenllm_v2/MMLU-PRO": 0.5145 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/fblgit.json b/data/developers/fblgit.json new file mode 100644 index 0000000000000000000000000000000000000000..6c66e5b38a834fb39aa5d0ee4cfd84d4996c80a2 --- /dev/null +++ b/data/developers/fblgit.json @@ -0,0 +1,159 @@ +{ + "developer": "fblgit", + "models": [ + { + "id": "fblgit/TheBeagle-v2beta-32B-MGS", + "name": "TheBeagle-v2beta-32B-MGS", + "developer": "fblgit", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4503, + "hfopenllm_v2/BBH": 0.7035, + "hfopenllm_v2/MATH Level 5": 0.3943, + "hfopenllm_v2/GPQA": 0.401, + "hfopenllm_v2/MUSR": 0.5021, + "hfopenllm_v2/MMLU-PRO": 0.5911 + } + }, + { + "id": "fblgit/UNA-SimpleSmaug-34b-v1beta", + "name": "UNA-SimpleSmaug-34b-v1beta", + "developer": "fblgit", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4556, + "hfopenllm_v2/BBH": 0.5287, + "hfopenllm_v2/MATH Level 5": 0.0718, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4256, + "hfopenllm_v2/MMLU-PRO": 0.454 + } + }, + { + "id": "fblgit/UNA-TheBeagle-7b-v1", + "name": "UNA-TheBeagle-7b-v1", + "developer": "fblgit", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3689, + "hfopenllm_v2/BBH": 0.5029, + "hfopenllm_v2/MATH Level 5": 0.077, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4564, + "hfopenllm_v2/MMLU-PRO": 0.3019 + } + }, + { + "id": "fblgit/UNA-ThePitbull-21.4B-v2", + "name": "UNA-ThePitbull-21.4B-v2", + "developer": "fblgit", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.379, + "hfopenllm_v2/BBH": 0.635, + "hfopenllm_v2/MATH Level 5": 0.1216, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.3922, + "hfopenllm_v2/MMLU-PRO": 0.3516 + } + }, + { + "id": "fblgit/cybertron-v4-qw7B-MGS", + "name": "cybertron-v4-qw7B-MGS", + "developer": "fblgit", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6264, + "hfopenllm_v2/BBH": 0.5592, + "hfopenllm_v2/MATH Level 5": 0.3489, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4371, + "hfopenllm_v2/MMLU-PRO": 0.4473 + } + }, + { + "id": "fblgit/cybertron-v4-qw7B-UNAMGS", + "name": "cybertron-v4-qw7B-UNAMGS", + "developer": "fblgit", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.609, + "hfopenllm_v2/BBH": 0.5643, + "hfopenllm_v2/MATH Level 5": 0.3731, + "hfopenllm_v2/GPQA": 0.3314, + "hfopenllm_v2/MUSR": 0.4343, + "hfopenllm_v2/MMLU-PRO": 0.45 + } + }, + { + "id": "fblgit/juanako-7b-UNA", + "name": "juanako-7b-UNA", + "developer": "fblgit", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4837, + "hfopenllm_v2/BBH": 0.507, + "hfopenllm_v2/MATH Level 5": 0.034, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4645, + "hfopenllm_v2/MMLU-PRO": 0.2771 + } + }, + { + "id": "fblgit/miniclaus-qw1.5B-UNAMGS", + "name": "miniclaus-qw1.5B-UNAMGS", + "developer": "fblgit", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3348, + "hfopenllm_v2/BBH": 0.4239, + "hfopenllm_v2/MATH Level 5": 0.1088, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4293, + "hfopenllm_v2/MMLU-PRO": 0.2937 + } + }, + { + "id": "fblgit/miniclaus-qw1.5B-UNAMGS-GRPO", + "name": "miniclaus-qw1.5B-UNAMGS-GRPO", + "developer": "fblgit", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3518, + "hfopenllm_v2/BBH": 0.4234, + "hfopenllm_v2/MATH Level 5": 0.1103, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4254, + "hfopenllm_v2/MMLU-PRO": 0.2945 + } + }, + { + "id": "fblgit/pancho-v1-qw25-3B-UNAMGS", + "name": "pancho-v1-qw25-3B-UNAMGS", + "developer": "fblgit", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5361, + "hfopenllm_v2/BBH": 0.4926, + "hfopenllm_v2/MATH Level 5": 0.1571, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4027, + "hfopenllm_v2/MMLU-PRO": 0.3766 + } + }, + { + "id": "fblgit/una-cybertron-7b-v2-bf16", + "name": "una-cybertron-7b-v2-bf16", + "developer": "fblgit", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4737, + "hfopenllm_v2/BBH": 0.3973, + "hfopenllm_v2/MATH Level 5": 0.0408, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4473, + "hfopenllm_v2/MMLU-PRO": 0.2443 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/fhai50032.json b/data/developers/fhai50032.json new file mode 100644 index 0000000000000000000000000000000000000000..018ad4362c945f75a3b06b06adb8c08393c3d413 --- /dev/null +++ b/data/developers/fhai50032.json @@ -0,0 +1,33 @@ +{ + "developer": "fhai50032", + "models": [ + { + "id": "fhai50032/RolePlayLake-7B", + "name": "RolePlayLake-7B", + "developer": "fhai50032", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5057, + "hfopenllm_v2/BBH": 0.5252, + "hfopenllm_v2/MATH Level 5": 0.0725, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4459, + "hfopenllm_v2/MMLU-PRO": 0.316 + } + }, + { + "id": "fhai50032/Unaligned-Thinker-PHI-4", + "name": "Unaligned-Thinker-PHI-4", + "developer": "fhai50032", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0563, + "hfopenllm_v2/BBH": 0.6643, + "hfopenllm_v2/MATH Level 5": 0.3353, + "hfopenllm_v2/GPQA": 0.3809, + "hfopenllm_v2/MUSR": 0.4679, + "hfopenllm_v2/MMLU-PRO": 0.5147 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/flammenai.json b/data/developers/flammenai.json new file mode 100644 index 0000000000000000000000000000000000000000..a3b8c231f633908cb340169e30bb19361f478bba --- /dev/null +++ b/data/developers/flammenai.json @@ -0,0 +1,89 @@ +{ + "developer": "flammenai", + "models": [ + { + "id": "flammenai/Llama3.1-Flammades-70B", + "name": "Llama3.1-Flammades-70B", + "developer": "flammenai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7058, + "hfopenllm_v2/BBH": 0.666, + "hfopenllm_v2/MATH Level 5": 0.2092, + "hfopenllm_v2/GPQA": 0.354, + "hfopenllm_v2/MUSR": 0.4871, + "hfopenllm_v2/MMLU-PRO": 0.4752 + } + }, + { + "id": "flammenai/Mahou-1.2a-llama3-8B", + "name": "Mahou-1.2a-llama3-8B", + "developer": "flammenai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5093, + "hfopenllm_v2/BBH": 0.5094, + "hfopenllm_v2/MATH Level 5": 0.0838, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.3847, + "hfopenllm_v2/MMLU-PRO": 0.3817 + } + }, + { + "id": "flammenai/Mahou-1.2a-mistral-7B", + "name": "Mahou-1.2a-mistral-7B", + "developer": "flammenai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4552, + "hfopenllm_v2/BBH": 0.5118, + "hfopenllm_v2/MATH Level 5": 0.0687, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3896, + "hfopenllm_v2/MMLU-PRO": 0.3163 + } + }, + { + "id": "flammenai/Mahou-1.5-llama3.1-70B", + "name": "Mahou-1.5-llama3.1-70B", + "developer": "flammenai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7147, + "hfopenllm_v2/BBH": 0.6651, + "hfopenllm_v2/MATH Level 5": 0.21, + "hfopenllm_v2/GPQA": 0.354, + "hfopenllm_v2/MUSR": 0.495, + "hfopenllm_v2/MMLU-PRO": 0.4749 + } + }, + { + "id": "flammenai/Mahou-1.5-mistral-nemo-12B", + "name": "Mahou-1.5-mistral-nemo-12B", + "developer": "flammenai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6751, + "hfopenllm_v2/BBH": 0.5522, + "hfopenllm_v2/MATH Level 5": 0.0869, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.452, + "hfopenllm_v2/MMLU-PRO": 0.3602 + } + }, + { + "id": "flammenai/flammen15-gutenberg-DPO-v1-7B", + "name": "flammen15-gutenberg-DPO-v1-7B", + "developer": "flammenai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4798, + "hfopenllm_v2/BBH": 0.5203, + "hfopenllm_v2/MATH Level 5": 0.0763, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4293, + "hfopenllm_v2/MMLU-PRO": 0.3186 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/fluently-lm.json b/data/developers/fluently-lm.json new file mode 100644 index 0000000000000000000000000000000000000000..0e6c9848b32304c6529c8bfaa085da0f02940129 --- /dev/null +++ b/data/developers/fluently-lm.json @@ -0,0 +1,47 @@ +{ + "developer": "fluently-lm", + "models": [ + { + "id": "fluently-lm/FluentlyLM-Prinum", + "name": "FluentlyLM-Prinum", + "developer": "fluently-lm", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.809, + "hfopenllm_v2/BBH": 0.7144, + "hfopenllm_v2/MATH Level 5": 0.54, + "hfopenllm_v2/GPQA": 0.3867, + "hfopenllm_v2/MUSR": 0.4471, + "hfopenllm_v2/MMLU-PRO": 0.5808 + } + }, + { + "id": "fluently-lm/Llama-TI-8B", + "name": "Llama-TI-8B", + "developer": "fluently-lm", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.288, + "hfopenllm_v2/BBH": 0.5201, + "hfopenllm_v2/MATH Level 5": 0.1964, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4103, + "hfopenllm_v2/MMLU-PRO": 0.344 + } + }, + { + "id": "fluently-lm/Llama-TI-8B-Instruct", + "name": "Llama-TI-8B-Instruct", + "developer": "fluently-lm", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7716, + "hfopenllm_v2/BBH": 0.5252, + "hfopenllm_v2/MATH Level 5": 0.2304, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.3813, + "hfopenllm_v2/MMLU-PRO": 0.3726 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/fluently-sets.json b/data/developers/fluently-sets.json new file mode 100644 index 0000000000000000000000000000000000000000..42343f86d625db8cce5578283dd28c1d1ca72350 --- /dev/null +++ b/data/developers/fluently-sets.json @@ -0,0 +1,33 @@ +{ + "developer": "fluently-sets", + "models": [ + { + "id": "fluently-sets/FalconThink3-10B-IT", + "name": "FalconThink3-10B-IT", + "developer": "fluently-sets", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7326, + "hfopenllm_v2/BBH": 0.62, + "hfopenllm_v2/MATH Level 5": 0.2447, + "hfopenllm_v2/GPQA": 0.3347, + "hfopenllm_v2/MUSR": 0.4479, + "hfopenllm_v2/MMLU-PRO": 0.4435 + } + }, + { + "id": "fluently-sets/reasoning-1-1k-demo", + "name": "reasoning-1-1k-demo", + "developer": "fluently-sets", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7525, + "hfopenllm_v2/BBH": 0.6397, + "hfopenllm_v2/MATH Level 5": 0.4282, + "hfopenllm_v2/GPQA": 0.3356, + "hfopenllm_v2/MUSR": 0.4061, + "hfopenllm_v2/MMLU-PRO": 0.4774 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/formulae.json b/data/developers/formulae.json new file mode 100644 index 0000000000000000000000000000000000000000..a90660346e063775466ae46cd6fd73ac7b92344a --- /dev/null +++ b/data/developers/formulae.json @@ -0,0 +1,145 @@ +{ + "developer": "formulae", + "models": [ + { + "id": "formulae/mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp", + "name": "mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp", + "developer": "formulae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1614, + "hfopenllm_v2/BBH": 0.2976, + "hfopenllm_v2/MATH Level 5": 0.0015, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.4219, + "hfopenllm_v2/MMLU-PRO": 0.1174 + } + }, + { + "id": "formulae/mita-elite-v1.1-7b-2-25-2025", + "name": "mita-elite-v1.1-7b-2-25-2025", + "developer": "formulae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.125, + "hfopenllm_v2/BBH": 0.2867, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2483, + "hfopenllm_v2/MUSR": 0.3487, + "hfopenllm_v2/MMLU-PRO": 0.1098 + } + }, + { + "id": "formulae/mita-elite-v1.1-gen2-7b-2-25-2025", + "name": "mita-elite-v1.1-gen2-7b-2-25-2025", + "developer": "formulae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1411, + "hfopenllm_v2/BBH": 0.2924, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.3541, + "hfopenllm_v2/MMLU-PRO": 0.1101 + } + }, + { + "id": "formulae/mita-elite-v1.2-7b-2-26-2025", + "name": "mita-elite-v1.2-7b-2-26-2025", + "developer": "formulae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.148, + "hfopenllm_v2/BBH": 0.293, + "hfopenllm_v2/MATH Level 5": 0.0023, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.4287, + "hfopenllm_v2/MMLU-PRO": 0.1186 + } + }, + { + "id": "formulae/mita-gen3-7b-2-26-2025", + "name": "mita-gen3-7b-2-26-2025", + "developer": "formulae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1964, + "hfopenllm_v2/BBH": 0.2916, + "hfopenllm_v2/MATH Level 5": 0.0023, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3912, + "hfopenllm_v2/MMLU-PRO": 0.1124 + } + }, + { + "id": "formulae/mita-gen3-v1.2-7b-2-26-2025", + "name": "mita-gen3-v1.2-7b-2-26-2025", + "developer": "formulae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2044, + "hfopenllm_v2/BBH": 0.3058, + "hfopenllm_v2/MATH Level 5": 0.0023, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.39, + "hfopenllm_v2/MMLU-PRO": 0.1128 + } + }, + { + "id": "formulae/mita-math-v2.3-2-25-2025", + "name": "mita-math-v2.3-2-25-2025", + "developer": "formulae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1373, + "hfopenllm_v2/BBH": 0.2949, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2508, + "hfopenllm_v2/MUSR": 0.3698, + "hfopenllm_v2/MMLU-PRO": 0.1118 + } + }, + { + "id": "formulae/mita-v1-7b", + "name": "mita-v1-7b", + "developer": "formulae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1972, + "hfopenllm_v2/BBH": 0.3003, + "hfopenllm_v2/MATH Level 5": 0.0023, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.4152, + "hfopenllm_v2/MMLU-PRO": 0.1147 + } + }, + { + "id": "formulae/mita-v1.1-7b-2-24-2025", + "name": "mita-v1.1-7b-2-24-2025", + "developer": "formulae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3412, + "hfopenllm_v2/BBH": 0.5442, + "hfopenllm_v2/MATH Level 5": 0.435, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.4557, + "hfopenllm_v2/MMLU-PRO": 0.4524 + } + }, + { + "id": "formulae/mita-v1.2-7b-2-24-2025", + "name": "mita-v1.2-7b-2-24-2025", + "developer": "formulae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2564, + "hfopenllm_v2/BBH": 0.4919, + "hfopenllm_v2/MATH Level 5": 0.4879, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4344, + "hfopenllm_v2/MMLU-PRO": 0.3359 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/frameai.json b/data/developers/frameai.json new file mode 100644 index 0000000000000000000000000000000000000000..f6cfe0eeaa937f1c2daddcc91f80312567ab07a3 --- /dev/null +++ b/data/developers/frameai.json @@ -0,0 +1,19 @@ +{ + "developer": "frameai", + "models": [ + { + "id": "frameai/Loxa-4B", + "name": "Loxa-4B", + "developer": "frameai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4765, + "hfopenllm_v2/BBH": 0.4217, + "hfopenllm_v2/MATH Level 5": 0.1095, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3377, + "hfopenllm_v2/MMLU-PRO": 0.2802 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/freewheelin.json b/data/developers/freewheelin.json new file mode 100644 index 0000000000000000000000000000000000000000..4cbcbd6d80c381c151916943a05fc6938679b59c --- /dev/null +++ b/data/developers/freewheelin.json @@ -0,0 +1,61 @@ +{ + "developer": "freewheelin", + "models": [ + { + "id": "freewheelin/free-evo-qwen72b-v0.8-re", + "name": "free-evo-qwen72b-v0.8-re", + "developer": "freewheelin", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5331, + "hfopenllm_v2/BBH": 0.6127, + "hfopenllm_v2/MATH Level 5": 0.1805, + "hfopenllm_v2/GPQA": 0.3565, + "hfopenllm_v2/MUSR": 0.4872, + "hfopenllm_v2/MMLU-PRO": 0.487 + } + }, + { + "id": "freewheelin/free-solar-evo-v0.1", + "name": "free-solar-evo-v0.1", + "developer": "freewheelin", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.205, + "hfopenllm_v2/BBH": 0.4502, + "hfopenllm_v2/MATH Level 5": 0.0083, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.4946, + "hfopenllm_v2/MMLU-PRO": 0.3414 + } + }, + { + "id": "freewheelin/free-solar-evo-v0.11", + "name": "free-solar-evo-v0.11", + "developer": "freewheelin", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2027, + "hfopenllm_v2/BBH": 0.4545, + "hfopenllm_v2/MATH Level 5": 0.0083, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.5052, + "hfopenllm_v2/MMLU-PRO": 0.3467 + } + }, + { + "id": "freewheelin/free-solar-evo-v0.13", + "name": "free-solar-evo-v0.13", + "developer": "freewheelin", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2321, + "hfopenllm_v2/BBH": 0.4555, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.5052, + "hfopenllm_v2/MMLU-PRO": 0.347 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/fulim.json b/data/developers/fulim.json new file mode 100644 index 0000000000000000000000000000000000000000..11f6fa5440952305b05a189555b1b36a11ed3c83 --- /dev/null +++ b/data/developers/fulim.json @@ -0,0 +1,19 @@ +{ + "developer": "fulim", + "models": [ + { + "id": "fulim/FineLlama-3.1-8B", + "name": "FineLlama-3.1-8B", + "developer": "fulim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1439, + "hfopenllm_v2/BBH": 0.4569, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.3867, + "hfopenllm_v2/MMLU-PRO": 0.3167 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/gabrielmbmb.json b/data/developers/gabrielmbmb.json new file mode 100644 index 0000000000000000000000000000000000000000..dacefab277230c321eb5a6384611464f503150d3 --- /dev/null +++ b/data/developers/gabrielmbmb.json @@ -0,0 +1,19 @@ +{ + "developer": "gabrielmbmb", + "models": [ + { + "id": "gabrielmbmb/SmolLM-1.7B-Instruct-IFEval", + "name": "SmolLM-1.7B-Instruct-IFEval", + "developer": "gabrielmbmb", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2306, + "hfopenllm_v2/BBH": 0.3138, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3328, + "hfopenllm_v2/MMLU-PRO": 0.1156 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/gaverfraxz.json b/data/developers/gaverfraxz.json new file mode 100644 index 0000000000000000000000000000000000000000..0436f043f2d80f7a296880acd9db459d95075740 --- /dev/null +++ b/data/developers/gaverfraxz.json @@ -0,0 +1,33 @@ +{ + "developer": "gaverfraxz", + "models": [ + { + "id": "gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA", + "name": "Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA", + "developer": "gaverfraxz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4009, + "hfopenllm_v2/BBH": 0.3985, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.365, + "hfopenllm_v2/MMLU-PRO": 0.1654 + } + }, + { + "id": "gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES", + "name": "Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES", + "developer": "gaverfraxz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4551, + "hfopenllm_v2/BBH": 0.5044, + "hfopenllm_v2/MATH Level 5": 0.1299, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3738, + "hfopenllm_v2/MMLU-PRO": 0.3679 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/gbueno86.json b/data/developers/gbueno86.json new file mode 100644 index 0000000000000000000000000000000000000000..5345de43e2ec85769193d3959b03a9fb7cc620e8 --- /dev/null +++ b/data/developers/gbueno86.json @@ -0,0 +1,33 @@ +{ + "developer": "gbueno86", + "models": [ + { + "id": "gbueno86/Brinebreath-Llama-3.1-70B", + "name": "Brinebreath-Llama-3.1-70B", + "developer": "gbueno86", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5533, + "hfopenllm_v2/BBH": 0.6881, + "hfopenllm_v2/MATH Level 5": 0.2976, + "hfopenllm_v2/GPQA": 0.3465, + "hfopenllm_v2/MUSR": 0.4541, + "hfopenllm_v2/MMLU-PRO": 0.5196 + } + }, + { + "id": "gbueno86/Meta-LLama-3-Cat-Smaug-LLama-70b", + "name": "Meta-LLama-3-Cat-Smaug-LLama-70b", + "developer": "gbueno86", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8072, + "hfopenllm_v2/BBH": 0.6674, + "hfopenllm_v2/MATH Level 5": 0.2938, + "hfopenllm_v2/GPQA": 0.3272, + "hfopenllm_v2/MUSR": 0.4368, + "hfopenllm_v2/MMLU-PRO": 0.5075 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/general-preference.json b/data/developers/general-preference.json new file mode 100644 index 0000000000000000000000000000000000000000..e991453787cea41b4dba4804ca8afcc0731b4110 --- /dev/null +++ b/data/developers/general-preference.json @@ -0,0 +1,31 @@ +{ + "developer": "general-preference", + "models": [ + { + "id": "general-preference/GPM-Gemma-2B", + "name": "general-preference/GPM-Gemma-2B", + "developer": "general-preference", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7449, + "reward-bench/Chat": 0.7151, + "reward-bench/Chat Hard": 0.6974, + "reward-bench/Safety": 0.8122, + "reward-bench/Reasoning": 0.755 + } + }, + { + "id": "general-preference/GPM-Llama-3.1-8B", + "name": "general-preference/GPM-Llama-3.1-8B", + "developer": "general-preference", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9224, + "reward-bench/Chat": 0.933, + "reward-bench/Chat Hard": 0.886, + "reward-bench/Safety": 0.9108, + "reward-bench/Reasoning": 0.9597 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ghost-x.json b/data/developers/ghost-x.json new file mode 100644 index 0000000000000000000000000000000000000000..d4ed1cc45f217e142b7d757d09fda45f95c70952 --- /dev/null +++ b/data/developers/ghost-x.json @@ -0,0 +1,19 @@ +{ + "developer": "ghost-x", + "models": [ + { + "id": "ghost-x/ghost-8b-beta-1608", + "name": "ghost-8b-beta-1608", + "developer": "ghost-x", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4273, + "hfopenllm_v2/BBH": 0.4517, + "hfopenllm_v2/MATH Level 5": 0.0695, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3516, + "hfopenllm_v2/MMLU-PRO": 0.284 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/glaiveai.json b/data/developers/glaiveai.json new file mode 100644 index 0000000000000000000000000000000000000000..fb54193fe0bd8cc8bd4fe5c9e80f2c0b40da388a --- /dev/null +++ b/data/developers/glaiveai.json @@ -0,0 +1,19 @@ +{ + "developer": "glaiveai", + "models": [ + { + "id": "glaiveai/Reflection-Llama-3.1-70B", + "name": "Reflection-Llama-3.1-70B", + "developer": "glaiveai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5991, + "hfopenllm_v2/BBH": 0.5681, + "hfopenllm_v2/MATH Level 5": 0.2757, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.438, + "hfopenllm_v2/MMLU-PRO": 0.6341 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/gmonsoon.json b/data/developers/gmonsoon.json new file mode 100644 index 0000000000000000000000000000000000000000..92f7e6f70cb598798b7960f00689d501f1387128 --- /dev/null +++ b/data/developers/gmonsoon.json @@ -0,0 +1,75 @@ +{ + "developer": "gmonsoon", + "models": [ + { + "id": "gmonsoon/SahabatAI-Llama-11B-Test", + "name": "SahabatAI-Llama-11B-Test", + "developer": "gmonsoon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3376, + "hfopenllm_v2/BBH": 0.4728, + "hfopenllm_v2/MATH Level 5": 0.031, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.4001, + "hfopenllm_v2/MMLU-PRO": 0.3182 + } + }, + { + "id": "gmonsoon/SahabatAI-MediChatIndo-8B-v1", + "name": "SahabatAI-MediChatIndo-8B-v1", + "developer": "gmonsoon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4163, + "hfopenllm_v2/BBH": 0.4509, + "hfopenllm_v2/MATH Level 5": 0.0619, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.3754, + "hfopenllm_v2/MMLU-PRO": 0.3108 + } + }, + { + "id": "gmonsoon/SahabatAI-Rebase-8B-Test", + "name": "SahabatAI-Rebase-8B-Test", + "developer": "gmonsoon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5156, + "hfopenllm_v2/BBH": 0.523, + "hfopenllm_v2/MATH Level 5": 0.1148, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.4133, + "hfopenllm_v2/MMLU-PRO": 0.3664 + } + }, + { + "id": "gmonsoon/StockSeaLLMs-7B-v1", + "name": "StockSeaLLMs-7B-v1", + "developer": "gmonsoon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4599, + "hfopenllm_v2/BBH": 0.5271, + "hfopenllm_v2/MATH Level 5": 0.1964, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4214, + "hfopenllm_v2/MMLU-PRO": 0.3952 + } + }, + { + "id": "gmonsoon/gemma2-9b-sahabatai-v1-instruct-BaseTIES", + "name": "gemma2-9b-sahabatai-v1-instruct-BaseTIES", + "developer": "gmonsoon", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7378, + "hfopenllm_v2/BBH": 0.6077, + "hfopenllm_v2/MATH Level 5": 0.1994, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.4778, + "hfopenllm_v2/MMLU-PRO": 0.4347 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/godlikehhd.json b/data/developers/godlikehhd.json new file mode 100644 index 0000000000000000000000000000000000000000..3c08839703f026452326e3a7d0fcccd609482c94 --- /dev/null +++ b/data/developers/godlikehhd.json @@ -0,0 +1,369 @@ +{ + "developer": "godlikehhd", + "models": [ + { + "id": "godlikehhd/alpaca_data_full_2", + "name": "alpaca_data_full_2", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3178, + "hfopenllm_v2/BBH": 0.4217, + "hfopenllm_v2/MATH Level 5": 0.0929, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4052, + "hfopenllm_v2/MMLU-PRO": 0.2854 + } + }, + { + "id": "godlikehhd/alpaca_data_full_3B", + "name": "alpaca_data_full_3B", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3696, + "hfopenllm_v2/BBH": 0.4684, + "hfopenllm_v2/MATH Level 5": 0.1337, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.4955, + "hfopenllm_v2/MMLU-PRO": 0.3357 + } + }, + { + "id": "godlikehhd/alpaca_data_ifd_max_2600", + "name": "alpaca_data_ifd_max_2600", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3043, + "hfopenllm_v2/BBH": 0.4029, + "hfopenllm_v2/MATH Level 5": 0.0989, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.3509, + "hfopenllm_v2/MMLU-PRO": 0.2916 + } + }, + { + "id": "godlikehhd/alpaca_data_ifd_max_2600_3B", + "name": "alpaca_data_ifd_max_2600_3B", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2982, + "hfopenllm_v2/BBH": 0.4626, + "hfopenllm_v2/MATH Level 5": 0.1594, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.4346, + "hfopenllm_v2/MMLU-PRO": 0.3288 + } + }, + { + "id": "godlikehhd/alpaca_data_ifd_me_max_5200", + "name": "alpaca_data_ifd_me_max_5200", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3683, + "hfopenllm_v2/BBH": 0.4153, + "hfopenllm_v2/MATH Level 5": 0.0974, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.3483, + "hfopenllm_v2/MMLU-PRO": 0.2982 + } + }, + { + "id": "godlikehhd/alpaca_data_ifd_min_2600", + "name": "alpaca_data_ifd_min_2600", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.375, + "hfopenllm_v2/BBH": 0.4219, + "hfopenllm_v2/MATH Level 5": 0.0967, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.3656, + "hfopenllm_v2/MMLU-PRO": 0.2893 + } + }, + { + "id": "godlikehhd/alpaca_data_ins_ans_max_5200", + "name": "alpaca_data_ins_ans_max_5200", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3479, + "hfopenllm_v2/BBH": 0.4098, + "hfopenllm_v2/MATH Level 5": 0.1027, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.3602, + "hfopenllm_v2/MMLU-PRO": 0.2901 + } + }, + { + "id": "godlikehhd/alpaca_data_ins_max_5200", + "name": "alpaca_data_ins_max_5200", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3275, + "hfopenllm_v2/BBH": 0.4155, + "hfopenllm_v2/MATH Level 5": 0.0997, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.3614, + "hfopenllm_v2/MMLU-PRO": 0.2916 + } + }, + { + "id": "godlikehhd/alpaca_data_ins_min_2600", + "name": "alpaca_data_ins_min_2600", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.333, + "hfopenllm_v2/BBH": 0.4187, + "hfopenllm_v2/MATH Level 5": 0.111, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.3853, + "hfopenllm_v2/MMLU-PRO": 0.288 + } + }, + { + "id": "godlikehhd/alpaca_data_ins_min_5200", + "name": "alpaca_data_ins_min_5200", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.336, + "hfopenllm_v2/BBH": 0.4289, + "hfopenllm_v2/MATH Level 5": 0.1035, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.3906, + "hfopenllm_v2/MMLU-PRO": 0.2949 + } + }, + { + "id": "godlikehhd/alpaca_data_sampled_ifd_5200", + "name": "alpaca_data_sampled_ifd_5200", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2924, + "hfopenllm_v2/BBH": 0.4033, + "hfopenllm_v2/MATH Level 5": 0.1254, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.3521, + "hfopenllm_v2/MMLU-PRO": 0.2896 + } + }, + { + "id": "godlikehhd/alpaca_data_sampled_ifd_new_5200", + "name": "alpaca_data_sampled_ifd_new_5200", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3663, + "hfopenllm_v2/BBH": 0.4178, + "hfopenllm_v2/MATH Level 5": 0.0944, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.3613, + "hfopenllm_v2/MMLU-PRO": 0.2925 + } + }, + { + "id": "godlikehhd/alpaca_data_score_max_0.1_2600", + "name": "alpaca_data_score_max_0.1_2600", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3288, + "hfopenllm_v2/BBH": 0.4252, + "hfopenllm_v2/MATH Level 5": 0.0989, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.3706, + "hfopenllm_v2/MMLU-PRO": 0.2923 + } + }, + { + "id": "godlikehhd/alpaca_data_score_max_0.3_2600", + "name": "alpaca_data_score_max_0.3_2600", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3375, + "hfopenllm_v2/BBH": 0.4151, + "hfopenllm_v2/MATH Level 5": 0.1035, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.3759, + "hfopenllm_v2/MMLU-PRO": 0.2913 + } + }, + { + "id": "godlikehhd/alpaca_data_score_max_0.7_2600", + "name": "alpaca_data_score_max_0.7_2600", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.364, + "hfopenllm_v2/BBH": 0.4185, + "hfopenllm_v2/MATH Level 5": 0.1073, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.3469, + "hfopenllm_v2/MMLU-PRO": 0.2983 + } + }, + { + "id": "godlikehhd/alpaca_data_score_max_2500", + "name": "alpaca_data_score_max_2500", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3564, + "hfopenllm_v2/BBH": 0.418, + "hfopenllm_v2/MATH Level 5": 0.0952, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.3627, + "hfopenllm_v2/MMLU-PRO": 0.294 + } + }, + { + "id": "godlikehhd/alpaca_data_score_max_2600_3B", + "name": "alpaca_data_score_max_2600_3B", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3358, + "hfopenllm_v2/BBH": 0.4716, + "hfopenllm_v2/MATH Level 5": 0.1548, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.4474, + "hfopenllm_v2/MMLU-PRO": 0.3342 + } + }, + { + "id": "godlikehhd/alpaca_data_score_max_5200", + "name": "alpaca_data_score_max_5200", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3445, + "hfopenllm_v2/BBH": 0.4242, + "hfopenllm_v2/MATH Level 5": 0.0974, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.3878, + "hfopenllm_v2/MMLU-PRO": 0.2945 + } + }, + { + "id": "godlikehhd/ifd_2500_qwen", + "name": "ifd_2500_qwen", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3365, + "hfopenllm_v2/BBH": 0.4298, + "hfopenllm_v2/MATH Level 5": 0.0982, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.3615, + "hfopenllm_v2/MMLU-PRO": 0.2921 + } + }, + { + "id": "godlikehhd/ifd_new_correct_all_sample_2500_qwen", + "name": "ifd_new_correct_all_sample_2500_qwen", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3376, + "hfopenllm_v2/BBH": 0.402, + "hfopenllm_v2/MATH Level 5": 0.0959, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.3562, + "hfopenllm_v2/MMLU-PRO": 0.2889 + } + }, + { + "id": "godlikehhd/ifd_new_correct_sample_2500_qwen", + "name": "ifd_new_correct_sample_2500_qwen", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3397, + "hfopenllm_v2/BBH": 0.411, + "hfopenllm_v2/MATH Level 5": 0.1042, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.3627, + "hfopenllm_v2/MMLU-PRO": 0.2932 + } + }, + { + "id": "godlikehhd/ifd_new_qwen_2500", + "name": "ifd_new_qwen_2500", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.324, + "hfopenllm_v2/BBH": 0.416, + "hfopenllm_v2/MATH Level 5": 0.1118, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.359, + "hfopenllm_v2/MMLU-PRO": 0.2911 + } + }, + { + "id": "godlikehhd/qwen-2.5-1.5b-cherry", + "name": "qwen-2.5-1.5b-cherry", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2893, + "hfopenllm_v2/BBH": 0.4036, + "hfopenllm_v2/MATH Level 5": 0.102, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.3456, + "hfopenllm_v2/MMLU-PRO": 0.2923 + } + }, + { + "id": "godlikehhd/qwen_2.5-1.5b-cherry_new", + "name": "qwen_2.5-1.5b-cherry_new", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.312, + "hfopenllm_v2/BBH": 0.415, + "hfopenllm_v2/MATH Level 5": 0.0967, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.3496, + "hfopenllm_v2/MMLU-PRO": 0.2894 + } + }, + { + "id": "godlikehhd/qwen_full_data_alpaca", + "name": "qwen_full_data_alpaca", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3136, + "hfopenllm_v2/BBH": 0.4229, + "hfopenllm_v2/MATH Level 5": 0.0921, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4052, + "hfopenllm_v2/MMLU-PRO": 0.2851 + } + }, + { + "id": "godlikehhd/qwen_ins_ans_2500", + "name": "qwen_ins_ans_2500", + "developer": "godlikehhd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2698, + "hfopenllm_v2/BBH": 0.4074, + "hfopenllm_v2/MATH Level 5": 0.114, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.3589, + "hfopenllm_v2/MMLU-PRO": 0.2809 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/google.json b/data/developers/google.json new file mode 100644 index 0000000000000000000000000000000000000000..516a0117d41131ccc1cc975646218628591986ec --- /dev/null +++ b/data/developers/google.json @@ -0,0 +1,1555 @@ +{ + "developer": "google", + "models": [ + { + "id": "google/Gemini 2.5 Flash", + "name": "Gemini 2.5 Flash", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "ace/Overall Score": 0.38, + "ace/Gaming Score": 0.284, + "apex-v1/Overall Score": 0.604 + } + }, + { + "id": "google/Gemini 2.5 Pro", + "name": "Gemini 2.5 Pro", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "ace/Overall Score": 0.4, + "ace/Gaming Score": 0.285 + } + }, + { + "id": "google/Gemini 3 Flash", + "name": "Gemini 3 Flash", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "apex-agents/Overall Pass@1": 0.24, + "apex-agents/Overall Pass@8": 0.367, + "apex-agents/Overall Mean Score": 0.395, + "apex-agents/Investment Banking Pass@1": 0.267, + "apex-agents/Management Consulting Pass@1": 0.193, + "apex-agents/Corporate Law Pass@1": 0.259, + "apex-agents/Corporate Lawyer Mean Score": 0.524, + "ace/Gaming Score": 0.415, + "apex-v1/Overall Score": 0.64, + "apex-v1/Consulting Score": 0.64 + } + }, + { + "id": "google/Gemini 3 Pro", + "name": "Gemini 3 Pro", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "ace/Overall Score": 0.47, + "ace/Gaming Score": 0.509, + "apex-agents/Overall Pass@1": 0.184, + "apex-agents/Overall Pass@8": 0.373, + "apex-agents/Overall Mean Score": 0.341, + "apex-agents/Investment Banking Pass@1": 0.188, + "apex-agents/Management Consulting Pass@1": 0.124, + "apex-agents/Corporate Law Pass@1": 0.239, + "apex-agents/Corporate Lawyer Mean Score": 0.487, + "apex-v1/Overall Score": 0.643, + "apex-v1/Consulting Score": 0.64, + "apex-v1/Investment Banking Score": 0.63 + } + }, + { + "id": "google/Gemini 3.1 Pro", + "name": "Gemini 3.1 Pro", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "apex-agents/Overall Pass@1": 0.335, + "apex-agents/Corporate Lawyer Mean Score": 0.494 + } + }, + { + "id": "google/Palmyra-X-43B", + "name": "Palmyra X 43B", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.732, + "helm_classic/MMLU": 0.609, + "helm_classic/BoolQ": 0.896, + "helm_classic/NarrativeQA": 0.742, + "helm_classic/NaturalQuestions (open-book)": -1.0, + "helm_classic/QuAC": 0.473, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.616, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": 0.049, + "helm_classic/XSUM": 0.149, + "helm_classic/IMDB": 0.935, + "helm_classic/CivilComments": 0.008, + "helm_classic/RAFT": 0.701 + } + }, + { + "id": "google/T5-11B", + "name": "T5 11B", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.131, + "helm_classic/MMLU": 0.29, + "helm_classic/BoolQ": 0.761, + "helm_classic/NarrativeQA": 0.086, + "helm_classic/NaturalQuestions (open-book)": 0.477, + "helm_classic/QuAC": 0.116, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.133, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": 0.043, + "helm_classic/XSUM": 0.015, + "helm_classic/IMDB": 0.379, + "helm_classic/CivilComments": 0.509, + "helm_classic/RAFT": 0.37 + } + }, + { + "id": "google/UL2-20B", + "name": "UL2 20B", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.167, + "helm_classic/MMLU": 0.291, + "helm_classic/BoolQ": 0.746, + "helm_classic/NarrativeQA": 0.083, + "helm_classic/NaturalQuestions (open-book)": 0.349, + "helm_classic/QuAC": 0.144, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.193, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": 0.03, + "helm_classic/XSUM": 0.058, + "helm_classic/IMDB": 0.337, + "helm_classic/CivilComments": 0.521, + "helm_classic/RAFT": 0.404 + } + }, + { + "id": "google/codegemma-1.1-2b", + "name": "codegemma-1.1-2b", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2294, + "hfopenllm_v2/BBH": 0.3353, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3871, + "hfopenllm_v2/MMLU-PRO": 0.1278 + } + }, + { + "id": "google/flame-1.0-24B-july-2024", + "name": "google/flame-1.0-24B-july-2024", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8781, + "reward-bench/Chat": 0.9218, + "reward-bench/Chat Hard": 0.7566, + "reward-bench/Safety": 0.8959, + "reward-bench/Reasoning": 0.938 + } + }, + { + "id": "google/flan-t5-base", + "name": "flan-t5-base", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1891, + "hfopenllm_v2/BBH": 0.3526, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2383, + "hfopenllm_v2/MUSR": 0.3671, + "hfopenllm_v2/MMLU-PRO": 0.1357 + } + }, + { + "id": "google/flan-t5-large", + "name": "flan-t5-large", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2201, + "hfopenllm_v2/BBH": 0.4153, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2508, + "hfopenllm_v2/MUSR": 0.4083, + "hfopenllm_v2/MMLU-PRO": 0.1709 + } + }, + { + "id": "google/flan-t5-small", + "name": "flan-t5-small", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1524, + "hfopenllm_v2/BBH": 0.3283, + "hfopenllm_v2/MATH Level 5": 0.0076, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.4123, + "hfopenllm_v2/MMLU-PRO": 0.1233 + } + }, + { + "id": "google/flan-t5-xl", + "name": "flan-t5-xl", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2207, + "hfopenllm_v2/BBH": 0.4537, + "hfopenllm_v2/MATH Level 5": 0.0008, + "hfopenllm_v2/GPQA": 0.2458, + "hfopenllm_v2/MUSR": 0.422, + "hfopenllm_v2/MMLU-PRO": 0.2142 + } + }, + { + "id": "google/flan-t5-xxl", + "name": "flan-t5-xxl", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.22, + "hfopenllm_v2/BBH": 0.5066, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.4218, + "hfopenllm_v2/MMLU-PRO": 0.2343 + } + }, + { + "id": "google/flan-ul2", + "name": "flan-ul2", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2393, + "hfopenllm_v2/BBH": 0.5054, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.3844, + "hfopenllm_v2/MMLU-PRO": 0.2493 + } + }, + { + "id": "google/gemini-1.0-pro-001", + "name": "Gemini 1.0 Pro 001", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_mmlu/MMLU All Subjects": 0.7, + "helm_mmlu/Abstract Algebra": 0.34, + "helm_mmlu/Anatomy": 0.652, + "helm_mmlu/College Physics": 0.333, + "helm_mmlu/Computer Security": 0.84, + "helm_mmlu/Econometrics": 0.553, + "helm_mmlu/Global Facts": 0.49, + "helm_mmlu/Jurisprudence": 0.861, + "helm_mmlu/Philosophy": 0.762, + "helm_mmlu/Professional Psychology": 0.752, + "helm_mmlu/Us Foreign Policy": 0.89, + "helm_mmlu/Astronomy": 0.796, + "helm_mmlu/Business Ethics": 0.69, + "helm_mmlu/Clinical Knowledge": 0.758, + "helm_mmlu/Conceptual Physics": 0.706, + "helm_mmlu/Electrical Engineering": 0.69, + "helm_mmlu/Elementary Mathematics": 0.476, + "helm_mmlu/Formal Logic": 0.468, + "helm_mmlu/High School World History": 0.865, + "helm_mmlu/Human Sexuality": 0.618, + "helm_mmlu/International Law": 0.876, + "helm_mmlu/Logical Fallacies": 0.804, + "helm_mmlu/Machine Learning": 0.527, + "helm_mmlu/Management": 0.845, + "helm_mmlu/Marketing": 0.91, + "helm_mmlu/Medical Genetics": 0.8, + "helm_mmlu/Miscellaneous": 0.851, + "helm_mmlu/Moral Scenarios": 0.46, + "helm_mmlu/Nutrition": 0.788, + "helm_mmlu/Prehistory": 0.802, + "helm_mmlu/Public Relations": 0.691, + "helm_mmlu/Security Studies": 0.804, + "helm_mmlu/Sociology": 0.9, + "helm_mmlu/Virology": 0.536, + "helm_mmlu/World Religions": 0.86, + "helm_mmlu/Mean win rate": 0.677 + } + }, + { + "id": "google/gemini-1.0-pro-002", + "name": "Gemini 1.0 Pro 002", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.422, + "helm_lite/NarrativeQA": 0.751, + "helm_lite/NaturalQuestions (closed-book)": 0.391, + "helm_lite/OpenbookQA": 0.788, + "helm_lite/MMLU": 0.534, + "helm_lite/MATH": 0.665, + "helm_lite/GSM8K": 0.816, + "helm_lite/LegalBench": 0.475, + "helm_lite/MedQA": 0.483, + "helm_lite/WMT 2014": 0.194 + } + }, + { + "id": "google/gemini-1.5-flash-001", + "name": "Gemini 1.5 Flash 001", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.667, + "helm_lite/NarrativeQA": 0.783, + "helm_lite/NaturalQuestions (closed-book)": 0.332, + "helm_lite/OpenbookQA": 0.928, + "helm_lite/MMLU": 0.703, + "helm_lite/MATH": 0.753, + "helm_lite/GSM8K": 0.785, + "helm_lite/LegalBench": 0.661, + "helm_lite/MedQA": 0.68, + "helm_lite/WMT 2014": 0.225, + "helm_mmlu/MMLU All Subjects": 0.779, + "helm_mmlu/Abstract Algebra": 0.58, + "helm_mmlu/Anatomy": 0.8, + "helm_mmlu/College Physics": 0.696, + "helm_mmlu/Computer Security": 0.79, + "helm_mmlu/Econometrics": 0.614, + "helm_mmlu/Global Facts": 0.53, + "helm_mmlu/Jurisprudence": 0.889, + "helm_mmlu/Philosophy": 0.791, + "helm_mmlu/Professional Psychology": 0.828, + "helm_mmlu/Us Foreign Policy": 0.93, + "helm_mmlu/Astronomy": 0.882, + "helm_mmlu/Business Ethics": 0.81, + "helm_mmlu/Clinical Knowledge": 0.834, + "helm_mmlu/Conceptual Physics": 0.851, + "helm_mmlu/Electrical Engineering": 0.8, + "helm_mmlu/Elementary Mathematics": 0.754, + "helm_mmlu/Formal Logic": 0.627, + "helm_mmlu/High School World History": 0.907, + "helm_mmlu/Human Sexuality": 0.374, + "helm_mmlu/International Law": 0.901, + "helm_mmlu/Logical Fallacies": 0.853, + "helm_mmlu/Machine Learning": 0.571, + "helm_mmlu/Management": 0.864, + "helm_mmlu/Marketing": 0.94, + "helm_mmlu/Medical Genetics": 0.86, + "helm_mmlu/Miscellaneous": 0.886, + "helm_mmlu/Moral Scenarios": 0.637, + "helm_mmlu/Nutrition": 0.82, + "helm_mmlu/Prehistory": 0.867, + "helm_mmlu/Public Relations": 0.764, + "helm_mmlu/Security Studies": 0.808, + "helm_mmlu/Sociology": 0.915, + "helm_mmlu/Virology": 0.566, + "helm_mmlu/World Religions": 0.883, + "helm_mmlu/Mean win rate": 0.47, + "reward-bench/Score": 0.8054, + "reward-bench/Chat": 0.9218, + "reward-bench/Chat Hard": 0.6349, + "reward-bench/Safety": 0.8696, + "reward-bench/Reasoning": 0.8512, + "reward-bench/Prior Sets (0.5 weight)": 0.6937 + } + }, + { + "id": "google/gemini-1.5-flash-002", + "name": "Gemini 1.5 Flash 002", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.609, + "helm_capabilities/MMLU-Pro": 0.678, + "helm_capabilities/GPQA": 0.437, + "helm_capabilities/IFEval": 0.831, + "helm_capabilities/WildBench": 0.792, + "helm_capabilities/Omni-MATH": 0.305, + "helm_lite/Mean win rate": 0.573, + "helm_lite/NarrativeQA": 0.746, + "helm_lite/NaturalQuestions (closed-book)": 0.323, + "helm_lite/OpenbookQA": 0.914, + "helm_lite/MMLU": 0.679, + "helm_lite/MATH": 0.908, + "helm_lite/GSM8K": 0.328, + "helm_lite/LegalBench": 0.67, + "helm_lite/MedQA": 0.656, + "helm_lite/WMT 2014": 0.212, + "helm_mmlu/MMLU All Subjects": 0.739, + "helm_mmlu/Abstract Algebra": 0.63, + "helm_mmlu/Anatomy": 0.793, + "helm_mmlu/College Physics": 0.637, + "helm_mmlu/Computer Security": 0.72, + "helm_mmlu/Econometrics": 0.675, + "helm_mmlu/Global Facts": 0.47, + "helm_mmlu/Jurisprudence": 0.852, + "helm_mmlu/Philosophy": 0.797, + "helm_mmlu/Professional Psychology": 0.806, + "helm_mmlu/Us Foreign Policy": 0.81, + "helm_mmlu/Astronomy": 0.895, + "helm_mmlu/Business Ethics": 0.27, + "helm_mmlu/Clinical Knowledge": 0.792, + "helm_mmlu/Conceptual Physics": 0.851, + "helm_mmlu/Electrical Engineering": 0.772, + "helm_mmlu/Elementary Mathematics": 0.704, + "helm_mmlu/Formal Logic": 0.595, + "helm_mmlu/High School World History": 0.869, + "helm_mmlu/Human Sexuality": 0.847, + "helm_mmlu/International Law": 0.752, + "helm_mmlu/Logical Fallacies": 0.859, + "helm_mmlu/Machine Learning": 0.616, + "helm_mmlu/Management": 0.893, + "helm_mmlu/Marketing": 0.953, + "helm_mmlu/Medical Genetics": 0.89, + "helm_mmlu/Miscellaneous": 0.9, + "helm_mmlu/Moral Scenarios": 0.676, + "helm_mmlu/Nutrition": 0.588, + "helm_mmlu/Prehistory": 0.762, + "helm_mmlu/Public Relations": 0.7, + "helm_mmlu/Security Studies": 0.547, + "helm_mmlu/Sociology": 0.851, + "helm_mmlu/Virology": 0.524, + "helm_mmlu/World Religions": 0.865, + "helm_mmlu/Mean win rate": 0.817 + } + }, + { + "id": "google/gemini-1.5-flash-8b", + "name": "google/gemini-1.5-flash-8b", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.4851, + "reward-bench/Factuality": 0.4611, + "reward-bench/Precise IF": 0.3625, + "reward-bench/Math": 0.5082, + "reward-bench/Safety": 0.6622, + "reward-bench/Focus": 0.6747, + "reward-bench/Ties": 0.2421 + } + }, + { + "id": "google/gemini-1.5-flash-preview-0514", + "name": "Gemini 1.5 Flash 0514 preview", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_mmlu/MMLU All Subjects": 0.778, + "helm_mmlu/Abstract Algebra": 0.56, + "helm_mmlu/Anatomy": 0.807, + "helm_mmlu/College Physics": 0.667, + "helm_mmlu/Computer Security": 0.77, + "helm_mmlu/Econometrics": 0.64, + "helm_mmlu/Global Facts": 0.55, + "helm_mmlu/Jurisprudence": 0.889, + "helm_mmlu/Philosophy": 0.807, + "helm_mmlu/Professional Psychology": 0.825, + "helm_mmlu/Us Foreign Policy": 0.93, + "helm_mmlu/Astronomy": 0.868, + "helm_mmlu/Business Ethics": 0.82, + "helm_mmlu/Clinical Knowledge": 0.838, + "helm_mmlu/Conceptual Physics": 0.855, + "helm_mmlu/Electrical Engineering": 0.814, + "helm_mmlu/Elementary Mathematics": 0.778, + "helm_mmlu/Formal Logic": 0.611, + "helm_mmlu/High School World History": 0.907, + "helm_mmlu/Human Sexuality": 0.374, + "helm_mmlu/International Law": 0.876, + "helm_mmlu/Logical Fallacies": 0.853, + "helm_mmlu/Machine Learning": 0.562, + "helm_mmlu/Management": 0.854, + "helm_mmlu/Marketing": 0.936, + "helm_mmlu/Medical Genetics": 0.86, + "helm_mmlu/Miscellaneous": 0.884, + "helm_mmlu/Moral Scenarios": 0.631, + "helm_mmlu/Nutrition": 0.801, + "helm_mmlu/Prehistory": 0.867, + "helm_mmlu/Public Relations": 0.773, + "helm_mmlu/Security Studies": 0.812, + "helm_mmlu/Sociology": 0.9, + "helm_mmlu/Virology": 0.566, + "helm_mmlu/World Religions": 0.871, + "helm_mmlu/Mean win rate": 0.713 + } + }, + { + "id": "google/gemini-1.5-pro-001", + "name": "Gemini 1.5 Pro 001", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.739, + "helm_lite/NarrativeQA": 0.783, + "helm_lite/NaturalQuestions (closed-book)": 0.378, + "helm_lite/OpenbookQA": 0.902, + "helm_lite/MMLU": 0.772, + "helm_lite/MATH": 0.825, + "helm_lite/GSM8K": 0.836, + "helm_lite/LegalBench": 0.757, + "helm_lite/MedQA": 0.692, + "helm_lite/WMT 2014": 0.189, + "helm_mmlu/MMLU All Subjects": 0.827, + "helm_mmlu/Abstract Algebra": 0.75, + "helm_mmlu/Anatomy": 0.83, + "helm_mmlu/College Physics": 0.745, + "helm_mmlu/Computer Security": 0.83, + "helm_mmlu/Econometrics": 0.728, + "helm_mmlu/Global Facts": 0.66, + "helm_mmlu/Jurisprudence": 0.889, + "helm_mmlu/Philosophy": 0.871, + "helm_mmlu/Professional Psychology": 0.894, + "helm_mmlu/Us Foreign Policy": 0.93, + "helm_mmlu/Astronomy": 0.914, + "helm_mmlu/Business Ethics": 0.8, + "helm_mmlu/Clinical Knowledge": 0.853, + "helm_mmlu/Conceptual Physics": 0.949, + "helm_mmlu/Electrical Engineering": 0.745, + "helm_mmlu/Elementary Mathematics": 0.939, + "helm_mmlu/Formal Logic": 0.706, + "helm_mmlu/High School World History": 0.924, + "helm_mmlu/Human Sexuality": 0.374, + "helm_mmlu/International Law": 0.917, + "helm_mmlu/Logical Fallacies": 0.896, + "helm_mmlu/Machine Learning": 0.652, + "helm_mmlu/Management": 0.922, + "helm_mmlu/Marketing": 0.932, + "helm_mmlu/Medical Genetics": 0.91, + "helm_mmlu/Miscellaneous": 0.958, + "helm_mmlu/Moral Scenarios": 0.739, + "helm_mmlu/Nutrition": 0.879, + "helm_mmlu/Prehistory": 0.87, + "helm_mmlu/Public Relations": 0.818, + "helm_mmlu/Security Studies": 0.873, + "helm_mmlu/Sociology": 0.92, + "helm_mmlu/Virology": 0.554, + "helm_mmlu/World Religions": 0.854, + "helm_mmlu/Mean win rate": 0.349 + } + }, + { + "id": "google/gemini-1.5-pro-002", + "name": "Gemini 1.5 Pro 002", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.657, + "helm_capabilities/MMLU-Pro": 0.737, + "helm_capabilities/GPQA": 0.534, + "helm_capabilities/IFEval": 0.837, + "helm_capabilities/WildBench": 0.813, + "helm_capabilities/Omni-MATH": 0.364, + "helm_lite/Mean win rate": 0.842, + "helm_lite/NarrativeQA": 0.756, + "helm_lite/NaturalQuestions (closed-book)": 0.455, + "helm_lite/OpenbookQA": 0.952, + "helm_lite/MMLU": 0.795, + "helm_lite/MATH": 0.92, + "helm_lite/GSM8K": 0.817, + "helm_lite/LegalBench": 0.747, + "helm_lite/MedQA": 0.771, + "helm_lite/WMT 2014": 0.231, + "helm_mmlu/MMLU All Subjects": 0.869, + "helm_mmlu/Abstract Algebra": 0.82, + "helm_mmlu/Anatomy": 0.83, + "helm_mmlu/College Physics": 0.863, + "helm_mmlu/Computer Security": 0.85, + "helm_mmlu/Econometrics": 0.693, + "helm_mmlu/Global Facts": 0.77, + "helm_mmlu/Jurisprudence": 0.898, + "helm_mmlu/Philosophy": 0.887, + "helm_mmlu/Professional Psychology": 0.912, + "helm_mmlu/Us Foreign Policy": 0.94, + "helm_mmlu/Astronomy": 0.934, + "helm_mmlu/Business Ethics": 0.84, + "helm_mmlu/Clinical Knowledge": 0.906, + "helm_mmlu/Conceptual Physics": 0.945, + "helm_mmlu/Electrical Engineering": 0.855, + "helm_mmlu/Elementary Mathematics": 0.942, + "helm_mmlu/Formal Logic": 0.754, + "helm_mmlu/High School World History": 0.937, + "helm_mmlu/Human Sexuality": 0.878, + "helm_mmlu/International Law": 0.917, + "helm_mmlu/Logical Fallacies": 0.902, + "helm_mmlu/Machine Learning": 0.83, + "helm_mmlu/Management": 0.903, + "helm_mmlu/Marketing": 0.962, + "helm_mmlu/Medical Genetics": 0.92, + "helm_mmlu/Miscellaneous": 0.959, + "helm_mmlu/Moral Scenarios": 0.792, + "helm_mmlu/Nutrition": 0.886, + "helm_mmlu/Prehistory": 0.926, + "helm_mmlu/Public Relations": 0.809, + "helm_mmlu/Security Studies": 0.857, + "helm_mmlu/Sociology": 0.95, + "helm_mmlu/Virology": 0.566, + "helm_mmlu/World Religions": 0.889, + "helm_mmlu/Mean win rate": 0.334 + } + }, + { + "id": "google/gemini-1.5-pro-0514", + "name": "google/gemini-1.5-pro-0514", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.882, + "reward-bench/Chat": 0.9232, + "reward-bench/Chat Hard": 0.8059, + "reward-bench/Safety": 0.8791, + "reward-bench/Reasoning": 0.9199 + } + }, + { + "id": "google/gemini-1.5-pro-0924", + "name": "google/gemini-1.5-pro-0924", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8678, + "reward-bench/Chat": 0.9413, + "reward-bench/Chat Hard": 0.7697, + "reward-bench/Safety": 0.8581, + "reward-bench/Reasoning": 0.9022 + } + }, + { + "id": "google/gemini-1.5-pro-preview-0409", + "name": "Gemini 1.5 Pro 0409 preview", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_mmlu/MMLU All Subjects": 0.81, + "helm_mmlu/Abstract Algebra": 0.6, + "helm_mmlu/Anatomy": 0.77, + "helm_mmlu/College Physics": 0.804, + "helm_mmlu/Computer Security": 0.81, + "helm_mmlu/Econometrics": 0.737, + "helm_mmlu/Global Facts": 0.66, + "helm_mmlu/Jurisprudence": 0.87, + "helm_mmlu/Philosophy": 0.846, + "helm_mmlu/Professional Psychology": 0.866, + "helm_mmlu/Us Foreign Policy": 0.94, + "helm_mmlu/Astronomy": 0.914, + "helm_mmlu/Business Ethics": 0.8, + "helm_mmlu/Clinical Knowledge": 0.868, + "helm_mmlu/Conceptual Physics": 0.915, + "helm_mmlu/Electrical Engineering": 0.772, + "helm_mmlu/Elementary Mathematics": 0.884, + "helm_mmlu/Formal Logic": 0.643, + "helm_mmlu/High School World History": 0.924, + "helm_mmlu/Human Sexuality": 0.397, + "helm_mmlu/International Law": 0.917, + "helm_mmlu/Logical Fallacies": 0.859, + "helm_mmlu/Machine Learning": 0.67, + "helm_mmlu/Management": 0.874, + "helm_mmlu/Marketing": 0.953, + "helm_mmlu/Medical Genetics": 0.91, + "helm_mmlu/Miscellaneous": 0.928, + "helm_mmlu/Moral Scenarios": 0.696, + "helm_mmlu/Nutrition": 0.846, + "helm_mmlu/Prehistory": 0.886, + "helm_mmlu/Public Relations": 0.755, + "helm_mmlu/Security Studies": 0.849, + "helm_mmlu/Sociology": 0.925, + "helm_mmlu/Virology": 0.584, + "helm_mmlu/World Religions": 0.877, + "helm_mmlu/Mean win rate": 0.118 + } + }, + { + "id": "google/gemini-2.0-flash-001", + "name": "Gemini 2.0 Flash", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.679, + "helm_capabilities/MMLU-Pro": 0.737, + "helm_capabilities/GPQA": 0.556, + "helm_capabilities/IFEval": 0.841, + "helm_capabilities/WildBench": 0.8, + "helm_capabilities/Omni-MATH": 0.459 + } + }, + { + "id": "google/gemini-2.0-flash-exp", + "name": "Gemini 2.0 Flash Experimental", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.813, + "helm_lite/NarrativeQA": 0.783, + "helm_lite/NaturalQuestions (closed-book)": 0.443, + "helm_lite/OpenbookQA": 0.946, + "helm_lite/MMLU": 0.717, + "helm_lite/MATH": 0.901, + "helm_lite/GSM8K": 0.946, + "helm_lite/LegalBench": 0.674, + "helm_lite/MedQA": 0.73, + "helm_lite/WMT 2014": 0.212, + "helm_mmlu/MMLU All Subjects": 0.797, + "helm_mmlu/Abstract Algebra": 0.72, + "helm_mmlu/Anatomy": 0.807, + "helm_mmlu/College Physics": 0.696, + "helm_mmlu/Computer Security": 0.83, + "helm_mmlu/Econometrics": 0.693, + "helm_mmlu/Global Facts": 0.66, + "helm_mmlu/Jurisprudence": 0.898, + "helm_mmlu/Philosophy": 0.887, + "helm_mmlu/Professional Psychology": 0.876, + "helm_mmlu/Us Foreign Policy": 0.78, + "helm_mmlu/Astronomy": 0.928, + "helm_mmlu/Business Ethics": 0.73, + "helm_mmlu/Clinical Knowledge": 0.879, + "helm_mmlu/Conceptual Physics": 0.813, + "helm_mmlu/Electrical Engineering": 0.834, + "helm_mmlu/Elementary Mathematics": 0.857, + "helm_mmlu/Formal Logic": 0.571, + "helm_mmlu/High School World History": 0.743, + "helm_mmlu/Human Sexuality": 0.901, + "helm_mmlu/International Law": 0.645, + "helm_mmlu/Logical Fallacies": 0.914, + "helm_mmlu/Machine Learning": 0.759, + "helm_mmlu/Management": 0.718, + "helm_mmlu/Marketing": 0.944, + "helm_mmlu/Medical Genetics": 0.89, + "helm_mmlu/Miscellaneous": 0.939, + "helm_mmlu/Moral Scenarios": 0.815, + "helm_mmlu/Nutrition": 0.856, + "helm_mmlu/Prehistory": 0.898, + "helm_mmlu/Public Relations": 0.791, + "helm_mmlu/Security Studies": 0.69, + "helm_mmlu/Sociology": 0.786, + "helm_mmlu/Virology": 0.554, + "helm_mmlu/World Religions": 0.731, + "helm_mmlu/Mean win rate": 0.567 + } + }, + { + "id": "google/gemini-2.0-flash-lite-preview-02-05", + "name": "Gemini 2.0 Flash Lite 02-05 preview", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.642, + "helm_capabilities/MMLU-Pro": 0.72, + "helm_capabilities/GPQA": 0.5, + "helm_capabilities/IFEval": 0.824, + "helm_capabilities/WildBench": 0.79, + "helm_capabilities/Omni-MATH": 0.374 + } + }, + { + "id": "google/gemini-2.5-flash", + "name": "gemini-2.5-flash", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.9145, + "global-mmlu-lite/Culturally Sensitive": 0.9, + "global-mmlu-lite/Culturally Agnostic": 0.9291, + "global-mmlu-lite/Arabic": 0.9125, + "global-mmlu-lite/English": 0.9325, + "global-mmlu-lite/Bengali": 0.91, + "global-mmlu-lite/German": 0.9025, + "global-mmlu-lite/French": 0.91, + "global-mmlu-lite/Hindi": 0.925, + "global-mmlu-lite/Indonesian": 0.9075, + "global-mmlu-lite/Italian": 0.9225, + "global-mmlu-lite/Japanese": 0.9125, + "global-mmlu-lite/Korean": 0.915, + "global-mmlu-lite/Portuguese": 0.9125, + "global-mmlu-lite/Spanish": 0.9175, + "global-mmlu-lite/Swahili": 0.915, + "global-mmlu-lite/Yoruba": 0.9075, + "global-mmlu-lite/Chinese": 0.915, + "global-mmlu-lite/Burmese": 0.915, + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.028169014084507043, + "livecodebenchpro/Easy Problems": 0.38028169014084506, + "reward-bench/Score": 0.7767, + "reward-bench/Factuality": 0.674, + "reward-bench/Precise IF": 0.575, + "reward-bench/Math": 0.852, + "reward-bench/Safety": 0.909, + "reward-bench/Focus": 0.841, + "reward-bench/Ties": 0.809, + "terminal-bench-2.0/terminal-bench-2.0": 15.4 + } + }, + { + "id": "google/gemini-2.5-flash-lite", + "name": "Gemini 2.5 Flash-Lite", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.591, + "helm_capabilities/MMLU-Pro": 0.537, + "helm_capabilities/GPQA": 0.309, + "helm_capabilities/IFEval": 0.81, + "helm_capabilities/WildBench": 0.818, + "helm_capabilities/Omni-MATH": 0.48 + } + }, + { + "id": "google/gemini-2.5-flash-preview-04-17", + "name": "Gemini 2.5 Flash 04-17 preview", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.626, + "helm_capabilities/MMLU-Pro": 0.639, + "helm_capabilities/GPQA": 0.39, + "helm_capabilities/IFEval": 0.898, + "helm_capabilities/WildBench": 0.817, + "helm_capabilities/Omni-MATH": 0.384, + "reward-bench/Score": 0.7721, + "reward-bench/Factuality": 0.6574, + "reward-bench/Precise IF": 0.5531, + "reward-bench/Math": 0.8115, + "reward-bench/Safety": 0.9094, + "reward-bench/Focus": 0.8672, + "reward-bench/Ties": 0.8341 + } + }, + { + "id": "google/gemini-2.5-flash-preview-05-20", + "name": "gemini-2.5-flash-preview-05-20", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.9092, + "global-mmlu-lite/Culturally Sensitive": 0.8925, + "global-mmlu-lite/Culturally Agnostic": 0.9259, + "global-mmlu-lite/Arabic": 0.905, + "global-mmlu-lite/English": 0.9225, + "global-mmlu-lite/Bengali": 0.91, + "global-mmlu-lite/German": 0.905, + "global-mmlu-lite/French": 0.925, + "global-mmlu-lite/Hindi": 0.9125, + "global-mmlu-lite/Indonesian": 0.9075, + "global-mmlu-lite/Italian": 0.89, + "global-mmlu-lite/Japanese": 0.9125, + "global-mmlu-lite/Korean": 0.9075, + "global-mmlu-lite/Portuguese": 0.915, + "global-mmlu-lite/Spanish": 0.915, + "global-mmlu-lite/Swahili": 0.905, + "global-mmlu-lite/Yoruba": 0.8825, + "global-mmlu-lite/Chinese": 0.93, + "global-mmlu-lite/Burmese": 0.9025 + } + }, + { + "id": "google/gemini-2.5-pro", + "name": "gemini-2.5-pro", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.9323, + "global-mmlu-lite/Culturally Sensitive": 0.9241, + "global-mmlu-lite/Culturally Agnostic": 0.9406, + "global-mmlu-lite/Arabic": 0.9475, + "global-mmlu-lite/English": 0.9275, + "global-mmlu-lite/Bengali": 0.9275, + "global-mmlu-lite/German": 0.93, + "global-mmlu-lite/French": 0.9425, + "global-mmlu-lite/Hindi": 0.9275, + "global-mmlu-lite/Indonesian": 0.925, + "global-mmlu-lite/Italian": 0.935, + "global-mmlu-lite/Japanese": 0.9375, + "global-mmlu-lite/Korean": 0.9275, + "global-mmlu-lite/Portuguese": 0.93, + "global-mmlu-lite/Spanish": 0.94, + "global-mmlu-lite/Swahili": 0.9375, + "global-mmlu-lite/Yoruba": 0.925, + "global-mmlu-lite/Chinese": 0.9275, + "global-mmlu-lite/Burmese": 0.93, + "livecodebenchpro/Hard Problems": 0.014084507042253521, + "livecodebenchpro/Medium Problems": 0.2112676056338028, + "livecodebenchpro/Easy Problems": 0.7183098591549296, + "reward-bench/Score": 0.7948, + "reward-bench/Factuality": 0.755, + "reward-bench/Precise IF": 0.619, + "reward-bench/Math": 0.898, + "reward-bench/Safety": 0.881, + "reward-bench/Focus": 0.805, + "reward-bench/Ties": 0.811, + "terminal-bench-2.0/terminal-bench-2.0": 16.4 + } + }, + { + "id": "google/gemini-2.5-pro-preview-03-25", + "name": "Gemini 2.5 Pro 03-25 preview", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.745, + "helm_capabilities/MMLU-Pro": 0.863, + "helm_capabilities/GPQA": 0.749, + "helm_capabilities/IFEval": 0.84, + "helm_capabilities/WildBench": 0.857, + "helm_capabilities/Omni-MATH": 0.416 + } + }, + { + "id": "google/gemini-2.5-pro-preview-05-06", + "name": "google/gemini-2.5-pro-preview-05-06", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6775, + "reward-bench/Factuality": 0.6532, + "reward-bench/Precise IF": 0.4688, + "reward-bench/Math": 0.5342, + "reward-bench/Safety": 0.8806, + "reward-bench/Focus": 0.8308, + "reward-bench/Ties": 0.6973 + } + }, + { + "id": "google/gemma-1.1-2b-it", + "name": "gemma-1.1-2b-it", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3067, + "hfopenllm_v2/BBH": 0.3185, + "hfopenllm_v2/MATH Level 5": 0.0181, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3394, + "hfopenllm_v2/MMLU-PRO": 0.1484 + } + }, + { + "id": "google/gemma-1.1-7b-it", + "name": "gemma-1.1-7b-it", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5039, + "hfopenllm_v2/BBH": 0.3935, + "hfopenllm_v2/MATH Level 5": 0.0491, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.423, + "hfopenllm_v2/MMLU-PRO": 0.2584 + } + }, + { + "id": "google/gemma-2-27b", + "name": "Gemma 2 27B", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_mmlu/MMLU All Subjects": 0.757, + "helm_mmlu/Abstract Algebra": 0.4, + "helm_mmlu/Anatomy": 0.77, + "helm_mmlu/College Physics": 0.5, + "helm_mmlu/Computer Security": 0.84, + "helm_mmlu/Econometrics": 0.667, + "helm_mmlu/Global Facts": 0.43, + "helm_mmlu/Jurisprudence": 0.861, + "helm_mmlu/Philosophy": 0.849, + "helm_mmlu/Professional Psychology": 0.84, + "helm_mmlu/Us Foreign Policy": 0.95, + "helm_mmlu/Astronomy": 0.829, + "helm_mmlu/Business Ethics": 0.78, + "helm_mmlu/Clinical Knowledge": 0.808, + "helm_mmlu/Conceptual Physics": 0.834, + "helm_mmlu/Electrical Engineering": 0.738, + "helm_mmlu/Elementary Mathematics": 0.558, + "helm_mmlu/Formal Logic": 0.516, + "helm_mmlu/High School World History": 0.89, + "helm_mmlu/Human Sexuality": 0.84, + "helm_mmlu/International Law": 0.843, + "helm_mmlu/Logical Fallacies": 0.865, + "helm_mmlu/Machine Learning": 0.625, + "helm_mmlu/Management": 0.864, + "helm_mmlu/Marketing": 0.94, + "helm_mmlu/Medical Genetics": 0.87, + "helm_mmlu/Miscellaneous": 0.885, + "helm_mmlu/Moral Scenarios": 0.394, + "helm_mmlu/Nutrition": 0.824, + "helm_mmlu/Prehistory": 0.877, + "helm_mmlu/Public Relations": 0.745, + "helm_mmlu/Security Studies": 0.808, + "helm_mmlu/Sociology": 0.9, + "helm_mmlu/Virology": 0.56, + "helm_mmlu/World Religions": 0.924, + "helm_mmlu/Mean win rate": 0.05, + "hfopenllm_v2/IFEval": 0.2475, + "hfopenllm_v2/BBH": 0.5643, + "hfopenllm_v2/MATH Level 5": 0.1662, + "hfopenllm_v2/GPQA": 0.3507, + "hfopenllm_v2/MUSR": 0.4396, + "hfopenllm_v2/MMLU-PRO": 0.4371 + } + }, + { + "id": "google/gemma-2-27b-it", + "name": "Gemma 2 Instruct 27B", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.675, + "helm_lite/NarrativeQA": 0.79, + "helm_lite/NaturalQuestions (closed-book)": 0.353, + "helm_lite/OpenbookQA": 0.918, + "helm_lite/MMLU": 0.664, + "helm_lite/MATH": 0.746, + "helm_lite/GSM8K": 0.812, + "helm_lite/LegalBench": 0.7, + "helm_lite/MedQA": 0.684, + "helm_lite/WMT 2014": 0.214, + "hfopenllm_v2/IFEval": 0.7978, + "hfopenllm_v2/BBH": 0.6451, + "hfopenllm_v2/MATH Level 5": 0.2387, + "hfopenllm_v2/GPQA": 0.375, + "hfopenllm_v2/MUSR": 0.4033, + "hfopenllm_v2/MMLU-PRO": 0.4451, + "reward-bench/Score": 0.809, + "reward-bench/Chat": 0.9483, + "reward-bench/Chat Hard": 0.591, + "reward-bench/Safety": 0.8635, + "reward-bench/Reasoning": 0.833 + } + }, + { + "id": "google/gemma-2-2b", + "name": "gemma-2-2b", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1993, + "hfopenllm_v2/BBH": 0.3656, + "hfopenllm_v2/MATH Level 5": 0.0287, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.4232, + "hfopenllm_v2/MMLU-PRO": 0.218 + } + }, + { + "id": "google/gemma-2-2b-it", + "name": "gemma-2-2b-it", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5668, + "hfopenllm_v2/BBH": 0.4199, + "hfopenllm_v2/MATH Level 5": 0.0008, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3929, + "hfopenllm_v2/MMLU-PRO": 0.255 + } + }, + { + "id": "google/gemma-2-2b-jpn-it", + "name": "gemma-2-2b-jpn-it", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5288, + "hfopenllm_v2/BBH": 0.4178, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3728, + "hfopenllm_v2/MMLU-PRO": 0.2467 + } + }, + { + "id": "google/gemma-2-9b", + "name": "Gemma 2 9B", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_mmlu/MMLU All Subjects": 0.721, + "helm_mmlu/Abstract Algebra": 0.4, + "helm_mmlu/Anatomy": 0.704, + "helm_mmlu/College Physics": 0.5, + "helm_mmlu/Computer Security": 0.81, + "helm_mmlu/Econometrics": 0.579, + "helm_mmlu/Global Facts": 0.53, + "helm_mmlu/Jurisprudence": 0.833, + "helm_mmlu/Philosophy": 0.772, + "helm_mmlu/Professional Psychology": 0.788, + "helm_mmlu/Us Foreign Policy": 0.9, + "helm_mmlu/Astronomy": 0.789, + "helm_mmlu/Business Ethics": 0.77, + "helm_mmlu/Clinical Knowledge": 0.777, + "helm_mmlu/Conceptual Physics": 0.732, + "helm_mmlu/Electrical Engineering": 0.724, + "helm_mmlu/Elementary Mathematics": 0.577, + "helm_mmlu/Formal Logic": 0.492, + "helm_mmlu/High School World History": 0.865, + "helm_mmlu/Human Sexuality": 0.809, + "helm_mmlu/International Law": 0.835, + "helm_mmlu/Logical Fallacies": 0.816, + "helm_mmlu/Machine Learning": 0.509, + "helm_mmlu/Management": 0.874, + "helm_mmlu/Marketing": 0.919, + "helm_mmlu/Medical Genetics": 0.84, + "helm_mmlu/Miscellaneous": 0.844, + "helm_mmlu/Moral Scenarios": 0.295, + "helm_mmlu/Nutrition": 0.775, + "helm_mmlu/Prehistory": 0.812, + "helm_mmlu/Public Relations": 0.736, + "helm_mmlu/Security Studies": 0.78, + "helm_mmlu/Sociology": 0.9, + "helm_mmlu/Virology": 0.53, + "helm_mmlu/World Religions": 0.86, + "helm_mmlu/Mean win rate": 0.265, + "hfopenllm_v2/IFEval": 0.204, + "hfopenllm_v2/BBH": 0.5377, + "hfopenllm_v2/MATH Level 5": 0.1344, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.4461, + "hfopenllm_v2/MMLU-PRO": 0.4103 + } + }, + { + "id": "google/gemma-2-9b-it", + "name": "Gemma 2 Instruct 9B", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.562, + "helm_lite/NarrativeQA": 0.768, + "helm_lite/NaturalQuestions (closed-book)": 0.328, + "helm_lite/OpenbookQA": 0.91, + "helm_lite/MMLU": 0.645, + "helm_lite/MATH": 0.724, + "helm_lite/GSM8K": 0.762, + "helm_lite/LegalBench": 0.639, + "helm_lite/MedQA": 0.63, + "helm_lite/WMT 2014": 0.201, + "hfopenllm_v2/IFEval": 0.7436, + "hfopenllm_v2/BBH": 0.599, + "hfopenllm_v2/MATH Level 5": 0.1949, + "hfopenllm_v2/GPQA": 0.3607, + "hfopenllm_v2/MUSR": 0.4073, + "hfopenllm_v2/MMLU-PRO": 0.3875 + } + }, + { + "id": "google/gemma-2b", + "name": "gemma-2b", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2038, + "hfopenllm_v2/BBH": 0.3366, + "hfopenllm_v2/MATH Level 5": 0.0302, + "hfopenllm_v2/GPQA": 0.255, + "hfopenllm_v2/MUSR": 0.3978, + "hfopenllm_v2/MMLU-PRO": 0.1366 + } + }, + { + "id": "google/gemma-2b-it", + "name": "gemma-2b-it", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.269, + "hfopenllm_v2/BBH": 0.3151, + "hfopenllm_v2/MATH Level 5": 0.0204, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.3341, + "hfopenllm_v2/MMLU-PRO": 0.1353 + } + }, + { + "id": "google/gemma-3-27b-it", + "name": "gemma-3-27b-it", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.763, + "global-mmlu-lite/Culturally Sensitive": 0.7528, + "global-mmlu-lite/Culturally Agnostic": 0.7733, + "global-mmlu-lite/Arabic": 0.78, + "global-mmlu-lite/English": 0.7337, + "global-mmlu-lite/Bengali": 0.75, + "global-mmlu-lite/German": 0.775, + "global-mmlu-lite/French": 0.7481, + "global-mmlu-lite/Hindi": 0.7335, + "global-mmlu-lite/Indonesian": 0.7563, + "global-mmlu-lite/Italian": 0.75, + "global-mmlu-lite/Japanese": 0.7925, + "global-mmlu-lite/Korean": 0.798, + "global-mmlu-lite/Portuguese": 0.7481, + "global-mmlu-lite/Spanish": 0.7494, + "global-mmlu-lite/Swahili": 0.785, + "global-mmlu-lite/Yoruba": 0.7444, + "global-mmlu-lite/Chinese": 0.7925, + "global-mmlu-lite/Burmese": 0.7719 + } + }, + { + "id": "google/gemma-3-4b-it", + "name": "gemma-3-4b-it", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.6511, + "global-mmlu-lite/Culturally Sensitive": 0.6116, + "global-mmlu-lite/Culturally Agnostic": 0.6906, + "global-mmlu-lite/Arabic": 0.6525, + "global-mmlu-lite/English": 0.67, + "global-mmlu-lite/Bengali": 0.68, + "global-mmlu-lite/German": 0.6525, + "global-mmlu-lite/French": 0.6575, + "global-mmlu-lite/Hindi": 0.6475, + "global-mmlu-lite/Indonesian": 0.6775, + "global-mmlu-lite/Italian": 0.6675, + "global-mmlu-lite/Japanese": 0.6325, + "global-mmlu-lite/Korean": 0.66, + "global-mmlu-lite/Portuguese": 0.68, + "global-mmlu-lite/Spanish": 0.6725, + "global-mmlu-lite/Swahili": 0.6075, + "global-mmlu-lite/Yoruba": 0.5825, + "global-mmlu-lite/Chinese": 0.6475, + "global-mmlu-lite/Burmese": 0.63 + } + }, + { + "id": "google/gemma-7b", + "name": "Gemma 7B", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.336, + "helm_lite/NarrativeQA": 0.752, + "helm_lite/NaturalQuestions (closed-book)": 0.336, + "helm_lite/OpenbookQA": 0.808, + "helm_lite/MMLU": 0.571, + "helm_lite/MATH": 0.5, + "helm_lite/GSM8K": 0.559, + "helm_lite/LegalBench": 0.581, + "helm_lite/MedQA": 0.513, + "helm_lite/WMT 2014": 0.187, + "helm_mmlu/MMLU All Subjects": 0.661, + "helm_mmlu/Abstract Algebra": 0.28, + "helm_mmlu/Anatomy": 0.563, + "helm_mmlu/College Physics": 0.412, + "helm_mmlu/Computer Security": 0.75, + "helm_mmlu/Econometrics": 0.474, + "helm_mmlu/Global Facts": 0.42, + "helm_mmlu/Jurisprudence": 0.769, + "helm_mmlu/Philosophy": 0.727, + "helm_mmlu/Professional Psychology": 0.712, + "helm_mmlu/Us Foreign Policy": 0.87, + "helm_mmlu/Astronomy": 0.717, + "helm_mmlu/Business Ethics": 0.65, + "helm_mmlu/Clinical Knowledge": 0.698, + "helm_mmlu/Conceptual Physics": 0.621, + "helm_mmlu/Electrical Engineering": 0.628, + "helm_mmlu/Elementary Mathematics": 0.516, + "helm_mmlu/Formal Logic": 0.508, + "helm_mmlu/High School World History": 0.857, + "helm_mmlu/Human Sexuality": 0.733, + "helm_mmlu/International Law": 0.835, + "helm_mmlu/Logical Fallacies": 0.742, + "helm_mmlu/Machine Learning": 0.554, + "helm_mmlu/Management": 0.864, + "helm_mmlu/Marketing": 0.885, + "helm_mmlu/Medical Genetics": 0.7, + "helm_mmlu/Miscellaneous": 0.838, + "helm_mmlu/Moral Scenarios": 0.377, + "helm_mmlu/Nutrition": 0.778, + "helm_mmlu/Prehistory": 0.756, + "helm_mmlu/Public Relations": 0.682, + "helm_mmlu/Security Studies": 0.735, + "helm_mmlu/Sociology": 0.841, + "helm_mmlu/Virology": 0.548, + "helm_mmlu/World Religions": 0.842, + "helm_mmlu/Mean win rate": 0.824, + "hfopenllm_v2/IFEval": 0.2659, + "hfopenllm_v2/BBH": 0.4362, + "hfopenllm_v2/MATH Level 5": 0.074, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.4062, + "hfopenllm_v2/MMLU-PRO": 0.2948 + } + }, + { + "id": "google/gemma-7b-it", + "name": "gemma-7b-it", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3868, + "hfopenllm_v2/BBH": 0.3646, + "hfopenllm_v2/MATH Level 5": 0.0295, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4274, + "hfopenllm_v2/MMLU-PRO": 0.1695 + } + }, + { + "id": "google/mt5-base", + "name": "mt5-base", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1645, + "hfopenllm_v2/BBH": 0.2883, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.2391, + "hfopenllm_v2/MUSR": 0.3672, + "hfopenllm_v2/MMLU-PRO": 0.107 + } + }, + { + "id": "google/mt5-small", + "name": "mt5-small", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1718, + "hfopenllm_v2/BBH": 0.2766, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2424, + "hfopenllm_v2/MUSR": 0.3857, + "hfopenllm_v2/MMLU-PRO": 0.1123 + } + }, + { + "id": "google/mt5-xl", + "name": "mt5-xl", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.196, + "hfopenllm_v2/BBH": 0.3047, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3795, + "hfopenllm_v2/MMLU-PRO": 0.112 + } + }, + { + "id": "google/mt5-xxl", + "name": "mt5-xxl", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2358, + "hfopenllm_v2/BBH": 0.2959, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2416, + "hfopenllm_v2/MUSR": 0.3689, + "hfopenllm_v2/MMLU-PRO": 0.1089 + } + }, + { + "id": "google/recurrentgemma-2b", + "name": "recurrentgemma-2b", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3017, + "hfopenllm_v2/BBH": 0.3197, + "hfopenllm_v2/MATH Level 5": 0.0204, + "hfopenllm_v2/GPQA": 0.2458, + "hfopenllm_v2/MUSR": 0.3446, + "hfopenllm_v2/MMLU-PRO": 0.1176 + } + }, + { + "id": "google/recurrentgemma-2b-it", + "name": "recurrentgemma-2b-it", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2949, + "hfopenllm_v2/BBH": 0.333, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3341, + "hfopenllm_v2/MMLU-PRO": 0.1402 + } + }, + { + "id": "google/recurrentgemma-9b", + "name": "recurrentgemma-9b", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3116, + "hfopenllm_v2/BBH": 0.3956, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.3803, + "hfopenllm_v2/MMLU-PRO": 0.2605 + } + }, + { + "id": "google/recurrentgemma-9b-it", + "name": "recurrentgemma-9b-it", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.501, + "hfopenllm_v2/BBH": 0.4367, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.4379, + "hfopenllm_v2/MMLU-PRO": 0.2843 + } + }, + { + "id": "google/switch-base-8", + "name": "switch-base-8", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1585, + "hfopenllm_v2/BBH": 0.2876, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3517, + "hfopenllm_v2/MMLU-PRO": 0.1098 + } + }, + { + "id": "google/text-bison@001", + "name": "PaLM-2 Bison", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.526, + "helm_lite/NarrativeQA": 0.718, + "helm_lite/NaturalQuestions (closed-book)": 0.39, + "helm_lite/OpenbookQA": 0.878, + "helm_lite/MMLU": 0.608, + "helm_lite/MATH": 0.421, + "helm_lite/GSM8K": 0.61, + "helm_lite/LegalBench": 0.645, + "helm_lite/MedQA": 0.547, + "helm_lite/WMT 2014": 0.241, + "helm_mmlu/MMLU All Subjects": 0.692, + "helm_mmlu/Abstract Algebra": 0.39, + "helm_mmlu/Anatomy": 0.644, + "helm_mmlu/College Physics": 0.51, + "helm_mmlu/Computer Security": 0.74, + "helm_mmlu/Econometrics": 0.518, + "helm_mmlu/Global Facts": 0.38, + "helm_mmlu/Jurisprudence": 0.769, + "helm_mmlu/Philosophy": 0.736, + "helm_mmlu/Professional Psychology": 0.761, + "helm_mmlu/Us Foreign Policy": 0.87, + "helm_mmlu/Astronomy": 0.803, + "helm_mmlu/Business Ethics": 0.76, + "helm_mmlu/Clinical Knowledge": 0.725, + "helm_mmlu/Conceptual Physics": 0.694, + "helm_mmlu/Electrical Engineering": 0.69, + "helm_mmlu/Elementary Mathematics": 0.487, + "helm_mmlu/Formal Logic": 0.5, + "helm_mmlu/High School World History": 0.869, + "helm_mmlu/Human Sexuality": 0.84, + "helm_mmlu/International Law": 0.835, + "helm_mmlu/Logical Fallacies": 0.853, + "helm_mmlu/Machine Learning": 0.562, + "helm_mmlu/Management": 0.893, + "helm_mmlu/Marketing": 0.893, + "helm_mmlu/Medical Genetics": 0.75, + "helm_mmlu/Miscellaneous": 0.866, + "helm_mmlu/Moral Scenarios": 0.369, + "helm_mmlu/Nutrition": 0.709, + "helm_mmlu/Prehistory": 0.812, + "helm_mmlu/Public Relations": 0.691, + "helm_mmlu/Security Studies": 0.812, + "helm_mmlu/Sociology": 0.92, + "helm_mmlu/Virology": 0.494, + "helm_mmlu/World Religions": 0.883, + "helm_mmlu/Mean win rate": 0.192 + } + }, + { + "id": "google/text-unicorn@001", + "name": "PaLM-2 Unicorn", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.644, + "helm_lite/NarrativeQA": 0.583, + "helm_lite/NaturalQuestions (closed-book)": 0.435, + "helm_lite/OpenbookQA": 0.938, + "helm_lite/MMLU": 0.702, + "helm_lite/MATH": 0.674, + "helm_lite/GSM8K": 0.831, + "helm_lite/LegalBench": 0.677, + "helm_lite/MedQA": 0.684, + "helm_lite/WMT 2014": 0.26, + "helm_mmlu/MMLU All Subjects": 0.786, + "helm_mmlu/Abstract Algebra": 0.51, + "helm_mmlu/Anatomy": 0.733, + "helm_mmlu/College Physics": 0.549, + "helm_mmlu/Computer Security": 0.77, + "helm_mmlu/Econometrics": 0.649, + "helm_mmlu/Global Facts": 0.53, + "helm_mmlu/Jurisprudence": 0.88, + "helm_mmlu/Philosophy": 0.836, + "helm_mmlu/Professional Psychology": 0.858, + "helm_mmlu/Us Foreign Policy": 0.96, + "helm_mmlu/Astronomy": 0.862, + "helm_mmlu/Business Ethics": 0.83, + "helm_mmlu/Clinical Knowledge": 0.804, + "helm_mmlu/Conceptual Physics": 0.809, + "helm_mmlu/Electrical Engineering": 0.772, + "helm_mmlu/Elementary Mathematics": 0.661, + "helm_mmlu/Formal Logic": 0.659, + "helm_mmlu/High School World History": 0.911, + "helm_mmlu/Human Sexuality": 0.924, + "helm_mmlu/International Law": 0.909, + "helm_mmlu/Logical Fallacies": 0.877, + "helm_mmlu/Machine Learning": 0.625, + "helm_mmlu/Management": 0.903, + "helm_mmlu/Marketing": 0.94, + "helm_mmlu/Medical Genetics": 0.83, + "helm_mmlu/Miscellaneous": 0.894, + "helm_mmlu/Moral Scenarios": 0.562, + "helm_mmlu/Nutrition": 0.856, + "helm_mmlu/Prehistory": 0.87, + "helm_mmlu/Public Relations": 0.773, + "helm_mmlu/Security Studies": 0.829, + "helm_mmlu/Sociology": 0.91, + "helm_mmlu/Virology": 0.572, + "helm_mmlu/World Religions": 0.877, + "helm_mmlu/Mean win rate": 0.142 + } + }, + { + "id": "google/umt5-base", + "name": "umt5-base", + "developer": "google", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1746, + "hfopenllm_v2/BBH": 0.2788, + "hfopenllm_v2/MATH Level 5": 0.0045, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.3382, + "hfopenllm_v2/MMLU-PRO": 0.1078 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/goulue5.json b/data/developers/goulue5.json new file mode 100644 index 0000000000000000000000000000000000000000..1784209cd1a91a98c08a6580371e4647e236069d --- /dev/null +++ b/data/developers/goulue5.json @@ -0,0 +1,19 @@ +{ + "developer": "goulue5", + "models": [ + { + "id": "goulue5/merging_LLM", + "name": "merging_LLM", + "developer": "goulue5", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3233, + "hfopenllm_v2/BBH": 0.4216, + "hfopenllm_v2/MATH Level 5": 0.0967, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.4333, + "hfopenllm_v2/MMLU-PRO": 0.2958 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/gradientai.json b/data/developers/gradientai.json new file mode 100644 index 0000000000000000000000000000000000000000..168231fa8cad153268e010d70aa3ff34d14b6188 --- /dev/null +++ b/data/developers/gradientai.json @@ -0,0 +1,19 @@ +{ + "developer": "gradientai", + "models": [ + { + "id": "gradientai/Llama-3-8B-Instruct-Gradient-1048k", + "name": "Llama-3-8B-Instruct-Gradient-1048k", + "developer": "gradientai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4456, + "hfopenllm_v2/BBH": 0.4346, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.4298, + "hfopenllm_v2/MMLU-PRO": 0.294 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/grimjim.json b/data/developers/grimjim.json new file mode 100644 index 0000000000000000000000000000000000000000..a372c72a5a90cd48978980b15e5022c86f709626 --- /dev/null +++ b/data/developers/grimjim.json @@ -0,0 +1,355 @@ +{ + "developer": "grimjim", + "models": [ + { + "id": "grimjim/DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B", + "name": "DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4797, + "hfopenllm_v2/BBH": 0.5269, + "hfopenllm_v2/MATH Level 5": 0.2221, + "hfopenllm_v2/GPQA": 0.3381, + "hfopenllm_v2/MUSR": 0.4408, + "hfopenllm_v2/MMLU-PRO": 0.3957 + } + }, + { + "id": "grimjim/Gigantes-v1-gemma2-9b-it", + "name": "Gigantes-v1-gemma2-9b-it", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6925, + "hfopenllm_v2/BBH": 0.5978, + "hfopenllm_v2/MATH Level 5": 0.2145, + "hfopenllm_v2/GPQA": 0.3532, + "hfopenllm_v2/MUSR": 0.4555, + "hfopenllm_v2/MMLU-PRO": 0.4225 + } + }, + { + "id": "grimjim/Gigantes-v2-gemma2-9b-it", + "name": "Gigantes-v2-gemma2-9b-it", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7351, + "hfopenllm_v2/BBH": 0.5987, + "hfopenllm_v2/MATH Level 5": 0.2017, + "hfopenllm_v2/GPQA": 0.3515, + "hfopenllm_v2/MUSR": 0.4595, + "hfopenllm_v2/MMLU-PRO": 0.4259 + } + }, + { + "id": "grimjim/Gigantes-v3-gemma2-9b-it", + "name": "Gigantes-v3-gemma2-9b-it", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6976, + "hfopenllm_v2/BBH": 0.5984, + "hfopenllm_v2/MATH Level 5": 0.21, + "hfopenllm_v2/GPQA": 0.3565, + "hfopenllm_v2/MUSR": 0.4608, + "hfopenllm_v2/MMLU-PRO": 0.4226 + } + }, + { + "id": "grimjim/HuatuoSkywork-o1-Llama-3.1-8B", + "name": "HuatuoSkywork-o1-Llama-3.1-8B", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3961, + "hfopenllm_v2/BBH": 0.4886, + "hfopenllm_v2/MATH Level 5": 0.3882, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.3839, + "hfopenllm_v2/MMLU-PRO": 0.3095 + } + }, + { + "id": "grimjim/Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge", + "name": "Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4271, + "hfopenllm_v2/BBH": 0.4962, + "hfopenllm_v2/MATH Level 5": 0.0997, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4043, + "hfopenllm_v2/MMLU-PRO": 0.3625 + } + }, + { + "id": "grimjim/Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge", + "name": "Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6806, + "hfopenllm_v2/BBH": 0.5022, + "hfopenllm_v2/MATH Level 5": 0.0891, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3885, + "hfopenllm_v2/MMLU-PRO": 0.3684 + } + }, + { + "id": "grimjim/Llama-3.1-8B-Instruct-abliterated_via_adapter", + "name": "Llama-3.1-8B-Instruct-abliterated_via_adapter", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.487, + "hfopenllm_v2/BBH": 0.5105, + "hfopenllm_v2/MATH Level 5": 0.1397, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.401, + "hfopenllm_v2/MMLU-PRO": 0.3651 + } + }, + { + "id": "grimjim/Llama-3.1-Bonsaikraft-8B-Instruct", + "name": "Llama-3.1-Bonsaikraft-8B-Instruct", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.425, + "hfopenllm_v2/BBH": 0.5287, + "hfopenllm_v2/MATH Level 5": 0.1314, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4235, + "hfopenllm_v2/MMLU-PRO": 0.3764 + } + }, + { + "id": "grimjim/Llama-Nephilim-Metamorphosis-v2-8B", + "name": "Llama-Nephilim-Metamorphosis-v2-8B", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4545, + "hfopenllm_v2/BBH": 0.5013, + "hfopenllm_v2/MATH Level 5": 0.1397, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.4091, + "hfopenllm_v2/MMLU-PRO": 0.3809 + } + }, + { + "id": "grimjim/Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B", + "name": "Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4366, + "hfopenllm_v2/BBH": 0.5287, + "hfopenllm_v2/MATH Level 5": 0.3006, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.3999, + "hfopenllm_v2/MMLU-PRO": 0.3684 + } + }, + { + "id": "grimjim/Magnolia-v1-Gemma2-8k-9B", + "name": "Magnolia-v1-Gemma2-8k-9B", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3531, + "hfopenllm_v2/BBH": 0.5589, + "hfopenllm_v2/MATH Level 5": 0.1684, + "hfopenllm_v2/GPQA": 0.3364, + "hfopenllm_v2/MUSR": 0.4645, + "hfopenllm_v2/MMLU-PRO": 0.4242 + } + }, + { + "id": "grimjim/Magnolia-v2-12B", + "name": "Magnolia-v2-12B", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3506, + "hfopenllm_v2/BBH": 0.529, + "hfopenllm_v2/MATH Level 5": 0.1292, + "hfopenllm_v2/GPQA": 0.3188, + "hfopenllm_v2/MUSR": 0.4171, + "hfopenllm_v2/MMLU-PRO": 0.3601 + } + }, + { + "id": "grimjim/Magnolia-v2-Gemma2-8k-9B", + "name": "Magnolia-v2-Gemma2-8k-9B", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7384, + "hfopenllm_v2/BBH": 0.6016, + "hfopenllm_v2/MATH Level 5": 0.2281, + "hfopenllm_v2/GPQA": 0.3574, + "hfopenllm_v2/MUSR": 0.4488, + "hfopenllm_v2/MMLU-PRO": 0.4332 + } + }, + { + "id": "grimjim/Magnolia-v3-12B", + "name": "Magnolia-v3-12B", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3965, + "hfopenllm_v2/BBH": 0.5327, + "hfopenllm_v2/MATH Level 5": 0.1352, + "hfopenllm_v2/GPQA": 0.3255, + "hfopenllm_v2/MUSR": 0.4184, + "hfopenllm_v2/MMLU-PRO": 0.3615 + } + }, + { + "id": "grimjim/Magnolia-v3-Gemma2-8k-9B", + "name": "Magnolia-v3-Gemma2-8k-9B", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7378, + "hfopenllm_v2/BBH": 0.6015, + "hfopenllm_v2/MATH Level 5": 0.2319, + "hfopenllm_v2/GPQA": 0.3565, + "hfopenllm_v2/MUSR": 0.4488, + "hfopenllm_v2/MMLU-PRO": 0.4337 + } + }, + { + "id": "grimjim/Magnolia-v4-12B", + "name": "Magnolia-v4-12B", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3418, + "hfopenllm_v2/BBH": 0.5431, + "hfopenllm_v2/MATH Level 5": 0.1314, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.4211, + "hfopenllm_v2/MMLU-PRO": 0.3672 + } + }, + { + "id": "grimjim/Magnolia-v5a-12B", + "name": "Magnolia-v5a-12B", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4114, + "hfopenllm_v2/BBH": 0.5312, + "hfopenllm_v2/MATH Level 5": 0.1375, + "hfopenllm_v2/GPQA": 0.3221, + "hfopenllm_v2/MUSR": 0.4145, + "hfopenllm_v2/MMLU-PRO": 0.3601 + } + }, + { + "id": "grimjim/Magot-v1-Gemma2-8k-9B", + "name": "Magot-v1-Gemma2-8k-9B", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2997, + "hfopenllm_v2/BBH": 0.6019, + "hfopenllm_v2/MATH Level 5": 0.0989, + "hfopenllm_v2/GPQA": 0.3465, + "hfopenllm_v2/MUSR": 0.4488, + "hfopenllm_v2/MMLU-PRO": 0.4337 + } + }, + { + "id": "grimjim/Magot-v2-Gemma2-8k-9B", + "name": "Magot-v2-Gemma2-8k-9B", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7347, + "hfopenllm_v2/BBH": 0.5897, + "hfopenllm_v2/MATH Level 5": 0.2017, + "hfopenllm_v2/GPQA": 0.354, + "hfopenllm_v2/MUSR": 0.4344, + "hfopenllm_v2/MMLU-PRO": 0.4223 + } + }, + { + "id": "grimjim/SauerHuatuoSkywork-o1-Llama-3.1-8B", + "name": "SauerHuatuoSkywork-o1-Llama-3.1-8B", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5219, + "hfopenllm_v2/BBH": 0.5222, + "hfopenllm_v2/MATH Level 5": 0.173, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.4527, + "hfopenllm_v2/MMLU-PRO": 0.3991 + } + }, + { + "id": "grimjim/llama-3-Nephilim-v1-8B", + "name": "llama-3-Nephilim-v1-8B", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4277, + "hfopenllm_v2/BBH": 0.5132, + "hfopenllm_v2/MATH Level 5": 0.0906, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4136, + "hfopenllm_v2/MMLU-PRO": 0.3796 + } + }, + { + "id": "grimjim/llama-3-Nephilim-v2-8B", + "name": "llama-3-Nephilim-v2-8B", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3922, + "hfopenllm_v2/BBH": 0.5048, + "hfopenllm_v2/MATH Level 5": 0.1065, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.3895, + "hfopenllm_v2/MMLU-PRO": 0.3641 + } + }, + { + "id": "grimjim/llama-3-Nephilim-v2.1-8B", + "name": "llama-3-Nephilim-v2.1-8B", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3895, + "hfopenllm_v2/BBH": 0.5095, + "hfopenllm_v2/MATH Level 5": 0.0997, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.3935, + "hfopenllm_v2/MMLU-PRO": 0.3644 + } + }, + { + "id": "grimjim/llama-3-Nephilim-v3-8B", + "name": "llama-3-Nephilim-v3-8B", + "developer": "grimjim", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4174, + "hfopenllm_v2/BBH": 0.5013, + "hfopenllm_v2/MATH Level 5": 0.0952, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.3989, + "hfopenllm_v2/MMLU-PRO": 0.3612 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/gupta-tanish.json b/data/developers/gupta-tanish.json new file mode 100644 index 0000000000000000000000000000000000000000..3a454b1d3780807337f362fcd4281d424b869cb8 --- /dev/null +++ b/data/developers/gupta-tanish.json @@ -0,0 +1,19 @@ +{ + "developer": "gupta-tanish", + "models": [ + { + "id": "gupta-tanish/llama-7b-dpo-baseline", + "name": "llama-7b-dpo-baseline", + "developer": "gupta-tanish", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2693, + "hfopenllm_v2/BBH": 0.3897, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.4456, + "hfopenllm_v2/MMLU-PRO": 0.2028 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/gz987.json b/data/developers/gz987.json new file mode 100644 index 0000000000000000000000000000000000000000..020082db9e4b469b79d1d3703cd5073eefcab4cd --- /dev/null +++ b/data/developers/gz987.json @@ -0,0 +1,61 @@ +{ + "developer": "gz987", + "models": [ + { + "id": "gz987/qwen2.5-7b-cabs-v0.1", + "name": "qwen2.5-7b-cabs-v0.1", + "developer": "gz987", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7506, + "hfopenllm_v2/BBH": 0.5482, + "hfopenllm_v2/MATH Level 5": 0.4796, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4376, + "hfopenllm_v2/MMLU-PRO": 0.4406 + } + }, + { + "id": "gz987/qwen2.5-7b-cabs-v0.2", + "name": "qwen2.5-7b-cabs-v0.2", + "developer": "gz987", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7418, + "hfopenllm_v2/BBH": 0.5516, + "hfopenllm_v2/MATH Level 5": 0.4902, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4429, + "hfopenllm_v2/MMLU-PRO": 0.4397 + } + }, + { + "id": "gz987/qwen2.5-7b-cabs-v0.3", + "name": "qwen2.5-7b-cabs-v0.3", + "developer": "gz987", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.757, + "hfopenllm_v2/BBH": 0.5494, + "hfopenllm_v2/MATH Level 5": 0.4932, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.443, + "hfopenllm_v2/MMLU-PRO": 0.4402 + } + }, + { + "id": "gz987/qwen2.5-7b-cabs-v0.4", + "name": "qwen2.5-7b-cabs-v0.4", + "developer": "gz987", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7583, + "hfopenllm_v2/BBH": 0.5524, + "hfopenllm_v2/MATH Level 5": 0.4849, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.443, + "hfopenllm_v2/MMLU-PRO": 0.4396 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/h2oai.json b/data/developers/h2oai.json new file mode 100644 index 0000000000000000000000000000000000000000..f4aa111f345aa1acf9f1cc6e0cca2ee77f66e54b --- /dev/null +++ b/data/developers/h2oai.json @@ -0,0 +1,75 @@ +{ + "developer": "h2oai", + "models": [ + { + "id": "h2oai/h2o-danube-1.8b-chat", + "name": "h2o-danube-1.8b-chat", + "developer": "h2oai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2199, + "hfopenllm_v2/BBH": 0.322, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.3989, + "hfopenllm_v2/MMLU-PRO": 0.1314 + } + }, + { + "id": "h2oai/h2o-danube3-4b-base", + "name": "h2o-danube3-4b-base", + "developer": "h2oai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2338, + "hfopenllm_v2/BBH": 0.3599, + "hfopenllm_v2/MATH Level 5": 0.0227, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.3778, + "hfopenllm_v2/MMLU-PRO": 0.2109 + } + }, + { + "id": "h2oai/h2o-danube3-4b-chat", + "name": "h2o-danube3-4b-chat", + "developer": "h2oai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3629, + "hfopenllm_v2/BBH": 0.3466, + "hfopenllm_v2/MATH Level 5": 0.0408, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3781, + "hfopenllm_v2/MMLU-PRO": 0.2228 + } + }, + { + "id": "h2oai/h2o-danube3-500m-chat", + "name": "h2o-danube3-500m-chat", + "developer": "h2oai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2208, + "hfopenllm_v2/BBH": 0.3035, + "hfopenllm_v2/MATH Level 5": 0.0166, + "hfopenllm_v2/GPQA": 0.2307, + "hfopenllm_v2/MUSR": 0.3434, + "hfopenllm_v2/MMLU-PRO": 0.1144 + } + }, + { + "id": "h2oai/h2o-danube3.1-4b-chat", + "name": "h2o-danube3.1-4b-chat", + "developer": "h2oai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5021, + "hfopenllm_v2/BBH": 0.3608, + "hfopenllm_v2/MATH Level 5": 0.0332, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.4102, + "hfopenllm_v2/MMLU-PRO": 0.2719 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/haoranxu.json b/data/developers/haoranxu.json new file mode 100644 index 0000000000000000000000000000000000000000..843256c0fe85c4f4e423c1c76f4a86daf1e80a57 --- /dev/null +++ b/data/developers/haoranxu.json @@ -0,0 +1,47 @@ +{ + "developer": "haoranxu", + "models": [ + { + "id": "haoranxu/ALMA-13B-R", + "name": "ALMA-13B-R", + "developer": "haoranxu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0039, + "hfopenllm_v2/BBH": 0.3457, + "hfopenllm_v2/MATH Level 5": 0.0174, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3528, + "hfopenllm_v2/MMLU-PRO": 0.1817 + } + }, + { + "id": "haoranxu/Llama-3-Instruct-8B-CPO-SimPO", + "name": "Llama-3-Instruct-8B-CPO-SimPO", + "developer": "haoranxu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7046, + "hfopenllm_v2/BBH": 0.5048, + "hfopenllm_v2/MATH Level 5": 0.1027, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.3567, + "hfopenllm_v2/MMLU-PRO": 0.3686 + } + }, + { + "id": "haoranxu/Llama-3-Instruct-8B-SimPO", + "name": "Llama-3-Instruct-8B-SimPO", + "developer": "haoranxu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7347, + "hfopenllm_v2/BBH": 0.4979, + "hfopenllm_v2/MATH Level 5": 0.0876, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.3566, + "hfopenllm_v2/MMLU-PRO": 0.3733 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/hatemmahmoud.json b/data/developers/hatemmahmoud.json new file mode 100644 index 0000000000000000000000000000000000000000..ccc014c278b3b9da1f79c243abd7ecaacd707c1a --- /dev/null +++ b/data/developers/hatemmahmoud.json @@ -0,0 +1,19 @@ +{ + "developer": "hatemmahmoud", + "models": [ + { + "id": "hatemmahmoud/qwen2.5-1.5b-sft-raft-grpo-hra-doc", + "name": "qwen2.5-1.5b-sft-raft-grpo-hra-doc", + "developer": "hatemmahmoud", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4196, + "hfopenllm_v2/BBH": 0.427, + "hfopenllm_v2/MATH Level 5": 0.2175, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.361, + "hfopenllm_v2/MMLU-PRO": 0.2776 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/hendrydong.json b/data/developers/hendrydong.json new file mode 100644 index 0000000000000000000000000000000000000000..56e05071d3b06a1289763aa84d052b49b755b3aa --- /dev/null +++ b/data/developers/hendrydong.json @@ -0,0 +1,24 @@ +{ + "developer": "hendrydong", + "models": [ + { + "id": "hendrydong/Mistral-RM-for-RAFT-GSHF-v0", + "name": "hendrydong/Mistral-RM-for-RAFT-GSHF-v0", + "developer": "hendrydong", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7847, + "reward-bench/Factuality": 0.5779, + "reward-bench/Precise IF": 0.3625, + "reward-bench/Math": 0.6011, + "reward-bench/Safety": 0.85, + "reward-bench/Focus": 0.6747, + "reward-bench/Ties": 0.5988, + "reward-bench/Chat": 0.9832, + "reward-bench/Chat Hard": 0.5789, + "reward-bench/Reasoning": 0.7434, + "reward-bench/Prior Sets (0.5 weight)": 0.7508 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/hon9kon9ize.json b/data/developers/hon9kon9ize.json new file mode 100644 index 0000000000000000000000000000000000000000..f340e6f94fab3cb0f8a79f260c68bc5ec6b80d3d --- /dev/null +++ b/data/developers/hon9kon9ize.json @@ -0,0 +1,33 @@ +{ + "developer": "hon9kon9ize", + "models": [ + { + "id": "hon9kon9ize/CantoneseLLMChat-v0.5", + "name": "CantoneseLLMChat-v0.5", + "developer": "hon9kon9ize", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3231, + "hfopenllm_v2/BBH": 0.4345, + "hfopenllm_v2/MATH Level 5": 0.0415, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.4706, + "hfopenllm_v2/MMLU-PRO": 0.2504 + } + }, + { + "id": "hon9kon9ize/CantoneseLLMChat-v1.0-7B", + "name": "CantoneseLLMChat-v1.0-7B", + "developer": "hon9kon9ize", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4455, + "hfopenllm_v2/BBH": 0.4866, + "hfopenllm_v2/MATH Level 5": 0.2107, + "hfopenllm_v2/GPQA": 0.3221, + "hfopenllm_v2/MUSR": 0.3883, + "hfopenllm_v2/MMLU-PRO": 0.3785 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/hongbai12.json b/data/developers/hongbai12.json new file mode 100644 index 0000000000000000000000000000000000000000..8ff34dbce459ce15b852da0ccf849e37ce258554 --- /dev/null +++ b/data/developers/hongbai12.json @@ -0,0 +1,19 @@ +{ + "developer": "hongbai12", + "models": [ + { + "id": "hongbai12/li-0.4-pre", + "name": "li-0.4-pre", + "developer": "hongbai12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.52, + "hfopenllm_v2/BBH": 0.6298, + "hfopenllm_v2/MATH Level 5": 0.4924, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.4513, + "hfopenllm_v2/MMLU-PRO": 0.5015 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/hotmailuser.json b/data/developers/hotmailuser.json new file mode 100644 index 0000000000000000000000000000000000000000..46c74a6ef9f98c2edc4d225547df3eaa491f90b5 --- /dev/null +++ b/data/developers/hotmailuser.json @@ -0,0 +1,481 @@ +{ + "developer": "hotmailuser", + "models": [ + { + "id": "hotmailuser/Deepseek-qwen-modelstock-2B", + "name": "Deepseek-qwen-modelstock-2B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2149, + "hfopenllm_v2/BBH": 0.3549, + "hfopenllm_v2/MATH Level 5": 0.3399, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3475, + "hfopenllm_v2/MMLU-PRO": 0.1911 + } + }, + { + "id": "hotmailuser/Falcon3Slerp1-10B", + "name": "Falcon3Slerp1-10B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5694, + "hfopenllm_v2/BBH": 0.617, + "hfopenllm_v2/MATH Level 5": 0.2598, + "hfopenllm_v2/GPQA": 0.344, + "hfopenllm_v2/MUSR": 0.4318, + "hfopenllm_v2/MMLU-PRO": 0.4402 + } + }, + { + "id": "hotmailuser/Falcon3Slerp2-10B", + "name": "Falcon3Slerp2-10B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6118, + "hfopenllm_v2/BBH": 0.6164, + "hfopenllm_v2/MATH Level 5": 0.2319, + "hfopenllm_v2/GPQA": 0.3381, + "hfopenllm_v2/MUSR": 0.4096, + "hfopenllm_v2/MMLU-PRO": 0.4369 + } + }, + { + "id": "hotmailuser/Falcon3Slerp4-10B", + "name": "Falcon3Slerp4-10B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6072, + "hfopenllm_v2/BBH": 0.6114, + "hfopenllm_v2/MATH Level 5": 0.2289, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.4017, + "hfopenllm_v2/MMLU-PRO": 0.4387 + } + }, + { + "id": "hotmailuser/FalconSlerp-3B", + "name": "FalconSlerp-3B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5695, + "hfopenllm_v2/BBH": 0.4624, + "hfopenllm_v2/MATH Level 5": 0.176, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.3989, + "hfopenllm_v2/MMLU-PRO": 0.2968 + } + }, + { + "id": "hotmailuser/FalconSlerp1-7B", + "name": "FalconSlerp1-7B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5395, + "hfopenllm_v2/BBH": 0.5355, + "hfopenllm_v2/MATH Level 5": 0.2379, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.4452, + "hfopenllm_v2/MMLU-PRO": 0.4129 + } + }, + { + "id": "hotmailuser/FalconSlerp2-7B", + "name": "FalconSlerp2-7B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.616, + "hfopenllm_v2/BBH": 0.5538, + "hfopenllm_v2/MATH Level 5": 0.2983, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.4479, + "hfopenllm_v2/MMLU-PRO": 0.4141 + } + }, + { + "id": "hotmailuser/FalconSlerp3-10B", + "name": "FalconSlerp3-10B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6002, + "hfopenllm_v2/BBH": 0.606, + "hfopenllm_v2/MATH Level 5": 0.2273, + "hfopenllm_v2/GPQA": 0.3356, + "hfopenllm_v2/MUSR": 0.4031, + "hfopenllm_v2/MMLU-PRO": 0.4323 + } + }, + { + "id": "hotmailuser/FalconSlerp3-7B", + "name": "FalconSlerp3-7B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6096, + "hfopenllm_v2/BBH": 0.5533, + "hfopenllm_v2/MATH Level 5": 0.3157, + "hfopenllm_v2/GPQA": 0.3188, + "hfopenllm_v2/MUSR": 0.4507, + "hfopenllm_v2/MMLU-PRO": 0.4127 + } + }, + { + "id": "hotmailuser/FalconSlerp4-7B", + "name": "FalconSlerp4-7B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6285, + "hfopenllm_v2/BBH": 0.5524, + "hfopenllm_v2/MATH Level 5": 0.2213, + "hfopenllm_v2/GPQA": 0.3322, + "hfopenllm_v2/MUSR": 0.4585, + "hfopenllm_v2/MMLU-PRO": 0.4032 + } + }, + { + "id": "hotmailuser/FalconSlerp6-7B", + "name": "FalconSlerp6-7B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6027, + "hfopenllm_v2/BBH": 0.5384, + "hfopenllm_v2/MATH Level 5": 0.2047, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.4492, + "hfopenllm_v2/MMLU-PRO": 0.3995 + } + }, + { + "id": "hotmailuser/Gemma2Crono-27B", + "name": "Gemma2Crono-27B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7086, + "hfopenllm_v2/BBH": 0.6505, + "hfopenllm_v2/MATH Level 5": 0.2424, + "hfopenllm_v2/GPQA": 0.3708, + "hfopenllm_v2/MUSR": 0.4567, + "hfopenllm_v2/MMLU-PRO": 0.4633 + } + }, + { + "id": "hotmailuser/Gemma2SimPO-27B", + "name": "Gemma2SimPO-27B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7222, + "hfopenllm_v2/BBH": 0.6413, + "hfopenllm_v2/MATH Level 5": 0.2817, + "hfopenllm_v2/GPQA": 0.3582, + "hfopenllm_v2/MUSR": 0.4447, + "hfopenllm_v2/MMLU-PRO": 0.4642 + } + }, + { + "id": "hotmailuser/Gemma2atlas-27B", + "name": "Gemma2atlas-27B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7214, + "hfopenllm_v2/BBH": 0.6545, + "hfopenllm_v2/MATH Level 5": 0.2145, + "hfopenllm_v2/GPQA": 0.3557, + "hfopenllm_v2/MUSR": 0.4445, + "hfopenllm_v2/MMLU-PRO": 0.475 + } + }, + { + "id": "hotmailuser/Gemma2magnum-27b", + "name": "Gemma2magnum-27b", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5051, + "hfopenllm_v2/BBH": 0.62, + "hfopenllm_v2/MATH Level 5": 0.2205, + "hfopenllm_v2/GPQA": 0.3851, + "hfopenllm_v2/MUSR": 0.4723, + "hfopenllm_v2/MMLU-PRO": 0.4596 + } + }, + { + "id": "hotmailuser/Llama-Hermes-slerp-8B", + "name": "Llama-Hermes-slerp-8B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.339, + "hfopenllm_v2/BBH": 0.531, + "hfopenllm_v2/MATH Level 5": 0.0801, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4078, + "hfopenllm_v2/MMLU-PRO": 0.3331 + } + }, + { + "id": "hotmailuser/Llama-Hermes-slerp2-8B", + "name": "Llama-Hermes-slerp2-8B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3728, + "hfopenllm_v2/BBH": 0.5265, + "hfopenllm_v2/MATH Level 5": 0.0974, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4248, + "hfopenllm_v2/MMLU-PRO": 0.3379 + } + }, + { + "id": "hotmailuser/LlamaStock-8B", + "name": "LlamaStock-8B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.425, + "hfopenllm_v2/BBH": 0.5329, + "hfopenllm_v2/MATH Level 5": 0.1699, + "hfopenllm_v2/GPQA": 0.3272, + "hfopenllm_v2/MUSR": 0.4129, + "hfopenllm_v2/MMLU-PRO": 0.3807 + } + }, + { + "id": "hotmailuser/Mistral-modelstock-24B", + "name": "Mistral-modelstock-24B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3424, + "hfopenllm_v2/BBH": 0.6452, + "hfopenllm_v2/MATH Level 5": 0.1307, + "hfopenllm_v2/GPQA": 0.4102, + "hfopenllm_v2/MUSR": 0.459, + "hfopenllm_v2/MMLU-PRO": 0.507 + } + }, + { + "id": "hotmailuser/Mistral-modelstock2-24B", + "name": "Mistral-modelstock2-24B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4318, + "hfopenllm_v2/BBH": 0.6689, + "hfopenllm_v2/MATH Level 5": 0.2402, + "hfopenllm_v2/GPQA": 0.3926, + "hfopenllm_v2/MUSR": 0.4616, + "hfopenllm_v2/MMLU-PRO": 0.5318 + } + }, + { + "id": "hotmailuser/Phi4-Slerp4-14B", + "name": "Phi4-Slerp4-14B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0629, + "hfopenllm_v2/BBH": 0.6731, + "hfopenllm_v2/MATH Level 5": 0.3474, + "hfopenllm_v2/GPQA": 0.3968, + "hfopenllm_v2/MUSR": 0.5097, + "hfopenllm_v2/MMLU-PRO": 0.5278 + } + }, + { + "id": "hotmailuser/Qwen2.5-HomerSlerp-7B", + "name": "Qwen2.5-HomerSlerp-7B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4488, + "hfopenllm_v2/BBH": 0.5633, + "hfopenllm_v2/MATH Level 5": 0.3316, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4383, + "hfopenllm_v2/MMLU-PRO": 0.4549 + } + }, + { + "id": "hotmailuser/QwenModelStock-1.8B", + "name": "QwenModelStock-1.8B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3263, + "hfopenllm_v2/BBH": 0.4188, + "hfopenllm_v2/MATH Level 5": 0.0989, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.4359, + "hfopenllm_v2/MMLU-PRO": 0.2959 + } + }, + { + "id": "hotmailuser/QwenSlerp-14B", + "name": "QwenSlerp-14B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7025, + "hfopenllm_v2/BBH": 0.6491, + "hfopenllm_v2/MATH Level 5": 0.3837, + "hfopenllm_v2/GPQA": 0.3876, + "hfopenllm_v2/MUSR": 0.4634, + "hfopenllm_v2/MMLU-PRO": 0.54 + } + }, + { + "id": "hotmailuser/QwenSlerp-3B", + "name": "QwenSlerp-3B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4334, + "hfopenllm_v2/BBH": 0.4892, + "hfopenllm_v2/MATH Level 5": 0.2749, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4317, + "hfopenllm_v2/MMLU-PRO": 0.3693 + } + }, + { + "id": "hotmailuser/QwenSlerp-7B", + "name": "QwenSlerp-7B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4673, + "hfopenllm_v2/BBH": 0.5636, + "hfopenllm_v2/MATH Level 5": 0.3444, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.4409, + "hfopenllm_v2/MMLU-PRO": 0.4509 + } + }, + { + "id": "hotmailuser/QwenSlerp2-14B", + "name": "QwenSlerp2-14B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7037, + "hfopenllm_v2/BBH": 0.6493, + "hfopenllm_v2/MATH Level 5": 0.3965, + "hfopenllm_v2/GPQA": 0.3809, + "hfopenllm_v2/MUSR": 0.4807, + "hfopenllm_v2/MMLU-PRO": 0.5379 + } + }, + { + "id": "hotmailuser/QwenSlerp2-3B", + "name": "QwenSlerp2-3B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.428, + "hfopenllm_v2/BBH": 0.4802, + "hfopenllm_v2/MATH Level 5": 0.2606, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4252, + "hfopenllm_v2/MMLU-PRO": 0.3742 + } + }, + { + "id": "hotmailuser/QwenSlerp3-14B", + "name": "QwenSlerp3-14B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6632, + "hfopenllm_v2/BBH": 0.6267, + "hfopenllm_v2/MATH Level 5": 0.4305, + "hfopenllm_v2/GPQA": 0.3666, + "hfopenllm_v2/MUSR": 0.4808, + "hfopenllm_v2/MMLU-PRO": 0.5263 + } + }, + { + "id": "hotmailuser/QwenSparse-7B", + "name": "QwenSparse-7B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1086, + "hfopenllm_v2/BBH": 0.2896, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3562, + "hfopenllm_v2/MMLU-PRO": 0.1122 + } + }, + { + "id": "hotmailuser/QwenStock-0.5B", + "name": "QwenStock-0.5B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2049, + "hfopenllm_v2/BBH": 0.2912, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3575, + "hfopenllm_v2/MMLU-PRO": 0.1167 + } + }, + { + "id": "hotmailuser/QwenStock-1.7B", + "name": "QwenStock-1.7B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3214, + "hfopenllm_v2/BBH": 0.4188, + "hfopenllm_v2/MATH Level 5": 0.0997, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.4412, + "hfopenllm_v2/MMLU-PRO": 0.2955 + } + }, + { + "id": "hotmailuser/QwenStock1-14B", + "name": "QwenStock1-14B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6693, + "hfopenllm_v2/BBH": 0.6502, + "hfopenllm_v2/MATH Level 5": 0.3701, + "hfopenllm_v2/GPQA": 0.3859, + "hfopenllm_v2/MUSR": 0.4781, + "hfopenllm_v2/MMLU-PRO": 0.5416 + } + }, + { + "id": "hotmailuser/RombosBeagle-v2beta-MGS-32B", + "name": "RombosBeagle-v2beta-MGS-32B", + "developer": "hotmailuser", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5157, + "hfopenllm_v2/BBH": 0.7037, + "hfopenllm_v2/MATH Level 5": 0.4992, + "hfopenllm_v2/GPQA": 0.38, + "hfopenllm_v2/MUSR": 0.5021, + "hfopenllm_v2/MMLU-PRO": 0.5908 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/huggyllama.json b/data/developers/huggyllama.json new file mode 100644 index 0000000000000000000000000000000000000000..385671a5d7ddc07d1e6a7c21e6d5bed23eeb69da --- /dev/null +++ b/data/developers/huggyllama.json @@ -0,0 +1,47 @@ +{ + "developer": "huggyllama", + "models": [ + { + "id": "huggyllama/llama-13b", + "name": "llama-13b", + "developer": "huggyllama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2411, + "hfopenllm_v2/BBH": 0.3988, + "hfopenllm_v2/MATH Level 5": 0.0204, + "hfopenllm_v2/GPQA": 0.255, + "hfopenllm_v2/MUSR": 0.3462, + "hfopenllm_v2/MMLU-PRO": 0.1952 + } + }, + { + "id": "huggyllama/llama-65b", + "name": "llama-65b", + "developer": "huggyllama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2526, + "hfopenllm_v2/BBH": 0.4703, + "hfopenllm_v2/MATH Level 5": 0.031, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3595, + "hfopenllm_v2/MMLU-PRO": 0.3078 + } + }, + { + "id": "huggyllama/llama-7b", + "name": "llama-7b", + "developer": "huggyllama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2501, + "hfopenllm_v2/BBH": 0.3277, + "hfopenllm_v2/MATH Level 5": 0.0083, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.3354, + "hfopenllm_v2/MMLU-PRO": 0.1313 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/huihui-ai.json b/data/developers/huihui-ai.json new file mode 100644 index 0000000000000000000000000000000000000000..71829df2eb5bb4e60283834879a090832eb49947 --- /dev/null +++ b/data/developers/huihui-ai.json @@ -0,0 +1,117 @@ +{ + "developer": "huihui-ai", + "models": [ + { + "id": "huihui-ai/DeepSeek-R1-Distill-Qwen-14B-abliterated-v2", + "name": "DeepSeek-R1-Distill-Qwen-14B-abliterated-v2", + "developer": "huihui-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4211, + "hfopenllm_v2/BBH": 0.3487, + "hfopenllm_v2/MATH Level 5": 0.2205, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.4701, + "hfopenllm_v2/MMLU-PRO": 0.1915 + } + }, + { + "id": "huihui-ai/QwQ-32B-Coder-Fusion-7030", + "name": "QwQ-32B-Coder-Fusion-7030", + "developer": "huihui-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3865, + "hfopenllm_v2/BBH": 0.6178, + "hfopenllm_v2/MATH Level 5": 0.2795, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.3922, + "hfopenllm_v2/MMLU-PRO": 0.4368 + } + }, + { + "id": "huihui-ai/QwQ-32B-Coder-Fusion-8020", + "name": "QwQ-32B-Coder-Fusion-8020", + "developer": "huihui-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6021, + "hfopenllm_v2/BBH": 0.6665, + "hfopenllm_v2/MATH Level 5": 0.4592, + "hfopenllm_v2/GPQA": 0.3549, + "hfopenllm_v2/MUSR": 0.4293, + "hfopenllm_v2/MMLU-PRO": 0.5367 + } + }, + { + "id": "huihui-ai/QwQ-32B-Coder-Fusion-9010", + "name": "QwQ-32B-Coder-Fusion-9010", + "developer": "huihui-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5778, + "hfopenllm_v2/BBH": 0.6727, + "hfopenllm_v2/MATH Level 5": 0.5317, + "hfopenllm_v2/GPQA": 0.3616, + "hfopenllm_v2/MUSR": 0.4682, + "hfopenllm_v2/MMLU-PRO": 0.56 + } + }, + { + "id": "huihui-ai/Qwen2.5-14B-Instruct-abliterated-v2", + "name": "Qwen2.5-14B-Instruct-abliterated-v2", + "developer": "huihui-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8328, + "hfopenllm_v2/BBH": 0.6324, + "hfopenllm_v2/MATH Level 5": 0.5302, + "hfopenllm_v2/GPQA": 0.3339, + "hfopenllm_v2/MUSR": 0.422, + "hfopenllm_v2/MMLU-PRO": 0.4962 + } + }, + { + "id": "huihui-ai/Qwen2.5-72B-Instruct-abliterated", + "name": "Qwen2.5-72B-Instruct-abliterated", + "developer": "huihui-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8593, + "hfopenllm_v2/BBH": 0.719, + "hfopenllm_v2/MATH Level 5": 0.6012, + "hfopenllm_v2/GPQA": 0.3951, + "hfopenllm_v2/MUSR": 0.4233, + "hfopenllm_v2/MMLU-PRO": 0.5537 + } + }, + { + "id": "huihui-ai/Qwen2.5-7B-Instruct-abliterated", + "name": "Qwen2.5-7B-Instruct-abliterated", + "developer": "huihui-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7546, + "hfopenllm_v2/BBH": 0.5262, + "hfopenllm_v2/MATH Level 5": 0.4577, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.3967, + "hfopenllm_v2/MMLU-PRO": 0.418 + } + }, + { + "id": "huihui-ai/Qwen2.5-7B-Instruct-abliterated-v2", + "name": "Qwen2.5-7B-Instruct-abliterated-v2", + "developer": "huihui-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7606, + "hfopenllm_v2/BBH": 0.5377, + "hfopenllm_v2/MATH Level 5": 0.4637, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.3981, + "hfopenllm_v2/MMLU-PRO": 0.4208 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/huu-ontocord.json b/data/developers/huu-ontocord.json new file mode 100644 index 0000000000000000000000000000000000000000..55cf01e55b67d7b8aca6f1c18a51c194dc111f8a --- /dev/null +++ b/data/developers/huu-ontocord.json @@ -0,0 +1,19 @@ +{ + "developer": "huu-ontocord", + "models": [ + { + "id": "huu-ontocord/wide_3b_orpo_stage1.1-ss1-orpo3", + "name": "wide_3b_orpo_stage1.1-ss1-orpo3", + "developer": "huu-ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1505, + "hfopenllm_v2/BBH": 0.2937, + "hfopenllm_v2/MATH Level 5": 0.0098, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3618, + "hfopenllm_v2/MMLU-PRO": 0.1164 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/iFaz.json b/data/developers/iFaz.json new file mode 100644 index 0000000000000000000000000000000000000000..9664b16cfdd6fd46ee11e5e776b40f03fd77b55f --- /dev/null +++ b/data/developers/iFaz.json @@ -0,0 +1,117 @@ +{ + "developer": "iFaz", + "models": [ + { + "id": "iFaz/llama31_8B_en_emo_v4", + "name": "llama31_8B_en_emo_v4", + "developer": "iFaz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3043, + "hfopenllm_v2/BBH": 0.4916, + "hfopenllm_v2/MATH Level 5": 0.0884, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.3643, + "hfopenllm_v2/MMLU-PRO": 0.3049 + } + }, + { + "id": "iFaz/llama32_1B_en_emo_v1", + "name": "llama32_1B_en_emo_v1", + "developer": "iFaz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4408, + "hfopenllm_v2/BBH": 0.338, + "hfopenllm_v2/MATH Level 5": 0.0378, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3489, + "hfopenllm_v2/MMLU-PRO": 0.1761 + } + }, + { + "id": "iFaz/llama32_3B_en_emo_1000_stp", + "name": "llama32_3B_en_emo_1000_stp", + "developer": "iFaz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7295, + "hfopenllm_v2/BBH": 0.4522, + "hfopenllm_v2/MATH Level 5": 0.1465, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.3621, + "hfopenllm_v2/MMLU-PRO": 0.3123 + } + }, + { + "id": "iFaz/llama32_3B_en_emo_2000_stp", + "name": "llama32_3B_en_emo_2000_stp", + "developer": "iFaz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7369, + "hfopenllm_v2/BBH": 0.4535, + "hfopenllm_v2/MATH Level 5": 0.1533, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3527, + "hfopenllm_v2/MMLU-PRO": 0.3098 + } + }, + { + "id": "iFaz/llama32_3B_en_emo_300_stp", + "name": "llama32_3B_en_emo_300_stp", + "developer": "iFaz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7256, + "hfopenllm_v2/BBH": 0.4505, + "hfopenllm_v2/MATH Level 5": 0.1601, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3621, + "hfopenllm_v2/MMLU-PRO": 0.3148 + } + }, + { + "id": "iFaz/llama32_3B_en_emo_5000_stp", + "name": "llama32_3B_en_emo_5000_stp", + "developer": "iFaz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.71, + "hfopenllm_v2/BBH": 0.4568, + "hfopenllm_v2/MATH Level 5": 0.1292, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3446, + "hfopenllm_v2/MMLU-PRO": 0.3067 + } + }, + { + "id": "iFaz/llama32_3B_en_emo_v2", + "name": "llama32_3B_en_emo_v2", + "developer": "iFaz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5454, + "hfopenllm_v2/BBH": 0.4284, + "hfopenllm_v2/MATH Level 5": 0.1088, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3482, + "hfopenllm_v2/MMLU-PRO": 0.3004 + } + }, + { + "id": "iFaz/llama32_3B_en_emo_v3", + "name": "llama32_3B_en_emo_v3", + "developer": "iFaz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5759, + "hfopenllm_v2/BBH": 0.4301, + "hfopenllm_v2/MATH Level 5": 0.068, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3553, + "hfopenllm_v2/MMLU-PRO": 0.271 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/iRyanBell.json b/data/developers/iRyanBell.json new file mode 100644 index 0000000000000000000000000000000000000000..2c50d54f05efbf1155feaa57a759bc15e4315d5a --- /dev/null +++ b/data/developers/iRyanBell.json @@ -0,0 +1,33 @@ +{ + "developer": "iRyanBell", + "models": [ + { + "id": "iRyanBell/ARC1", + "name": "ARC1", + "developer": "iRyanBell", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4411, + "hfopenllm_v2/BBH": 0.4903, + "hfopenllm_v2/MATH Level 5": 0.0687, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.3991, + "hfopenllm_v2/MMLU-PRO": 0.3371 + } + }, + { + "id": "iRyanBell/ARC1-II", + "name": "ARC1-II", + "developer": "iRyanBell", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1708, + "hfopenllm_v2/BBH": 0.3382, + "hfopenllm_v2/MATH Level 5": 0.0219, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.4913, + "hfopenllm_v2/MMLU-PRO": 0.1686 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ibivibiv.json b/data/developers/ibivibiv.json new file mode 100644 index 0000000000000000000000000000000000000000..8269b8ed59cb84b54b31344a04348454354da164 --- /dev/null +++ b/data/developers/ibivibiv.json @@ -0,0 +1,33 @@ +{ + "developer": "ibivibiv", + "models": [ + { + "id": "ibivibiv/colossus_120b", + "name": "colossus_120b", + "developer": "ibivibiv", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4276, + "hfopenllm_v2/BBH": 0.6061, + "hfopenllm_v2/MATH Level 5": 0.0566, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4733, + "hfopenllm_v2/MMLU-PRO": 0.3961 + } + }, + { + "id": "ibivibiv/multimaster-7b-v6", + "name": "multimaster-7b-v6", + "developer": "ibivibiv", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4473, + "hfopenllm_v2/BBH": 0.5194, + "hfopenllm_v2/MATH Level 5": 0.0559, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4396, + "hfopenllm_v2/MMLU-PRO": 0.3095 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ibm-granite.json b/data/developers/ibm-granite.json new file mode 100644 index 0000000000000000000000000000000000000000..cbdb43757d126217fef481bf71c9969a6e78a083 --- /dev/null +++ b/data/developers/ibm-granite.json @@ -0,0 +1,285 @@ +{ + "developer": "ibm-granite", + "models": [ + { + "id": "ibm-granite/granite-3.0-1b-a400m-base", + "name": "granite-3.0-1b-a400m-base", + "developer": "ibm-granite", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2404, + "hfopenllm_v2/BBH": 0.3221, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.2475, + "hfopenllm_v2/MUSR": 0.3367, + "hfopenllm_v2/MMLU-PRO": 0.1152 + } + }, + { + "id": "ibm-granite/granite-3.0-1b-a400m-instruct", + "name": "granite-3.0-1b-a400m-instruct", + "developer": "ibm-granite", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3332, + "hfopenllm_v2/BBH": 0.3224, + "hfopenllm_v2/MATH Level 5": 0.0279, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3623, + "hfopenllm_v2/MMLU-PRO": 0.1244 + } + }, + { + "id": "ibm-granite/granite-3.0-2b-base", + "name": "granite-3.0-2b-base", + "developer": "ibm-granite", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3874, + "hfopenllm_v2/BBH": 0.4047, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3434, + "hfopenllm_v2/MMLU-PRO": 0.2381 + } + }, + { + "id": "ibm-granite/granite-3.0-2b-instruct", + "name": "granite-3.0-2b-instruct", + "developer": "ibm-granite", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.514, + "hfopenllm_v2/BBH": 0.4412, + "hfopenllm_v2/MATH Level 5": 0.0921, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.3515, + "hfopenllm_v2/MMLU-PRO": 0.2814 + } + }, + { + "id": "ibm-granite/granite-3.0-3b-a800m-base", + "name": "granite-3.0-3b-a800m-base", + "developer": "ibm-granite", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2732, + "hfopenllm_v2/BBH": 0.3667, + "hfopenllm_v2/MATH Level 5": 0.0483, + "hfopenllm_v2/GPQA": 0.2517, + "hfopenllm_v2/MUSR": 0.342, + "hfopenllm_v2/MMLU-PRO": 0.1891 + } + }, + { + "id": "ibm-granite/granite-3.0-3b-a800m-instruct", + "name": "granite-3.0-3b-a800m-instruct", + "developer": "ibm-granite", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4298, + "hfopenllm_v2/BBH": 0.3753, + "hfopenllm_v2/MATH Level 5": 0.0702, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.3487, + "hfopenllm_v2/MMLU-PRO": 0.2152 + } + }, + { + "id": "ibm-granite/granite-3.0-8b-base", + "name": "granite-3.0-8b-base", + "developer": "ibm-granite", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4583, + "hfopenllm_v2/BBH": 0.4944, + "hfopenllm_v2/MATH Level 5": 0.1012, + "hfopenllm_v2/GPQA": 0.3255, + "hfopenllm_v2/MUSR": 0.4081, + "hfopenllm_v2/MMLU-PRO": 0.3313 + } + }, + { + "id": "ibm-granite/granite-3.0-8b-instruct", + "name": "granite-3.0-8b-instruct", + "developer": "ibm-granite", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.531, + "hfopenllm_v2/BBH": 0.5192, + "hfopenllm_v2/MATH Level 5": 0.142, + "hfopenllm_v2/GPQA": 0.3322, + "hfopenllm_v2/MUSR": 0.3901, + "hfopenllm_v2/MMLU-PRO": 0.3457 + } + }, + { + "id": "ibm-granite/granite-3.1-1b-a400m-base", + "name": "granite-3.1-1b-a400m-base", + "developer": "ibm-granite", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2519, + "hfopenllm_v2/BBH": 0.3299, + "hfopenllm_v2/MATH Level 5": 0.0272, + "hfopenllm_v2/GPQA": 0.2517, + "hfopenllm_v2/MUSR": 0.3501, + "hfopenllm_v2/MMLU-PRO": 0.1139 + } + }, + { + "id": "ibm-granite/granite-3.1-1b-a400m-instruct", + "name": "granite-3.1-1b-a400m-instruct", + "developer": "ibm-granite", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4686, + "hfopenllm_v2/BBH": 0.328, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.2399, + "hfopenllm_v2/MUSR": 0.3302, + "hfopenllm_v2/MMLU-PRO": 0.1217 + } + }, + { + "id": "ibm-granite/granite-3.1-2b-base", + "name": "granite-3.1-2b-base", + "developer": "ibm-granite", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3522, + "hfopenllm_v2/BBH": 0.4047, + "hfopenllm_v2/MATH Level 5": 0.0566, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.3486, + "hfopenllm_v2/MMLU-PRO": 0.2251 + } + }, + { + "id": "ibm-granite/granite-3.1-2b-instruct", + "name": "granite-3.1-2b-instruct", + "developer": "ibm-granite", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6286, + "hfopenllm_v2/BBH": 0.4409, + "hfopenllm_v2/MATH Level 5": 0.1526, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.3605, + "hfopenllm_v2/MMLU-PRO": 0.2819 + } + }, + { + "id": "ibm-granite/granite-3.1-3b-a800m-base", + "name": "granite-3.1-3b-a800m-base", + "developer": "ibm-granite", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2996, + "hfopenllm_v2/BBH": 0.3628, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.3275, + "hfopenllm_v2/MMLU-PRO": 0.1793 + } + }, + { + "id": "ibm-granite/granite-3.1-3b-a800m-instruct", + "name": "granite-3.1-3b-a800m-instruct", + "developer": "ibm-granite", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5516, + "hfopenllm_v2/BBH": 0.4009, + "hfopenllm_v2/MATH Level 5": 0.114, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.3486, + "hfopenllm_v2/MMLU-PRO": 0.2148 + } + }, + { + "id": "ibm-granite/granite-3.1-8b-base", + "name": "granite-3.1-8b-base", + "developer": "ibm-granite", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4221, + "hfopenllm_v2/BBH": 0.4777, + "hfopenllm_v2/MATH Level 5": 0.0944, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.3922, + "hfopenllm_v2/MMLU-PRO": 0.3232 + } + }, + { + "id": "ibm-granite/granite-3.1-8b-instruct", + "name": "granite-3.1-8b-instruct", + "developer": "ibm-granite", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7208, + "hfopenllm_v2/BBH": 0.5364, + "hfopenllm_v2/MATH Level 5": 0.2198, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4707, + "hfopenllm_v2/MMLU-PRO": 0.3537 + } + }, + { + "id": "ibm-granite/granite-3.2-2b-instruct", + "name": "granite-3.2-2b-instruct", + "developer": "ibm-granite", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6152, + "hfopenllm_v2/BBH": 0.4387, + "hfopenllm_v2/MATH Level 5": 0.1443, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.3646, + "hfopenllm_v2/MMLU-PRO": 0.2783 + } + }, + { + "id": "ibm-granite/granite-3.2-8b-instruct", + "name": "granite-3.2-8b-instruct", + "developer": "ibm-granite", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7275, + "hfopenllm_v2/BBH": 0.5402, + "hfopenllm_v2/MATH Level 5": 0.2379, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.4562, + "hfopenllm_v2/MMLU-PRO": 0.3512 + } + }, + { + "id": "ibm-granite/granite-7b-base", + "name": "granite-7b-base", + "developer": "ibm-granite", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2414, + "hfopenllm_v2/BBH": 0.348, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.2458, + "hfopenllm_v2/MUSR": 0.3555, + "hfopenllm_v2/MMLU-PRO": 0.1834 + } + }, + { + "id": "ibm-granite/granite-7b-instruct", + "name": "granite-7b-instruct", + "developer": "ibm-granite", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2972, + "hfopenllm_v2/BBH": 0.3723, + "hfopenllm_v2/MATH Level 5": 0.0204, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.402, + "hfopenllm_v2/MMLU-PRO": 0.2286 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ibm.json b/data/developers/ibm.json new file mode 100644 index 0000000000000000000000000000000000000000..f14b58c513caa7f4dc439ffd8a168edd0740f276 --- /dev/null +++ b/data/developers/ibm.json @@ -0,0 +1,74 @@ +{ + "developer": "ibm", + "models": [ + { + "id": "ibm/PowerLM-3b", + "name": "PowerLM-3b", + "developer": "ibm", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3321, + "hfopenllm_v2/BBH": 0.3679, + "hfopenllm_v2/MATH Level 5": 0.0363, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3563, + "hfopenllm_v2/MMLU-PRO": 0.2016 + } + }, + { + "id": "ibm/granite-3.3-8b-instruct", + "name": "IBM Granite 3.3 8B Instruct", + "developer": "ibm", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.463, + "helm_capabilities/MMLU-Pro": 0.343, + "helm_capabilities/GPQA": 0.325, + "helm_capabilities/IFEval": 0.729, + "helm_capabilities/WildBench": 0.741, + "helm_capabilities/Omni-MATH": 0.176 + } + }, + { + "id": "ibm/granite-4.0-h-small", + "name": "granite-4.0-h-small", + "developer": "ibm", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.7503, + "global-mmlu-lite/Culturally Sensitive": 0.7182, + "global-mmlu-lite/Culturally Agnostic": 0.7826, + "global-mmlu-lite/Arabic": 0.7613, + "global-mmlu-lite/English": 0.77, + "global-mmlu-lite/Bengali": 0.7613, + "global-mmlu-lite/German": 0.755, + "global-mmlu-lite/French": 0.7594, + "global-mmlu-lite/Hindi": 0.7575, + "global-mmlu-lite/Indonesian": 0.7614, + "global-mmlu-lite/Italian": 0.7525, + "global-mmlu-lite/Japanese": 0.7406, + "global-mmlu-lite/Korean": 0.7525, + "global-mmlu-lite/Portuguese": 0.757, + "global-mmlu-lite/Spanish": 0.7638, + "global-mmlu-lite/Swahili": 0.7318, + "global-mmlu-lite/Yoruba": 0.6921, + "global-mmlu-lite/Chinese": 0.7475, + "global-mmlu-lite/Burmese": 0.7419 + } + }, + { + "id": "ibm/merlinite-7b", + "name": "merlinite-7b", + "developer": "ibm", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2499, + "hfopenllm_v2/BBH": 0.5007, + "hfopenllm_v2/MATH Level 5": 0.0242, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4412, + "hfopenllm_v2/MMLU-PRO": 0.3068 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/icefog72.json b/data/developers/icefog72.json new file mode 100644 index 0000000000000000000000000000000000000000..0ea5ae912613ba6349dc7eed95da5f8d3afd5dbe --- /dev/null +++ b/data/developers/icefog72.json @@ -0,0 +1,873 @@ +{ + "developer": "icefog72", + "models": [ + { + "id": "icefog72/Ice0.15-02.10-RP", + "name": "Ice0.15-02.10-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5343, + "hfopenllm_v2/BBH": 0.4976, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.432, + "hfopenllm_v2/MMLU-PRO": 0.3066 + } + }, + { + "id": "icefog72/Ice0.16-02.10-RP", + "name": "Ice0.16-02.10-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5069, + "hfopenllm_v2/BBH": 0.4946, + "hfopenllm_v2/MATH Level 5": 0.0589, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.4334, + "hfopenllm_v2/MMLU-PRO": 0.3068 + } + }, + { + "id": "icefog72/Ice0.17-03.10-RP", + "name": "Ice0.17-03.10-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5124, + "hfopenllm_v2/BBH": 0.5007, + "hfopenllm_v2/MATH Level 5": 0.0612, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.4334, + "hfopenllm_v2/MMLU-PRO": 0.3085 + } + }, + { + "id": "icefog72/Ice0.27-06.11-RP", + "name": "Ice0.27-06.11-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4918, + "hfopenllm_v2/BBH": 0.5112, + "hfopenllm_v2/MATH Level 5": 0.0566, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4328, + "hfopenllm_v2/MMLU-PRO": 0.3154 + } + }, + { + "id": "icefog72/Ice0.29-06.11-RP", + "name": "Ice0.29-06.11-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4861, + "hfopenllm_v2/BBH": 0.5088, + "hfopenllm_v2/MATH Level 5": 0.0566, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4459, + "hfopenllm_v2/MMLU-PRO": 0.3093 + } + }, + { + "id": "icefog72/Ice0.31-08.11-RP", + "name": "Ice0.31-08.11-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5146, + "hfopenllm_v2/BBH": 0.5032, + "hfopenllm_v2/MATH Level 5": 0.0612, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4277, + "hfopenllm_v2/MMLU-PRO": 0.3131 + } + }, + { + "id": "icefog72/Ice0.32-10.11-RP", + "name": "Ice0.32-10.11-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4915, + "hfopenllm_v2/BBH": 0.5048, + "hfopenllm_v2/MATH Level 5": 0.0514, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4382, + "hfopenllm_v2/MMLU-PRO": 0.31 + } + }, + { + "id": "icefog72/Ice0.34b-14.11-RP", + "name": "Ice0.34b-14.11-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4762, + "hfopenllm_v2/BBH": 0.5067, + "hfopenllm_v2/MATH Level 5": 0.065, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.442, + "hfopenllm_v2/MMLU-PRO": 0.3125 + } + }, + { + "id": "icefog72/Ice0.34n-14.11-RP", + "name": "Ice0.34n-14.11-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4787, + "hfopenllm_v2/BBH": 0.5091, + "hfopenllm_v2/MATH Level 5": 0.0725, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.438, + "hfopenllm_v2/MMLU-PRO": 0.3124 + } + }, + { + "id": "icefog72/Ice0.37-18.11-RP", + "name": "Ice0.37-18.11-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4972, + "hfopenllm_v2/BBH": 0.5084, + "hfopenllm_v2/MATH Level 5": 0.0642, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4339, + "hfopenllm_v2/MMLU-PRO": 0.3143 + } + }, + { + "id": "icefog72/Ice0.38-19.11-RP", + "name": "Ice0.38-19.11-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4403, + "hfopenllm_v2/BBH": 0.5101, + "hfopenllm_v2/MATH Level 5": 0.0551, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4367, + "hfopenllm_v2/MMLU-PRO": 0.314 + } + }, + { + "id": "icefog72/Ice0.39-19.11-RP", + "name": "Ice0.39-19.11-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4757, + "hfopenllm_v2/BBH": 0.5093, + "hfopenllm_v2/MATH Level 5": 0.0498, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4341, + "hfopenllm_v2/MMLU-PRO": 0.3127 + } + }, + { + "id": "icefog72/Ice0.40-20.11-RP", + "name": "Ice0.40-20.11-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4763, + "hfopenllm_v2/BBH": 0.5093, + "hfopenllm_v2/MATH Level 5": 0.0642, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4446, + "hfopenllm_v2/MMLU-PRO": 0.3099 + } + }, + { + "id": "icefog72/Ice0.41-22.11-RP", + "name": "Ice0.41-22.11-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.462, + "hfopenllm_v2/BBH": 0.4723, + "hfopenllm_v2/MATH Level 5": 0.031, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.456, + "hfopenllm_v2/MMLU-PRO": 0.2618 + } + }, + { + "id": "icefog72/Ice0.50-16.01-RP", + "name": "Ice0.50-16.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4385, + "hfopenllm_v2/BBH": 0.498, + "hfopenllm_v2/MATH Level 5": 0.0468, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4381, + "hfopenllm_v2/MMLU-PRO": 0.3069 + } + }, + { + "id": "icefog72/Ice0.50.1-16.01-RP", + "name": "Ice0.50.1-16.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4829, + "hfopenllm_v2/BBH": 0.5107, + "hfopenllm_v2/MATH Level 5": 0.0612, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4327, + "hfopenllm_v2/MMLU-PRO": 0.3132 + } + }, + { + "id": "icefog72/Ice0.51-16.01-RP", + "name": "Ice0.51-16.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4431, + "hfopenllm_v2/BBH": 0.5044, + "hfopenllm_v2/MATH Level 5": 0.0514, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4437, + "hfopenllm_v2/MMLU-PRO": 0.306 + } + }, + { + "id": "icefog72/Ice0.51.1-16.01-RP", + "name": "Ice0.51.1-16.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4573, + "hfopenllm_v2/BBH": 0.5121, + "hfopenllm_v2/MATH Level 5": 0.0642, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4394, + "hfopenllm_v2/MMLU-PRO": 0.3104 + } + }, + { + "id": "icefog72/Ice0.52-16.01-RP", + "name": "Ice0.52-16.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4503, + "hfopenllm_v2/BBH": 0.5047, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4396, + "hfopenllm_v2/MMLU-PRO": 0.308 + } + }, + { + "id": "icefog72/Ice0.52.1-16.01-RP", + "name": "Ice0.52.1-16.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4549, + "hfopenllm_v2/BBH": 0.5106, + "hfopenllm_v2/MATH Level 5": 0.0627, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4394, + "hfopenllm_v2/MMLU-PRO": 0.3105 + } + }, + { + "id": "icefog72/Ice0.53-16.01-RP", + "name": "Ice0.53-16.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4741, + "hfopenllm_v2/BBH": 0.5102, + "hfopenllm_v2/MATH Level 5": 0.0634, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4327, + "hfopenllm_v2/MMLU-PRO": 0.313 + } + }, + { + "id": "icefog72/Ice0.54-17.01-RP", + "name": "Ice0.54-17.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4379, + "hfopenllm_v2/BBH": 0.4853, + "hfopenllm_v2/MATH Level 5": 0.0408, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4874, + "hfopenllm_v2/MMLU-PRO": 0.2326 + } + }, + { + "id": "icefog72/Ice0.55-17.01-RP", + "name": "Ice0.55-17.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4961, + "hfopenllm_v2/BBH": 0.5077, + "hfopenllm_v2/MATH Level 5": 0.0604, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.4725, + "hfopenllm_v2/MMLU-PRO": 0.2658 + } + }, + { + "id": "icefog72/Ice0.57-17.01-RP", + "name": "Ice0.57-17.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5152, + "hfopenllm_v2/BBH": 0.5064, + "hfopenllm_v2/MATH Level 5": 0.0514, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4686, + "hfopenllm_v2/MMLU-PRO": 0.2651 + } + }, + { + "id": "icefog72/Ice0.60-18.01-RP", + "name": "Ice0.60-18.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5374, + "hfopenllm_v2/BBH": 0.5094, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.467, + "hfopenllm_v2/MMLU-PRO": 0.2837 + } + }, + { + "id": "icefog72/Ice0.60.1-18.01-RP", + "name": "Ice0.60.1-18.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5188, + "hfopenllm_v2/BBH": 0.512, + "hfopenllm_v2/MATH Level 5": 0.0461, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4498, + "hfopenllm_v2/MMLU-PRO": 0.2914 + } + }, + { + "id": "icefog72/Ice0.61-18.01-RP", + "name": "Ice0.61-18.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5441, + "hfopenllm_v2/BBH": 0.5105, + "hfopenllm_v2/MATH Level 5": 0.0468, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4697, + "hfopenllm_v2/MMLU-PRO": 0.2709 + } + }, + { + "id": "icefog72/Ice0.62-18.01-RP", + "name": "Ice0.62-18.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5367, + "hfopenllm_v2/BBH": 0.5103, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4538, + "hfopenllm_v2/MMLU-PRO": 0.2877 + } + }, + { + "id": "icefog72/Ice0.62.1-24.01-RP", + "name": "Ice0.62.1-24.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5182, + "hfopenllm_v2/BBH": 0.5109, + "hfopenllm_v2/MATH Level 5": 0.0559, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4551, + "hfopenllm_v2/MMLU-PRO": 0.2871 + } + }, + { + "id": "icefog72/Ice0.64-24.01-RP", + "name": "Ice0.64-24.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5441, + "hfopenllm_v2/BBH": 0.506, + "hfopenllm_v2/MATH Level 5": 0.0627, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.462, + "hfopenllm_v2/MMLU-PRO": 0.2933 + } + }, + { + "id": "icefog72/Ice0.64.1-24.01-RP", + "name": "Ice0.64.1-24.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5447, + "hfopenllm_v2/BBH": 0.506, + "hfopenllm_v2/MATH Level 5": 0.0627, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.462, + "hfopenllm_v2/MMLU-PRO": 0.2933 + } + }, + { + "id": "icefog72/Ice0.65-25.01-RP", + "name": "Ice0.65-25.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5029, + "hfopenllm_v2/BBH": 0.5096, + "hfopenllm_v2/MATH Level 5": 0.065, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.434, + "hfopenllm_v2/MMLU-PRO": 0.2997 + } + }, + { + "id": "icefog72/Ice0.66-25.01-RP", + "name": "Ice0.66-25.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5325, + "hfopenllm_v2/BBH": 0.5129, + "hfopenllm_v2/MATH Level 5": 0.0604, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4434, + "hfopenllm_v2/MMLU-PRO": 0.3039 + } + }, + { + "id": "icefog72/Ice0.67-25.01-RP", + "name": "Ice0.67-25.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5361, + "hfopenllm_v2/BBH": 0.5113, + "hfopenllm_v2/MATH Level 5": 0.0748, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.4279, + "hfopenllm_v2/MMLU-PRO": 0.3097 + } + }, + { + "id": "icefog72/Ice0.68-25.01-RP", + "name": "Ice0.68-25.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5514, + "hfopenllm_v2/BBH": 0.513, + "hfopenllm_v2/MATH Level 5": 0.0725, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4446, + "hfopenllm_v2/MMLU-PRO": 0.3012 + } + }, + { + "id": "icefog72/Ice0.69-25.01-RP", + "name": "Ice0.69-25.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5438, + "hfopenllm_v2/BBH": 0.5098, + "hfopenllm_v2/MATH Level 5": 0.0566, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4486, + "hfopenllm_v2/MMLU-PRO": 0.2965 + } + }, + { + "id": "icefog72/Ice0.7-29.09-RP", + "name": "Ice0.7-29.09-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5176, + "hfopenllm_v2/BBH": 0.5048, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.4238, + "hfopenllm_v2/MMLU-PRO": 0.3127 + } + }, + { + "id": "icefog72/Ice0.70-25.01-RP", + "name": "Ice0.70-25.01-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5498, + "hfopenllm_v2/BBH": 0.5136, + "hfopenllm_v2/MATH Level 5": 0.0597, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4512, + "hfopenllm_v2/MMLU-PRO": 0.2996 + } + }, + { + "id": "icefog72/Ice0.70.1-01.02-RP", + "name": "Ice0.70.1-01.02-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.507, + "hfopenllm_v2/BBH": 0.506, + "hfopenllm_v2/MATH Level 5": 0.034, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4599, + "hfopenllm_v2/MMLU-PRO": 0.2749 + } + }, + { + "id": "icefog72/Ice0.73-01.02-RP", + "name": "Ice0.73-01.02-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5292, + "hfopenllm_v2/BBH": 0.5103, + "hfopenllm_v2/MATH Level 5": 0.0385, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.4664, + "hfopenllm_v2/MMLU-PRO": 0.2702 + } + }, + { + "id": "icefog72/Ice0.74-02.02-RP", + "name": "Ice0.74-02.02-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2935, + "hfopenllm_v2/BBH": 0.4646, + "hfopenllm_v2/MATH Level 5": 0.0015, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.428, + "hfopenllm_v2/MMLU-PRO": 0.2143 + } + }, + { + "id": "icefog72/Ice0.76-02.02-RP", + "name": "Ice0.76-02.02-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4529, + "hfopenllm_v2/BBH": 0.5086, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.4362, + "hfopenllm_v2/MMLU-PRO": 0.2652 + } + }, + { + "id": "icefog72/Ice0.77-02.02-RP", + "name": "Ice0.77-02.02-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.531, + "hfopenllm_v2/BBH": 0.5109, + "hfopenllm_v2/MATH Level 5": 0.0393, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.4765, + "hfopenllm_v2/MMLU-PRO": 0.2999 + } + }, + { + "id": "icefog72/Ice0.78-02.02-RP", + "name": "Ice0.78-02.02-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4053, + "hfopenllm_v2/BBH": 0.5002, + "hfopenllm_v2/MATH Level 5": 0.0438, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4686, + "hfopenllm_v2/MMLU-PRO": 0.2955 + } + }, + { + "id": "icefog72/Ice0.80-03.02-RP", + "name": "Ice0.80-03.02-RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5516, + "hfopenllm_v2/BBH": 0.5098, + "hfopenllm_v2/MATH Level 5": 0.0559, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.4923, + "hfopenllm_v2/MMLU-PRO": 0.2912 + } + }, + { + "id": "icefog72/IceCocoaRP-7b", + "name": "IceCocoaRP-7b", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4962, + "hfopenllm_v2/BBH": 0.4938, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4198, + "hfopenllm_v2/MMLU-PRO": 0.3098 + } + }, + { + "id": "icefog72/IceCoffeeRP-7b", + "name": "IceCoffeeRP-7b", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4959, + "hfopenllm_v2/BBH": 0.4889, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.416, + "hfopenllm_v2/MMLU-PRO": 0.2975 + } + }, + { + "id": "icefog72/IceDrinkByFrankensteinV3RP", + "name": "IceDrinkByFrankensteinV3RP", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4975, + "hfopenllm_v2/BBH": 0.4833, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.4253, + "hfopenllm_v2/MMLU-PRO": 0.2927 + } + }, + { + "id": "icefog72/IceDrinkNameGoesHereRP-7b-Model_Stock", + "name": "IceDrinkNameGoesHereRP-7b-Model_Stock", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4968, + "hfopenllm_v2/BBH": 0.4658, + "hfopenllm_v2/MATH Level 5": 0.0408, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.4067, + "hfopenllm_v2/MMLU-PRO": 0.2817 + } + }, + { + "id": "icefog72/IceDrinkNameNotFoundRP-7b-Model_Stock", + "name": "IceDrinkNameNotFoundRP-7b-Model_Stock", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.513, + "hfopenllm_v2/BBH": 0.5026, + "hfopenllm_v2/MATH Level 5": 0.0604, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.4372, + "hfopenllm_v2/MMLU-PRO": 0.3064 + } + }, + { + "id": "icefog72/IceDrunkCherryRP-7b", + "name": "IceDrunkCherryRP-7b", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4898, + "hfopenllm_v2/BBH": 0.4847, + "hfopenllm_v2/MATH Level 5": 0.0612, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.4292, + "hfopenllm_v2/MMLU-PRO": 0.3009 + } + }, + { + "id": "icefog72/IceDrunkenCherryRP-7b", + "name": "IceDrunkenCherryRP-7b", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4763, + "hfopenllm_v2/BBH": 0.5093, + "hfopenllm_v2/MATH Level 5": 0.0642, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4446, + "hfopenllm_v2/MMLU-PRO": 0.3099 + } + }, + { + "id": "icefog72/IceEspressoRPv2-7b", + "name": "IceEspressoRPv2-7b", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4977, + "hfopenllm_v2/BBH": 0.5055, + "hfopenllm_v2/MATH Level 5": 0.0619, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.4331, + "hfopenllm_v2/MMLU-PRO": 0.3061 + } + }, + { + "id": "icefog72/IceLemonTeaRP-32k-7b", + "name": "IceLemonTeaRP-32k-7b", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5212, + "hfopenllm_v2/BBH": 0.4997, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.429, + "hfopenllm_v2/MMLU-PRO": 0.3068 + } + }, + { + "id": "icefog72/IceMartiniRP-7b", + "name": "IceMartiniRP-7b", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5045, + "hfopenllm_v2/BBH": 0.4972, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.4345, + "hfopenllm_v2/MMLU-PRO": 0.3073 + } + }, + { + "id": "icefog72/IceNalyvkaRP-7b", + "name": "IceNalyvkaRP-7b", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5498, + "hfopenllm_v2/BBH": 0.5136, + "hfopenllm_v2/MATH Level 5": 0.0597, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4512, + "hfopenllm_v2/MMLU-PRO": 0.2996 + } + }, + { + "id": "icefog72/IceSakeRP-7b", + "name": "IceSakeRP-7b", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5228, + "hfopenllm_v2/BBH": 0.5119, + "hfopenllm_v2/MATH Level 5": 0.0634, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.413, + "hfopenllm_v2/MMLU-PRO": 0.3177 + } + }, + { + "id": "icefog72/IceSakeV4RP-7b", + "name": "IceSakeV4RP-7b", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4634, + "hfopenllm_v2/BBH": 0.493, + "hfopenllm_v2/MATH Level 5": 0.0559, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4082, + "hfopenllm_v2/MMLU-PRO": 0.3103 + } + }, + { + "id": "icefog72/IceSakeV6RP-7b", + "name": "IceSakeV6RP-7b", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5033, + "hfopenllm_v2/BBH": 0.4976, + "hfopenllm_v2/MATH Level 5": 0.0619, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.42, + "hfopenllm_v2/MMLU-PRO": 0.3093 + } + }, + { + "id": "icefog72/IceSakeV8RP-7b", + "name": "IceSakeV8RP-7b", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6086, + "hfopenllm_v2/BBH": 0.4885, + "hfopenllm_v2/MATH Level 5": 0.0597, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3993, + "hfopenllm_v2/MMLU-PRO": 0.301 + } + }, + { + "id": "icefog72/IceTea21EnergyDrinkRPV13-DPOv3", + "name": "IceTea21EnergyDrinkRPV13-DPOv3", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5263, + "hfopenllm_v2/BBH": 0.502, + "hfopenllm_v2/MATH Level 5": 0.0582, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.4372, + "hfopenllm_v2/MMLU-PRO": 0.3056 + } + }, + { + "id": "icefog72/IceTea21EnergyDrinkRPV13-DPOv3.5", + "name": "IceTea21EnergyDrinkRPV13-DPOv3.5", + "developer": "icefog72", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4871, + "hfopenllm_v2/BBH": 0.44, + "hfopenllm_v2/MATH Level 5": 0.0363, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.3964, + "hfopenllm_v2/MMLU-PRO": 0.2498 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ifable.json b/data/developers/ifable.json new file mode 100644 index 0000000000000000000000000000000000000000..601fc025187d79b202e9e36eb8bfbcdb29f13555 --- /dev/null +++ b/data/developers/ifable.json @@ -0,0 +1,19 @@ +{ + "developer": "ifable", + "models": [ + { + "id": "ifable/gemma-2-Ifable-9B", + "name": "gemma-2-Ifable-9B", + "developer": "ifable", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2984, + "hfopenllm_v2/BBH": 0.5866, + "hfopenllm_v2/MATH Level 5": 0.1397, + "hfopenllm_v2/GPQA": 0.3414, + "hfopenllm_v2/MUSR": 0.4053, + "hfopenllm_v2/MMLU-PRO": 0.4226 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ilsp.json b/data/developers/ilsp.json new file mode 100644 index 0000000000000000000000000000000000000000..7f7f270df6cb334f203b0a9cc17b7bbc4d4faef9 --- /dev/null +++ b/data/developers/ilsp.json @@ -0,0 +1,19 @@ +{ + "developer": "ilsp", + "models": [ + { + "id": "ilsp/Llama-Krikri-8B-Instruct", + "name": "Llama-Krikri-8B-Instruct", + "developer": "ilsp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6079, + "hfopenllm_v2/BBH": 0.5047, + "hfopenllm_v2/MATH Level 5": 0.1178, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.408, + "hfopenllm_v2/MMLU-PRO": 0.3313 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/inflatebot.json b/data/developers/inflatebot.json new file mode 100644 index 0000000000000000000000000000000000000000..eabe2eefaddd0757e69dc56209cfd6c076a5ca10 --- /dev/null +++ b/data/developers/inflatebot.json @@ -0,0 +1,19 @@ +{ + "developer": "inflatebot", + "models": [ + { + "id": "inflatebot/MN-12B-Mag-Mell-R1", + "name": "MN-12B-Mag-Mell-R1", + "developer": "inflatebot", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4613, + "hfopenllm_v2/BBH": 0.5304, + "hfopenllm_v2/MATH Level 5": 0.1299, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4002, + "hfopenllm_v2/MMLU-PRO": 0.3438 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/infly.json b/data/developers/infly.json new file mode 100644 index 0000000000000000000000000000000000000000..d497bf1e2632542284e99f21127cba81c8ed1b97 --- /dev/null +++ b/data/developers/infly.json @@ -0,0 +1,23 @@ +{ + "developer": "infly", + "models": [ + { + "id": "infly/INF-ORM-Llama3.1-70B", + "name": "infly/INF-ORM-Llama3.1-70B", + "developer": "infly", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9511, + "reward-bench/Factuality": 0.7411, + "reward-bench/Precise IF": 0.4188, + "reward-bench/Math": 0.6995, + "reward-bench/Safety": 0.9365, + "reward-bench/Focus": 0.903, + "reward-bench/Ties": 0.8622, + "reward-bench/Chat": 0.9665, + "reward-bench/Chat Hard": 0.9101, + "reward-bench/Reasoning": 0.9912 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/informatiker.json b/data/developers/informatiker.json new file mode 100644 index 0000000000000000000000000000000000000000..431024c422c6128cfc084f87f0fbeb88ae3dc94b --- /dev/null +++ b/data/developers/informatiker.json @@ -0,0 +1,19 @@ +{ + "developer": "informatiker", + "models": [ + { + "id": "informatiker/Qwen2-7B-Instruct-abliterated", + "name": "Qwen2-7B-Instruct-abliterated", + "developer": "informatiker", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5822, + "hfopenllm_v2/BBH": 0.5534, + "hfopenllm_v2/MATH Level 5": 0.2636, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.3888, + "hfopenllm_v2/MMLU-PRO": 0.3873 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/insightfactory.json b/data/developers/insightfactory.json new file mode 100644 index 0000000000000000000000000000000000000000..19234d170aa685994456d0cd6208d08d15380df9 --- /dev/null +++ b/data/developers/insightfactory.json @@ -0,0 +1,19 @@ +{ + "developer": "insightfactory", + "models": [ + { + "id": "insightfactory/Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model", + "name": "Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model", + "developer": "insightfactory", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4588, + "hfopenllm_v2/BBH": 0.4146, + "hfopenllm_v2/MATH Level 5": 0.105, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3499, + "hfopenllm_v2/MMLU-PRO": 0.296 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/instruction-pretrain.json b/data/developers/instruction-pretrain.json new file mode 100644 index 0000000000000000000000000000000000000000..9af8328ba70698314b25fb0bbf55b956185c2412 --- /dev/null +++ b/data/developers/instruction-pretrain.json @@ -0,0 +1,19 @@ +{ + "developer": "instruction-pretrain", + "models": [ + { + "id": "instruction-pretrain/InstructLM-500M", + "name": "InstructLM-500M", + "developer": "instruction-pretrain", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1028, + "hfopenllm_v2/BBH": 0.2941, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3528, + "hfopenllm_v2/MMLU-PRO": 0.1141 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/internlm.json b/data/developers/internlm.json new file mode 100644 index 0000000000000000000000000000000000000000..69708dbd584389b0c656d19a5340943c92e82210 --- /dev/null +++ b/data/developers/internlm.json @@ -0,0 +1,143 @@ +{ + "developer": "internlm", + "models": [ + { + "id": "internlm/internlm2-1_8b", + "name": "internlm2-1_8b", + "developer": "internlm", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2198, + "hfopenllm_v2/BBH": 0.388, + "hfopenllm_v2/MATH Level 5": 0.0211, + "hfopenllm_v2/GPQA": 0.2483, + "hfopenllm_v2/MUSR": 0.3813, + "hfopenllm_v2/MMLU-PRO": 0.1588 + } + }, + { + "id": "internlm/internlm2-1_8b-reward", + "name": "internlm/internlm2-1_8b-reward", + "developer": "internlm", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.3902, + "reward-bench/Chat": 0.9358, + "reward-bench/Chat Hard": 0.6623, + "reward-bench/Safety": 0.4711, + "reward-bench/Reasoning": 0.8724, + "reward-bench/Factuality": 0.2758, + "reward-bench/Precise IF": 0.3625, + "reward-bench/Math": 0.4426, + "reward-bench/Focus": 0.596, + "reward-bench/Ties": 0.1934 + } + }, + { + "id": "internlm/internlm2-20b-reward", + "name": "internlm/internlm2-20b-reward", + "developer": "internlm", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5628, + "reward-bench/Chat": 0.9888, + "reward-bench/Chat Hard": 0.7654, + "reward-bench/Safety": 0.6111, + "reward-bench/Reasoning": 0.9576, + "reward-bench/Factuality": 0.5558, + "reward-bench/Precise IF": 0.3625, + "reward-bench/Math": 0.5738, + "reward-bench/Focus": 0.7253, + "reward-bench/Ties": 0.5483 + } + }, + { + "id": "internlm/internlm2-7b", + "name": "internlm2-7b", + "developer": "internlm", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.228, + "hfopenllm_v2/BBH": 0.5825, + "hfopenllm_v2/MATH Level 5": 0.0857, + "hfopenllm_v2/GPQA": 0.3367, + "hfopenllm_v2/MUSR": 0.44, + "hfopenllm_v2/MMLU-PRO": 0.19 + } + }, + { + "id": "internlm/internlm2-7b-reward", + "name": "internlm/internlm2-7b-reward", + "developer": "internlm", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8759, + "reward-bench/Factuality": 0.4211, + "reward-bench/Precise IF": 0.4, + "reward-bench/Math": 0.5628, + "reward-bench/Safety": 0.8716, + "reward-bench/Focus": 0.7051, + "reward-bench/Ties": 0.5164, + "reward-bench/Chat": 0.9916, + "reward-bench/Chat Hard": 0.6952, + "reward-bench/Reasoning": 0.9453 + } + }, + { + "id": "internlm/internlm2-chat-1_8b", + "name": "internlm2-chat-1_8b", + "developer": "internlm", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2387, + "hfopenllm_v2/BBH": 0.4452, + "hfopenllm_v2/MATH Level 5": 0.0325, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3631, + "hfopenllm_v2/MMLU-PRO": 0.1839 + } + }, + { + "id": "internlm/internlm2_5-1_8b-chat", + "name": "internlm2_5-1_8b-chat", + "developer": "internlm", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3849, + "hfopenllm_v2/BBH": 0.4489, + "hfopenllm_v2/MATH Level 5": 0.1586, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.3594, + "hfopenllm_v2/MMLU-PRO": 0.1299 + } + }, + { + "id": "internlm/internlm2_5-20b-chat", + "name": "internlm2_5-20b-chat", + "developer": "internlm", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.701, + "hfopenllm_v2/BBH": 0.7474, + "hfopenllm_v2/MATH Level 5": 0.4079, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.4558, + "hfopenllm_v2/MMLU-PRO": 0.3998 + } + }, + { + "id": "internlm/internlm2_5-7b-chat", + "name": "internlm2_5-7b-chat", + "developer": "internlm", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5539, + "hfopenllm_v2/BBH": 0.7073, + "hfopenllm_v2/MATH Level 5": 0.253, + "hfopenllm_v2/GPQA": 0.3473, + "hfopenllm_v2/MUSR": 0.4594, + "hfopenllm_v2/MMLU-PRO": 0.3777 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/intervitens.json b/data/developers/intervitens.json new file mode 100644 index 0000000000000000000000000000000000000000..293151195dddaf964129ce365ecdb39e498abcb7 --- /dev/null +++ b/data/developers/intervitens.json @@ -0,0 +1,19 @@ +{ + "developer": "intervitens", + "models": [ + { + "id": "intervitens/mini-magnum-12b-v1.1", + "name": "mini-magnum-12b-v1.1", + "developer": "intervitens", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5156, + "hfopenllm_v2/BBH": 0.5062, + "hfopenllm_v2/MATH Level 5": 0.0619, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.4004, + "hfopenllm_v2/MMLU-PRO": 0.3291 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/inumulaisk.json b/data/developers/inumulaisk.json new file mode 100644 index 0000000000000000000000000000000000000000..fe3306870802d32ab7048171bf1b41497caa7318 --- /dev/null +++ b/data/developers/inumulaisk.json @@ -0,0 +1,19 @@ +{ + "developer": "inumulaisk", + "models": [ + { + "id": "inumulaisk/eval_model", + "name": "eval_model", + "developer": "inumulaisk", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1931, + "hfopenllm_v2/BBH": 0.3512, + "hfopenllm_v2/MATH Level 5": 0.2976, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.358, + "hfopenllm_v2/MMLU-PRO": 0.1664 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/invalid-coder.json b/data/developers/invalid-coder.json new file mode 100644 index 0000000000000000000000000000000000000000..1e4e77376b570ecc50521a6e198803bfb1f87e75 --- /dev/null +++ b/data/developers/invalid-coder.json @@ -0,0 +1,19 @@ +{ + "developer": "invalid-coder", + "models": [ + { + "id": "invalid-coder/Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp", + "name": "Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp", + "developer": "invalid-coder", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4555, + "hfopenllm_v2/BBH": 0.5158, + "hfopenllm_v2/MATH Level 5": 0.0491, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.3992, + "hfopenllm_v2/MMLU-PRO": 0.3146 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/invisietch.json b/data/developers/invisietch.json new file mode 100644 index 0000000000000000000000000000000000000000..8583bb65940ddb12664addbe021e70440d09afd0 --- /dev/null +++ b/data/developers/invisietch.json @@ -0,0 +1,61 @@ +{ + "developer": "invisietch", + "models": [ + { + "id": "invisietch/EtherealRainbow-v0.2-8B", + "name": "EtherealRainbow-v0.2-8B", + "developer": "invisietch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3903, + "hfopenllm_v2/BBH": 0.5102, + "hfopenllm_v2/MATH Level 5": 0.0823, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.3827, + "hfopenllm_v2/MMLU-PRO": 0.3653 + } + }, + { + "id": "invisietch/EtherealRainbow-v0.3-8B", + "name": "EtherealRainbow-v0.3-8B", + "developer": "invisietch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3682, + "hfopenllm_v2/BBH": 0.5097, + "hfopenllm_v2/MATH Level 5": 0.0763, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.3904, + "hfopenllm_v2/MMLU-PRO": 0.3626 + } + }, + { + "id": "invisietch/MiS-Firefly-v0.2-22B", + "name": "MiS-Firefly-v0.2-22B", + "developer": "invisietch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5371, + "hfopenllm_v2/BBH": 0.5514, + "hfopenllm_v2/MATH Level 5": 0.1654, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4694, + "hfopenllm_v2/MMLU-PRO": 0.362 + } + }, + { + "id": "invisietch/Nimbus-Miqu-v0.1-70B", + "name": "Nimbus-Miqu-v0.1-70B", + "developer": "invisietch", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4647, + "hfopenllm_v2/BBH": 0.601, + "hfopenllm_v2/MATH Level 5": 0.0604, + "hfopenllm_v2/GPQA": 0.3389, + "hfopenllm_v2/MUSR": 0.4133, + "hfopenllm_v2/MMLU-PRO": 0.3853 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/irahulpandey.json b/data/developers/irahulpandey.json new file mode 100644 index 0000000000000000000000000000000000000000..d636eca8af7304b695038d41c56949934cef8424 --- /dev/null +++ b/data/developers/irahulpandey.json @@ -0,0 +1,19 @@ +{ + "developer": "irahulpandey", + "models": [ + { + "id": "irahulpandey/mistralai-7B-slerp-v0.1", + "name": "mistralai-7B-slerp-v0.1", + "developer": "irahulpandey", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4966, + "hfopenllm_v2/BBH": 0.5011, + "hfopenllm_v2/MATH Level 5": 0.0514, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.455, + "hfopenllm_v2/MMLU-PRO": 0.2951 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/jaredjoss.json b/data/developers/jaredjoss.json new file mode 100644 index 0000000000000000000000000000000000000000..26f23fdcced9da0fbd543c310df0ea11996f4638 --- /dev/null +++ b/data/developers/jaredjoss.json @@ -0,0 +1,19 @@ +{ + "developer": "jaredjoss", + "models": [ + { + "id": "jaredjoss/pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model", + "name": "pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model", + "developer": "jaredjoss", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1572, + "hfopenllm_v2/BBH": 0.2863, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3607, + "hfopenllm_v2/MMLU-PRO": 0.1169 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/jaspionjader.json b/data/developers/jaspionjader.json new file mode 100644 index 0000000000000000000000000000000000000000..a117f106f2a5d6899d3f145088878651c642288f --- /dev/null +++ b/data/developers/jaspionjader.json @@ -0,0 +1,2749 @@ +{ + "developer": "jaspionjader", + "models": [ + { + "id": "jaspionjader/Auro-Kosmos-EVAA-v2-8B", + "name": "Auro-Kosmos-EVAA-v2-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4778, + "hfopenllm_v2/BBH": 0.5447, + "hfopenllm_v2/MATH Level 5": 0.1412, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.425, + "hfopenllm_v2/MMLU-PRO": 0.3858 + } + }, + { + "id": "jaspionjader/Auro-Kosmos-EVAA-v2.1-8B", + "name": "Auro-Kosmos-EVAA-v2.1-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4666, + "hfopenllm_v2/BBH": 0.5444, + "hfopenllm_v2/MATH Level 5": 0.1458, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4317, + "hfopenllm_v2/MMLU-PRO": 0.3826 + } + }, + { + "id": "jaspionjader/Auro-Kosmos-EVAA-v2.2-8B", + "name": "Auro-Kosmos-EVAA-v2.2-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4268, + "hfopenllm_v2/BBH": 0.5431, + "hfopenllm_v2/MATH Level 5": 0.1412, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4251, + "hfopenllm_v2/MMLU-PRO": 0.3798 + } + }, + { + "id": "jaspionjader/Auro-Kosmos-EVAA-v2.3-8B", + "name": "Auro-Kosmos-EVAA-v2.3-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4271, + "hfopenllm_v2/BBH": 0.5441, + "hfopenllm_v2/MATH Level 5": 0.1344, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4278, + "hfopenllm_v2/MMLU-PRO": 0.3784 + } + }, + { + "id": "jaspionjader/Kosmos-Aurora_faustus-8B", + "name": "Kosmos-Aurora_faustus-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4432, + "hfopenllm_v2/BBH": 0.526, + "hfopenllm_v2/MATH Level 5": 0.1125, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4117, + "hfopenllm_v2/MMLU-PRO": 0.3813 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-8B", + "name": "Kosmos-EVAA-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4405, + "hfopenllm_v2/BBH": 0.5312, + "hfopenllm_v2/MATH Level 5": 0.1178, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4237, + "hfopenllm_v2/MMLU-PRO": 0.3818 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-Franken-Immersive-v39-8B", + "name": "Kosmos-EVAA-Franken-Immersive-v39-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4378, + "hfopenllm_v2/BBH": 0.519, + "hfopenllm_v2/MATH Level 5": 0.1292, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.4236, + "hfopenllm_v2/MMLU-PRO": 0.39 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-Franken-v38-8B", + "name": "Kosmos-EVAA-Franken-v38-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4356, + "hfopenllm_v2/BBH": 0.523, + "hfopenllm_v2/MATH Level 5": 0.1292, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4212, + "hfopenllm_v2/MMLU-PRO": 0.389 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-Fusion-8B", + "name": "Kosmos-EVAA-Fusion-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4345, + "hfopenllm_v2/BBH": 0.5419, + "hfopenllm_v2/MATH Level 5": 0.1292, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4277, + "hfopenllm_v2/MMLU-PRO": 0.3854 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-PRP-8B", + "name": "Kosmos-EVAA-PRP-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3405, + "hfopenllm_v2/BBH": 0.5196, + "hfopenllm_v2/MATH Level 5": 0.0884, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4301, + "hfopenllm_v2/MMLU-PRO": 0.3647 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-PRP-light-8B", + "name": "Kosmos-EVAA-PRP-light-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3824, + "hfopenllm_v2/BBH": 0.5271, + "hfopenllm_v2/MATH Level 5": 0.1103, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4249, + "hfopenllm_v2/MMLU-PRO": 0.3782 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-PRP-v23-8B", + "name": "Kosmos-EVAA-PRP-v23-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4041, + "hfopenllm_v2/BBH": 0.529, + "hfopenllm_v2/MATH Level 5": 0.1156, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4368, + "hfopenllm_v2/MMLU-PRO": 0.3706 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-PRP-v24-8B", + "name": "Kosmos-EVAA-PRP-v24-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4259, + "hfopenllm_v2/BBH": 0.5276, + "hfopenllm_v2/MATH Level 5": 0.1103, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.429, + "hfopenllm_v2/MMLU-PRO": 0.3779 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-PRP-v25-8B", + "name": "Kosmos-EVAA-PRP-v25-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4421, + "hfopenllm_v2/BBH": 0.5291, + "hfopenllm_v2/MATH Level 5": 0.1186, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.4303, + "hfopenllm_v2/MMLU-PRO": 0.3716 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-PRP-v26-8B", + "name": "Kosmos-EVAA-PRP-v26-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4414, + "hfopenllm_v2/BBH": 0.5271, + "hfopenllm_v2/MATH Level 5": 0.1133, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4264, + "hfopenllm_v2/MMLU-PRO": 0.3793 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-PRP-v27-8B", + "name": "Kosmos-EVAA-PRP-v27-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4378, + "hfopenllm_v2/BBH": 0.529, + "hfopenllm_v2/MATH Level 5": 0.1193, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4343, + "hfopenllm_v2/MMLU-PRO": 0.3755 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-PRP-v28-8B", + "name": "Kosmos-EVAA-PRP-v28-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4366, + "hfopenllm_v2/BBH": 0.5295, + "hfopenllm_v2/MATH Level 5": 0.1171, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.433, + "hfopenllm_v2/MMLU-PRO": 0.375 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-PRP-v29-8B", + "name": "Kosmos-EVAA-PRP-v29-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4487, + "hfopenllm_v2/BBH": 0.5275, + "hfopenllm_v2/MATH Level 5": 0.1201, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4237, + "hfopenllm_v2/MMLU-PRO": 0.3765 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-PRP-v30-8B", + "name": "Kosmos-EVAA-PRP-v30-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4295, + "hfopenllm_v2/BBH": 0.5328, + "hfopenllm_v2/MATH Level 5": 0.1178, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4263, + "hfopenllm_v2/MMLU-PRO": 0.3938 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-PRP-v31-8B", + "name": "Kosmos-EVAA-PRP-v31-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4399, + "hfopenllm_v2/BBH": 0.5315, + "hfopenllm_v2/MATH Level 5": 0.1133, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4251, + "hfopenllm_v2/MMLU-PRO": 0.3935 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-PRP-v32-8B", + "name": "Kosmos-EVAA-PRP-v32-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4487, + "hfopenllm_v2/BBH": 0.5293, + "hfopenllm_v2/MATH Level 5": 0.1148, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4211, + "hfopenllm_v2/MMLU-PRO": 0.3777 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-PRP-v33-8B", + "name": "Kosmos-EVAA-PRP-v33-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4302, + "hfopenllm_v2/BBH": 0.5321, + "hfopenllm_v2/MATH Level 5": 0.1178, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4184, + "hfopenllm_v2/MMLU-PRO": 0.3909 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-PRP-v34-8B", + "name": "Kosmos-EVAA-PRP-v34-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4563, + "hfopenllm_v2/BBH": 0.5333, + "hfopenllm_v2/MATH Level 5": 0.1125, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.4237, + "hfopenllm_v2/MMLU-PRO": 0.3927 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-TSN-8B", + "name": "Kosmos-EVAA-TSN-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4721, + "hfopenllm_v2/BBH": 0.5177, + "hfopenllm_v2/MATH Level 5": 0.1344, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4329, + "hfopenllm_v2/MMLU-PRO": 0.3816 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-TSN-light-8B", + "name": "Kosmos-EVAA-TSN-light-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4685, + "hfopenllm_v2/BBH": 0.5235, + "hfopenllm_v2/MATH Level 5": 0.1216, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4289, + "hfopenllm_v2/MMLU-PRO": 0.3806 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-TSN-v19-8B", + "name": "Kosmos-EVAA-TSN-v19-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4564, + "hfopenllm_v2/BBH": 0.5316, + "hfopenllm_v2/MATH Level 5": 0.1156, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4277, + "hfopenllm_v2/MMLU-PRO": 0.379 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-TSN-v20-8B", + "name": "Kosmos-EVAA-TSN-v20-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4423, + "hfopenllm_v2/BBH": 0.525, + "hfopenllm_v2/MATH Level 5": 0.1246, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.421, + "hfopenllm_v2/MMLU-PRO": 0.3936 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-TSN-v21-8B", + "name": "Kosmos-EVAA-TSN-v21-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.467, + "hfopenllm_v2/BBH": 0.5248, + "hfopenllm_v2/MATH Level 5": 0.1193, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4343, + "hfopenllm_v2/MMLU-PRO": 0.3816 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-TSN-v22-8B", + "name": "Kosmos-EVAA-TSN-v22-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4673, + "hfopenllm_v2/BBH": 0.5246, + "hfopenllm_v2/MATH Level 5": 0.1133, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4303, + "hfopenllm_v2/MMLU-PRO": 0.3812 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-gamma-8B", + "name": "Kosmos-EVAA-gamma-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4572, + "hfopenllm_v2/BBH": 0.5322, + "hfopenllm_v2/MATH Level 5": 0.105, + "hfopenllm_v2/GPQA": 0.3188, + "hfopenllm_v2/MUSR": 0.4306, + "hfopenllm_v2/MMLU-PRO": 0.3901 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-gamma-alt-8B", + "name": "Kosmos-EVAA-gamma-alt-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4542, + "hfopenllm_v2/BBH": 0.5298, + "hfopenllm_v2/MATH Level 5": 0.1095, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.4292, + "hfopenllm_v2/MMLU-PRO": 0.3896 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-gamma-light-8B", + "name": "Kosmos-EVAA-gamma-light-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4581, + "hfopenllm_v2/BBH": 0.5376, + "hfopenllm_v2/MATH Level 5": 0.1103, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4291, + "hfopenllm_v2/MMLU-PRO": 0.3943 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-gamma-light-alt-8B", + "name": "Kosmos-EVAA-gamma-light-alt-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4454, + "hfopenllm_v2/BBH": 0.5327, + "hfopenllm_v2/MATH Level 5": 0.1133, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4305, + "hfopenllm_v2/MMLU-PRO": 0.3923 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-gamma-ultra-light-8B", + "name": "Kosmos-EVAA-gamma-ultra-light-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4563, + "hfopenllm_v2/BBH": 0.5316, + "hfopenllm_v2/MATH Level 5": 0.1178, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4197, + "hfopenllm_v2/MMLU-PRO": 0.3915 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-gamma-v13-8B", + "name": "Kosmos-EVAA-gamma-v13-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4429, + "hfopenllm_v2/BBH": 0.5359, + "hfopenllm_v2/MATH Level 5": 0.1118, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4278, + "hfopenllm_v2/MMLU-PRO": 0.393 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-gamma-v14-8B", + "name": "Kosmos-EVAA-gamma-v14-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.438, + "hfopenllm_v2/BBH": 0.5363, + "hfopenllm_v2/MATH Level 5": 0.1103, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4277, + "hfopenllm_v2/MMLU-PRO": 0.3931 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-gamma-v15-8B", + "name": "Kosmos-EVAA-gamma-v15-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4654, + "hfopenllm_v2/BBH": 0.5343, + "hfopenllm_v2/MATH Level 5": 0.111, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.4277, + "hfopenllm_v2/MMLU-PRO": 0.3941 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-gamma-v16-8B", + "name": "Kosmos-EVAA-gamma-v16-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4557, + "hfopenllm_v2/BBH": 0.5344, + "hfopenllm_v2/MATH Level 5": 0.1171, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.4264, + "hfopenllm_v2/MMLU-PRO": 0.3917 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-gamma-v17-8B", + "name": "Kosmos-EVAA-gamma-v17-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4462, + "hfopenllm_v2/BBH": 0.5347, + "hfopenllm_v2/MATH Level 5": 0.111, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.4291, + "hfopenllm_v2/MMLU-PRO": 0.3923 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-gamma-v18-8B", + "name": "Kosmos-EVAA-gamma-v18-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4341, + "hfopenllm_v2/BBH": 0.5339, + "hfopenllm_v2/MATH Level 5": 0.111, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.4317, + "hfopenllm_v2/MMLU-PRO": 0.3905 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-immersive-sof-v44-8B", + "name": "Kosmos-EVAA-immersive-sof-v44-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4408, + "hfopenllm_v2/BBH": 0.5215, + "hfopenllm_v2/MATH Level 5": 0.1186, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4144, + "hfopenllm_v2/MMLU-PRO": 0.3888 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-v10-8B", + "name": "Kosmos-EVAA-v10-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4262, + "hfopenllm_v2/BBH": 0.5376, + "hfopenllm_v2/MATH Level 5": 0.1246, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4224, + "hfopenllm_v2/MMLU-PRO": 0.3831 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-v11-8B", + "name": "Kosmos-EVAA-v11-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4426, + "hfopenllm_v2/BBH": 0.5359, + "hfopenllm_v2/MATH Level 5": 0.1322, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.4184, + "hfopenllm_v2/MMLU-PRO": 0.3836 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-v12-8B", + "name": "Kosmos-EVAA-v12-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4378, + "hfopenllm_v2/BBH": 0.5349, + "hfopenllm_v2/MATH Level 5": 0.1367, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4211, + "hfopenllm_v2/MMLU-PRO": 0.3836 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-v2-8B", + "name": "Kosmos-EVAA-v2-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4396, + "hfopenllm_v2/BBH": 0.5341, + "hfopenllm_v2/MATH Level 5": 0.1322, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4211, + "hfopenllm_v2/MMLU-PRO": 0.3826 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-v3-8B", + "name": "Kosmos-EVAA-v3-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4411, + "hfopenllm_v2/BBH": 0.5331, + "hfopenllm_v2/MATH Level 5": 0.1329, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4224, + "hfopenllm_v2/MMLU-PRO": 0.3821 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-v4-8B", + "name": "Kosmos-EVAA-v4-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4289, + "hfopenllm_v2/BBH": 0.5337, + "hfopenllm_v2/MATH Level 5": 0.1254, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4197, + "hfopenllm_v2/MMLU-PRO": 0.3817 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-v5-8B", + "name": "Kosmos-EVAA-v5-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.446, + "hfopenllm_v2/BBH": 0.5345, + "hfopenllm_v2/MATH Level 5": 0.1261, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4224, + "hfopenllm_v2/MMLU-PRO": 0.3821 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-v6-8B", + "name": "Kosmos-EVAA-v6-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4396, + "hfopenllm_v2/BBH": 0.538, + "hfopenllm_v2/MATH Level 5": 0.1292, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4184, + "hfopenllm_v2/MMLU-PRO": 0.3821 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-v7-8B", + "name": "Kosmos-EVAA-v7-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4277, + "hfopenllm_v2/BBH": 0.5335, + "hfopenllm_v2/MATH Level 5": 0.1337, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4171, + "hfopenllm_v2/MMLU-PRO": 0.3836 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-v8-8B", + "name": "Kosmos-EVAA-v8-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4383, + "hfopenllm_v2/BBH": 0.5359, + "hfopenllm_v2/MATH Level 5": 0.1307, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.421, + "hfopenllm_v2/MMLU-PRO": 0.3827 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-v9-8B", + "name": "Kosmos-EVAA-v9-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4369, + "hfopenllm_v2/BBH": 0.5361, + "hfopenllm_v2/MATH Level 5": 0.1276, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4184, + "hfopenllm_v2/MMLU-PRO": 0.382 + } + }, + { + "id": "jaspionjader/Kosmos-EVAA-v9-TitanFusion-Mix-8B", + "name": "Kosmos-EVAA-v9-TitanFusion-Mix-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4284, + "hfopenllm_v2/BBH": 0.554, + "hfopenllm_v2/MATH Level 5": 0.1148, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.4354, + "hfopenllm_v2/MMLU-PRO": 0.3836 + } + }, + { + "id": "jaspionjader/Kosmos-Elusive-8b", + "name": "Kosmos-Elusive-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4169, + "hfopenllm_v2/BBH": 0.5339, + "hfopenllm_v2/MATH Level 5": 0.1261, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4078, + "hfopenllm_v2/MMLU-PRO": 0.376 + } + }, + { + "id": "jaspionjader/Kosmos-Elusive-VENN-8B", + "name": "Kosmos-Elusive-VENN-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4233, + "hfopenllm_v2/BBH": 0.5356, + "hfopenllm_v2/MATH Level 5": 0.1246, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4157, + "hfopenllm_v2/MMLU-PRO": 0.3797 + } + }, + { + "id": "jaspionjader/Kosmos-Elusive-VENN-Asymmetric-8B", + "name": "Kosmos-Elusive-VENN-Asymmetric-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4542, + "hfopenllm_v2/BBH": 0.5313, + "hfopenllm_v2/MATH Level 5": 0.1344, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4251, + "hfopenllm_v2/MMLU-PRO": 0.3842 + } + }, + { + "id": "jaspionjader/Kosmos-Elusive-VENN-Aurora_faustus-8B", + "name": "Kosmos-Elusive-VENN-Aurora_faustus-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4335, + "hfopenllm_v2/BBH": 0.5304, + "hfopenllm_v2/MATH Level 5": 0.1125, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.417, + "hfopenllm_v2/MMLU-PRO": 0.3795 + } + }, + { + "id": "jaspionjader/Kosmos-VENN-8B", + "name": "Kosmos-VENN-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4332, + "hfopenllm_v2/BBH": 0.5318, + "hfopenllm_v2/MATH Level 5": 0.1412, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4211, + "hfopenllm_v2/MMLU-PRO": 0.3801 + } + }, + { + "id": "jaspionjader/PRP-Kosmos-EVAA-8B", + "name": "PRP-Kosmos-EVAA-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3633, + "hfopenllm_v2/BBH": 0.5237, + "hfopenllm_v2/MATH Level 5": 0.0959, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.425, + "hfopenllm_v2/MMLU-PRO": 0.3766 + } + }, + { + "id": "jaspionjader/PRP-Kosmos-EVAA-light-8B", + "name": "PRP-Kosmos-EVAA-light-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4321, + "hfopenllm_v2/BBH": 0.5275, + "hfopenllm_v2/MATH Level 5": 0.1103, + "hfopenllm_v2/GPQA": 0.3221, + "hfopenllm_v2/MUSR": 0.4235, + "hfopenllm_v2/MMLU-PRO": 0.3631 + } + }, + { + "id": "jaspionjader/TSN-Kosmos-EVAA-8B", + "name": "TSN-Kosmos-EVAA-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4903, + "hfopenllm_v2/BBH": 0.5347, + "hfopenllm_v2/MATH Level 5": 0.145, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.4173, + "hfopenllm_v2/MMLU-PRO": 0.3831 + } + }, + { + "id": "jaspionjader/TSN-Kosmos-EVAA-v2-8B", + "name": "TSN-Kosmos-EVAA-v2-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4667, + "hfopenllm_v2/BBH": 0.5343, + "hfopenllm_v2/MATH Level 5": 0.108, + "hfopenllm_v2/GPQA": 0.3221, + "hfopenllm_v2/MUSR": 0.4186, + "hfopenllm_v2/MMLU-PRO": 0.3762 + } + }, + { + "id": "jaspionjader/bbb-1", + "name": "bbb-1", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4864, + "hfopenllm_v2/BBH": 0.5376, + "hfopenllm_v2/MATH Level 5": 0.1367, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4171, + "hfopenllm_v2/MMLU-PRO": 0.3897 + } + }, + { + "id": "jaspionjader/bbb-2", + "name": "bbb-2", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4077, + "hfopenllm_v2/BBH": 0.5067, + "hfopenllm_v2/MATH Level 5": 0.1125, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4145, + "hfopenllm_v2/MMLU-PRO": 0.3635 + } + }, + { + "id": "jaspionjader/bbb-3", + "name": "bbb-3", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4168, + "hfopenllm_v2/BBH": 0.5158, + "hfopenllm_v2/MATH Level 5": 0.1405, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.4265, + "hfopenllm_v2/MMLU-PRO": 0.3856 + } + }, + { + "id": "jaspionjader/bbb-4", + "name": "bbb-4", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4768, + "hfopenllm_v2/BBH": 0.5212, + "hfopenllm_v2/MATH Level 5": 0.1276, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4092, + "hfopenllm_v2/MMLU-PRO": 0.3773 + } + }, + { + "id": "jaspionjader/bbb-5", + "name": "bbb-5", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4703, + "hfopenllm_v2/BBH": 0.5207, + "hfopenllm_v2/MATH Level 5": 0.1397, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.3998, + "hfopenllm_v2/MMLU-PRO": 0.3834 + } + }, + { + "id": "jaspionjader/bbb-6", + "name": "bbb-6", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.488, + "hfopenllm_v2/BBH": 0.5211, + "hfopenllm_v2/MATH Level 5": 0.139, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4052, + "hfopenllm_v2/MMLU-PRO": 0.3871 + } + }, + { + "id": "jaspionjader/bbb-7", + "name": "bbb-7", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4828, + "hfopenllm_v2/BBH": 0.5211, + "hfopenllm_v2/MATH Level 5": 0.1367, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4038, + "hfopenllm_v2/MMLU-PRO": 0.386 + } + }, + { + "id": "jaspionjader/bh-1", + "name": "bh-1", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4284, + "hfopenllm_v2/BBH": 0.589, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4441, + "hfopenllm_v2/MMLU-PRO": 0.3449 + } + }, + { + "id": "jaspionjader/bh-10", + "name": "bh-10", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4618, + "hfopenllm_v2/BBH": 0.5856, + "hfopenllm_v2/MATH Level 5": 0.1103, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4199, + "hfopenllm_v2/MMLU-PRO": 0.3708 + } + }, + { + "id": "jaspionjader/bh-11", + "name": "bh-11", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4575, + "hfopenllm_v2/BBH": 0.5851, + "hfopenllm_v2/MATH Level 5": 0.1178, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4146, + "hfopenllm_v2/MMLU-PRO": 0.3738 + } + }, + { + "id": "jaspionjader/bh-12", + "name": "bh-12", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4734, + "hfopenllm_v2/BBH": 0.5802, + "hfopenllm_v2/MATH Level 5": 0.1186, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4145, + "hfopenllm_v2/MMLU-PRO": 0.3737 + } + }, + { + "id": "jaspionjader/bh-13", + "name": "bh-13", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4698, + "hfopenllm_v2/BBH": 0.5778, + "hfopenllm_v2/MATH Level 5": 0.1125, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4159, + "hfopenllm_v2/MMLU-PRO": 0.373 + } + }, + { + "id": "jaspionjader/bh-15", + "name": "bh-15", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4745, + "hfopenllm_v2/BBH": 0.5819, + "hfopenllm_v2/MATH Level 5": 0.1246, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4105, + "hfopenllm_v2/MMLU-PRO": 0.3767 + } + }, + { + "id": "jaspionjader/bh-16", + "name": "bh-16", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4731, + "hfopenllm_v2/BBH": 0.5783, + "hfopenllm_v2/MATH Level 5": 0.1193, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4159, + "hfopenllm_v2/MMLU-PRO": 0.3776 + } + }, + { + "id": "jaspionjader/bh-17", + "name": "bh-17", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4722, + "hfopenllm_v2/BBH": 0.5776, + "hfopenllm_v2/MATH Level 5": 0.1133, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4158, + "hfopenllm_v2/MMLU-PRO": 0.3757 + } + }, + { + "id": "jaspionjader/bh-18", + "name": "bh-18", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4725, + "hfopenllm_v2/BBH": 0.5824, + "hfopenllm_v2/MATH Level 5": 0.1186, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4185, + "hfopenllm_v2/MMLU-PRO": 0.3757 + } + }, + { + "id": "jaspionjader/bh-19", + "name": "bh-19", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4584, + "hfopenllm_v2/BBH": 0.5766, + "hfopenllm_v2/MATH Level 5": 0.1193, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4171, + "hfopenllm_v2/MMLU-PRO": 0.3775 + } + }, + { + "id": "jaspionjader/bh-2", + "name": "bh-2", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4579, + "hfopenllm_v2/BBH": 0.5937, + "hfopenllm_v2/MATH Level 5": 0.1027, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4186, + "hfopenllm_v2/MMLU-PRO": 0.3695 + } + }, + { + "id": "jaspionjader/bh-20", + "name": "bh-20", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4727, + "hfopenllm_v2/BBH": 0.575, + "hfopenllm_v2/MATH Level 5": 0.1201, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.4105, + "hfopenllm_v2/MMLU-PRO": 0.3768 + } + }, + { + "id": "jaspionjader/bh-21", + "name": "bh-21", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.47, + "hfopenllm_v2/BBH": 0.5738, + "hfopenllm_v2/MATH Level 5": 0.1216, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4158, + "hfopenllm_v2/MMLU-PRO": 0.3776 + } + }, + { + "id": "jaspionjader/bh-22", + "name": "bh-22", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.46, + "hfopenllm_v2/BBH": 0.5793, + "hfopenllm_v2/MATH Level 5": 0.1186, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4172, + "hfopenllm_v2/MMLU-PRO": 0.3764 + } + }, + { + "id": "jaspionjader/bh-23", + "name": "bh-23", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4658, + "hfopenllm_v2/BBH": 0.57, + "hfopenllm_v2/MATH Level 5": 0.1201, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4197, + "hfopenllm_v2/MMLU-PRO": 0.3796 + } + }, + { + "id": "jaspionjader/bh-24", + "name": "bh-24", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4715, + "hfopenllm_v2/BBH": 0.5717, + "hfopenllm_v2/MATH Level 5": 0.1269, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4158, + "hfopenllm_v2/MMLU-PRO": 0.3809 + } + }, + { + "id": "jaspionjader/bh-25", + "name": "bh-25", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4752, + "hfopenllm_v2/BBH": 0.5706, + "hfopenllm_v2/MATH Level 5": 0.1133, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.4118, + "hfopenllm_v2/MMLU-PRO": 0.3782 + } + }, + { + "id": "jaspionjader/bh-26", + "name": "bh-26", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4691, + "hfopenllm_v2/BBH": 0.5735, + "hfopenllm_v2/MATH Level 5": 0.1163, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4277, + "hfopenllm_v2/MMLU-PRO": 0.3772 + } + }, + { + "id": "jaspionjader/bh-27", + "name": "bh-27", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4819, + "hfopenllm_v2/BBH": 0.5714, + "hfopenllm_v2/MATH Level 5": 0.1276, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4091, + "hfopenllm_v2/MMLU-PRO": 0.3799 + } + }, + { + "id": "jaspionjader/bh-28", + "name": "bh-28", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4785, + "hfopenllm_v2/BBH": 0.5703, + "hfopenllm_v2/MATH Level 5": 0.1231, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4131, + "hfopenllm_v2/MMLU-PRO": 0.3812 + } + }, + { + "id": "jaspionjader/bh-29", + "name": "bh-29", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4688, + "hfopenllm_v2/BBH": 0.567, + "hfopenllm_v2/MATH Level 5": 0.1208, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4237, + "hfopenllm_v2/MMLU-PRO": 0.3819 + } + }, + { + "id": "jaspionjader/bh-3", + "name": "bh-3", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4664, + "hfopenllm_v2/BBH": 0.5891, + "hfopenllm_v2/MATH Level 5": 0.1148, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4173, + "hfopenllm_v2/MMLU-PRO": 0.3702 + } + }, + { + "id": "jaspionjader/bh-30", + "name": "bh-30", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4666, + "hfopenllm_v2/BBH": 0.5706, + "hfopenllm_v2/MATH Level 5": 0.1231, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4144, + "hfopenllm_v2/MMLU-PRO": 0.3782 + } + }, + { + "id": "jaspionjader/bh-31", + "name": "bh-31", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4727, + "hfopenllm_v2/BBH": 0.5665, + "hfopenllm_v2/MATH Level 5": 0.1284, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4104, + "hfopenllm_v2/MMLU-PRO": 0.382 + } + }, + { + "id": "jaspionjader/bh-32", + "name": "bh-32", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4636, + "hfopenllm_v2/BBH": 0.5662, + "hfopenllm_v2/MATH Level 5": 0.1246, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4157, + "hfopenllm_v2/MMLU-PRO": 0.3812 + } + }, + { + "id": "jaspionjader/bh-33", + "name": "bh-33", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4685, + "hfopenllm_v2/BBH": 0.5653, + "hfopenllm_v2/MATH Level 5": 0.1178, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4157, + "hfopenllm_v2/MMLU-PRO": 0.3808 + } + }, + { + "id": "jaspionjader/bh-34", + "name": "bh-34", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4624, + "hfopenllm_v2/BBH": 0.5681, + "hfopenllm_v2/MATH Level 5": 0.1208, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4185, + "hfopenllm_v2/MMLU-PRO": 0.3804 + } + }, + { + "id": "jaspionjader/bh-35", + "name": "bh-35", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4721, + "hfopenllm_v2/BBH": 0.564, + "hfopenllm_v2/MATH Level 5": 0.1246, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4183, + "hfopenllm_v2/MMLU-PRO": 0.383 + } + }, + { + "id": "jaspionjader/bh-36", + "name": "bh-36", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4666, + "hfopenllm_v2/BBH": 0.5664, + "hfopenllm_v2/MATH Level 5": 0.1239, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4196, + "hfopenllm_v2/MMLU-PRO": 0.3831 + } + }, + { + "id": "jaspionjader/bh-37", + "name": "bh-37", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.488, + "hfopenllm_v2/BBH": 0.5625, + "hfopenllm_v2/MATH Level 5": 0.1216, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4156, + "hfopenllm_v2/MMLU-PRO": 0.3828 + } + }, + { + "id": "jaspionjader/bh-38", + "name": "bh-38", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4618, + "hfopenllm_v2/BBH": 0.5658, + "hfopenllm_v2/MATH Level 5": 0.1239, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4117, + "hfopenllm_v2/MMLU-PRO": 0.3811 + } + }, + { + "id": "jaspionjader/bh-39", + "name": "bh-39", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4576, + "hfopenllm_v2/BBH": 0.5633, + "hfopenllm_v2/MATH Level 5": 0.1254, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4262, + "hfopenllm_v2/MMLU-PRO": 0.3831 + } + }, + { + "id": "jaspionjader/bh-4", + "name": "bh-4", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4673, + "hfopenllm_v2/BBH": 0.5892, + "hfopenllm_v2/MATH Level 5": 0.1095, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4173, + "hfopenllm_v2/MMLU-PRO": 0.3705 + } + }, + { + "id": "jaspionjader/bh-40", + "name": "bh-40", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4536, + "hfopenllm_v2/BBH": 0.5634, + "hfopenllm_v2/MATH Level 5": 0.1246, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4236, + "hfopenllm_v2/MMLU-PRO": 0.3835 + } + }, + { + "id": "jaspionjader/bh-41", + "name": "bh-41", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.474, + "hfopenllm_v2/BBH": 0.5614, + "hfopenllm_v2/MATH Level 5": 0.1254, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4183, + "hfopenllm_v2/MMLU-PRO": 0.3825 + } + }, + { + "id": "jaspionjader/bh-42", + "name": "bh-42", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.466, + "hfopenllm_v2/BBH": 0.5646, + "hfopenllm_v2/MATH Level 5": 0.1269, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.421, + "hfopenllm_v2/MMLU-PRO": 0.3812 + } + }, + { + "id": "jaspionjader/bh-43", + "name": "bh-43", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.46, + "hfopenllm_v2/BBH": 0.5635, + "hfopenllm_v2/MATH Level 5": 0.1239, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4156, + "hfopenllm_v2/MMLU-PRO": 0.382 + } + }, + { + "id": "jaspionjader/bh-44", + "name": "bh-44", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4706, + "hfopenllm_v2/BBH": 0.5643, + "hfopenllm_v2/MATH Level 5": 0.1216, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4249, + "hfopenllm_v2/MMLU-PRO": 0.3834 + } + }, + { + "id": "jaspionjader/bh-46", + "name": "bh-46", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4727, + "hfopenllm_v2/BBH": 0.5632, + "hfopenllm_v2/MATH Level 5": 0.1276, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4262, + "hfopenllm_v2/MMLU-PRO": 0.3822 + } + }, + { + "id": "jaspionjader/bh-47", + "name": "bh-47", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4652, + "hfopenllm_v2/BBH": 0.5546, + "hfopenllm_v2/MATH Level 5": 0.1276, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4156, + "hfopenllm_v2/MMLU-PRO": 0.3855 + } + }, + { + "id": "jaspionjader/bh-48", + "name": "bh-48", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4688, + "hfopenllm_v2/BBH": 0.5541, + "hfopenllm_v2/MATH Level 5": 0.1254, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4209, + "hfopenllm_v2/MMLU-PRO": 0.386 + } + }, + { + "id": "jaspionjader/bh-49", + "name": "bh-49", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4725, + "hfopenllm_v2/BBH": 0.554, + "hfopenllm_v2/MATH Level 5": 0.1201, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4129, + "hfopenllm_v2/MMLU-PRO": 0.3808 + } + }, + { + "id": "jaspionjader/bh-5", + "name": "bh-5", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4652, + "hfopenllm_v2/BBH": 0.5882, + "hfopenllm_v2/MATH Level 5": 0.1057, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4186, + "hfopenllm_v2/MMLU-PRO": 0.3702 + } + }, + { + "id": "jaspionjader/bh-50", + "name": "bh-50", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4725, + "hfopenllm_v2/BBH": 0.5553, + "hfopenllm_v2/MATH Level 5": 0.1208, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4169, + "hfopenllm_v2/MMLU-PRO": 0.3842 + } + }, + { + "id": "jaspionjader/bh-51", + "name": "bh-51", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.463, + "hfopenllm_v2/BBH": 0.5557, + "hfopenllm_v2/MATH Level 5": 0.1239, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4168, + "hfopenllm_v2/MMLU-PRO": 0.3831 + } + }, + { + "id": "jaspionjader/bh-52", + "name": "bh-52", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4536, + "hfopenllm_v2/BBH": 0.5444, + "hfopenllm_v2/MATH Level 5": 0.1201, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4169, + "hfopenllm_v2/MMLU-PRO": 0.3843 + } + }, + { + "id": "jaspionjader/bh-53", + "name": "bh-53", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.478, + "hfopenllm_v2/BBH": 0.5494, + "hfopenllm_v2/MATH Level 5": 0.1269, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4196, + "hfopenllm_v2/MMLU-PRO": 0.3858 + } + }, + { + "id": "jaspionjader/bh-54", + "name": "bh-54", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4841, + "hfopenllm_v2/BBH": 0.5548, + "hfopenllm_v2/MATH Level 5": 0.1292, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4155, + "hfopenllm_v2/MMLU-PRO": 0.3825 + } + }, + { + "id": "jaspionjader/bh-55", + "name": "bh-55", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4709, + "hfopenllm_v2/BBH": 0.555, + "hfopenllm_v2/MATH Level 5": 0.1284, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4222, + "hfopenllm_v2/MMLU-PRO": 0.3846 + } + }, + { + "id": "jaspionjader/bh-56", + "name": "bh-56", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.46, + "hfopenllm_v2/BBH": 0.5447, + "hfopenllm_v2/MATH Level 5": 0.1231, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4116, + "hfopenllm_v2/MMLU-PRO": 0.3844 + } + }, + { + "id": "jaspionjader/bh-57", + "name": "bh-57", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4405, + "hfopenllm_v2/BBH": 0.5425, + "hfopenllm_v2/MATH Level 5": 0.1261, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.421, + "hfopenllm_v2/MMLU-PRO": 0.3896 + } + }, + { + "id": "jaspionjader/bh-58", + "name": "bh-58", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.463, + "hfopenllm_v2/BBH": 0.5446, + "hfopenllm_v2/MATH Level 5": 0.1322, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4183, + "hfopenllm_v2/MMLU-PRO": 0.3896 + } + }, + { + "id": "jaspionjader/bh-59", + "name": "bh-59", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4341, + "hfopenllm_v2/BBH": 0.5512, + "hfopenllm_v2/MATH Level 5": 0.1541, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.417, + "hfopenllm_v2/MMLU-PRO": 0.3838 + } + }, + { + "id": "jaspionjader/bh-6", + "name": "bh-6", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4621, + "hfopenllm_v2/BBH": 0.5891, + "hfopenllm_v2/MATH Level 5": 0.1088, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4199, + "hfopenllm_v2/MMLU-PRO": 0.3698 + } + }, + { + "id": "jaspionjader/bh-60", + "name": "bh-60", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4207, + "hfopenllm_v2/BBH": 0.5369, + "hfopenllm_v2/MATH Level 5": 0.1579, + "hfopenllm_v2/GPQA": 0.3255, + "hfopenllm_v2/MUSR": 0.4289, + "hfopenllm_v2/MMLU-PRO": 0.3689 + } + }, + { + "id": "jaspionjader/bh-61", + "name": "bh-61", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4247, + "hfopenllm_v2/BBH": 0.5271, + "hfopenllm_v2/MATH Level 5": 0.1707, + "hfopenllm_v2/GPQA": 0.3188, + "hfopenllm_v2/MUSR": 0.4356, + "hfopenllm_v2/MMLU-PRO": 0.3679 + } + }, + { + "id": "jaspionjader/bh-62", + "name": "bh-62", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.415, + "hfopenllm_v2/BBH": 0.5379, + "hfopenllm_v2/MATH Level 5": 0.1624, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.4289, + "hfopenllm_v2/MMLU-PRO": 0.3719 + } + }, + { + "id": "jaspionjader/bh-63", + "name": "bh-63", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4308, + "hfopenllm_v2/BBH": 0.4917, + "hfopenllm_v2/MATH Level 5": 0.111, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4313, + "hfopenllm_v2/MMLU-PRO": 0.3248 + } + }, + { + "id": "jaspionjader/bh-64", + "name": "bh-64", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.414, + "hfopenllm_v2/BBH": 0.536, + "hfopenllm_v2/MATH Level 5": 0.1548, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.4355, + "hfopenllm_v2/MMLU-PRO": 0.3693 + } + }, + { + "id": "jaspionjader/bh-7", + "name": "bh-7", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4624, + "hfopenllm_v2/BBH": 0.5861, + "hfopenllm_v2/MATH Level 5": 0.114, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4119, + "hfopenllm_v2/MMLU-PRO": 0.3715 + } + }, + { + "id": "jaspionjader/bh-8", + "name": "bh-8", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4597, + "hfopenllm_v2/BBH": 0.59, + "hfopenllm_v2/MATH Level 5": 0.1178, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4265, + "hfopenllm_v2/MMLU-PRO": 0.372 + } + }, + { + "id": "jaspionjader/bh-9", + "name": "bh-9", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4509, + "hfopenllm_v2/BBH": 0.585, + "hfopenllm_v2/MATH Level 5": 0.1156, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4146, + "hfopenllm_v2/MMLU-PRO": 0.3703 + } + }, + { + "id": "jaspionjader/dp-6-8b", + "name": "dp-6-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4806, + "hfopenllm_v2/BBH": 0.53, + "hfopenllm_v2/MATH Level 5": 0.1329, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4434, + "hfopenllm_v2/MMLU-PRO": 0.3897 + } + }, + { + "id": "jaspionjader/dp-7-8b", + "name": "dp-7-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4498, + "hfopenllm_v2/BBH": 0.5291, + "hfopenllm_v2/MATH Level 5": 0.1261, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4407, + "hfopenllm_v2/MMLU-PRO": 0.3934 + } + }, + { + "id": "jaspionjader/ek-6", + "name": "ek-6", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4642, + "hfopenllm_v2/BBH": 0.5219, + "hfopenllm_v2/MATH Level 5": 0.1322, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4144, + "hfopenllm_v2/MMLU-PRO": 0.3861 + } + }, + { + "id": "jaspionjader/ek-7", + "name": "ek-7", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4767, + "hfopenllm_v2/BBH": 0.5194, + "hfopenllm_v2/MATH Level 5": 0.1329, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4171, + "hfopenllm_v2/MMLU-PRO": 0.3887 + } + }, + { + "id": "jaspionjader/f-1-8b", + "name": "f-1-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4983, + "hfopenllm_v2/BBH": 0.5141, + "hfopenllm_v2/MATH Level 5": 0.1284, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4527, + "hfopenllm_v2/MMLU-PRO": 0.3907 + } + }, + { + "id": "jaspionjader/f-2-8b", + "name": "f-2-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4824, + "hfopenllm_v2/BBH": 0.5294, + "hfopenllm_v2/MATH Level 5": 0.1171, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4501, + "hfopenllm_v2/MMLU-PRO": 0.3962 + } + }, + { + "id": "jaspionjader/f-3-8b", + "name": "f-3-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4803, + "hfopenllm_v2/BBH": 0.5275, + "hfopenllm_v2/MATH Level 5": 0.1216, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4421, + "hfopenllm_v2/MMLU-PRO": 0.3954 + } + }, + { + "id": "jaspionjader/f-4-8b", + "name": "f-4-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4797, + "hfopenllm_v2/BBH": 0.5289, + "hfopenllm_v2/MATH Level 5": 0.1148, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4514, + "hfopenllm_v2/MMLU-PRO": 0.3956 + } + }, + { + "id": "jaspionjader/f-5-8b", + "name": "f-5-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5044, + "hfopenllm_v2/BBH": 0.5313, + "hfopenllm_v2/MATH Level 5": 0.1239, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4461, + "hfopenllm_v2/MMLU-PRO": 0.3949 + } + }, + { + "id": "jaspionjader/f-6-8b", + "name": "f-6-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4846, + "hfopenllm_v2/BBH": 0.5241, + "hfopenllm_v2/MATH Level 5": 0.1193, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4474, + "hfopenllm_v2/MMLU-PRO": 0.3939 + } + }, + { + "id": "jaspionjader/f-7-8b", + "name": "f-7-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4462, + "hfopenllm_v2/BBH": 0.5277, + "hfopenllm_v2/MATH Level 5": 0.1239, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4315, + "hfopenllm_v2/MMLU-PRO": 0.3936 + } + }, + { + "id": "jaspionjader/f-8-8b", + "name": "f-8-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4739, + "hfopenllm_v2/BBH": 0.5259, + "hfopenllm_v2/MATH Level 5": 0.1224, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4354, + "hfopenllm_v2/MMLU-PRO": 0.394 + } + }, + { + "id": "jaspionjader/f-9-8b", + "name": "f-9-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4602, + "hfopenllm_v2/BBH": 0.5292, + "hfopenllm_v2/MATH Level 5": 0.1299, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4461, + "hfopenllm_v2/MMLU-PRO": 0.3944 + } + }, + { + "id": "jaspionjader/fct-14-8b", + "name": "fct-14-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4129, + "hfopenllm_v2/BBH": 0.5206, + "hfopenllm_v2/MATH Level 5": 0.1201, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4186, + "hfopenllm_v2/MMLU-PRO": 0.3875 + } + }, + { + "id": "jaspionjader/fct-9-8b", + "name": "fct-9-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4354, + "hfopenllm_v2/BBH": 0.5205, + "hfopenllm_v2/MATH Level 5": 0.1193, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4291, + "hfopenllm_v2/MMLU-PRO": 0.3932 + } + }, + { + "id": "jaspionjader/fr-1-8b", + "name": "fr-1-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4211, + "hfopenllm_v2/BBH": 0.5142, + "hfopenllm_v2/MATH Level 5": 0.1118, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4277, + "hfopenllm_v2/MMLU-PRO": 0.361 + } + }, + { + "id": "jaspionjader/fr-10-8b", + "name": "fr-10-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4402, + "hfopenllm_v2/BBH": 0.5207, + "hfopenllm_v2/MATH Level 5": 0.1224, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4119, + "hfopenllm_v2/MMLU-PRO": 0.3863 + } + }, + { + "id": "jaspionjader/fr-3-8b", + "name": "fr-3-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4326, + "hfopenllm_v2/BBH": 0.5255, + "hfopenllm_v2/MATH Level 5": 0.1133, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4198, + "hfopenllm_v2/MMLU-PRO": 0.3863 + } + }, + { + "id": "jaspionjader/gamma-Kosmos-EVAA-8B", + "name": "gamma-Kosmos-EVAA-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.425, + "hfopenllm_v2/BBH": 0.5253, + "hfopenllm_v2/MATH Level 5": 0.0899, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4412, + "hfopenllm_v2/MMLU-PRO": 0.3776 + } + }, + { + "id": "jaspionjader/gamma-Kosmos-EVAA-v2-8B", + "name": "gamma-Kosmos-EVAA-v2-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4233, + "hfopenllm_v2/BBH": 0.5262, + "hfopenllm_v2/MATH Level 5": 0.1057, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.4344, + "hfopenllm_v2/MMLU-PRO": 0.3756 + } + }, + { + "id": "jaspionjader/gamma-Kosmos-EVAA-v3-8B", + "name": "gamma-Kosmos-EVAA-v3-8B", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4333, + "hfopenllm_v2/BBH": 0.5278, + "hfopenllm_v2/MATH Level 5": 0.111, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4263, + "hfopenllm_v2/MMLU-PRO": 0.3898 + } + }, + { + "id": "jaspionjader/knf-2-8b", + "name": "knf-2-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.425, + "hfopenllm_v2/BBH": 0.5207, + "hfopenllm_v2/MATH Level 5": 0.1201, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4185, + "hfopenllm_v2/MMLU-PRO": 0.3875 + } + }, + { + "id": "jaspionjader/knfp-2-8b", + "name": "knfp-2-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5327, + "hfopenllm_v2/BBH": 0.5305, + "hfopenllm_v2/MATH Level 5": 0.1427, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4185, + "hfopenllm_v2/MMLU-PRO": 0.3726 + } + }, + { + "id": "jaspionjader/knfp-3-8b", + "name": "knfp-3-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4946, + "hfopenllm_v2/BBH": 0.52, + "hfopenllm_v2/MATH Level 5": 0.1224, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4171, + "hfopenllm_v2/MMLU-PRO": 0.3881 + } + }, + { + "id": "jaspionjader/kstc-1-8b", + "name": "kstc-1-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4643, + "hfopenllm_v2/BBH": 0.5209, + "hfopenllm_v2/MATH Level 5": 0.1171, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4158, + "hfopenllm_v2/MMLU-PRO": 0.3892 + } + }, + { + "id": "jaspionjader/kstc-11-8b", + "name": "kstc-11-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4757, + "hfopenllm_v2/BBH": 0.5189, + "hfopenllm_v2/MATH Level 5": 0.1201, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4118, + "hfopenllm_v2/MMLU-PRO": 0.3879 + } + }, + { + "id": "jaspionjader/kstc-4-8b", + "name": "kstc-4-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.477, + "hfopenllm_v2/BBH": 0.5216, + "hfopenllm_v2/MATH Level 5": 0.1239, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4118, + "hfopenllm_v2/MMLU-PRO": 0.3869 + } + }, + { + "id": "jaspionjader/kstc-5-8b", + "name": "kstc-5-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4721, + "hfopenllm_v2/BBH": 0.5211, + "hfopenllm_v2/MATH Level 5": 0.1299, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.4224, + "hfopenllm_v2/MMLU-PRO": 0.3892 + } + }, + { + "id": "jaspionjader/kstc-6-8b", + "name": "kstc-6-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4944, + "hfopenllm_v2/BBH": 0.5231, + "hfopenllm_v2/MATH Level 5": 0.1246, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4105, + "hfopenllm_v2/MMLU-PRO": 0.3857 + } + }, + { + "id": "jaspionjader/kstc-8-8b", + "name": "kstc-8-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.491, + "hfopenllm_v2/BBH": 0.5239, + "hfopenllm_v2/MATH Level 5": 0.1307, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4211, + "hfopenllm_v2/MMLU-PRO": 0.3889 + } + }, + { + "id": "jaspionjader/kstc-9-8b", + "name": "kstc-9-8b", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4861, + "hfopenllm_v2/BBH": 0.5238, + "hfopenllm_v2/MATH Level 5": 0.136, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4118, + "hfopenllm_v2/MMLU-PRO": 0.3872 + } + }, + { + "id": "jaspionjader/slu-10", + "name": "slu-10", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.436, + "hfopenllm_v2/BBH": 0.5096, + "hfopenllm_v2/MATH Level 5": 0.0974, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.392, + "hfopenllm_v2/MMLU-PRO": 0.3664 + } + }, + { + "id": "jaspionjader/slu-11", + "name": "slu-11", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3725, + "hfopenllm_v2/BBH": 0.489, + "hfopenllm_v2/MATH Level 5": 0.0559, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.3919, + "hfopenllm_v2/MMLU-PRO": 0.3382 + } + }, + { + "id": "jaspionjader/slu-13", + "name": "slu-13", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4378, + "hfopenllm_v2/BBH": 0.5097, + "hfopenllm_v2/MATH Level 5": 0.0808, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.3814, + "hfopenllm_v2/MMLU-PRO": 0.358 + } + }, + { + "id": "jaspionjader/slu-14", + "name": "slu-14", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4107, + "hfopenllm_v2/BBH": 0.5089, + "hfopenllm_v2/MATH Level 5": 0.0974, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.396, + "hfopenllm_v2/MMLU-PRO": 0.3627 + } + }, + { + "id": "jaspionjader/slu-17", + "name": "slu-17", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4217, + "hfopenllm_v2/BBH": 0.5071, + "hfopenllm_v2/MATH Level 5": 0.0853, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.3761, + "hfopenllm_v2/MMLU-PRO": 0.3619 + } + }, + { + "id": "jaspionjader/slu-2", + "name": "slu-2", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4016, + "hfopenllm_v2/BBH": 0.5008, + "hfopenllm_v2/MATH Level 5": 0.0634, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.3959, + "hfopenllm_v2/MMLU-PRO": 0.3506 + } + }, + { + "id": "jaspionjader/slu-20", + "name": "slu-20", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4393, + "hfopenllm_v2/BBH": 0.5061, + "hfopenllm_v2/MATH Level 5": 0.0869, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.3933, + "hfopenllm_v2/MMLU-PRO": 0.3665 + } + }, + { + "id": "jaspionjader/slu-22", + "name": "slu-22", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4321, + "hfopenllm_v2/BBH": 0.5082, + "hfopenllm_v2/MATH Level 5": 0.0793, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.3893, + "hfopenllm_v2/MMLU-PRO": 0.365 + } + }, + { + "id": "jaspionjader/slu-23", + "name": "slu-23", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4478, + "hfopenllm_v2/BBH": 0.5132, + "hfopenllm_v2/MATH Level 5": 0.0944, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4092, + "hfopenllm_v2/MMLU-PRO": 0.3725 + } + }, + { + "id": "jaspionjader/slu-25", + "name": "slu-25", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.45, + "hfopenllm_v2/BBH": 0.5095, + "hfopenllm_v2/MATH Level 5": 0.0838, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.3946, + "hfopenllm_v2/MMLU-PRO": 0.3684 + } + }, + { + "id": "jaspionjader/slu-29", + "name": "slu-29", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4431, + "hfopenllm_v2/BBH": 0.5096, + "hfopenllm_v2/MATH Level 5": 0.0869, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.3933, + "hfopenllm_v2/MMLU-PRO": 0.3669 + } + }, + { + "id": "jaspionjader/slu-32", + "name": "slu-32", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4516, + "hfopenllm_v2/BBH": 0.5167, + "hfopenllm_v2/MATH Level 5": 0.1073, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4039, + "hfopenllm_v2/MMLU-PRO": 0.3766 + } + }, + { + "id": "jaspionjader/slu-33", + "name": "slu-33", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4457, + "hfopenllm_v2/BBH": 0.5081, + "hfopenllm_v2/MATH Level 5": 0.0997, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.3867, + "hfopenllm_v2/MMLU-PRO": 0.3679 + } + }, + { + "id": "jaspionjader/slu-34", + "name": "slu-34", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4351, + "hfopenllm_v2/BBH": 0.5077, + "hfopenllm_v2/MATH Level 5": 0.0997, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.388, + "hfopenllm_v2/MMLU-PRO": 0.372 + } + }, + { + "id": "jaspionjader/slu-35", + "name": "slu-35", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4242, + "hfopenllm_v2/BBH": 0.5103, + "hfopenllm_v2/MATH Level 5": 0.1012, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.3946, + "hfopenllm_v2/MMLU-PRO": 0.3676 + } + }, + { + "id": "jaspionjader/slu-36", + "name": "slu-36", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4518, + "hfopenllm_v2/BBH": 0.5087, + "hfopenllm_v2/MATH Level 5": 0.0906, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.3933, + "hfopenllm_v2/MMLU-PRO": 0.3711 + } + }, + { + "id": "jaspionjader/slu-37", + "name": "slu-37", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4534, + "hfopenllm_v2/BBH": 0.51, + "hfopenllm_v2/MATH Level 5": 0.0974, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.3946, + "hfopenllm_v2/MMLU-PRO": 0.3695 + } + }, + { + "id": "jaspionjader/slu-6", + "name": "slu-6", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4117, + "hfopenllm_v2/BBH": 0.5099, + "hfopenllm_v2/MATH Level 5": 0.0944, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4066, + "hfopenllm_v2/MMLU-PRO": 0.3611 + } + }, + { + "id": "jaspionjader/slu-mix-1", + "name": "slu-mix-1", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4569, + "hfopenllm_v2/BBH": 0.524, + "hfopenllm_v2/MATH Level 5": 0.1118, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4277, + "hfopenllm_v2/MMLU-PRO": 0.393 + } + }, + { + "id": "jaspionjader/sof-1", + "name": "sof-1", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4314, + "hfopenllm_v2/BBH": 0.501, + "hfopenllm_v2/MATH Level 5": 0.114, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4082, + "hfopenllm_v2/MMLU-PRO": 0.3674 + } + }, + { + "id": "jaspionjader/sof-10", + "name": "sof-10", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4648, + "hfopenllm_v2/BBH": 0.5197, + "hfopenllm_v2/MATH Level 5": 0.1239, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4091, + "hfopenllm_v2/MMLU-PRO": 0.3874 + } + }, + { + "id": "jaspionjader/sof-3", + "name": "sof-3", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4637, + "hfopenllm_v2/BBH": 0.5206, + "hfopenllm_v2/MATH Level 5": 0.1276, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4131, + "hfopenllm_v2/MMLU-PRO": 0.3812 + } + }, + { + "id": "jaspionjader/sof-6", + "name": "sof-6", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4354, + "hfopenllm_v2/BBH": 0.5209, + "hfopenllm_v2/MATH Level 5": 0.1299, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4171, + "hfopenllm_v2/MMLU-PRO": 0.3844 + } + }, + { + "id": "jaspionjader/test-10", + "name": "test-10", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4578, + "hfopenllm_v2/BBH": 0.5316, + "hfopenllm_v2/MATH Level 5": 0.114, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.4251, + "hfopenllm_v2/MMLU-PRO": 0.3936 + } + }, + { + "id": "jaspionjader/test-11", + "name": "test-11", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4541, + "hfopenllm_v2/BBH": 0.535, + "hfopenllm_v2/MATH Level 5": 0.1201, + "hfopenllm_v2/GPQA": 0.3238, + "hfopenllm_v2/MUSR": 0.429, + "hfopenllm_v2/MMLU-PRO": 0.3939 + } + }, + { + "id": "jaspionjader/test-12", + "name": "test-12", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4368, + "hfopenllm_v2/BBH": 0.5347, + "hfopenllm_v2/MATH Level 5": 0.108, + "hfopenllm_v2/GPQA": 0.3188, + "hfopenllm_v2/MUSR": 0.425, + "hfopenllm_v2/MMLU-PRO": 0.3935 + } + }, + { + "id": "jaspionjader/test-13", + "name": "test-13", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4581, + "hfopenllm_v2/BBH": 0.5318, + "hfopenllm_v2/MATH Level 5": 0.1057, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4264, + "hfopenllm_v2/MMLU-PRO": 0.3935 + } + }, + { + "id": "jaspionjader/test-14", + "name": "test-14", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4444, + "hfopenllm_v2/BBH": 0.5323, + "hfopenllm_v2/MATH Level 5": 0.1103, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.4317, + "hfopenllm_v2/MMLU-PRO": 0.393 + } + }, + { + "id": "jaspionjader/test-15", + "name": "test-15", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4365, + "hfopenllm_v2/BBH": 0.5328, + "hfopenllm_v2/MATH Level 5": 0.1118, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4264, + "hfopenllm_v2/MMLU-PRO": 0.393 + } + }, + { + "id": "jaspionjader/test-16", + "name": "test-16", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4599, + "hfopenllm_v2/BBH": 0.533, + "hfopenllm_v2/MATH Level 5": 0.1095, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4225, + "hfopenllm_v2/MMLU-PRO": 0.393 + } + }, + { + "id": "jaspionjader/test-17", + "name": "test-17", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4267, + "hfopenllm_v2/BBH": 0.5329, + "hfopenllm_v2/MATH Level 5": 0.1103, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.429, + "hfopenllm_v2/MMLU-PRO": 0.3929 + } + }, + { + "id": "jaspionjader/test-18", + "name": "test-18", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4392, + "hfopenllm_v2/BBH": 0.5317, + "hfopenllm_v2/MATH Level 5": 0.1148, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4251, + "hfopenllm_v2/MMLU-PRO": 0.393 + } + }, + { + "id": "jaspionjader/test-19", + "name": "test-19", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4401, + "hfopenllm_v2/BBH": 0.5319, + "hfopenllm_v2/MATH Level 5": 0.1095, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4264, + "hfopenllm_v2/MMLU-PRO": 0.3929 + } + }, + { + "id": "jaspionjader/test-20", + "name": "test-20", + "developer": "jaspionjader", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4529, + "hfopenllm_v2/BBH": 0.5327, + "hfopenllm_v2/MATH Level 5": 0.1118, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4251, + "hfopenllm_v2/MMLU-PRO": 0.392 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/jayasuryajsk.json b/data/developers/jayasuryajsk.json new file mode 100644 index 0000000000000000000000000000000000000000..bdf1e16539bcea7f4ba0e17c17b7cf1671762ab6 --- /dev/null +++ b/data/developers/jayasuryajsk.json @@ -0,0 +1,19 @@ +{ + "developer": "jayasuryajsk", + "models": [ + { + "id": "jayasuryajsk/Qwen2.5-3B-reasoner", + "name": "Qwen2.5-3B-reasoner", + "developer": "jayasuryajsk", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.416, + "hfopenllm_v2/BBH": 0.4651, + "hfopenllm_v2/MATH Level 5": 0.2085, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4123, + "hfopenllm_v2/MMLU-PRO": 0.3482 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/jeanmichela.json b/data/developers/jeanmichela.json new file mode 100644 index 0000000000000000000000000000000000000000..fadc4dfc28a3579f5dccbf04caf169e5d950a8b7 --- /dev/null +++ b/data/developers/jeanmichela.json @@ -0,0 +1,19 @@ +{ + "developer": "jeanmichela", + "models": [ + { + "id": "jeanmichela/o-distil-qwen", + "name": "o-distil-qwen", + "developer": "jeanmichela", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4482, + "hfopenllm_v2/BBH": 0.59, + "hfopenllm_v2/MATH Level 5": 0.565, + "hfopenllm_v2/GPQA": 0.3935, + "hfopenllm_v2/MUSR": 0.534, + "hfopenllm_v2/MMLU-PRO": 0.4658 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/jebcarter.json b/data/developers/jebcarter.json new file mode 100644 index 0000000000000000000000000000000000000000..d53b334a7bf817d5bf537f8b18a0e0ee8092de33 --- /dev/null +++ b/data/developers/jebcarter.json @@ -0,0 +1,19 @@ +{ + "developer": "jebcarter", + "models": [ + { + "id": "jebcarter/psyonic-cetacean-20B", + "name": "psyonic-cetacean-20B", + "developer": "jebcarter", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2544, + "hfopenllm_v2/BBH": 0.4907, + "hfopenllm_v2/MATH Level 5": 0.0181, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.4661, + "hfopenllm_v2/MMLU-PRO": 0.2886 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/jebish7.json b/data/developers/jebish7.json new file mode 100644 index 0000000000000000000000000000000000000000..7be3133dfe92554eec690af796d07cb44563a97c --- /dev/null +++ b/data/developers/jebish7.json @@ -0,0 +1,131 @@ +{ + "developer": "jebish7", + "models": [ + { + "id": "jebish7/Llama-3-Nanda-10B-Chat", + "name": "Llama-3-Nanda-10B-Chat", + "developer": "jebish7", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2953, + "hfopenllm_v2/BBH": 0.4959, + "hfopenllm_v2/MATH Level 5": 0.0559, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4356, + "hfopenllm_v2/MMLU-PRO": 0.3157 + } + }, + { + "id": "jebish7/Llama-3.1-8B-Instruct", + "name": "Llama-3.1-8B-Instruct", + "developer": "jebish7", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5058, + "hfopenllm_v2/BBH": 0.5088, + "hfopenllm_v2/MATH Level 5": 0.1548, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.3998, + "hfopenllm_v2/MMLU-PRO": 0.3777 + } + }, + { + "id": "jebish7/Nemotron-4-Mini-Hindi-4B-Base", + "name": "Nemotron-4-Mini-Hindi-4B-Base", + "developer": "jebish7", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2285, + "hfopenllm_v2/BBH": 0.3924, + "hfopenllm_v2/MATH Level 5": 0.0272, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.4249, + "hfopenllm_v2/MMLU-PRO": 0.2503 + } + }, + { + "id": "jebish7/Nemotron-4-Mini-Hindi-4B-Instruct", + "name": "Nemotron-4-Mini-Hindi-4B-Instruct", + "developer": "jebish7", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3345, + "hfopenllm_v2/BBH": 0.4041, + "hfopenllm_v2/MATH Level 5": 0.0287, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4153, + "hfopenllm_v2/MMLU-PRO": 0.2595 + } + }, + { + "id": "jebish7/Nemotron-Mini-4B-Instruct", + "name": "Nemotron-Mini-4B-Instruct", + "developer": "jebish7", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3709, + "hfopenllm_v2/BBH": 0.4244, + "hfopenllm_v2/MATH Level 5": 0.0325, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.4727, + "hfopenllm_v2/MMLU-PRO": 0.2783 + } + }, + { + "id": "jebish7/aya-expanse-8b", + "name": "aya-expanse-8b", + "developer": "jebish7", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3791, + "hfopenllm_v2/BBH": 0.4969, + "hfopenllm_v2/MATH Level 5": 0.0816, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3869, + "hfopenllm_v2/MMLU-PRO": 0.3103 + } + }, + { + "id": "jebish7/gemma-2-2b-it", + "name": "gemma-2-2b-it", + "developer": "jebish7", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1272, + "hfopenllm_v2/BBH": 0.4395, + "hfopenllm_v2/MATH Level 5": 0.034, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4244, + "hfopenllm_v2/MMLU-PRO": 0.2715 + } + }, + { + "id": "jebish7/gemma-2-9b-it", + "name": "gemma-2-9b-it", + "developer": "jebish7", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1557, + "hfopenllm_v2/BBH": 0.5949, + "hfopenllm_v2/MATH Level 5": 0.0846, + "hfopenllm_v2/GPQA": 0.3473, + "hfopenllm_v2/MUSR": 0.4554, + "hfopenllm_v2/MMLU-PRO": 0.4143 + } + }, + { + "id": "jebish7/qwen2.5-0.5B-IHA-Hin", + "name": "qwen2.5-0.5B-IHA-Hin", + "developer": "jebish7", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1416, + "hfopenllm_v2/BBH": 0.2989, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.3475, + "hfopenllm_v2/MMLU-PRO": 0.1094 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/jeffmeloy.json b/data/developers/jeffmeloy.json new file mode 100644 index 0000000000000000000000000000000000000000..7750c73ca5aba450b20525028dfad86db374ee28 --- /dev/null +++ b/data/developers/jeffmeloy.json @@ -0,0 +1,257 @@ +{ + "developer": "jeffmeloy", + "models": [ + { + "id": "jeffmeloy/Qwen-7B-nerd-uncensored-v1.0", + "name": "Qwen-7B-nerd-uncensored-v1.0", + "developer": "jeffmeloy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6136, + "hfopenllm_v2/BBH": 0.5421, + "hfopenllm_v2/MATH Level 5": 0.287, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.4793, + "hfopenllm_v2/MMLU-PRO": 0.4363 + } + }, + { + "id": "jeffmeloy/Qwen2.5-7B-minperplexity-2", + "name": "Qwen2.5-7B-minperplexity-2", + "developer": "jeffmeloy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5097, + "hfopenllm_v2/BBH": 0.5524, + "hfopenllm_v2/MATH Level 5": 0.3014, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.4625, + "hfopenllm_v2/MMLU-PRO": 0.4346 + } + }, + { + "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v0.9", + "name": "Qwen2.5-7B-nerd-uncensored-v0.9", + "developer": "jeffmeloy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6048, + "hfopenllm_v2/BBH": 0.547, + "hfopenllm_v2/MATH Level 5": 0.2946, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.482, + "hfopenllm_v2/MMLU-PRO": 0.4363 + } + }, + { + "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.0", + "name": "Qwen2.5-7B-nerd-uncensored-v1.0", + "developer": "jeffmeloy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7695, + "hfopenllm_v2/BBH": 0.5418, + "hfopenllm_v2/MATH Level 5": 0.4713, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4551, + "hfopenllm_v2/MMLU-PRO": 0.4254 + } + }, + { + "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.1", + "name": "Qwen2.5-7B-nerd-uncensored-v1.1", + "developer": "jeffmeloy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6626, + "hfopenllm_v2/BBH": 0.4864, + "hfopenllm_v2/MATH Level 5": 0.1329, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.3843, + "hfopenllm_v2/MMLU-PRO": 0.385 + } + }, + { + "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.2", + "name": "Qwen2.5-7B-nerd-uncensored-v1.2", + "developer": "jeffmeloy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4965, + "hfopenllm_v2/BBH": 0.4946, + "hfopenllm_v2/MATH Level 5": 0.1208, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4172, + "hfopenllm_v2/MMLU-PRO": 0.3969 + } + }, + { + "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.3", + "name": "Qwen2.5-7B-nerd-uncensored-v1.3", + "developer": "jeffmeloy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4995, + "hfopenllm_v2/BBH": 0.5026, + "hfopenllm_v2/MATH Level 5": 0.1231, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4187, + "hfopenllm_v2/MMLU-PRO": 0.4016 + } + }, + { + "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.4", + "name": "Qwen2.5-7B-nerd-uncensored-v1.4", + "developer": "jeffmeloy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6079, + "hfopenllm_v2/BBH": 0.5467, + "hfopenllm_v2/MATH Level 5": 0.281, + "hfopenllm_v2/GPQA": 0.3238, + "hfopenllm_v2/MUSR": 0.4714, + "hfopenllm_v2/MMLU-PRO": 0.4419 + } + }, + { + "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.5", + "name": "Qwen2.5-7B-nerd-uncensored-v1.5", + "developer": "jeffmeloy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.565, + "hfopenllm_v2/BBH": 0.5523, + "hfopenllm_v2/MATH Level 5": 0.2757, + "hfopenllm_v2/GPQA": 0.3272, + "hfopenllm_v2/MUSR": 0.4982, + "hfopenllm_v2/MMLU-PRO": 0.4448 + } + }, + { + "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.7", + "name": "Qwen2.5-7B-nerd-uncensored-v1.7", + "developer": "jeffmeloy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4202, + "hfopenllm_v2/BBH": 0.5392, + "hfopenllm_v2/MATH Level 5": 0.2915, + "hfopenllm_v2/GPQA": 0.3238, + "hfopenllm_v2/MUSR": 0.4848, + "hfopenllm_v2/MMLU-PRO": 0.428 + } + }, + { + "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.8", + "name": "Qwen2.5-7B-nerd-uncensored-v1.8", + "developer": "jeffmeloy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6256, + "hfopenllm_v2/BBH": 0.5447, + "hfopenllm_v2/MATH Level 5": 0.2704, + "hfopenllm_v2/GPQA": 0.3238, + "hfopenllm_v2/MUSR": 0.4767, + "hfopenllm_v2/MMLU-PRO": 0.4343 + } + }, + { + "id": "jeffmeloy/Qwen2.5-7B-olm-v1.0", + "name": "Qwen2.5-7B-olm-v1.0", + "developer": "jeffmeloy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5331, + "hfopenllm_v2/BBH": 0.566, + "hfopenllm_v2/MATH Level 5": 0.2863, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.4278, + "hfopenllm_v2/MMLU-PRO": 0.4566 + } + }, + { + "id": "jeffmeloy/Qwen2.5-7B-olm-v1.1", + "name": "Qwen2.5-7B-olm-v1.1", + "developer": "jeffmeloy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4329, + "hfopenllm_v2/BBH": 0.5478, + "hfopenllm_v2/MATH Level 5": 0.3829, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4808, + "hfopenllm_v2/MMLU-PRO": 0.4354 + } + }, + { + "id": "jeffmeloy/Qwen2.5-7B-olm-v1.2", + "name": "Qwen2.5-7B-olm-v1.2", + "developer": "jeffmeloy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4203, + "hfopenllm_v2/BBH": 0.5533, + "hfopenllm_v2/MATH Level 5": 0.2847, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4688, + "hfopenllm_v2/MMLU-PRO": 0.4387 + } + }, + { + "id": "jeffmeloy/Qwen2.5-7B-olm-v1.3", + "name": "Qwen2.5-7B-olm-v1.3", + "developer": "jeffmeloy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4219, + "hfopenllm_v2/BBH": 0.5532, + "hfopenllm_v2/MATH Level 5": 0.3104, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.4701, + "hfopenllm_v2/MMLU-PRO": 0.447 + } + }, + { + "id": "jeffmeloy/Qwen2.5-7B-olm-v1.4", + "name": "Qwen2.5-7B-olm-v1.4", + "developer": "jeffmeloy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4545, + "hfopenllm_v2/BBH": 0.5582, + "hfopenllm_v2/MATH Level 5": 0.2923, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4622, + "hfopenllm_v2/MMLU-PRO": 0.4457 + } + }, + { + "id": "jeffmeloy/Qwen2.5-7B-olm-v1.5", + "name": "Qwen2.5-7B-olm-v1.5", + "developer": "jeffmeloy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4547, + "hfopenllm_v2/BBH": 0.5544, + "hfopenllm_v2/MATH Level 5": 0.2817, + "hfopenllm_v2/GPQA": 0.3398, + "hfopenllm_v2/MUSR": 0.4539, + "hfopenllm_v2/MMLU-PRO": 0.4399 + } + }, + { + "id": "jeffmeloy/jeffmeloy_Qwen2.5-7B-minperplexity-1", + "name": "jeffmeloy_Qwen2.5-7B-minperplexity-1", + "developer": "jeffmeloy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3757, + "hfopenllm_v2/BBH": 0.5582, + "hfopenllm_v2/MATH Level 5": 0.2915, + "hfopenllm_v2/GPQA": 0.3322, + "hfopenllm_v2/MUSR": 0.429, + "hfopenllm_v2/MMLU-PRO": 0.4368 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/jeonsworld.json b/data/developers/jeonsworld.json new file mode 100644 index 0000000000000000000000000000000000000000..66a7da435776e15ee799e0eb55a917e7b5fc8d81 --- /dev/null +++ b/data/developers/jeonsworld.json @@ -0,0 +1,19 @@ +{ + "developer": "jeonsworld", + "models": [ + { + "id": "jeonsworld/CarbonVillain-en-10.7B-v4", + "name": "CarbonVillain-en-10.7B-v4", + "developer": "jeonsworld", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4579, + "hfopenllm_v2/BBH": 0.5168, + "hfopenllm_v2/MATH Level 5": 0.0468, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.3965, + "hfopenllm_v2/MMLU-PRO": 0.3142 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/jiangxinyang-shanda.json b/data/developers/jiangxinyang-shanda.json new file mode 100644 index 0000000000000000000000000000000000000000..a117dd12e1248c966f9863b47e12b1d37fb191c2 --- /dev/null +++ b/data/developers/jiangxinyang-shanda.json @@ -0,0 +1,19 @@ +{ + "developer": "jiangxinyang-shanda", + "models": [ + { + "id": "jiangxinyang-shanda/Homer-LLama3-8B", + "name": "Homer-LLama3-8B", + "developer": "jiangxinyang-shanda", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3992, + "hfopenllm_v2/BBH": 0.5173, + "hfopenllm_v2/MATH Level 5": 0.0861, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4056, + "hfopenllm_v2/MMLU-PRO": 0.3139 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/jieliu.json b/data/developers/jieliu.json new file mode 100644 index 0000000000000000000000000000000000000000..3180e8761112c19266b8020f255f076dedb5ea93 --- /dev/null +++ b/data/developers/jieliu.json @@ -0,0 +1,19 @@ +{ + "developer": "jieliu", + "models": [ + { + "id": "jieliu/Storm-7B", + "name": "Storm-7B", + "developer": "jieliu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3424, + "hfopenllm_v2/BBH": 0.5187, + "hfopenllm_v2/MATH Level 5": 0.0612, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4429, + "hfopenllm_v2/MMLU-PRO": 0.3119 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/jiviai.json b/data/developers/jiviai.json new file mode 100644 index 0000000000000000000000000000000000000000..0d7dce80a40ec9dabe53ceda7d61a4e9c8397c4e --- /dev/null +++ b/data/developers/jiviai.json @@ -0,0 +1,19 @@ +{ + "developer": "jiviai", + "models": [ + { + "id": "jiviai/medX_v2", + "name": "medX_v2", + "developer": "jiviai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3743, + "hfopenllm_v2/BBH": 0.4509, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.3498, + "hfopenllm_v2/MMLU-PRO": 0.3428 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/jlzhou.json b/data/developers/jlzhou.json new file mode 100644 index 0000000000000000000000000000000000000000..fbe69b6f4579a76ba77d51ba0528d76c911dd226 --- /dev/null +++ b/data/developers/jlzhou.json @@ -0,0 +1,19 @@ +{ + "developer": "jlzhou", + "models": [ + { + "id": "jlzhou/Qwen2.5-3B-Infinity-Instruct-0625", + "name": "Qwen2.5-3B-Infinity-Instruct-0625", + "developer": "jlzhou", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3558, + "hfopenllm_v2/BBH": 0.4774, + "hfopenllm_v2/MATH Level 5": 0.1367, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3981, + "hfopenllm_v2/MMLU-PRO": 0.3199 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/johnsutor.json b/data/developers/johnsutor.json new file mode 100644 index 0000000000000000000000000000000000000000..f779ab2f470853f18bd8b0de0fa0b59de6f6df4f --- /dev/null +++ b/data/developers/johnsutor.json @@ -0,0 +1,439 @@ +{ + "developer": "johnsutor", + "models": [ + { + "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01", + "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4271, + "hfopenllm_v2/BBH": 0.5036, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.3221, + "hfopenllm_v2/MUSR": 0.4638, + "hfopenllm_v2/MMLU-PRO": 0.3739 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1", + "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4253, + "hfopenllm_v2/BBH": 0.5019, + "hfopenllm_v2/MATH Level 5": 0.0967, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.415, + "hfopenllm_v2/MMLU-PRO": 0.3724 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01", + "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3377, + "hfopenllm_v2/BBH": 0.4917, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.5018, + "hfopenllm_v2/MMLU-PRO": 0.3533 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1", + "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4274, + "hfopenllm_v2/BBH": 0.5126, + "hfopenllm_v2/MATH Level 5": 0.0808, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4226, + "hfopenllm_v2/MMLU-PRO": 0.3739 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01", + "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3204, + "hfopenllm_v2/BBH": 0.4884, + "hfopenllm_v2/MATH Level 5": 0.0038, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.5098, + "hfopenllm_v2/MMLU-PRO": 0.3344 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1", + "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4396, + "hfopenllm_v2/BBH": 0.514, + "hfopenllm_v2/MATH Level 5": 0.0801, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4398, + "hfopenllm_v2/MMLU-PRO": 0.3696 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01", + "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2814, + "hfopenllm_v2/BBH": 0.4854, + "hfopenllm_v2/MATH Level 5": 0.0023, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.5163, + "hfopenllm_v2/MMLU-PRO": 0.3295 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1", + "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4302, + "hfopenllm_v2/BBH": 0.5157, + "hfopenllm_v2/MATH Level 5": 0.0627, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4332, + "hfopenllm_v2/MMLU-PRO": 0.3663 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01", + "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.279, + "hfopenllm_v2/BBH": 0.4861, + "hfopenllm_v2/MATH Level 5": 0.0015, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.515, + "hfopenllm_v2/MMLU-PRO": 0.3305 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1", + "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4223, + "hfopenllm_v2/BBH": 0.5154, + "hfopenllm_v2/MATH Level 5": 0.074, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4384, + "hfopenllm_v2/MMLU-PRO": 0.365 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01", + "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4359, + "hfopenllm_v2/BBH": 0.5041, + "hfopenllm_v2/MATH Level 5": 0.0483, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4532, + "hfopenllm_v2/MMLU-PRO": 0.3762 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1", + "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4202, + "hfopenllm_v2/BBH": 0.5011, + "hfopenllm_v2/MATH Level 5": 0.0982, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.415, + "hfopenllm_v2/MMLU-PRO": 0.3699 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01", + "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3518, + "hfopenllm_v2/BBH": 0.4999, + "hfopenllm_v2/MATH Level 5": 0.0234, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4871, + "hfopenllm_v2/MMLU-PRO": 0.3611 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1", + "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4204, + "hfopenllm_v2/BBH": 0.5107, + "hfopenllm_v2/MATH Level 5": 0.0876, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4279, + "hfopenllm_v2/MMLU-PRO": 0.371 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01", + "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3454, + "hfopenllm_v2/BBH": 0.4984, + "hfopenllm_v2/MATH Level 5": 0.0219, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4911, + "hfopenllm_v2/MMLU-PRO": 0.3531 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1", + "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4092, + "hfopenllm_v2/BBH": 0.5137, + "hfopenllm_v2/MATH Level 5": 0.0808, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4357, + "hfopenllm_v2/MMLU-PRO": 0.3669 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01", + "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2904, + "hfopenllm_v2/BBH": 0.4967, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4991, + "hfopenllm_v2/MMLU-PRO": 0.349 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1", + "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4199, + "hfopenllm_v2/BBH": 0.5147, + "hfopenllm_v2/MATH Level 5": 0.0808, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4358, + "hfopenllm_v2/MMLU-PRO": 0.3615 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01", + "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2913, + "hfopenllm_v2/BBH": 0.4918, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4977, + "hfopenllm_v2/MMLU-PRO": 0.3454 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1", + "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4162, + "hfopenllm_v2/BBH": 0.5139, + "hfopenllm_v2/MATH Level 5": 0.0778, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4317, + "hfopenllm_v2/MMLU-PRO": 0.3625 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_dare_linear", + "name": "Llama-3-8B-Instruct_dare_linear", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2145, + "hfopenllm_v2/BBH": 0.4283, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4979, + "hfopenllm_v2/MMLU-PRO": 0.2414 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.1", + "name": "Llama-3-8B-Instruct_dare_ties-density-0.1", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1891, + "hfopenllm_v2/BBH": 0.4119, + "hfopenllm_v2/MATH Level 5": 0.0008, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.4658, + "hfopenllm_v2/MMLU-PRO": 0.2265 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.3", + "name": "Llama-3-8B-Instruct_dare_ties-density-0.3", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2113, + "hfopenllm_v2/BBH": 0.4559, + "hfopenllm_v2/MATH Level 5": 0.0015, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.5069, + "hfopenllm_v2/MMLU-PRO": 0.304 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.7", + "name": "Llama-3-8B-Instruct_dare_ties-density-0.7", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2034, + "hfopenllm_v2/BBH": 0.4723, + "hfopenllm_v2/MATH Level 5": 0.003, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.511, + "hfopenllm_v2/MMLU-PRO": 0.3148 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.9", + "name": "Llama-3-8B-Instruct_dare_ties-density-0.9", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2161, + "hfopenllm_v2/BBH": 0.4664, + "hfopenllm_v2/MATH Level 5": 0.0015, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.523, + "hfopenllm_v2/MMLU-PRO": 0.3143 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_linear", + "name": "Llama-3-8B-Instruct_linear", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4308, + "hfopenllm_v2/BBH": 0.5031, + "hfopenllm_v2/MATH Level 5": 0.1005, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4097, + "hfopenllm_v2/MMLU-PRO": 0.3712 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_ties-density-0.1", + "name": "Llama-3-8B-Instruct_ties-density-0.1", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4116, + "hfopenllm_v2/BBH": 0.5021, + "hfopenllm_v2/MATH Level 5": 0.0793, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.4174, + "hfopenllm_v2/MMLU-PRO": 0.36 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_ties-density-0.3", + "name": "Llama-3-8B-Instruct_ties-density-0.3", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3626, + "hfopenllm_v2/BBH": 0.4906, + "hfopenllm_v2/MATH Level 5": 0.0672, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4025, + "hfopenllm_v2/MMLU-PRO": 0.3321 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_ties-density-0.5", + "name": "Llama-3-8B-Instruct_ties-density-0.5", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3797, + "hfopenllm_v2/BBH": 0.4793, + "hfopenllm_v2/MATH Level 5": 0.0612, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.388, + "hfopenllm_v2/MMLU-PRO": 0.3175 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_ties-density-0.7", + "name": "Llama-3-8B-Instruct_ties-density-0.7", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3681, + "hfopenllm_v2/BBH": 0.4738, + "hfopenllm_v2/MATH Level 5": 0.0672, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.3881, + "hfopenllm_v2/MMLU-PRO": 0.3152 + } + }, + { + "id": "johnsutor/Llama-3-8B-Instruct_ties-density-0.9", + "name": "Llama-3-8B-Instruct_ties-density-0.9", + "developer": "johnsutor", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3858, + "hfopenllm_v2/BBH": 0.4735, + "hfopenllm_v2/MATH Level 5": 0.0619, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.388, + "hfopenllm_v2/MMLU-PRO": 0.3182 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/jondurbin.json b/data/developers/jondurbin.json new file mode 100644 index 0000000000000000000000000000000000000000..9184eff22aab70e07429839320522a10809d277e --- /dev/null +++ b/data/developers/jondurbin.json @@ -0,0 +1,19 @@ +{ + "developer": "jondurbin", + "models": [ + { + "id": "jondurbin/bagel-dpo-34b-v0.5", + "name": "jondurbin/bagel-dpo-34b-v0.5", + "developer": "jondurbin", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7215, + "reward-bench/Chat": 0.9385, + "reward-bench/Chat Hard": 0.5504, + "reward-bench/Safety": 0.6446, + "reward-bench/Reasoning": 0.8889, + "reward-bench/Prior Sets (0.5 weight)": 0.4487 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/jpacifico.json b/data/developers/jpacifico.json new file mode 100644 index 0000000000000000000000000000000000000000..1969c4a11f52a585d9eecffb9498875e3a4668b6 --- /dev/null +++ b/data/developers/jpacifico.json @@ -0,0 +1,257 @@ +{ + "developer": "jpacifico", + "models": [ + { + "id": "jpacifico/Chocolatine-14B-Instruct-4k-DPO", + "name": "Chocolatine-14B-Instruct-4k-DPO", + "developer": "jpacifico", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4689, + "hfopenllm_v2/BBH": 0.63, + "hfopenllm_v2/MATH Level 5": 0.1782, + "hfopenllm_v2/GPQA": 0.3414, + "hfopenllm_v2/MUSR": 0.4439, + "hfopenllm_v2/MMLU-PRO": 0.4764 + } + }, + { + "id": "jpacifico/Chocolatine-14B-Instruct-DPO-v1.2", + "name": "Chocolatine-14B-Instruct-DPO-v1.2", + "developer": "jpacifico", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6852, + "hfopenllm_v2/BBH": 0.6438, + "hfopenllm_v2/MATH Level 5": 0.2092, + "hfopenllm_v2/GPQA": 0.3255, + "hfopenllm_v2/MUSR": 0.4268, + "hfopenllm_v2/MMLU-PRO": 0.4697 + } + }, + { + "id": "jpacifico/Chocolatine-14B-Instruct-DPO-v1.3", + "name": "Chocolatine-14B-Instruct-DPO-v1.3", + "developer": "jpacifico", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.704, + "hfopenllm_v2/BBH": 0.6846, + "hfopenllm_v2/MATH Level 5": 0.5619, + "hfopenllm_v2/GPQA": 0.3414, + "hfopenllm_v2/MUSR": 0.4234, + "hfopenllm_v2/MMLU-PRO": 0.5374 + } + }, + { + "id": "jpacifico/Chocolatine-2-14B-Instruct-DPO-v2.0b1", + "name": "Chocolatine-2-14B-Instruct-DPO-v2.0b1", + "developer": "jpacifico", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1033, + "hfopenllm_v2/BBH": 0.6696, + "hfopenllm_v2/MATH Level 5": 0.2757, + "hfopenllm_v2/GPQA": 0.3758, + "hfopenllm_v2/MUSR": 0.4467, + "hfopenllm_v2/MMLU-PRO": 0.5124 + } + }, + { + "id": "jpacifico/Chocolatine-2-14B-Instruct-v2.0", + "name": "Chocolatine-2-14B-Instruct-v2.0", + "developer": "jpacifico", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0885, + "hfopenllm_v2/BBH": 0.677, + "hfopenllm_v2/MATH Level 5": 0.4804, + "hfopenllm_v2/GPQA": 0.3876, + "hfopenllm_v2/MUSR": 0.5021, + "hfopenllm_v2/MMLU-PRO": 0.5302 + } + }, + { + "id": "jpacifico/Chocolatine-2-14B-Instruct-v2.0.1", + "name": "Chocolatine-2-14B-Instruct-v2.0.1", + "developer": "jpacifico", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0742, + "hfopenllm_v2/BBH": 0.6736, + "hfopenllm_v2/MATH Level 5": 0.4796, + "hfopenllm_v2/GPQA": 0.3918, + "hfopenllm_v2/MUSR": 0.5008, + "hfopenllm_v2/MMLU-PRO": 0.5299 + } + }, + { + "id": "jpacifico/Chocolatine-2-14B-Instruct-v2.0.3", + "name": "Chocolatine-2-14B-Instruct-v2.0.3", + "developer": "jpacifico", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7037, + "hfopenllm_v2/BBH": 0.6548, + "hfopenllm_v2/MATH Level 5": 0.4207, + "hfopenllm_v2/GPQA": 0.3792, + "hfopenllm_v2/MUSR": 0.4768, + "hfopenllm_v2/MMLU-PRO": 0.5374 + } + }, + { + "id": "jpacifico/Chocolatine-2-14B-Instruct-v2.0b2", + "name": "Chocolatine-2-14B-Instruct-v2.0b2", + "developer": "jpacifico", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7241, + "hfopenllm_v2/BBH": 0.6476, + "hfopenllm_v2/MATH Level 5": 0.395, + "hfopenllm_v2/GPQA": 0.3834, + "hfopenllm_v2/MUSR": 0.4808, + "hfopenllm_v2/MMLU-PRO": 0.5369 + } + }, + { + "id": "jpacifico/Chocolatine-2-14B-Instruct-v2.0b3", + "name": "Chocolatine-2-14B-Instruct-v2.0b3", + "developer": "jpacifico", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7323, + "hfopenllm_v2/BBH": 0.6469, + "hfopenllm_v2/MATH Level 5": 0.4109, + "hfopenllm_v2/GPQA": 0.3792, + "hfopenllm_v2/MUSR": 0.4781, + "hfopenllm_v2/MMLU-PRO": 0.5337 + } + }, + { + "id": "jpacifico/Chocolatine-3B-Instruct-DPO-Revised", + "name": "Chocolatine-3B-Instruct-DPO-Revised", + "developer": "jpacifico", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5623, + "hfopenllm_v2/BBH": 0.554, + "hfopenllm_v2/MATH Level 5": 0.1805, + "hfopenllm_v2/GPQA": 0.3221, + "hfopenllm_v2/MUSR": 0.4453, + "hfopenllm_v2/MMLU-PRO": 0.3989 + } + }, + { + "id": "jpacifico/Chocolatine-3B-Instruct-DPO-v1.0", + "name": "Chocolatine-3B-Instruct-DPO-v1.0", + "developer": "jpacifico", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3737, + "hfopenllm_v2/BBH": 0.5471, + "hfopenllm_v2/MATH Level 5": 0.1782, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.4755, + "hfopenllm_v2/MMLU-PRO": 0.3937 + } + }, + { + "id": "jpacifico/Chocolatine-3B-Instruct-DPO-v1.2", + "name": "Chocolatine-3B-Instruct-DPO-v1.2", + "developer": "jpacifico", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5455, + "hfopenllm_v2/BBH": 0.5487, + "hfopenllm_v2/MATH Level 5": 0.2047, + "hfopenllm_v2/GPQA": 0.3389, + "hfopenllm_v2/MUSR": 0.4154, + "hfopenllm_v2/MMLU-PRO": 0.3877 + } + }, + { + "id": "jpacifico/Distilucie-7B-Math-Instruct-DPO-v0.1", + "name": "Distilucie-7B-Math-Instruct-DPO-v0.1", + "developer": "jpacifico", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3048, + "hfopenllm_v2/BBH": 0.3835, + "hfopenllm_v2/MATH Level 5": 0.0257, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.3644, + "hfopenllm_v2/MMLU-PRO": 0.1809 + } + }, + { + "id": "jpacifico/Lucie-7B-Instruct-DPO-v1.1", + "name": "Lucie-7B-Instruct-DPO-v1.1", + "developer": "jpacifico", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3121, + "hfopenllm_v2/BBH": 0.3781, + "hfopenllm_v2/MATH Level 5": 0.0234, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.4016, + "hfopenllm_v2/MMLU-PRO": 0.1838 + } + }, + { + "id": "jpacifico/Lucie-7B-Instruct-DPO-v1.1.3", + "name": "Lucie-7B-Instruct-DPO-v1.1.3", + "developer": "jpacifico", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3045, + "hfopenllm_v2/BBH": 0.3819, + "hfopenllm_v2/MATH Level 5": 0.0242, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.3818, + "hfopenllm_v2/MMLU-PRO": 0.1764 + } + }, + { + "id": "jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.0", + "name": "Lucie-7B-Instruct-Merged-Model_Stock-v1.0", + "developer": "jpacifico", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3234, + "hfopenllm_v2/BBH": 0.3802, + "hfopenllm_v2/MATH Level 5": 0.0242, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.3844, + "hfopenllm_v2/MMLU-PRO": 0.1871 + } + }, + { + "id": "jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.1", + "name": "Lucie-7B-Instruct-Merged-Model_Stock-v1.1", + "developer": "jpacifico", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3014, + "hfopenllm_v2/BBH": 0.3808, + "hfopenllm_v2/MATH Level 5": 0.0279, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.375, + "hfopenllm_v2/MMLU-PRO": 0.1862 + } + }, + { + "id": "jpacifico/Lucie-Boosted-7B-Instruct", + "name": "Lucie-Boosted-7B-Instruct", + "developer": "jpacifico", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2566, + "hfopenllm_v2/BBH": 0.3465, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3699, + "hfopenllm_v2/MMLU-PRO": 0.163 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/jsfs11.json b/data/developers/jsfs11.json new file mode 100644 index 0000000000000000000000000000000000000000..d1bd91726362eb30d05bbdabbd0e3d2c21e9e432 --- /dev/null +++ b/data/developers/jsfs11.json @@ -0,0 +1,47 @@ +{ + "developer": "jsfs11", + "models": [ + { + "id": "jsfs11/L3-8B-Stheno-slerp", + "name": "L3-8B-Stheno-slerp", + "developer": "jsfs11", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6752, + "hfopenllm_v2/BBH": 0.5326, + "hfopenllm_v2/MATH Level 5": 0.0989, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.3725, + "hfopenllm_v2/MMLU-PRO": 0.3649 + } + }, + { + "id": "jsfs11/MixtureofMerges-MoE-4x7b-v4", + "name": "MixtureofMerges-MoE-4x7b-v4", + "developer": "jsfs11", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.403, + "hfopenllm_v2/BBH": 0.5169, + "hfopenllm_v2/MATH Level 5": 0.0634, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.4386, + "hfopenllm_v2/MMLU-PRO": 0.3032 + } + }, + { + "id": "jsfs11/MixtureofMerges-MoE-4x7b-v5", + "name": "MixtureofMerges-MoE-4x7b-v5", + "developer": "jsfs11", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4199, + "hfopenllm_v2/BBH": 0.5198, + "hfopenllm_v2/MATH Level 5": 0.0755, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4305, + "hfopenllm_v2/MMLU-PRO": 0.3098 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/kaist-ai.json b/data/developers/kaist-ai.json new file mode 100644 index 0000000000000000000000000000000000000000..cafb4760301ac66a70e5dd87aada79bbc39137de --- /dev/null +++ b/data/developers/kaist-ai.json @@ -0,0 +1,61 @@ +{ + "developer": "kaist-ai", + "models": [ + { + "id": "kaist-ai/janus-7b", + "name": "janus-7b", + "developer": "kaist-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3775, + "hfopenllm_v2/BBH": 0.4694, + "hfopenllm_v2/MATH Level 5": 0.0408, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.4401, + "hfopenllm_v2/MMLU-PRO": 0.2874 + } + }, + { + "id": "kaist-ai/janus-dpo-7b", + "name": "janus-dpo-7b", + "developer": "kaist-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4003, + "hfopenllm_v2/BBH": 0.4773, + "hfopenllm_v2/MATH Level 5": 0.0415, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.4387, + "hfopenllm_v2/MMLU-PRO": 0.2976 + } + }, + { + "id": "kaist-ai/janus-rm-7b", + "name": "janus-rm-7b", + "developer": "kaist-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1778, + "hfopenllm_v2/BBH": 0.3056, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2517, + "hfopenllm_v2/MUSR": 0.3883, + "hfopenllm_v2/MMLU-PRO": 0.1126 + } + }, + { + "id": "kaist-ai/mistral-orpo-capybara-7k", + "name": "mistral-orpo-capybara-7k", + "developer": "kaist-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5367, + "hfopenllm_v2/BBH": 0.4489, + "hfopenllm_v2/MATH Level 5": 0.0393, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.3964, + "hfopenllm_v2/MMLU-PRO": 0.2971 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/kavonalds.json b/data/developers/kavonalds.json new file mode 100644 index 0000000000000000000000000000000000000000..3c38b58473bebadc89fb67cf205ea04b79c73205 --- /dev/null +++ b/data/developers/kavonalds.json @@ -0,0 +1,47 @@ +{ + "developer": "kavonalds", + "models": [ + { + "id": "kavonalds/BunderMaxx-0710", + "name": "BunderMaxx-0710", + "developer": "kavonalds", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3283, + "hfopenllm_v2/BBH": 0.6651, + "hfopenllm_v2/MATH Level 5": 0.068, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3393, + "hfopenllm_v2/MMLU-PRO": 0.1314 + } + }, + { + "id": "kavonalds/BunderMaxx-1010", + "name": "BunderMaxx-1010", + "developer": "kavonalds", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2981, + "hfopenllm_v2/BBH": 0.702, + "hfopenllm_v2/MATH Level 5": 0.105, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3484, + "hfopenllm_v2/MMLU-PRO": 0.1224 + } + }, + { + "id": "kavonalds/Lancer-1-1b-Instruct", + "name": "Lancer-1-1b-Instruct", + "developer": "kavonalds", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5546, + "hfopenllm_v2/BBH": 0.3253, + "hfopenllm_v2/MATH Level 5": 0.0393, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3144, + "hfopenllm_v2/MMLU-PRO": 0.1568 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/kayfour.json b/data/developers/kayfour.json new file mode 100644 index 0000000000000000000000000000000000000000..102ad7bc9a7c1e3e6a4966af227d9c5ec836ba95 --- /dev/null +++ b/data/developers/kayfour.json @@ -0,0 +1,19 @@ +{ + "developer": "kayfour", + "models": [ + { + "id": "kayfour/T3Q-Qwen2.5-7B-it-KOR-Safe", + "name": "T3Q-Qwen2.5-7B-it-KOR-Safe", + "developer": "kayfour", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6081, + "hfopenllm_v2/BBH": 0.555, + "hfopenllm_v2/MATH Level 5": 0.3761, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.4277, + "hfopenllm_v2/MMLU-PRO": 0.4464 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/keeeeenw.json b/data/developers/keeeeenw.json new file mode 100644 index 0000000000000000000000000000000000000000..c92573903f435f299efb6614a4dc6f1767e96059 --- /dev/null +++ b/data/developers/keeeeenw.json @@ -0,0 +1,19 @@ +{ + "developer": "keeeeenw", + "models": [ + { + "id": "keeeeenw/MicroLlama", + "name": "MicroLlama", + "developer": "keeeeenw", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1985, + "hfopenllm_v2/BBH": 0.3007, + "hfopenllm_v2/MATH Level 5": 0.0113, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3698, + "hfopenllm_v2/MMLU-PRO": 0.1138 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/kekmodel.json b/data/developers/kekmodel.json new file mode 100644 index 0000000000000000000000000000000000000000..6b36fc62a538fa5126cc628205db84e8a3eccaff --- /dev/null +++ b/data/developers/kekmodel.json @@ -0,0 +1,19 @@ +{ + "developer": "kekmodel", + "models": [ + { + "id": "kekmodel/StopCarbon-10.7B-v5", + "name": "StopCarbon-10.7B-v5", + "developer": "kekmodel", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4728, + "hfopenllm_v2/BBH": 0.5178, + "hfopenllm_v2/MATH Level 5": 0.0559, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4019, + "hfopenllm_v2/MMLU-PRO": 0.3157 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/kevin009.json b/data/developers/kevin009.json new file mode 100644 index 0000000000000000000000000000000000000000..ff82ca19e303fe735ca9364c6b5a69e9d239a4df --- /dev/null +++ b/data/developers/kevin009.json @@ -0,0 +1,19 @@ +{ + "developer": "kevin009", + "models": [ + { + "id": "kevin009/llamaRAGdrama", + "name": "llamaRAGdrama", + "developer": "kevin009", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2598, + "hfopenllm_v2/BBH": 0.4007, + "hfopenllm_v2/MATH Level 5": 0.0431, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.4316, + "hfopenllm_v2/MMLU-PRO": 0.2724 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/khoantap.json b/data/developers/khoantap.json new file mode 100644 index 0000000000000000000000000000000000000000..39542944051aedda7a7c34cbbd69b94b255956a2 --- /dev/null +++ b/data/developers/khoantap.json @@ -0,0 +1,131 @@ +{ + "developer": "khoantap", + "models": [ + { + "id": "khoantap/cheap-moe-merge", + "name": "cheap-moe-merge", + "developer": "khoantap", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4557, + "hfopenllm_v2/BBH": 0.5131, + "hfopenllm_v2/MATH Level 5": 0.0921, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4103, + "hfopenllm_v2/MMLU-PRO": 0.3339 + } + }, + { + "id": "khoantap/llama-3-8b-stock-merge", + "name": "llama-3-8b-stock-merge", + "developer": "khoantap", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4812, + "hfopenllm_v2/BBH": 0.5162, + "hfopenllm_v2/MATH Level 5": 0.1616, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.3946, + "hfopenllm_v2/MMLU-PRO": 0.38 + } + }, + { + "id": "khoantap/llama-breadcrumbs-ties-merge", + "name": "llama-breadcrumbs-ties-merge", + "developer": "khoantap", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2205, + "hfopenllm_v2/BBH": 0.5416, + "hfopenllm_v2/MATH Level 5": 0.1125, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.4434, + "hfopenllm_v2/MMLU-PRO": 0.3172 + } + }, + { + "id": "khoantap/llama-evolve-ties-best-merge", + "name": "llama-evolve-ties-best-merge", + "developer": "khoantap", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6744, + "hfopenllm_v2/BBH": 0.5414, + "hfopenllm_v2/MATH Level 5": 0.1563, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.3946, + "hfopenllm_v2/MMLU-PRO": 0.386 + } + }, + { + "id": "khoantap/llama-linear-0.5-0.5-1-merge", + "name": "llama-linear-0.5-0.5-1-merge", + "developer": "khoantap", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4812, + "hfopenllm_v2/BBH": 0.5643, + "hfopenllm_v2/MATH Level 5": 0.2054, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4143, + "hfopenllm_v2/MMLU-PRO": 0.3833 + } + }, + { + "id": "khoantap/llama-linear-0.5-1-0.5-merge", + "name": "llama-linear-0.5-1-0.5-merge", + "developer": "khoantap", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5032, + "hfopenllm_v2/BBH": 0.5951, + "hfopenllm_v2/MATH Level 5": 0.148, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4172, + "hfopenllm_v2/MMLU-PRO": 0.369 + } + }, + { + "id": "khoantap/llama-linear-1-0.5-0.5-merge", + "name": "llama-linear-1-0.5-0.5-merge", + "developer": "khoantap", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4515, + "hfopenllm_v2/BBH": 0.5526, + "hfopenllm_v2/MATH Level 5": 0.2477, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4118, + "hfopenllm_v2/MMLU-PRO": 0.3635 + } + }, + { + "id": "khoantap/llama-slerp-merge", + "name": "llama-slerp-merge", + "developer": "khoantap", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.498, + "hfopenllm_v2/BBH": 0.5783, + "hfopenllm_v2/MATH Level 5": 0.0831, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4053, + "hfopenllm_v2/MMLU-PRO": 0.3678 + } + }, + { + "id": "khoantap/moe-out-merge", + "name": "moe-out-merge", + "developer": "khoantap", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4505, + "hfopenllm_v2/BBH": 0.5151, + "hfopenllm_v2/MATH Level 5": 0.0929, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.4063, + "hfopenllm_v2/MMLU-PRO": 0.3348 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/khulaifi95.json b/data/developers/khulaifi95.json new file mode 100644 index 0000000000000000000000000000000000000000..61225eeefe15af2d1ba99c65423ebd1c5482c99b --- /dev/null +++ b/data/developers/khulaifi95.json @@ -0,0 +1,19 @@ +{ + "developer": "khulaifi95", + "models": [ + { + "id": "khulaifi95/Llama-3.1-8B-Reason-Blend-888k", + "name": "Llama-3.1-8B-Reason-Blend-888k", + "developer": "khulaifi95", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5832, + "hfopenllm_v2/BBH": 0.479, + "hfopenllm_v2/MATH Level 5": 0.1156, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3379, + "hfopenllm_v2/MMLU-PRO": 0.31 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/kms7530.json b/data/developers/kms7530.json new file mode 100644 index 0000000000000000000000000000000000000000..3fbbe5339d95906fa350578a1f7278d91d76e541 --- /dev/null +++ b/data/developers/kms7530.json @@ -0,0 +1,61 @@ +{ + "developer": "kms7530", + "models": [ + { + "id": "kms7530/chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1", + "name": "chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1", + "developer": "kms7530", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5455, + "hfopenllm_v2/BBH": 0.4289, + "hfopenllm_v2/MATH Level 5": 0.0619, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3821, + "hfopenllm_v2/MMLU-PRO": 0.2798 + } + }, + { + "id": "kms7530/chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath", + "name": "chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath", + "developer": "kms7530", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4863, + "hfopenllm_v2/BBH": 0.4987, + "hfopenllm_v2/MATH Level 5": 0.108, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.3983, + "hfopenllm_v2/MMLU-PRO": 0.3481 + } + }, + { + "id": "kms7530/chemeng_qwen-math-7b_24_1_100_1", + "name": "chemeng_qwen-math-7b_24_1_100_1", + "developer": "kms7530", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2111, + "hfopenllm_v2/BBH": 0.3578, + "hfopenllm_v2/MATH Level 5": 0.2243, + "hfopenllm_v2/GPQA": 0.2441, + "hfopenllm_v2/MUSR": 0.3687, + "hfopenllm_v2/MMLU-PRO": 0.2158 + } + }, + { + "id": "kms7530/chemeng_qwen-math-7b_24_1_100_1_nonmath", + "name": "chemeng_qwen-math-7b_24_1_100_1_nonmath", + "developer": "kms7530", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2584, + "hfopenllm_v2/BBH": 0.3893, + "hfopenllm_v2/MATH Level 5": 0.3097, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4087, + "hfopenllm_v2/MMLU-PRO": 0.2452 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/kno10.json b/data/developers/kno10.json new file mode 100644 index 0000000000000000000000000000000000000000..018c11f687911e11ce50496c2e0c018416cf3db1 --- /dev/null +++ b/data/developers/kno10.json @@ -0,0 +1,33 @@ +{ + "developer": "kno10", + "models": [ + { + "id": "kno10/ende-chat-0.0.5", + "name": "ende-chat-0.0.5", + "developer": "kno10", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3404, + "hfopenllm_v2/BBH": 0.3604, + "hfopenllm_v2/MATH Level 5": 0.0204, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3938, + "hfopenllm_v2/MMLU-PRO": 0.179 + } + }, + { + "id": "kno10/ende-chat-0.0.7", + "name": "ende-chat-0.0.7", + "developer": "kno10", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4401, + "hfopenllm_v2/BBH": 0.3792, + "hfopenllm_v2/MATH Level 5": 0.0174, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.3861, + "hfopenllm_v2/MMLU-PRO": 0.1966 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/kyutai.json b/data/developers/kyutai.json new file mode 100644 index 0000000000000000000000000000000000000000..7422ab7fa4a4ab4478ee22b9d7cd718cd595f093 --- /dev/null +++ b/data/developers/kyutai.json @@ -0,0 +1,19 @@ +{ + "developer": "kyutai", + "models": [ + { + "id": "kyutai/helium-1-preview-2b", + "name": "helium-1-preview-2b", + "developer": "kyutai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2614, + "hfopenllm_v2/BBH": 0.3638, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.355, + "hfopenllm_v2/MMLU-PRO": 0.1873 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/kz919.json b/data/developers/kz919.json new file mode 100644 index 0000000000000000000000000000000000000000..f07a8d4ed120988a54dd758837acf314e8b90f22 --- /dev/null +++ b/data/developers/kz919.json @@ -0,0 +1,19 @@ +{ + "developer": "kz919", + "models": [ + { + "id": "kz919/QwQ-0.5B-Distilled-SFT", + "name": "QwQ-0.5B-Distilled-SFT", + "developer": "kz919", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3077, + "hfopenllm_v2/BBH": 0.3256, + "hfopenllm_v2/MATH Level 5": 0.074, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3409, + "hfopenllm_v2/MMLU-PRO": 0.1587 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ladydaina.json b/data/developers/ladydaina.json new file mode 100644 index 0000000000000000000000000000000000000000..ab258388baf01a4d69d83b70adc3bc612368f60b --- /dev/null +++ b/data/developers/ladydaina.json @@ -0,0 +1,19 @@ +{ + "developer": "ladydaina", + "models": [ + { + "id": "ladydaina/ECE-FDF", + "name": "ECE-FDF", + "developer": "ladydaina", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3728, + "hfopenllm_v2/BBH": 0.515, + "hfopenllm_v2/MATH Level 5": 0.0816, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4504, + "hfopenllm_v2/MMLU-PRO": 0.3007 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/laislemke.json b/data/developers/laislemke.json new file mode 100644 index 0000000000000000000000000000000000000000..4805ab06830ed4c83bd1b79bef90811b1eb79627 --- /dev/null +++ b/data/developers/laislemke.json @@ -0,0 +1,19 @@ +{ + "developer": "laislemke", + "models": [ + { + "id": "laislemke/LLaMA-2-vicuna-7b-slerp", + "name": "LLaMA-2-vicuna-7b-slerp", + "developer": "laislemke", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2932, + "hfopenllm_v2/BBH": 0.2986, + "hfopenllm_v2/MATH Level 5": 0.0113, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3833, + "hfopenllm_v2/MMLU-PRO": 0.1342 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/lalainy.json b/data/developers/lalainy.json new file mode 100644 index 0000000000000000000000000000000000000000..476a1593f27daec458637c1cc3ea80e442afd5f9 --- /dev/null +++ b/data/developers/lalainy.json @@ -0,0 +1,103 @@ +{ + "developer": "lalainy", + "models": [ + { + "id": "lalainy/ECE-PRYMMAL-0.5B-FT-V5-MUSR", + "name": "ECE-PRYMMAL-0.5B-FT-V5-MUSR", + "developer": "lalainy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2138, + "hfopenllm_v2/BBH": 0.3269, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3262, + "hfopenllm_v2/MMLU-PRO": 0.1533 + } + }, + { + "id": "lalainy/ECE-PRYMMAL-0.5B-SLERP-V4", + "name": "ECE-PRYMMAL-0.5B-SLERP-V4", + "developer": "lalainy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1564, + "hfopenllm_v2/BBH": 0.2894, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3789, + "hfopenllm_v2/MMLU-PRO": 0.1169 + } + }, + { + "id": "lalainy/ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1", + "name": "ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1", + "developer": "lalainy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1437, + "hfopenllm_v2/BBH": 0.3032, + "hfopenllm_v2/MATH Level 5": 0.0008, + "hfopenllm_v2/GPQA": 0.2349, + "hfopenllm_v2/MUSR": 0.3646, + "hfopenllm_v2/MMLU-PRO": 0.1121 + } + }, + { + "id": "lalainy/ECE-PRYMMAL-YL-1B-SLERP-V3", + "name": "ECE-PRYMMAL-YL-1B-SLERP-V3", + "developer": "lalainy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.325, + "hfopenllm_v2/BBH": 0.4225, + "hfopenllm_v2/MATH Level 5": 0.0974, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4213, + "hfopenllm_v2/MMLU-PRO": 0.2931 + } + }, + { + "id": "lalainy/ECE-PRYMMAL-YL-1B-SLERP-V4", + "name": "ECE-PRYMMAL-YL-1B-SLERP-V4", + "developer": "lalainy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3324, + "hfopenllm_v2/BBH": 0.4171, + "hfopenllm_v2/MATH Level 5": 0.1005, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.4306, + "hfopenllm_v2/MMLU-PRO": 0.2893 + } + }, + { + "id": "lalainy/ECE-PRYMMAL-YL-6B-SLERP-V1", + "name": "ECE-PRYMMAL-YL-6B-SLERP-V1", + "developer": "lalainy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3264, + "hfopenllm_v2/BBH": 0.4629, + "hfopenllm_v2/MATH Level 5": 0.1269, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.4864, + "hfopenllm_v2/MMLU-PRO": 0.3214 + } + }, + { + "id": "lalainy/ECE-PRYMMAL-YL-6B-SLERP-V2", + "name": "ECE-PRYMMAL-YL-6B-SLERP-V2", + "developer": "lalainy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3249, + "hfopenllm_v2/BBH": 0.4629, + "hfopenllm_v2/MATH Level 5": 0.1269, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.4864, + "hfopenllm_v2/MMLU-PRO": 0.3214 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/langgptai.json b/data/developers/langgptai.json new file mode 100644 index 0000000000000000000000000000000000000000..3f8c0e341080bbea317c93d76a30adf3cea0fbbf --- /dev/null +++ b/data/developers/langgptai.json @@ -0,0 +1,33 @@ +{ + "developer": "langgptai", + "models": [ + { + "id": "langgptai/Qwen-las-v0.1", + "name": "Qwen-las-v0.1", + "developer": "langgptai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3301, + "hfopenllm_v2/BBH": 0.3893, + "hfopenllm_v2/MATH Level 5": 0.037, + "hfopenllm_v2/GPQA": 0.2466, + "hfopenllm_v2/MUSR": 0.3701, + "hfopenllm_v2/MMLU-PRO": 0.2325 + } + }, + { + "id": "langgptai/qwen1.5-7b-chat-sa-v0.1", + "name": "qwen1.5-7b-chat-sa-v0.1", + "developer": "langgptai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4268, + "hfopenllm_v2/BBH": 0.4325, + "hfopenllm_v2/MATH Level 5": 0.0302, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.3551, + "hfopenllm_v2/MMLU-PRO": 0.2993 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/lars1234.json b/data/developers/lars1234.json new file mode 100644 index 0000000000000000000000000000000000000000..e1d6249b8cfa9325134174278aafeb88d68057c8 --- /dev/null +++ b/data/developers/lars1234.json @@ -0,0 +1,19 @@ +{ + "developer": "lars1234", + "models": [ + { + "id": "lars1234/Mistral-Small-24B-Instruct-2501-writer", + "name": "Mistral-Small-24B-Instruct-2501-writer", + "developer": "lars1234", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6565, + "hfopenllm_v2/BBH": 0.6733, + "hfopenllm_v2/MATH Level 5": 0.3557, + "hfopenllm_v2/GPQA": 0.3893, + "hfopenllm_v2/MUSR": 0.4645, + "hfopenllm_v2/MMLU-PRO": 0.5448 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/leafspark.json b/data/developers/leafspark.json new file mode 100644 index 0000000000000000000000000000000000000000..600febe95f303ade7fd5feeb906feac8e576ffc3 --- /dev/null +++ b/data/developers/leafspark.json @@ -0,0 +1,19 @@ +{ + "developer": "leafspark", + "models": [ + { + "id": "leafspark/Llama-3.1-8B-MultiReflection-Instruct", + "name": "Llama-3.1-8B-MultiReflection-Instruct", + "developer": "leafspark", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7125, + "hfopenllm_v2/BBH": 0.5009, + "hfopenllm_v2/MATH Level 5": 0.1707, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.3682, + "hfopenllm_v2/MMLU-PRO": 0.3724 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/lemon07r.json b/data/developers/lemon07r.json new file mode 100644 index 0000000000000000000000000000000000000000..5916ffdb5a2cbbd332932220180cd47829bb2fa7 --- /dev/null +++ b/data/developers/lemon07r.json @@ -0,0 +1,243 @@ +{ + "developer": "lemon07r", + "models": [ + { + "id": "lemon07r/Gemma-2-Ataraxy-9B", + "name": "Gemma-2-Ataraxy-9B", + "developer": "lemon07r", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3009, + "hfopenllm_v2/BBH": 0.5931, + "hfopenllm_v2/MATH Level 5": 0.0853, + "hfopenllm_v2/GPQA": 0.3347, + "hfopenllm_v2/MUSR": 0.4424, + "hfopenllm_v2/MMLU-PRO": 0.4226 + } + }, + { + "id": "lemon07r/Gemma-2-Ataraxy-Advanced-9B", + "name": "Gemma-2-Ataraxy-Advanced-9B", + "developer": "lemon07r", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5516, + "hfopenllm_v2/BBH": 0.5889, + "hfopenllm_v2/MATH Level 5": 0.1979, + "hfopenllm_v2/GPQA": 0.3356, + "hfopenllm_v2/MUSR": 0.3761, + "hfopenllm_v2/MMLU-PRO": 0.4244 + } + }, + { + "id": "lemon07r/Gemma-2-Ataraxy-Remix-9B", + "name": "Gemma-2-Ataraxy-Remix-9B", + "developer": "lemon07r", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7083, + "hfopenllm_v2/BBH": 0.5892, + "hfopenllm_v2/MATH Level 5": 0.2017, + "hfopenllm_v2/GPQA": 0.3389, + "hfopenllm_v2/MUSR": 0.4372, + "hfopenllm_v2/MMLU-PRO": 0.4239 + } + }, + { + "id": "lemon07r/Gemma-2-Ataraxy-v2-9B", + "name": "Gemma-2-Ataraxy-v2-9B", + "developer": "lemon07r", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2136, + "hfopenllm_v2/BBH": 0.5766, + "hfopenllm_v2/MATH Level 5": 0.0846, + "hfopenllm_v2/GPQA": 0.3423, + "hfopenllm_v2/MUSR": 0.3484, + "hfopenllm_v2/MMLU-PRO": 0.4221 + } + }, + { + "id": "lemon07r/Gemma-2-Ataraxy-v2a-9B", + "name": "Gemma-2-Ataraxy-v2a-9B", + "developer": "lemon07r", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1595, + "hfopenllm_v2/BBH": 0.5182, + "hfopenllm_v2/MATH Level 5": 0.0612, + "hfopenllm_v2/GPQA": 0.3398, + "hfopenllm_v2/MUSR": 0.3165, + "hfopenllm_v2/MMLU-PRO": 0.3515 + } + }, + { + "id": "lemon07r/Gemma-2-Ataraxy-v2f-9B", + "name": "Gemma-2-Ataraxy-v2f-9B", + "developer": "lemon07r", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3791, + "hfopenllm_v2/BBH": 0.5193, + "hfopenllm_v2/MATH Level 5": 0.1163, + "hfopenllm_v2/GPQA": 0.3389, + "hfopenllm_v2/MUSR": 0.3231, + "hfopenllm_v2/MMLU-PRO": 0.3503 + } + }, + { + "id": "lemon07r/Gemma-2-Ataraxy-v3-Advanced-9B", + "name": "Gemma-2-Ataraxy-v3-Advanced-9B", + "developer": "lemon07r", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6602, + "hfopenllm_v2/BBH": 0.5935, + "hfopenllm_v2/MATH Level 5": 0.1873, + "hfopenllm_v2/GPQA": 0.3364, + "hfopenllm_v2/MUSR": 0.445, + "hfopenllm_v2/MMLU-PRO": 0.4196 + } + }, + { + "id": "lemon07r/Gemma-2-Ataraxy-v3b-9B", + "name": "Gemma-2-Ataraxy-v3b-9B", + "developer": "lemon07r", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6809, + "hfopenllm_v2/BBH": 0.5908, + "hfopenllm_v2/MATH Level 5": 0.2153, + "hfopenllm_v2/GPQA": 0.3331, + "hfopenllm_v2/MUSR": 0.4489, + "hfopenllm_v2/MMLU-PRO": 0.4205 + } + }, + { + "id": "lemon07r/Gemma-2-Ataraxy-v3i-9B", + "name": "Gemma-2-Ataraxy-v3i-9B", + "developer": "lemon07r", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4203, + "hfopenllm_v2/BBH": 0.5626, + "hfopenllm_v2/MATH Level 5": 0.1533, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.3181, + "hfopenllm_v2/MMLU-PRO": 0.4166 + } + }, + { + "id": "lemon07r/Gemma-2-Ataraxy-v3j-9B", + "name": "Gemma-2-Ataraxy-v3j-9B", + "developer": "lemon07r", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4169, + "hfopenllm_v2/BBH": 0.5632, + "hfopenllm_v2/MATH Level 5": 0.1692, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.318, + "hfopenllm_v2/MMLU-PRO": 0.4134 + } + }, + { + "id": "lemon07r/Gemma-2-Ataraxy-v4-Advanced-9B", + "name": "Gemma-2-Ataraxy-v4-Advanced-9B", + "developer": "lemon07r", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7015, + "hfopenllm_v2/BBH": 0.6024, + "hfopenllm_v2/MATH Level 5": 0.2153, + "hfopenllm_v2/GPQA": 0.3389, + "hfopenllm_v2/MUSR": 0.4581, + "hfopenllm_v2/MMLU-PRO": 0.4367 + } + }, + { + "id": "lemon07r/Gemma-2-Ataraxy-v4a-Advanced-9B", + "name": "Gemma-2-Ataraxy-v4a-Advanced-9B", + "developer": "lemon07r", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7135, + "hfopenllm_v2/BBH": 0.5988, + "hfopenllm_v2/MATH Level 5": 0.2115, + "hfopenllm_v2/GPQA": 0.344, + "hfopenllm_v2/MUSR": 0.4489, + "hfopenllm_v2/MMLU-PRO": 0.4309 + } + }, + { + "id": "lemon07r/Gemma-2-Ataraxy-v4b-9B", + "name": "Gemma-2-Ataraxy-v4b-9B", + "developer": "lemon07r", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6878, + "hfopenllm_v2/BBH": 0.6039, + "hfopenllm_v2/MATH Level 5": 0.2334, + "hfopenllm_v2/GPQA": 0.3406, + "hfopenllm_v2/MUSR": 0.4555, + "hfopenllm_v2/MMLU-PRO": 0.4357 + } + }, + { + "id": "lemon07r/Gemma-2-Ataraxy-v4c-9B", + "name": "Gemma-2-Ataraxy-v4c-9B", + "developer": "lemon07r", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6945, + "hfopenllm_v2/BBH": 0.6084, + "hfopenllm_v2/MATH Level 5": 0.2266, + "hfopenllm_v2/GPQA": 0.3339, + "hfopenllm_v2/MUSR": 0.4528, + "hfopenllm_v2/MMLU-PRO": 0.4395 + } + }, + { + "id": "lemon07r/Gemma-2-Ataraxy-v4d-9B", + "name": "Gemma-2-Ataraxy-v4d-9B", + "developer": "lemon07r", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.725, + "hfopenllm_v2/BBH": 0.6054, + "hfopenllm_v2/MATH Level 5": 0.2334, + "hfopenllm_v2/GPQA": 0.3473, + "hfopenllm_v2/MUSR": 0.4541, + "hfopenllm_v2/MMLU-PRO": 0.4346 + } + }, + { + "id": "lemon07r/Llama-3-RedMagic4-8B", + "name": "Llama-3-RedMagic4-8B", + "developer": "lemon07r", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4864, + "hfopenllm_v2/BBH": 0.4256, + "hfopenllm_v2/MATH Level 5": 0.0899, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.3766, + "hfopenllm_v2/MMLU-PRO": 0.3676 + } + }, + { + "id": "lemon07r/llama-3-NeuralMahou-8b", + "name": "llama-3-NeuralMahou-8b", + "developer": "lemon07r", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4901, + "hfopenllm_v2/BBH": 0.4184, + "hfopenllm_v2/MATH Level 5": 0.102, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.3873, + "hfopenllm_v2/MMLU-PRO": 0.369 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/lesubra.json b/data/developers/lesubra.json new file mode 100644 index 0000000000000000000000000000000000000000..69fc58639a13523de36e07336784157369037d4d --- /dev/null +++ b/data/developers/lesubra.json @@ -0,0 +1,117 @@ +{ + "developer": "lesubra", + "models": [ + { + "id": "lesubra/ECE-EIFFEL-3B", + "name": "ECE-EIFFEL-3B", + "developer": "lesubra", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3469, + "hfopenllm_v2/BBH": 0.5102, + "hfopenllm_v2/MATH Level 5": 0.1216, + "hfopenllm_v2/GPQA": 0.3314, + "hfopenllm_v2/MUSR": 0.4362, + "hfopenllm_v2/MMLU-PRO": 0.3821 + } + }, + { + "id": "lesubra/ECE-EIFFEL-3Bv2", + "name": "ECE-EIFFEL-3Bv2", + "developer": "lesubra", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3013, + "hfopenllm_v2/BBH": 0.5424, + "hfopenllm_v2/MATH Level 5": 0.1186, + "hfopenllm_v2/GPQA": 0.3356, + "hfopenllm_v2/MUSR": 0.4443, + "hfopenllm_v2/MMLU-PRO": 0.3999 + } + }, + { + "id": "lesubra/ECE-EIFFEL-3Bv3", + "name": "ECE-EIFFEL-3Bv3", + "developer": "lesubra", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3786, + "hfopenllm_v2/BBH": 0.5469, + "hfopenllm_v2/MATH Level 5": 0.1669, + "hfopenllm_v2/GPQA": 0.3297, + "hfopenllm_v2/MUSR": 0.4675, + "hfopenllm_v2/MMLU-PRO": 0.3975 + } + }, + { + "id": "lesubra/ECE-PRYMMAL-3B-SLERP-V1", + "name": "ECE-PRYMMAL-3B-SLERP-V1", + "developer": "lesubra", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2933, + "hfopenllm_v2/BBH": 0.5341, + "hfopenllm_v2/MATH Level 5": 0.1662, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4595, + "hfopenllm_v2/MMLU-PRO": 0.39 + } + }, + { + "id": "lesubra/ECE-PRYMMAL-3B-SLERP-V2", + "name": "ECE-PRYMMAL-3B-SLERP-V2", + "developer": "lesubra", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2933, + "hfopenllm_v2/BBH": 0.5341, + "hfopenllm_v2/MATH Level 5": 0.1662, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4595, + "hfopenllm_v2/MMLU-PRO": 0.39 + } + }, + { + "id": "lesubra/ECE-PRYMMAL-3B-SLERP_2-V1", + "name": "ECE-PRYMMAL-3B-SLERP_2-V1", + "developer": "lesubra", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3649, + "hfopenllm_v2/BBH": 0.5411, + "hfopenllm_v2/MATH Level 5": 0.1677, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.4661, + "hfopenllm_v2/MMLU-PRO": 0.399 + } + }, + { + "id": "lesubra/ECE-PRYMMAL-3B-SLERP_2-V2", + "name": "ECE-PRYMMAL-3B-SLERP_2-V2", + "developer": "lesubra", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3664, + "hfopenllm_v2/BBH": 0.5411, + "hfopenllm_v2/MATH Level 5": 0.1677, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.4661, + "hfopenllm_v2/MMLU-PRO": 0.399 + } + }, + { + "id": "lesubra/merge-test", + "name": "merge-test", + "developer": "lesubra", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5383, + "hfopenllm_v2/BBH": 0.524, + "hfopenllm_v2/MATH Level 5": 0.1208, + "hfopenllm_v2/GPQA": 0.3221, + "hfopenllm_v2/MUSR": 0.4419, + "hfopenllm_v2/MMLU-PRO": 0.3874 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/lightblue.json b/data/developers/lightblue.json new file mode 100644 index 0000000000000000000000000000000000000000..f33eda917b9f6e9ecf0fdaa52393b108e888d013 --- /dev/null +++ b/data/developers/lightblue.json @@ -0,0 +1,75 @@ +{ + "developer": "lightblue", + "models": [ + { + "id": "lightblue/suzume-llama-3-8B-multilingual", + "name": "suzume-llama-3-8B-multilingual", + "developer": "lightblue", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6678, + "hfopenllm_v2/BBH": 0.495, + "hfopenllm_v2/MATH Level 5": 0.0944, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3977, + "hfopenllm_v2/MMLU-PRO": 0.3383 + } + }, + { + "id": "lightblue/suzume-llama-3-8B-multilingual-orpo-borda-full", + "name": "suzume-llama-3-8B-multilingual-orpo-borda-full", + "developer": "lightblue", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5817, + "hfopenllm_v2/BBH": 0.4714, + "hfopenllm_v2/MATH Level 5": 0.0763, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3222, + "hfopenllm_v2/MMLU-PRO": 0.331 + } + }, + { + "id": "lightblue/suzume-llama-3-8B-multilingual-orpo-borda-half", + "name": "suzume-llama-3-8B-multilingual-orpo-borda-half", + "developer": "lightblue", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6249, + "hfopenllm_v2/BBH": 0.4707, + "hfopenllm_v2/MATH Level 5": 0.0906, + "hfopenllm_v2/GPQA": 0.245, + "hfopenllm_v2/MUSR": 0.3516, + "hfopenllm_v2/MMLU-PRO": 0.3614 + } + }, + { + "id": "lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top25", + "name": "suzume-llama-3-8B-multilingual-orpo-borda-top25", + "developer": "lightblue", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6637, + "hfopenllm_v2/BBH": 0.4865, + "hfopenllm_v2/MATH Level 5": 0.1042, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3566, + "hfopenllm_v2/MMLU-PRO": 0.3684 + } + }, + { + "id": "lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top75", + "name": "suzume-llama-3-8B-multilingual-orpo-borda-top75", + "developer": "lightblue", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6687, + "hfopenllm_v2/BBH": 0.4833, + "hfopenllm_v2/MATH Level 5": 0.0785, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3817, + "hfopenllm_v2/MMLU-PRO": 0.3769 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/lkoenig.json b/data/developers/lkoenig.json new file mode 100644 index 0000000000000000000000000000000000000000..dd120e45fb33ede50cb300ddc13f0d793fe882cc --- /dev/null +++ b/data/developers/lkoenig.json @@ -0,0 +1,159 @@ +{ + "developer": "lkoenig", + "models": [ + { + "id": "lkoenig/BBAI_145_", + "name": "BBAI_145_", + "developer": "lkoenig", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.445, + "hfopenllm_v2/BBH": 0.5567, + "hfopenllm_v2/MATH Level 5": 0.361, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4382, + "hfopenllm_v2/MMLU-PRO": 0.449 + } + }, + { + "id": "lkoenig/BBAI_200_Gemma", + "name": "BBAI_200_Gemma", + "developer": "lkoenig", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0705, + "hfopenllm_v2/BBH": 0.3449, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3631, + "hfopenllm_v2/MMLU-PRO": 0.1679 + } + }, + { + "id": "lkoenig/BBAI_212_QwenLawLo", + "name": "BBAI_212_QwenLawLo", + "developer": "lkoenig", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4566, + "hfopenllm_v2/BBH": 0.5574, + "hfopenllm_v2/MATH Level 5": 0.3603, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.437, + "hfopenllm_v2/MMLU-PRO": 0.4489 + } + }, + { + "id": "lkoenig/BBAI_212_Qwencore", + "name": "BBAI_212_Qwencore", + "developer": "lkoenig", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4384, + "hfopenllm_v2/BBH": 0.5569, + "hfopenllm_v2/MATH Level 5": 0.3489, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4343, + "hfopenllm_v2/MMLU-PRO": 0.449 + } + }, + { + "id": "lkoenig/BBAI_230_Xiaqwen", + "name": "BBAI_230_Xiaqwen", + "developer": "lkoenig", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4649, + "hfopenllm_v2/BBH": 0.5578, + "hfopenllm_v2/MATH Level 5": 0.3663, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4422, + "hfopenllm_v2/MMLU-PRO": 0.4481 + } + }, + { + "id": "lkoenig/BBAI_375_QwenDyancabs", + "name": "BBAI_375_QwenDyancabs", + "developer": "lkoenig", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4566, + "hfopenllm_v2/BBH": 0.5571, + "hfopenllm_v2/MATH Level 5": 0.3776, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4462, + "hfopenllm_v2/MMLU-PRO": 0.4476 + } + }, + { + "id": "lkoenig/BBAI_456_QwenKoen", + "name": "BBAI_456_QwenKoen", + "developer": "lkoenig", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4529, + "hfopenllm_v2/BBH": 0.5553, + "hfopenllm_v2/MATH Level 5": 0.3686, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4395, + "hfopenllm_v2/MMLU-PRO": 0.4469 + } + }, + { + "id": "lkoenig/BBAI_7B_KoenQwenDyan", + "name": "BBAI_7B_KoenQwenDyan", + "developer": "lkoenig", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5807, + "hfopenllm_v2/BBH": 0.5537, + "hfopenllm_v2/MATH Level 5": 0.3739, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.4369, + "hfopenllm_v2/MMLU-PRO": 0.446 + } + }, + { + "id": "lkoenig/BBAI_7B_Qwen2.5koen", + "name": "BBAI_7B_Qwen2.5koen", + "developer": "lkoenig", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.46, + "hfopenllm_v2/BBH": 0.5544, + "hfopenllm_v2/MATH Level 5": 0.3656, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4369, + "hfopenllm_v2/MMLU-PRO": 0.4485 + } + }, + { + "id": "lkoenig/BBAI_7B_QwenDyanKoenLo", + "name": "BBAI_7B_QwenDyanKoenLo", + "developer": "lkoenig", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4663, + "hfopenllm_v2/BBH": 0.5562, + "hfopenllm_v2/MATH Level 5": 0.364, + "hfopenllm_v2/GPQA": 0.3188, + "hfopenllm_v2/MUSR": 0.4343, + "hfopenllm_v2/MMLU-PRO": 0.4465 + } + }, + { + "id": "lkoenig/BBAI_7B_QwenDyancabsLAW", + "name": "BBAI_7B_QwenDyancabsLAW", + "developer": "lkoenig", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.555, + "hfopenllm_v2/BBH": 0.5579, + "hfopenllm_v2/MATH Level 5": 0.3678, + "hfopenllm_v2/GPQA": 0.3188, + "hfopenllm_v2/MUSR": 0.4461, + "hfopenllm_v2/MMLU-PRO": 0.4471 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/llm-blender.json b/data/developers/llm-blender.json new file mode 100644 index 0000000000000000000000000000000000000000..20669383aaa6959bf51fde926169f3d0693eaf73 --- /dev/null +++ b/data/developers/llm-blender.json @@ -0,0 +1,19 @@ +{ + "developer": "llm-blender", + "models": [ + { + "id": "llm-blender/PairRM-hf", + "name": "llm-blender/PairRM-hf", + "developer": "llm-blender", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6087, + "reward-bench/Chat": 0.9022, + "reward-bench/Chat Hard": 0.5219, + "reward-bench/Safety": 0.477, + "reward-bench/Reasoning": 0.4898, + "reward-bench/Prior Sets (0.5 weight)": 0.6961 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/llmat.json b/data/developers/llmat.json new file mode 100644 index 0000000000000000000000000000000000000000..d073eb81547c22e04e5363702192b3a9654d7362 --- /dev/null +++ b/data/developers/llmat.json @@ -0,0 +1,19 @@ +{ + "developer": "llmat", + "models": [ + { + "id": "llmat/Mistral-v0.3-7B-ORPO", + "name": "Mistral-v0.3-7B-ORPO", + "developer": "llmat", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.377, + "hfopenllm_v2/BBH": 0.3978, + "hfopenllm_v2/MATH Level 5": 0.0242, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3555, + "hfopenllm_v2/MMLU-PRO": 0.2278 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/llnYou.json b/data/developers/llnYou.json new file mode 100644 index 0000000000000000000000000000000000000000..78c52c5cd89fc6383489e38a5b917a1b06465248 --- /dev/null +++ b/data/developers/llnYou.json @@ -0,0 +1,75 @@ +{ + "developer": "llnYou", + "models": [ + { + "id": "llnYou/ECE-PRYMMAL-YL-1B-SLERP-V5", + "name": "ECE-PRYMMAL-YL-1B-SLERP-V5", + "developer": "llnYou", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3313, + "hfopenllm_v2/BBH": 0.4233, + "hfopenllm_v2/MATH Level 5": 0.111, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.3868, + "hfopenllm_v2/MMLU-PRO": 0.2931 + } + }, + { + "id": "llnYou/ECE-PRYMMAL-YL-1B-SLERP-V6", + "name": "ECE-PRYMMAL-YL-1B-SLERP-V6", + "developer": "llnYou", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1388, + "hfopenllm_v2/BBH": 0.3944, + "hfopenllm_v2/MATH Level 5": 0.0023, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.3928, + "hfopenllm_v2/MMLU-PRO": 0.235 + } + }, + { + "id": "llnYou/ECE-PRYMMAL-YL-3B-SLERP-V1", + "name": "ECE-PRYMMAL-YL-3B-SLERP-V1", + "developer": "llnYou", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2346, + "hfopenllm_v2/BBH": 0.4018, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.3364, + "hfopenllm_v2/MMLU-PRO": 0.285 + } + }, + { + "id": "llnYou/ECE-PRYMMAL-YL-3B-SLERP-V2", + "name": "ECE-PRYMMAL-YL-3B-SLERP-V2", + "developer": "llnYou", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2309, + "hfopenllm_v2/BBH": 0.399, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.3588, + "hfopenllm_v2/MMLU-PRO": 0.29 + } + }, + { + "id": "llnYou/ECE-PRYMMAL-YL-3B-SLERP-V3", + "name": "ECE-PRYMMAL-YL-3B-SLERP-V3", + "developer": "llnYou", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3581, + "hfopenllm_v2/BBH": 0.5473, + "hfopenllm_v2/MATH Level 5": 0.1299, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4361, + "hfopenllm_v2/MMLU-PRO": 0.4043 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/lmsys.json b/data/developers/lmsys.json new file mode 100644 index 0000000000000000000000000000000000000000..96ae1114d73cf5e858c50a6264de31049a4871d1 --- /dev/null +++ b/data/developers/lmsys.json @@ -0,0 +1,93 @@ +{ + "developer": "lmsys", + "models": [ + { + "id": "lmsys/Vicuna-v1.3-13B", + "name": "Vicuna v1.3 13B", + "developer": "lmsys", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.706, + "helm_classic/MMLU": 0.462, + "helm_classic/BoolQ": 0.808, + "helm_classic/NarrativeQA": 0.691, + "helm_classic/NaturalQuestions (open-book)": 0.686, + "helm_classic/QuAC": 0.403, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.385, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.762, + "helm_classic/CivilComments": 0.645, + "helm_classic/RAFT": 0.657 + } + }, + { + "id": "lmsys/Vicuna-v1.3-7B", + "name": "Vicuna v1.3 7B", + "developer": "lmsys", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.625, + "helm_classic/MMLU": 0.434, + "helm_classic/BoolQ": 0.76, + "helm_classic/NarrativeQA": 0.643, + "helm_classic/NaturalQuestions (open-book)": 0.634, + "helm_classic/QuAC": 0.392, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.292, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.916, + "helm_classic/CivilComments": 0.62, + "helm_classic/RAFT": 0.693 + } + }, + { + "id": "lmsys/vicuna-13b-v1.3", + "name": "vicuna-13b-v1.3", + "developer": "lmsys", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3344, + "hfopenllm_v2/BBH": 0.3384, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3727, + "hfopenllm_v2/MMLU-PRO": 0.2243 + } + }, + { + "id": "lmsys/vicuna-7b-v1.3", + "name": "vicuna-7b-v1.3", + "developer": "lmsys", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2909, + "hfopenllm_v2/BBH": 0.3298, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2424, + "hfopenllm_v2/MUSR": 0.3793, + "hfopenllm_v2/MMLU-PRO": 0.1838 + } + }, + { + "id": "lmsys/vicuna-7b-v1.5", + "name": "vicuna-7b-v1.5", + "developer": "lmsys", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2352, + "hfopenllm_v2/BBH": 0.3947, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.4231, + "hfopenllm_v2/MMLU-PRO": 0.2147 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/lodrick-the-lafted.json b/data/developers/lodrick-the-lafted.json new file mode 100644 index 0000000000000000000000000000000000000000..a81c387a5f606d1877cf135dda080cad901adb49 --- /dev/null +++ b/data/developers/lodrick-the-lafted.json @@ -0,0 +1,19 @@ +{ + "developer": "lodrick-the-lafted", + "models": [ + { + "id": "lodrick-the-lafted/llama-3.1-8b-instruct-ortho-v7", + "name": "llama-3.1-8b-instruct-ortho-v7", + "developer": "lodrick-the-lafted", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3515, + "hfopenllm_v2/BBH": 0.3907, + "hfopenllm_v2/MATH Level 5": 0.0272, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3616, + "hfopenllm_v2/MMLU-PRO": 0.1974 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/lordjia.json b/data/developers/lordjia.json new file mode 100644 index 0000000000000000000000000000000000000000..f199a770c703a5000ae4040a1ad666f3c4cf3c52 --- /dev/null +++ b/data/developers/lordjia.json @@ -0,0 +1,33 @@ +{ + "developer": "lordjia", + "models": [ + { + "id": "lordjia/Llama-3-Cantonese-8B-Instruct", + "name": "Llama-3-Cantonese-8B-Instruct", + "developer": "lordjia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6669, + "hfopenllm_v2/BBH": 0.4814, + "hfopenllm_v2/MATH Level 5": 0.0891, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4046, + "hfopenllm_v2/MMLU-PRO": 0.3515 + } + }, + { + "id": "lordjia/Qwen2-Cantonese-7B-Instruct", + "name": "Qwen2-Cantonese-7B-Instruct", + "developer": "lordjia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5435, + "hfopenllm_v2/BBH": 0.5215, + "hfopenllm_v2/MATH Level 5": 0.256, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4004, + "hfopenllm_v2/MMLU-PRO": 0.3843 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/lt-asset.json b/data/developers/lt-asset.json new file mode 100644 index 0000000000000000000000000000000000000000..0bbb1e10736981f0b0824b3c6f95c492cc8700c4 --- /dev/null +++ b/data/developers/lt-asset.json @@ -0,0 +1,19 @@ +{ + "developer": "lt-asset", + "models": [ + { + "id": "lt-asset/nova-1.3b", + "name": "nova-1.3b", + "developer": "lt-asset", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1214, + "hfopenllm_v2/BBH": 0.317, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2492, + "hfopenllm_v2/MUSR": 0.3698, + "hfopenllm_v2/MMLU-PRO": 0.1142 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/lunahr.json b/data/developers/lunahr.json new file mode 100644 index 0000000000000000000000000000000000000000..77a095e0a71c400736fd57c940aee25f3924e8c7 --- /dev/null +++ b/data/developers/lunahr.json @@ -0,0 +1,33 @@ +{ + "developer": "lunahr", + "models": [ + { + "id": "lunahr/thea-3b-50r-u1", + "name": "thea-3b-50r-u1", + "developer": "lunahr", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.603, + "hfopenllm_v2/BBH": 0.4105, + "hfopenllm_v2/MATH Level 5": 0.1042, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3182, + "hfopenllm_v2/MMLU-PRO": 0.2808 + } + }, + { + "id": "lunahr/thea-v2-3b-50r", + "name": "thea-v2-3b-50r", + "developer": "lunahr", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3704, + "hfopenllm_v2/BBH": 0.4194, + "hfopenllm_v2/MATH Level 5": 0.0242, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3222, + "hfopenllm_v2/MMLU-PRO": 0.2409 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/m42-health.json b/data/developers/m42-health.json new file mode 100644 index 0000000000000000000000000000000000000000..acda9f22f160f934ca72856d70298d1e988d23a6 --- /dev/null +++ b/data/developers/m42-health.json @@ -0,0 +1,19 @@ +{ + "developer": "m42-health", + "models": [ + { + "id": "m42-health/Llama3-Med42-70B", + "name": "Llama3-Med42-70B", + "developer": "m42-health", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6291, + "hfopenllm_v2/BBH": 0.6688, + "hfopenllm_v2/MATH Level 5": 0.2258, + "hfopenllm_v2/GPQA": 0.3473, + "hfopenllm_v2/MUSR": 0.4629, + "hfopenllm_v2/MMLU-PRO": 0.4963 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/macadeliccc.json b/data/developers/macadeliccc.json new file mode 100644 index 0000000000000000000000000000000000000000..88d79cbd6ec61017ed931dd9e4e20bd30d94154f --- /dev/null +++ b/data/developers/macadeliccc.json @@ -0,0 +1,47 @@ +{ + "developer": "macadeliccc", + "models": [ + { + "id": "macadeliccc/Samantha-Qwen-2-7B", + "name": "Samantha-Qwen-2-7B", + "developer": "macadeliccc", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4377, + "hfopenllm_v2/BBH": 0.5082, + "hfopenllm_v2/MATH Level 5": 0.2115, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.4799, + "hfopenllm_v2/MMLU-PRO": 0.3779 + } + }, + { + "id": "macadeliccc/magistrate-3.2-3b-base", + "name": "magistrate-3.2-3b-base", + "developer": "macadeliccc", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1159, + "hfopenllm_v2/BBH": 0.3343, + "hfopenllm_v2/MATH Level 5": 0.0113, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3976, + "hfopenllm_v2/MMLU-PRO": 0.1689 + } + }, + { + "id": "macadeliccc/magistrate-3.2-3b-it", + "name": "magistrate-3.2-3b-it", + "developer": "macadeliccc", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2292, + "hfopenllm_v2/BBH": 0.3257, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2475, + "hfopenllm_v2/MUSR": 0.3763, + "hfopenllm_v2/MMLU-PRO": 0.1592 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/magnifi.json b/data/developers/magnifi.json new file mode 100644 index 0000000000000000000000000000000000000000..fefbb71d537ed40535f4c3c5326d83fa9d2fe79a --- /dev/null +++ b/data/developers/magnifi.json @@ -0,0 +1,19 @@ +{ + "developer": "magnifi", + "models": [ + { + "id": "magnifi/Phi3_intent_v56_3_w_unknown_5_lr_0.002", + "name": "Phi3_intent_v56_3_w_unknown_5_lr_0.002", + "developer": "magnifi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2018, + "hfopenllm_v2/BBH": 0.3282, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.4123, + "hfopenllm_v2/MMLU-PRO": 0.1472 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/maldv.json b/data/developers/maldv.json new file mode 100644 index 0000000000000000000000000000000000000000..93e5895b319e228b31a59b6018a49e0d77bddbea --- /dev/null +++ b/data/developers/maldv.json @@ -0,0 +1,103 @@ +{ + "developer": "maldv", + "models": [ + { + "id": "maldv/Awqward2.5-32B-Instruct", + "name": "Awqward2.5-32B-Instruct", + "developer": "maldv", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8255, + "hfopenllm_v2/BBH": 0.6974, + "hfopenllm_v2/MATH Level 5": 0.6231, + "hfopenllm_v2/GPQA": 0.3406, + "hfopenllm_v2/MUSR": 0.4275, + "hfopenllm_v2/MMLU-PRO": 0.5723 + } + }, + { + "id": "maldv/Lytta2.5-32B-Instruct", + "name": "Lytta2.5-32B-Instruct", + "developer": "maldv", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2508, + "hfopenllm_v2/BBH": 0.56, + "hfopenllm_v2/MATH Level 5": 0.3444, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3769, + "hfopenllm_v2/MMLU-PRO": 0.5048 + } + }, + { + "id": "maldv/Qwentile2.5-32B-Instruct", + "name": "Qwentile2.5-32B-Instruct", + "developer": "maldv", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7393, + "hfopenllm_v2/BBH": 0.6963, + "hfopenllm_v2/MATH Level 5": 0.5219, + "hfopenllm_v2/GPQA": 0.3842, + "hfopenllm_v2/MUSR": 0.4682, + "hfopenllm_v2/MMLU-PRO": 0.5879 + } + }, + { + "id": "maldv/badger-kappa-llama-3-8b", + "name": "badger-kappa-llama-3-8b", + "developer": "maldv", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4695, + "hfopenllm_v2/BBH": 0.5085, + "hfopenllm_v2/MATH Level 5": 0.0861, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.3765, + "hfopenllm_v2/MMLU-PRO": 0.3695 + } + }, + { + "id": "maldv/badger-lambda-llama-3-8b", + "name": "badger-lambda-llama-3-8b", + "developer": "maldv", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4861, + "hfopenllm_v2/BBH": 0.4963, + "hfopenllm_v2/MATH Level 5": 0.0944, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.3754, + "hfopenllm_v2/MMLU-PRO": 0.3767 + } + }, + { + "id": "maldv/badger-mu-llama-3-8b", + "name": "badger-mu-llama-3-8b", + "developer": "maldv", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4919, + "hfopenllm_v2/BBH": 0.5143, + "hfopenllm_v2/MATH Level 5": 0.0559, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3555, + "hfopenllm_v2/MMLU-PRO": 0.3674 + } + }, + { + "id": "maldv/badger-writer-llama-3-8b", + "name": "badger-writer-llama-3-8b", + "developer": "maldv", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5303, + "hfopenllm_v2/BBH": 0.4864, + "hfopenllm_v2/MATH Level 5": 0.0755, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.3581, + "hfopenllm_v2/MMLU-PRO": 0.376 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/marcuscedricridia.json b/data/developers/marcuscedricridia.json new file mode 100644 index 0000000000000000000000000000000000000000..42b9b90d849d08c39bd80b4a0b17e9b79a9284fe --- /dev/null +++ b/data/developers/marcuscedricridia.json @@ -0,0 +1,565 @@ +{ + "developer": "marcuscedricridia", + "models": [ + { + "id": "marcuscedricridia/Cheng-1", + "name": "Cheng-1", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7789, + "hfopenllm_v2/BBH": 0.5525, + "hfopenllm_v2/MATH Level 5": 0.4894, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4073, + "hfopenllm_v2/MMLU-PRO": 0.4349 + } + }, + { + "id": "marcuscedricridia/Cheng-2", + "name": "Cheng-2", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8337, + "hfopenllm_v2/BBH": 0.6499, + "hfopenllm_v2/MATH Level 5": 0.5438, + "hfopenllm_v2/GPQA": 0.3456, + "hfopenllm_v2/MUSR": 0.4193, + "hfopenllm_v2/MMLU-PRO": 0.5013 + } + }, + { + "id": "marcuscedricridia/Cheng-2-v1.1", + "name": "Cheng-2-v1.1", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.827, + "hfopenllm_v2/BBH": 0.651, + "hfopenllm_v2/MATH Level 5": 0.5393, + "hfopenllm_v2/GPQA": 0.3431, + "hfopenllm_v2/MUSR": 0.4167, + "hfopenllm_v2/MMLU-PRO": 0.5076 + } + }, + { + "id": "marcuscedricridia/Hush-Qwen2.5-7B-MST", + "name": "Hush-Qwen2.5-7B-MST", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7488, + "hfopenllm_v2/BBH": 0.5458, + "hfopenllm_v2/MATH Level 5": 0.4245, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.3914, + "hfopenllm_v2/MMLU-PRO": 0.4163 + } + }, + { + "id": "marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.1", + "name": "Hush-Qwen2.5-7B-MST-v1.1", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7445, + "hfopenllm_v2/BBH": 0.5559, + "hfopenllm_v2/MATH Level 5": 0.4653, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4073, + "hfopenllm_v2/MMLU-PRO": 0.4299 + } + }, + { + "id": "marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.3", + "name": "Hush-Qwen2.5-7B-MST-v1.3", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7043, + "hfopenllm_v2/BBH": 0.5516, + "hfopenllm_v2/MATH Level 5": 0.4758, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.4311, + "hfopenllm_v2/MMLU-PRO": 0.444 + } + }, + { + "id": "marcuscedricridia/Hush-Qwen2.5-7B-Preview", + "name": "Hush-Qwen2.5-7B-Preview", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7962, + "hfopenllm_v2/BBH": 0.5431, + "hfopenllm_v2/MATH Level 5": 0.3754, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.4298, + "hfopenllm_v2/MMLU-PRO": 0.4364 + } + }, + { + "id": "marcuscedricridia/Hush-Qwen2.5-7B-RP-v1.4-1M", + "name": "Hush-Qwen2.5-7B-RP-v1.4-1M", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7728, + "hfopenllm_v2/BBH": 0.5295, + "hfopenllm_v2/MATH Level 5": 0.3369, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4433, + "hfopenllm_v2/MMLU-PRO": 0.4135 + } + }, + { + "id": "marcuscedricridia/Hush-Qwen2.5-7B-v1.1", + "name": "Hush-Qwen2.5-7B-v1.1", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7889, + "hfopenllm_v2/BBH": 0.5384, + "hfopenllm_v2/MATH Level 5": 0.4381, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4179, + "hfopenllm_v2/MMLU-PRO": 0.4227 + } + }, + { + "id": "marcuscedricridia/Hush-Qwen2.5-7B-v1.2", + "name": "Hush-Qwen2.5-7B-v1.2", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7865, + "hfopenllm_v2/BBH": 0.5403, + "hfopenllm_v2/MATH Level 5": 0.4403, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.4219, + "hfopenllm_v2/MMLU-PRO": 0.4197 + } + }, + { + "id": "marcuscedricridia/Hush-Qwen2.5-7B-v1.3", + "name": "Hush-Qwen2.5-7B-v1.3", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7856, + "hfopenllm_v2/BBH": 0.5327, + "hfopenllm_v2/MATH Level 5": 0.3323, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4246, + "hfopenllm_v2/MMLU-PRO": 0.4345 + } + }, + { + "id": "marcuscedricridia/Hush-Qwen2.5-7B-v1.4", + "name": "Hush-Qwen2.5-7B-v1.4", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7835, + "hfopenllm_v2/BBH": 0.5423, + "hfopenllm_v2/MATH Level 5": 0.426, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.4232, + "hfopenllm_v2/MMLU-PRO": 0.4195 + } + }, + { + "id": "marcuscedricridia/Qwen2.5-7B-Preview", + "name": "Qwen2.5-7B-Preview", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7679, + "hfopenllm_v2/BBH": 0.536, + "hfopenllm_v2/MATH Level 5": 0.3444, + "hfopenllm_v2/GPQA": 0.3238, + "hfopenllm_v2/MUSR": 0.414, + "hfopenllm_v2/MMLU-PRO": 0.4258 + } + }, + { + "id": "marcuscedricridia/Yell-Qwen2.5-7B-Preview", + "name": "Yell-Qwen2.5-7B-Preview", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5839, + "hfopenllm_v2/BBH": 0.5371, + "hfopenllm_v2/MATH Level 5": 0.1926, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.4046, + "hfopenllm_v2/MMLU-PRO": 0.3798 + } + }, + { + "id": "marcuscedricridia/Yell-Qwen2.5-7B-Preview-v1.1", + "name": "Yell-Qwen2.5-7B-Preview-v1.1", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5757, + "hfopenllm_v2/BBH": 0.5348, + "hfopenllm_v2/MATH Level 5": 0.1896, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.4059, + "hfopenllm_v2/MMLU-PRO": 0.3831 + } + }, + { + "id": "marcuscedricridia/absolute-o1-7b", + "name": "absolute-o1-7b", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7516, + "hfopenllm_v2/BBH": 0.5469, + "hfopenllm_v2/MATH Level 5": 0.5083, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.4114, + "hfopenllm_v2/MMLU-PRO": 0.4413 + } + }, + { + "id": "marcuscedricridia/cursa-o1-7b", + "name": "cursa-o1-7b", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7628, + "hfopenllm_v2/BBH": 0.5466, + "hfopenllm_v2/MATH Level 5": 0.4955, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4301, + "hfopenllm_v2/MMLU-PRO": 0.4392 + } + }, + { + "id": "marcuscedricridia/cursa-o1-7b-2-28-2025", + "name": "cursa-o1-7b-2-28-2025", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7467, + "hfopenllm_v2/BBH": 0.5384, + "hfopenllm_v2/MATH Level 5": 0.4811, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4273, + "hfopenllm_v2/MMLU-PRO": 0.4365 + } + }, + { + "id": "marcuscedricridia/cursa-o1-7b-v1.1", + "name": "cursa-o1-7b-v1.1", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7528, + "hfopenllm_v2/BBH": 0.5493, + "hfopenllm_v2/MATH Level 5": 0.4985, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4259, + "hfopenllm_v2/MMLU-PRO": 0.4392 + } + }, + { + "id": "marcuscedricridia/cursa-o1-7b-v1.2-normalize-false", + "name": "cursa-o1-7b-v1.2-normalize-false", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7616, + "hfopenllm_v2/BBH": 0.5492, + "hfopenllm_v2/MATH Level 5": 0.4992, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4273, + "hfopenllm_v2/MMLU-PRO": 0.4436 + } + }, + { + "id": "marcuscedricridia/cursor-o1-7b", + "name": "cursor-o1-7b", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4107, + "hfopenllm_v2/BBH": 0.5007, + "hfopenllm_v2/MATH Level 5": 0.1412, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.4101, + "hfopenllm_v2/MMLU-PRO": 0.3251 + } + }, + { + "id": "marcuscedricridia/cursorr-o1.2-7b", + "name": "cursorr-o1.2-7b", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.166, + "hfopenllm_v2/BBH": 0.3068, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.3538, + "hfopenllm_v2/MMLU-PRO": 0.108 + } + }, + { + "id": "marcuscedricridia/etr1o-explicit-v1.1", + "name": "etr1o-explicit-v1.1", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.288, + "hfopenllm_v2/BBH": 0.3132, + "hfopenllm_v2/MATH Level 5": 0.0045, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.4111, + "hfopenllm_v2/MMLU-PRO": 0.1195 + } + }, + { + "id": "marcuscedricridia/etr1o-explicit-v1.2", + "name": "etr1o-explicit-v1.2", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1504, + "hfopenllm_v2/BBH": 0.295, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.4031, + "hfopenllm_v2/MMLU-PRO": 0.1126 + } + }, + { + "id": "marcuscedricridia/etr1o-v1.1", + "name": "etr1o-v1.1", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1597, + "hfopenllm_v2/BBH": 0.31, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.4017, + "hfopenllm_v2/MMLU-PRO": 0.1157 + } + }, + { + "id": "marcuscedricridia/etr1o-v1.2", + "name": "etr1o-v1.2", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7287, + "hfopenllm_v2/BBH": 0.6349, + "hfopenllm_v2/MATH Level 5": 0.3588, + "hfopenllm_v2/GPQA": 0.3758, + "hfopenllm_v2/MUSR": 0.4714, + "hfopenllm_v2/MMLU-PRO": 0.5316 + } + }, + { + "id": "marcuscedricridia/fan-o1-7b", + "name": "fan-o1-7b", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4456, + "hfopenllm_v2/BBH": 0.4849, + "hfopenllm_v2/MATH Level 5": 0.1616, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.3834, + "hfopenllm_v2/MMLU-PRO": 0.3274 + } + }, + { + "id": "marcuscedricridia/olmner-7b", + "name": "olmner-7b", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7254, + "hfopenllm_v2/BBH": 0.5472, + "hfopenllm_v2/MATH Level 5": 0.463, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.438, + "hfopenllm_v2/MMLU-PRO": 0.4309 + } + }, + { + "id": "marcuscedricridia/olmner-della-7b", + "name": "olmner-della-7b", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7637, + "hfopenllm_v2/BBH": 0.5491, + "hfopenllm_v2/MATH Level 5": 0.4962, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4208, + "hfopenllm_v2/MMLU-PRO": 0.4386 + } + }, + { + "id": "marcuscedricridia/olmner-o1-7b", + "name": "olmner-o1-7b", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7528, + "hfopenllm_v2/BBH": 0.5481, + "hfopenllm_v2/MATH Level 5": 0.4924, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4299, + "hfopenllm_v2/MMLU-PRO": 0.4386 + } + }, + { + "id": "marcuscedricridia/olmner-sbr-7b", + "name": "olmner-sbr-7b", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.76, + "hfopenllm_v2/BBH": 0.5462, + "hfopenllm_v2/MATH Level 5": 0.4947, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4154, + "hfopenllm_v2/MMLU-PRO": 0.4412 + } + }, + { + "id": "marcuscedricridia/post-cursa-o1", + "name": "post-cursa-o1", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7628, + "hfopenllm_v2/BBH": 0.548, + "hfopenllm_v2/MATH Level 5": 0.4872, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4351, + "hfopenllm_v2/MMLU-PRO": 0.4361 + } + }, + { + "id": "marcuscedricridia/pre-cursa-o1", + "name": "pre-cursa-o1", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7409, + "hfopenllm_v2/BBH": 0.5462, + "hfopenllm_v2/MATH Level 5": 0.5038, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.426, + "hfopenllm_v2/MMLU-PRO": 0.4424 + } + }, + { + "id": "marcuscedricridia/pre-cursa-o1-v1.2", + "name": "pre-cursa-o1-v1.2", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7549, + "hfopenllm_v2/BBH": 0.5487, + "hfopenllm_v2/MATH Level 5": 0.5068, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4272, + "hfopenllm_v2/MMLU-PRO": 0.4402 + } + }, + { + "id": "marcuscedricridia/pre-cursa-o1-v1.3", + "name": "pre-cursa-o1-v1.3", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7507, + "hfopenllm_v2/BBH": 0.5455, + "hfopenllm_v2/MATH Level 5": 0.5076, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4271, + "hfopenllm_v2/MMLU-PRO": 0.442 + } + }, + { + "id": "marcuscedricridia/pre-cursa-o1-v1.4", + "name": "pre-cursa-o1-v1.4", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7488, + "hfopenllm_v2/BBH": 0.5493, + "hfopenllm_v2/MATH Level 5": 0.4834, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4285, + "hfopenllm_v2/MMLU-PRO": 0.4436 + } + }, + { + "id": "marcuscedricridia/pre-cursa-o1-v1.6", + "name": "pre-cursa-o1-v1.6", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7528, + "hfopenllm_v2/BBH": 0.5473, + "hfopenllm_v2/MATH Level 5": 0.5, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.4234, + "hfopenllm_v2/MMLU-PRO": 0.4413 + } + }, + { + "id": "marcuscedricridia/r1o-et", + "name": "r1o-et", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3597, + "hfopenllm_v2/BBH": 0.4209, + "hfopenllm_v2/MATH Level 5": 0.0793, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3579, + "hfopenllm_v2/MMLU-PRO": 0.258 + } + }, + { + "id": "marcuscedricridia/sbr-o1-7b", + "name": "sbr-o1-7b", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7455, + "hfopenllm_v2/BBH": 0.5479, + "hfopenllm_v2/MATH Level 5": 0.4985, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4404, + "hfopenllm_v2/MMLU-PRO": 0.4355 + } + }, + { + "id": "marcuscedricridia/stray-r1o-et", + "name": "stray-r1o-et", + "developer": "marcuscedricridia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1562, + "hfopenllm_v2/BBH": 0.2967, + "hfopenllm_v2/MATH Level 5": 0.0045, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.4086, + "hfopenllm_v2/MMLU-PRO": 0.1094 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/marin-community.json b/data/developers/marin-community.json new file mode 100644 index 0000000000000000000000000000000000000000..ca6eb59d93bda99084a5861cffc99217581f03ee --- /dev/null +++ b/data/developers/marin-community.json @@ -0,0 +1,19 @@ +{ + "developer": "marin-community", + "models": [ + { + "id": "marin-community/marin-8b-instruct", + "name": "Marin 8B Instruct", + "developer": "marin-community", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.325, + "helm_capabilities/MMLU-Pro": 0.188, + "helm_capabilities/GPQA": 0.168, + "helm_capabilities/IFEval": 0.632, + "helm_capabilities/WildBench": 0.477, + "helm_capabilities/Omni-MATH": 0.16 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/matouLeLoup.json b/data/developers/matouLeLoup.json new file mode 100644 index 0000000000000000000000000000000000000000..32ce94b71fbd9954a39bf98b1d00aabe591f2d98 --- /dev/null +++ b/data/developers/matouLeLoup.json @@ -0,0 +1,75 @@ +{ + "developer": "matouLeLoup", + "models": [ + { + "id": "matouLeLoup/ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3", + "name": "ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3", + "developer": "matouLeLoup", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1873, + "hfopenllm_v2/BBH": 0.3239, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3752, + "hfopenllm_v2/MMLU-PRO": 0.172 + } + }, + { + "id": "matouLeLoup/ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis", + "name": "ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis", + "developer": "matouLeLoup", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1873, + "hfopenllm_v2/BBH": 0.3239, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3752, + "hfopenllm_v2/MMLU-PRO": 0.172 + } + }, + { + "id": "matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis", + "name": "ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis", + "developer": "matouLeLoup", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1873, + "hfopenllm_v2/BBH": 0.3239, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3752, + "hfopenllm_v2/MMLU-PRO": 0.172 + } + }, + { + "id": "matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis", + "name": "ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis", + "developer": "matouLeLoup", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1882, + "hfopenllm_v2/BBH": 0.3233, + "hfopenllm_v2/MATH Level 5": 0.0272, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3685, + "hfopenllm_v2/MMLU-PRO": 0.172 + } + }, + { + "id": "matouLeLoup/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis", + "name": "ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis", + "developer": "matouLeLoup", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1652, + "hfopenllm_v2/BBH": 0.3024, + "hfopenllm_v2/MATH Level 5": 0.0189, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.4273, + "hfopenllm_v2/MMLU-PRO": 0.1116 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/mattshumer.json b/data/developers/mattshumer.json new file mode 100644 index 0000000000000000000000000000000000000000..52e185b44b676e08faebee759b87d1f5beb28242 --- /dev/null +++ b/data/developers/mattshumer.json @@ -0,0 +1,46 @@ +{ + "developer": "mattshumer", + "models": [ + { + "id": "mattshumer/Reflection-70B", + "name": "mattshumer/Reflection-70B", + "developer": "mattshumer", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8422, + "reward-bench/Chat": 0.9749, + "reward-bench/Chat Hard": 0.7061, + "reward-bench/Safety": 0.8318, + "reward-bench/Reasoning": 0.8562 + } + }, + { + "id": "mattshumer/Reflection-Llama-3.1-70B", + "name": "Reflection-Llama-3.1-70B", + "developer": "mattshumer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0045, + "hfopenllm_v2/BBH": 0.645, + "hfopenllm_v2/MATH Level 5": 0.2145, + "hfopenllm_v2/GPQA": 0.3633, + "hfopenllm_v2/MUSR": 0.4577, + "hfopenllm_v2/MMLU-PRO": 0.4955 + } + }, + { + "id": "mattshumer/ref_70_e3", + "name": "ref_70_e3", + "developer": "mattshumer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6294, + "hfopenllm_v2/BBH": 0.6501, + "hfopenllm_v2/MATH Level 5": 0.2795, + "hfopenllm_v2/GPQA": 0.3356, + "hfopenllm_v2/MUSR": 0.4328, + "hfopenllm_v2/MMLU-PRO": 0.5303 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/maywell.json b/data/developers/maywell.json new file mode 100644 index 0000000000000000000000000000000000000000..0c4576902a7f911314c03825ca7c9d050bfeaf4f --- /dev/null +++ b/data/developers/maywell.json @@ -0,0 +1,19 @@ +{ + "developer": "maywell", + "models": [ + { + "id": "maywell/Qwen2-7B-Multilingual-RP", + "name": "Qwen2-7B-Multilingual-RP", + "developer": "maywell", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4347, + "hfopenllm_v2/BBH": 0.5062, + "hfopenllm_v2/MATH Level 5": 0.2243, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.3696, + "hfopenllm_v2/MMLU-PRO": 0.3859 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/meditsolutions.json b/data/developers/meditsolutions.json new file mode 100644 index 0000000000000000000000000000000000000000..673c1a5477f36cda6913a10c8f2fc3d71440c0f2 --- /dev/null +++ b/data/developers/meditsolutions.json @@ -0,0 +1,173 @@ +{ + "developer": "meditsolutions", + "models": [ + { + "id": "meditsolutions/Llama-3.1-MedIT-SUN-8B", + "name": "Llama-3.1-MedIT-SUN-8B", + "developer": "meditsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7837, + "hfopenllm_v2/BBH": 0.5187, + "hfopenllm_v2/MATH Level 5": 0.2092, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4056, + "hfopenllm_v2/MMLU-PRO": 0.3916 + } + }, + { + "id": "meditsolutions/Llama-3.2-SUN-1B-Instruct", + "name": "Llama-3.2-SUN-1B-Instruct", + "developer": "meditsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6413, + "hfopenllm_v2/BBH": 0.3474, + "hfopenllm_v2/MATH Level 5": 0.071, + "hfopenllm_v2/GPQA": 0.2424, + "hfopenllm_v2/MUSR": 0.3514, + "hfopenllm_v2/MMLU-PRO": 0.1781 + } + }, + { + "id": "meditsolutions/Llama-3.2-SUN-1B-chat", + "name": "Llama-3.2-SUN-1B-chat", + "developer": "meditsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5482, + "hfopenllm_v2/BBH": 0.3514, + "hfopenllm_v2/MATH Level 5": 0.0642, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3249, + "hfopenllm_v2/MMLU-PRO": 0.1838 + } + }, + { + "id": "meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-26000", + "name": "Llama-3.2-SUN-2.4B-checkpoint-26000", + "developer": "meditsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2814, + "hfopenllm_v2/BBH": 0.3018, + "hfopenllm_v2/MATH Level 5": 0.0181, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.4103, + "hfopenllm_v2/MMLU-PRO": 0.1345 + } + }, + { + "id": "meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-34800", + "name": "Llama-3.2-SUN-2.4B-checkpoint-34800", + "developer": "meditsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2501, + "hfopenllm_v2/BBH": 0.3161, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.4022, + "hfopenllm_v2/MMLU-PRO": 0.1357 + } + }, + { + "id": "meditsolutions/Llama-3.2-SUN-2.4B-v1.0.0", + "name": "Llama-3.2-SUN-2.4B-v1.0.0", + "developer": "meditsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5637, + "hfopenllm_v2/BBH": 0.3391, + "hfopenllm_v2/MATH Level 5": 0.0627, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3209, + "hfopenllm_v2/MMLU-PRO": 0.1543 + } + }, + { + "id": "meditsolutions/Llama-3.2-SUN-2.5B-chat", + "name": "Llama-3.2-SUN-2.5B-chat", + "developer": "meditsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5604, + "hfopenllm_v2/BBH": 0.3575, + "hfopenllm_v2/MATH Level 5": 0.071, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3155, + "hfopenllm_v2/MMLU-PRO": 0.1813 + } + }, + { + "id": "meditsolutions/Llama-3.2-SUN-HDIC-1B-Instruct", + "name": "Llama-3.2-SUN-HDIC-1B-Instruct", + "developer": "meditsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6827, + "hfopenllm_v2/BBH": 0.3508, + "hfopenllm_v2/MATH Level 5": 0.0619, + "hfopenllm_v2/GPQA": 0.2366, + "hfopenllm_v2/MUSR": 0.3594, + "hfopenllm_v2/MMLU-PRO": 0.1687 + } + }, + { + "id": "meditsolutions/MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune", + "name": "MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune", + "developer": "meditsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3655, + "hfopenllm_v2/BBH": 0.4035, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4253, + "hfopenllm_v2/MMLU-PRO": 0.219 + } + }, + { + "id": "meditsolutions/MSH-v1-Bielik-v2.3-Instruct-MedIT-merge", + "name": "MSH-v1-Bielik-v2.3-Instruct-MedIT-merge", + "developer": "meditsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5814, + "hfopenllm_v2/BBH": 0.5672, + "hfopenllm_v2/MATH Level 5": 0.2077, + "hfopenllm_v2/GPQA": 0.3456, + "hfopenllm_v2/MUSR": 0.4385, + "hfopenllm_v2/MMLU-PRO": 0.35 + } + }, + { + "id": "meditsolutions/MedIT-Mesh-3B-Instruct", + "name": "MedIT-Mesh-3B-Instruct", + "developer": "meditsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5814, + "hfopenllm_v2/BBH": 0.5576, + "hfopenllm_v2/MATH Level 5": 0.2032, + "hfopenllm_v2/GPQA": 0.3238, + "hfopenllm_v2/MUSR": 0.4048, + "hfopenllm_v2/MMLU-PRO": 0.4012 + } + }, + { + "id": "meditsolutions/SmolLM2-MedIT-Upscale-2B", + "name": "SmolLM2-MedIT-Upscale-2B", + "developer": "meditsolutions", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6429, + "hfopenllm_v2/BBH": 0.3551, + "hfopenllm_v2/MATH Level 5": 0.0559, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3314, + "hfopenllm_v2/MMLU-PRO": 0.1971 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/meetkai.json b/data/developers/meetkai.json new file mode 100644 index 0000000000000000000000000000000000000000..d762477ec57ff7e5a541f046ef34cd69f3fb74b9 --- /dev/null +++ b/data/developers/meetkai.json @@ -0,0 +1,19 @@ +{ + "developer": "meetkai", + "models": [ + { + "id": "meetkai/functionary-small-v3.1", + "name": "functionary-small-v3.1", + "developer": "meetkai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6275, + "hfopenllm_v2/BBH": 0.4982, + "hfopenllm_v2/MATH Level 5": 0.1571, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.3834, + "hfopenllm_v2/MMLU-PRO": 0.3349 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/meraGPT.json b/data/developers/meraGPT.json new file mode 100644 index 0000000000000000000000000000000000000000..c87574afa87391bfd37d8e239fc87676a126cfb3 --- /dev/null +++ b/data/developers/meraGPT.json @@ -0,0 +1,19 @@ +{ + "developer": "meraGPT", + "models": [ + { + "id": "meraGPT/mera-mix-4x7B", + "name": "mera-mix-4x7B", + "developer": "meraGPT", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4832, + "hfopenllm_v2/BBH": 0.4019, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4057, + "hfopenllm_v2/MMLU-PRO": 0.2748 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/mergekit-community.json b/data/developers/mergekit-community.json new file mode 100644 index 0000000000000000000000000000000000000000..a3bc732a5017ba341385820eeec21b196150948d --- /dev/null +++ b/data/developers/mergekit-community.json @@ -0,0 +1,159 @@ +{ + "developer": "mergekit-community", + "models": [ + { + "id": "mergekit-community/JAJUKA-WEWILLNEVERFORGETYOU-3B", + "name": "JAJUKA-WEWILLNEVERFORGETYOU-3B", + "developer": "mergekit-community", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4941, + "hfopenllm_v2/BBH": 0.437, + "hfopenllm_v2/MATH Level 5": 0.1246, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.3656, + "hfopenllm_v2/MMLU-PRO": 0.3033 + } + }, + { + "id": "mergekit-community/SuperQwen-2.5-1.5B", + "name": "SuperQwen-2.5-1.5B", + "developer": "mergekit-community", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1336, + "hfopenllm_v2/BBH": 0.2907, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.3355, + "hfopenllm_v2/MMLU-PRO": 0.1075 + } + }, + { + "id": "mergekit-community/VirtuosoSmall-InstructModelStock", + "name": "VirtuosoSmall-InstructModelStock", + "developer": "mergekit-community", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5238, + "hfopenllm_v2/BBH": 0.6518, + "hfopenllm_v2/MATH Level 5": 0.4094, + "hfopenllm_v2/GPQA": 0.3826, + "hfopenllm_v2/MUSR": 0.4756, + "hfopenllm_v2/MMLU-PRO": 0.5421 + } + }, + { + "id": "mergekit-community/diabolic6045_ELN-AOC-CAIN", + "name": "diabolic6045_ELN-AOC-CAIN", + "developer": "mergekit-community", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0862, + "hfopenllm_v2/BBH": 0.3126, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3658, + "hfopenllm_v2/MMLU-PRO": 0.1191 + } + }, + { + "id": "mergekit-community/mergekit-dare_ties-ajgjgea", + "name": "mergekit-dare_ties-ajgjgea", + "developer": "mergekit-community", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5263, + "hfopenllm_v2/BBH": 0.3495, + "hfopenllm_v2/MATH Level 5": 0.0642, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3289, + "hfopenllm_v2/MMLU-PRO": 0.1744 + } + }, + { + "id": "mergekit-community/mergekit-della-zgowfmf", + "name": "mergekit-della-zgowfmf", + "developer": "mergekit-community", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4828, + "hfopenllm_v2/BBH": 0.6591, + "hfopenllm_v2/MATH Level 5": 0.3618, + "hfopenllm_v2/GPQA": 0.3901, + "hfopenllm_v2/MUSR": 0.4834, + "hfopenllm_v2/MMLU-PRO": 0.5415 + } + }, + { + "id": "mergekit-community/mergekit-model_stock-azgztvm", + "name": "mergekit-model_stock-azgztvm", + "developer": "mergekit-community", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5062, + "hfopenllm_v2/BBH": 0.6543, + "hfopenllm_v2/MATH Level 5": 0.4373, + "hfopenllm_v2/GPQA": 0.3817, + "hfopenllm_v2/MUSR": 0.473, + "hfopenllm_v2/MMLU-PRO": 0.5406 + } + }, + { + "id": "mergekit-community/mergekit-slerp-fmrazcr", + "name": "mergekit-slerp-fmrazcr", + "developer": "mergekit-community", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4174, + "hfopenllm_v2/BBH": 0.5342, + "hfopenllm_v2/MATH Level 5": 0.1193, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.4105, + "hfopenllm_v2/MMLU-PRO": 0.3777 + } + }, + { + "id": "mergekit-community/mergekit-ties-rraxdhv", + "name": "mergekit-ties-rraxdhv", + "developer": "mergekit-community", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1123, + "hfopenllm_v2/BBH": 0.5184, + "hfopenllm_v2/MATH Level 5": 0.04, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4202, + "hfopenllm_v2/MMLU-PRO": 0.391 + } + }, + { + "id": "mergekit-community/mergekit-ties-ykqemwr", + "name": "mergekit-ties-ykqemwr", + "developer": "mergekit-community", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.36, + "hfopenllm_v2/BBH": 0.5455, + "hfopenllm_v2/MATH Level 5": 0.1224, + "hfopenllm_v2/GPQA": 0.3221, + "hfopenllm_v2/MUSR": 0.4198, + "hfopenllm_v2/MMLU-PRO": 0.3734 + } + }, + { + "id": "mergekit-community/sexeh_time_testing", + "name": "sexeh_time_testing", + "developer": "mergekit-community", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7329, + "hfopenllm_v2/BBH": 0.5241, + "hfopenllm_v2/MATH Level 5": 0.0899, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.3619, + "hfopenllm_v2/MMLU-PRO": 0.3667 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/meta-llama.json b/data/developers/meta-llama.json new file mode 100644 index 0000000000000000000000000000000000000000..a28b7c096e670398e8b74d7b30002bc56044221b --- /dev/null +++ b/data/developers/meta-llama.json @@ -0,0 +1,335 @@ +{ + "developer": "meta-llama", + "models": [ + { + "id": "meta-llama/Llama-2-13b-chat-hf", + "name": "Llama-2-13b-chat-hf", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3985, + "hfopenllm_v2/BBH": 0.3343, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2315, + "hfopenllm_v2/MUSR": 0.4007, + "hfopenllm_v2/MMLU-PRO": 0.1923 + } + }, + { + "id": "meta-llama/Llama-2-13b-hf", + "name": "Llama-2-13b-hf", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2482, + "hfopenllm_v2/BBH": 0.4126, + "hfopenllm_v2/MATH Level 5": 0.0151, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.3538, + "hfopenllm_v2/MMLU-PRO": 0.2378 + } + }, + { + "id": "meta-llama/Llama-2-70b-chat-hf", + "name": "Llama-2-70b-chat-hf", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4958, + "hfopenllm_v2/BBH": 0.3042, + "hfopenllm_v2/MATH Level 5": 0.0295, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3687, + "hfopenllm_v2/MMLU-PRO": 0.2433 + } + }, + { + "id": "meta-llama/Llama-2-70b-hf", + "name": "Llama-2-70b-hf", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2407, + "hfopenllm_v2/BBH": 0.5473, + "hfopenllm_v2/MATH Level 5": 0.0325, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4124, + "hfopenllm_v2/MMLU-PRO": 0.3718 + } + }, + { + "id": "meta-llama/Llama-2-7b-chat-hf", + "name": "Llama-2-7b-chat-hf", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3986, + "hfopenllm_v2/BBH": 0.3114, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3676, + "hfopenllm_v2/MMLU-PRO": 0.1688 + } + }, + { + "id": "meta-llama/Llama-2-7b-hf", + "name": "Llama-2-7b-hf", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2519, + "hfopenllm_v2/BBH": 0.3496, + "hfopenllm_v2/MATH Level 5": 0.0174, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3701, + "hfopenllm_v2/MMLU-PRO": 0.1861 + } + }, + { + "id": "meta-llama/Llama-3.1-70B", + "name": "Llama-3.1-70B", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1684, + "hfopenllm_v2/BBH": 0.626, + "hfopenllm_v2/MATH Level 5": 0.1843, + "hfopenllm_v2/GPQA": 0.3876, + "hfopenllm_v2/MUSR": 0.4572, + "hfopenllm_v2/MMLU-PRO": 0.4654 + } + }, + { + "id": "meta-llama/Llama-3.1-70B-Instruct", + "name": "Llama-3.1-70B-Instruct", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8669, + "hfopenllm_v2/BBH": 0.6917, + "hfopenllm_v2/MATH Level 5": 0.3807, + "hfopenllm_v2/GPQA": 0.3565, + "hfopenllm_v2/MUSR": 0.4581, + "hfopenllm_v2/MMLU-PRO": 0.5309 + } + }, + { + "id": "meta-llama/Llama-3.1-8B", + "name": "Llama-3.1-8B", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1246, + "hfopenllm_v2/BBH": 0.466, + "hfopenllm_v2/MATH Level 5": 0.0657, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.3812, + "hfopenllm_v2/MMLU-PRO": 0.3288 + } + }, + { + "id": "meta-llama/Llama-3.1-8B-Instruct", + "name": "Llama-3.1-8B-Instruct", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4922, + "hfopenllm_v2/BBH": 0.5087, + "hfopenllm_v2/MATH Level 5": 0.1556, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.3972, + "hfopenllm_v2/MMLU-PRO": 0.3798 + } + }, + { + "id": "meta-llama/Llama-3.2-1B", + "name": "Llama-3.2-1B", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1478, + "hfopenllm_v2/BBH": 0.3115, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2282, + "hfopenllm_v2/MUSR": 0.3447, + "hfopenllm_v2/MMLU-PRO": 0.1203 + } + }, + { + "id": "meta-llama/Llama-3.2-1B-Instruct", + "name": "Llama-3.2-1B-Instruct", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5698, + "hfopenllm_v2/BBH": 0.3497, + "hfopenllm_v2/MATH Level 5": 0.0702, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3329, + "hfopenllm_v2/MMLU-PRO": 0.1682 + } + }, + { + "id": "meta-llama/Llama-3.2-3B", + "name": "Llama-3.2-3B", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1337, + "hfopenllm_v2/BBH": 0.3905, + "hfopenllm_v2/MATH Level 5": 0.0189, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3577, + "hfopenllm_v2/MMLU-PRO": 0.2488 + } + }, + { + "id": "meta-llama/Llama-3.2-3B-Instruct", + "name": "Llama-3.2-3B-Instruct", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7393, + "hfopenllm_v2/BBH": 0.461, + "hfopenllm_v2/MATH Level 5": 0.1767, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.3529, + "hfopenllm_v2/MMLU-PRO": 0.3195 + } + }, + { + "id": "meta-llama/Llama-3.3-70B-Instruct", + "name": "Llama-3.3-70B-Instruct", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8998, + "hfopenllm_v2/BBH": 0.6919, + "hfopenllm_v2/MATH Level 5": 0.4834, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.4461, + "hfopenllm_v2/MMLU-PRO": 0.5332 + } + }, + { + "id": "meta-llama/Meta-Llama-3-70B", + "name": "Meta-Llama-3-70B", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1603, + "hfopenllm_v2/BBH": 0.6461, + "hfopenllm_v2/MATH Level 5": 0.1858, + "hfopenllm_v2/GPQA": 0.3977, + "hfopenllm_v2/MUSR": 0.4518, + "hfopenllm_v2/MMLU-PRO": 0.4709 + } + }, + { + "id": "meta-llama/Meta-Llama-3-70B-Instruct", + "name": "Meta-Llama-3-70B-Instruct", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8099, + "hfopenllm_v2/BBH": 0.6547, + "hfopenllm_v2/MATH Level 5": 0.2447, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.4154, + "hfopenllm_v2/MMLU-PRO": 0.5207, + "reward-bench/Score": 0.7627, + "reward-bench/Chat": 0.9763, + "reward-bench/Chat Hard": 0.5888, + "reward-bench/Safety": 0.7297, + "reward-bench/Reasoning": 0.7854, + "reward-bench/Prior Sets (0.5 weight)": 0.7035 + } + }, + { + "id": "meta-llama/Meta-Llama-3-8B", + "name": "Meta-Llama-3-8B", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1455, + "hfopenllm_v2/BBH": 0.4598, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.3614, + "hfopenllm_v2/MMLU-PRO": 0.321 + } + }, + { + "id": "meta-llama/Meta-Llama-3-8B-Instruct", + "name": "Meta-Llama-3-8B-Instruct", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4782, + "hfopenllm_v2/BBH": 0.491, + "hfopenllm_v2/MATH Level 5": 0.0914, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.3805, + "hfopenllm_v2/MMLU-PRO": 0.3591, + "reward-bench/Score": 0.645, + "reward-bench/Chat": 0.8547, + "reward-bench/Chat Hard": 0.4156, + "reward-bench/Safety": 0.6797, + "reward-bench/Reasoning": 0.6482, + "reward-bench/Prior Sets (0.5 weight)": 0.6082 + } + }, + { + "id": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", + "name": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8412, + "reward-bench/Chat": 0.9721, + "reward-bench/Chat Hard": 0.7456, + "reward-bench/Safety": 0.7757, + "reward-bench/Reasoning": 0.8715 + } + }, + { + "id": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "name": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8405, + "reward-bench/Chat": 0.9721, + "reward-bench/Chat Hard": 0.7018, + "reward-bench/Safety": 0.8284, + "reward-bench/Reasoning": 0.8599 + } + }, + { + "id": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", + "name": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7808, + "reward-bench/Chat": 0.8757, + "reward-bench/Chat Hard": 0.6689, + "reward-bench/Safety": 0.7507, + "reward-bench/Reasoning": 0.828 + } + }, + { + "id": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "name": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "developer": "meta-llama", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6565, + "reward-bench/Chat": 0.8073, + "reward-bench/Chat Hard": 0.4978, + "reward-bench/Safety": 0.6399, + "reward-bench/Reasoning": 0.6811 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/meta-metrics.json b/data/developers/meta-metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..de6bbc909c82311f9b8856a2e528b221f7eb309b --- /dev/null +++ b/data/developers/meta-metrics.json @@ -0,0 +1,18 @@ +{ + "developer": "meta-metrics", + "models": [ + { + "id": "meta-metrics/MetaMetrics-RM-v1.0", + "name": "meta-metrics/MetaMetrics-RM-v1.0", + "developer": "meta-metrics", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9342, + "reward-bench/Chat": 0.9832, + "reward-bench/Chat Hard": 0.864, + "reward-bench/Safety": 0.9081, + "reward-bench/Reasoning": 0.9816 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/meta.json b/data/developers/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..de736e49b4d41c787178b80ac40ea100d79fc1c1 --- /dev/null +++ b/data/developers/meta.json @@ -0,0 +1,870 @@ +{ + "developer": "meta", + "models": [ + { + "id": "meta/LLaMA-13B", + "name": "LLaMA 13B", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.595, + "helm_classic/MMLU": 0.422, + "helm_classic/BoolQ": 0.714, + "helm_classic/NarrativeQA": 0.711, + "helm_classic/NaturalQuestions (open-book)": 0.614, + "helm_classic/QuAC": 0.347, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.324, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.928, + "helm_classic/CivilComments": 0.6, + "helm_classic/RAFT": 0.643 + } + }, + { + "id": "meta/LLaMA-30B", + "name": "LLaMA 30B", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.781, + "helm_classic/MMLU": 0.531, + "helm_classic/BoolQ": 0.861, + "helm_classic/NarrativeQA": 0.752, + "helm_classic/NaturalQuestions (open-book)": 0.666, + "helm_classic/QuAC": 0.39, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.344, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.927, + "helm_classic/CivilComments": 0.549, + "helm_classic/RAFT": 0.752 + } + }, + { + "id": "meta/LLaMA-65B", + "name": "LLaMA 65B", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.908, + "helm_classic/MMLU": 0.584, + "helm_classic/BoolQ": 0.871, + "helm_classic/NarrativeQA": 0.755, + "helm_classic/NaturalQuestions (open-book)": 0.672, + "helm_classic/QuAC": 0.401, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.508, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.962, + "helm_classic/CivilComments": 0.655, + "helm_classic/RAFT": 0.702 + } + }, + { + "id": "meta/LLaMA-7B", + "name": "LLaMA 7B", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.533, + "helm_classic/MMLU": 0.321, + "helm_classic/BoolQ": 0.756, + "helm_classic/NarrativeQA": 0.669, + "helm_classic/NaturalQuestions (open-book)": 0.589, + "helm_classic/QuAC": 0.338, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.28, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.947, + "helm_classic/CivilComments": 0.563, + "helm_classic/RAFT": 0.573 + } + }, + { + "id": "meta/Llama-2-13B", + "name": "Llama 2 13B", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.823, + "helm_classic/MMLU": 0.507, + "helm_classic/BoolQ": 0.811, + "helm_classic/NarrativeQA": 0.744, + "helm_classic/NaturalQuestions (open-book)": 0.637, + "helm_classic/QuAC": 0.424, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.33, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.962, + "helm_classic/CivilComments": 0.588, + "helm_classic/RAFT": 0.707 + } + }, + { + "id": "meta/Llama-2-70B", + "name": "Llama 2 70B", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.944, + "helm_classic/MMLU": 0.582, + "helm_classic/BoolQ": 0.886, + "helm_classic/NarrativeQA": 0.77, + "helm_classic/NaturalQuestions (open-book)": 0.674, + "helm_classic/QuAC": 0.484, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.554, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.961, + "helm_classic/CivilComments": 0.652, + "helm_classic/RAFT": 0.727 + } + }, + { + "id": "meta/Llama-2-7B", + "name": "Llama 2 7B", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.607, + "helm_classic/MMLU": 0.431, + "helm_classic/BoolQ": 0.762, + "helm_classic/NarrativeQA": 0.691, + "helm_classic/NaturalQuestions (open-book)": 0.611, + "helm_classic/QuAC": 0.406, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.272, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.907, + "helm_classic/CivilComments": 0.562, + "helm_classic/RAFT": 0.643 + } + }, + { + "id": "meta/OPT-175B", + "name": "OPT 175B", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.609, + "helm_classic/MMLU": 0.318, + "helm_classic/BoolQ": 0.793, + "helm_classic/NarrativeQA": 0.671, + "helm_classic/NaturalQuestions (open-book)": 0.615, + "helm_classic/QuAC": 0.36, + "helm_classic/HellaSwag": 0.791, + "helm_classic/OpenbookQA": 0.586, + "helm_classic/TruthfulQA": 0.25, + "helm_classic/MS MARCO (TREC)": 0.448, + "helm_classic/CNN/DailyMail": 0.146, + "helm_classic/XSUM": 0.155, + "helm_classic/IMDB": 0.947, + "helm_classic/CivilComments": 0.505, + "helm_classic/RAFT": 0.606 + } + }, + { + "id": "meta/OPT-66B", + "name": "OPT 66B", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.448, + "helm_classic/MMLU": 0.276, + "helm_classic/BoolQ": 0.76, + "helm_classic/NarrativeQA": 0.638, + "helm_classic/NaturalQuestions (open-book)": 0.596, + "helm_classic/QuAC": 0.357, + "helm_classic/HellaSwag": 0.745, + "helm_classic/OpenbookQA": 0.534, + "helm_classic/TruthfulQA": 0.201, + "helm_classic/MS MARCO (TREC)": 0.482, + "helm_classic/CNN/DailyMail": 0.136, + "helm_classic/XSUM": 0.126, + "helm_classic/IMDB": 0.917, + "helm_classic/CivilComments": 0.506, + "helm_classic/RAFT": 0.557 + } + }, + { + "id": "meta/llama-2-13b", + "name": "Llama 2 13B", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.233, + "helm_lite/NarrativeQA": 0.741, + "helm_lite/NaturalQuestions (closed-book)": 0.371, + "helm_lite/OpenbookQA": 0.634, + "helm_lite/MMLU": 0.505, + "helm_lite/MATH": 0.102, + "helm_lite/GSM8K": 0.266, + "helm_lite/LegalBench": 0.591, + "helm_lite/MedQA": 0.392, + "helm_lite/WMT 2014": 0.167, + "helm_mmlu/MMLU All Subjects": 0.554, + "helm_mmlu/Abstract Algebra": 0.27, + "helm_mmlu/Anatomy": 0.496, + "helm_mmlu/College Physics": 0.235, + "helm_mmlu/Computer Security": 0.69, + "helm_mmlu/Econometrics": 0.307, + "helm_mmlu/Global Facts": 0.38, + "helm_mmlu/Jurisprudence": 0.704, + "helm_mmlu/Philosophy": 0.672, + "helm_mmlu/Professional Psychology": 0.567, + "helm_mmlu/Us Foreign Policy": 0.83, + "helm_mmlu/Astronomy": 0.546, + "helm_mmlu/Business Ethics": 0.55, + "helm_mmlu/Clinical Knowledge": 0.592, + "helm_mmlu/Conceptual Physics": 0.413, + "helm_mmlu/Electrical Engineering": 0.49, + "helm_mmlu/Elementary Mathematics": 0.307, + "helm_mmlu/Formal Logic": 0.381, + "helm_mmlu/High School World History": 0.705, + "helm_mmlu/Human Sexuality": 0.618, + "helm_mmlu/International Law": 0.752, + "helm_mmlu/Logical Fallacies": 0.687, + "helm_mmlu/Machine Learning": 0.286, + "helm_mmlu/Management": 0.738, + "helm_mmlu/Marketing": 0.786, + "helm_mmlu/Medical Genetics": 0.57, + "helm_mmlu/Miscellaneous": 0.748, + "helm_mmlu/Moral Scenarios": 0.407, + "helm_mmlu/Nutrition": 0.627, + "helm_mmlu/Prehistory": 0.654, + "helm_mmlu/Public Relations": 0.6, + "helm_mmlu/Security Studies": 0.608, + "helm_mmlu/Sociology": 0.761, + "helm_mmlu/Virology": 0.476, + "helm_mmlu/World Religions": 0.76, + "helm_mmlu/Mean win rate": 0.502 + } + }, + { + "id": "meta/llama-2-70b", + "name": "Llama 2 70B", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.482, + "helm_lite/NarrativeQA": 0.763, + "helm_lite/NaturalQuestions (closed-book)": 0.46, + "helm_lite/OpenbookQA": 0.838, + "helm_lite/MMLU": 0.58, + "helm_lite/MATH": 0.323, + "helm_lite/GSM8K": 0.567, + "helm_lite/LegalBench": 0.673, + "helm_lite/MedQA": 0.618, + "helm_lite/WMT 2014": 0.196, + "helm_mmlu/MMLU All Subjects": 0.695, + "helm_mmlu/Abstract Algebra": 0.31, + "helm_mmlu/Anatomy": 0.607, + "helm_mmlu/College Physics": 0.363, + "helm_mmlu/Computer Security": 0.77, + "helm_mmlu/Econometrics": 0.43, + "helm_mmlu/Global Facts": 0.47, + "helm_mmlu/Jurisprudence": 0.824, + "helm_mmlu/Philosophy": 0.791, + "helm_mmlu/Professional Psychology": 0.76, + "helm_mmlu/Us Foreign Policy": 0.92, + "helm_mmlu/Astronomy": 0.829, + "helm_mmlu/Business Ethics": 0.73, + "helm_mmlu/Clinical Knowledge": 0.717, + "helm_mmlu/Conceptual Physics": 0.668, + "helm_mmlu/Electrical Engineering": 0.634, + "helm_mmlu/Elementary Mathematics": 0.421, + "helm_mmlu/Formal Logic": 0.468, + "helm_mmlu/High School World History": 0.882, + "helm_mmlu/Human Sexuality": 0.84, + "helm_mmlu/International Law": 0.868, + "helm_mmlu/Logical Fallacies": 0.791, + "helm_mmlu/Machine Learning": 0.491, + "helm_mmlu/Management": 0.845, + "helm_mmlu/Marketing": 0.889, + "helm_mmlu/Medical Genetics": 0.72, + "helm_mmlu/Miscellaneous": 0.857, + "helm_mmlu/Moral Scenarios": 0.45, + "helm_mmlu/Nutrition": 0.758, + "helm_mmlu/Prehistory": 0.84, + "helm_mmlu/Public Relations": 0.745, + "helm_mmlu/Security Studies": 0.796, + "helm_mmlu/Sociology": 0.9, + "helm_mmlu/Virology": 0.53, + "helm_mmlu/World Religions": 0.854, + "helm_mmlu/Mean win rate": 0.508 + } + }, + { + "id": "meta/llama-2-7b", + "name": "Llama 2 7B", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.152, + "helm_lite/NarrativeQA": 0.686, + "helm_lite/NaturalQuestions (closed-book)": 0.333, + "helm_lite/OpenbookQA": 0.544, + "helm_lite/MMLU": 0.425, + "helm_lite/MATH": 0.097, + "helm_lite/GSM8K": 0.154, + "helm_lite/LegalBench": 0.502, + "helm_lite/MedQA": 0.392, + "helm_lite/WMT 2014": 0.144, + "helm_mmlu/MMLU All Subjects": 0.458, + "helm_mmlu/Abstract Algebra": 0.29, + "helm_mmlu/Anatomy": 0.452, + "helm_mmlu/College Physics": 0.196, + "helm_mmlu/Computer Security": 0.59, + "helm_mmlu/Econometrics": 0.316, + "helm_mmlu/Global Facts": 0.29, + "helm_mmlu/Jurisprudence": 0.519, + "helm_mmlu/Philosophy": 0.592, + "helm_mmlu/Professional Psychology": 0.459, + "helm_mmlu/Us Foreign Policy": 0.64, + "helm_mmlu/Astronomy": 0.408, + "helm_mmlu/Business Ethics": 0.48, + "helm_mmlu/Clinical Knowledge": 0.453, + "helm_mmlu/Conceptual Physics": 0.434, + "helm_mmlu/Electrical Engineering": 0.407, + "helm_mmlu/Elementary Mathematics": 0.254, + "helm_mmlu/Formal Logic": 0.27, + "helm_mmlu/High School World History": 0.662, + "helm_mmlu/Human Sexuality": 0.557, + "helm_mmlu/International Law": 0.628, + "helm_mmlu/Logical Fallacies": 0.466, + "helm_mmlu/Machine Learning": 0.402, + "helm_mmlu/Management": 0.563, + "helm_mmlu/Marketing": 0.697, + "helm_mmlu/Medical Genetics": 0.53, + "helm_mmlu/Miscellaneous": 0.632, + "helm_mmlu/Moral Scenarios": 0.238, + "helm_mmlu/Nutrition": 0.497, + "helm_mmlu/Prehistory": 0.503, + "helm_mmlu/Public Relations": 0.509, + "helm_mmlu/Security Studies": 0.433, + "helm_mmlu/Sociology": 0.617, + "helm_mmlu/Virology": 0.392, + "helm_mmlu/World Religions": 0.713, + "helm_mmlu/Mean win rate": 0.681 + } + }, + { + "id": "meta/llama-3-70b", + "name": "Llama 3 70B", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.793, + "helm_lite/NarrativeQA": 0.798, + "helm_lite/NaturalQuestions (closed-book)": 0.475, + "helm_lite/OpenbookQA": 0.934, + "helm_lite/MMLU": 0.695, + "helm_lite/MATH": 0.663, + "helm_lite/GSM8K": 0.805, + "helm_lite/LegalBench": 0.733, + "helm_lite/MedQA": 0.777, + "helm_lite/WMT 2014": 0.225, + "helm_mmlu/MMLU All Subjects": 0.793, + "helm_mmlu/Abstract Algebra": 0.43, + "helm_mmlu/Anatomy": 0.785, + "helm_mmlu/College Physics": 0.529, + "helm_mmlu/Computer Security": 0.85, + "helm_mmlu/Econometrics": 0.693, + "helm_mmlu/Global Facts": 0.49, + "helm_mmlu/Jurisprudence": 0.861, + "helm_mmlu/Philosophy": 0.865, + "helm_mmlu/Professional Psychology": 0.871, + "helm_mmlu/Us Foreign Policy": 0.94, + "helm_mmlu/Astronomy": 0.921, + "helm_mmlu/Business Ethics": 0.83, + "helm_mmlu/Clinical Knowledge": 0.845, + "helm_mmlu/Conceptual Physics": 0.838, + "helm_mmlu/Electrical Engineering": 0.766, + "helm_mmlu/Elementary Mathematics": 0.632, + "helm_mmlu/Formal Logic": 0.651, + "helm_mmlu/High School World History": 0.941, + "helm_mmlu/Human Sexuality": 0.878, + "helm_mmlu/International Law": 0.901, + "helm_mmlu/Logical Fallacies": 0.865, + "helm_mmlu/Machine Learning": 0.714, + "helm_mmlu/Management": 0.913, + "helm_mmlu/Marketing": 0.94, + "helm_mmlu/Medical Genetics": 0.89, + "helm_mmlu/Miscellaneous": 0.917, + "helm_mmlu/Moral Scenarios": 0.598, + "helm_mmlu/Nutrition": 0.876, + "helm_mmlu/Prehistory": 0.91, + "helm_mmlu/Public Relations": 0.727, + "helm_mmlu/Security Studies": 0.833, + "helm_mmlu/Sociology": 0.93, + "helm_mmlu/Virology": 0.59, + "helm_mmlu/World Religions": 0.906, + "helm_mmlu/Mean win rate": 0.524 + } + }, + { + "id": "meta/llama-3-8b", + "name": "Llama 3 8B", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.387, + "helm_lite/NarrativeQA": 0.754, + "helm_lite/NaturalQuestions (closed-book)": 0.378, + "helm_lite/OpenbookQA": 0.766, + "helm_lite/MMLU": 0.602, + "helm_lite/MATH": 0.391, + "helm_lite/GSM8K": 0.499, + "helm_lite/LegalBench": 0.637, + "helm_lite/MedQA": 0.581, + "helm_lite/WMT 2014": 0.183, + "helm_mmlu/MMLU All Subjects": 0.668, + "helm_mmlu/Abstract Algebra": 0.33, + "helm_mmlu/Anatomy": 0.696, + "helm_mmlu/College Physics": 0.451, + "helm_mmlu/Computer Security": 0.8, + "helm_mmlu/Econometrics": 0.518, + "helm_mmlu/Global Facts": 0.34, + "helm_mmlu/Jurisprudence": 0.741, + "helm_mmlu/Philosophy": 0.743, + "helm_mmlu/Professional Psychology": 0.711, + "helm_mmlu/Us Foreign Policy": 0.88, + "helm_mmlu/Astronomy": 0.711, + "helm_mmlu/Business Ethics": 0.65, + "helm_mmlu/Clinical Knowledge": 0.751, + "helm_mmlu/Conceptual Physics": 0.557, + "helm_mmlu/Electrical Engineering": 0.669, + "helm_mmlu/Elementary Mathematics": 0.426, + "helm_mmlu/Formal Logic": 0.468, + "helm_mmlu/High School World History": 0.823, + "helm_mmlu/Human Sexuality": 0.748, + "helm_mmlu/International Law": 0.843, + "helm_mmlu/Logical Fallacies": 0.755, + "helm_mmlu/Machine Learning": 0.545, + "helm_mmlu/Management": 0.874, + "helm_mmlu/Marketing": 0.885, + "helm_mmlu/Medical Genetics": 0.83, + "helm_mmlu/Miscellaneous": 0.831, + "helm_mmlu/Moral Scenarios": 0.416, + "helm_mmlu/Nutrition": 0.761, + "helm_mmlu/Prehistory": 0.738, + "helm_mmlu/Public Relations": 0.736, + "helm_mmlu/Security Studies": 0.771, + "helm_mmlu/Sociology": 0.866, + "helm_mmlu/Virology": 0.566, + "helm_mmlu/World Religions": 0.819, + "helm_mmlu/Mean win rate": 0.733 + } + }, + { + "id": "meta/llama-3.1-405b-instruct-turbo", + "name": "Llama 3.1 Instruct Turbo 405B", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.618, + "helm_capabilities/MMLU-Pro": 0.723, + "helm_capabilities/GPQA": 0.522, + "helm_capabilities/IFEval": 0.811, + "helm_capabilities/WildBench": 0.783, + "helm_capabilities/Omni-MATH": 0.249, + "helm_lite/Mean win rate": 0.854, + "helm_lite/NarrativeQA": 0.749, + "helm_lite/NaturalQuestions (closed-book)": 0.456, + "helm_lite/OpenbookQA": 0.94, + "helm_lite/MMLU": 0.759, + "helm_lite/MATH": 0.827, + "helm_lite/GSM8K": 0.949, + "helm_lite/LegalBench": 0.707, + "helm_lite/MedQA": 0.805, + "helm_lite/WMT 2014": 0.238, + "helm_mmlu/MMLU All Subjects": 0.845, + "helm_mmlu/Abstract Algebra": 0.7, + "helm_mmlu/Anatomy": 0.822, + "helm_mmlu/College Physics": 0.696, + "helm_mmlu/Computer Security": 0.81, + "helm_mmlu/Econometrics": 0.746, + "helm_mmlu/Global Facts": 0.71, + "helm_mmlu/Jurisprudence": 0.87, + "helm_mmlu/Philosophy": 0.878, + "helm_mmlu/Professional Psychology": 0.861, + "helm_mmlu/Us Foreign Policy": 0.94, + "helm_mmlu/Astronomy": 0.921, + "helm_mmlu/Business Ethics": 0.81, + "helm_mmlu/Clinical Knowledge": 0.879, + "helm_mmlu/Conceptual Physics": 0.877, + "helm_mmlu/Electrical Engineering": 0.821, + "helm_mmlu/Elementary Mathematics": 0.828, + "helm_mmlu/Formal Logic": 0.698, + "helm_mmlu/High School World History": 0.941, + "helm_mmlu/Human Sexuality": 0.855, + "helm_mmlu/International Law": 0.95, + "helm_mmlu/Logical Fallacies": 0.92, + "helm_mmlu/Machine Learning": 0.795, + "helm_mmlu/Management": 0.893, + "helm_mmlu/Marketing": 0.962, + "helm_mmlu/Medical Genetics": 0.93, + "helm_mmlu/Miscellaneous": 0.939, + "helm_mmlu/Moral Scenarios": 0.876, + "helm_mmlu/Nutrition": 0.928, + "helm_mmlu/Prehistory": 0.929, + "helm_mmlu/Public Relations": 0.818, + "helm_mmlu/Security Studies": 0.857, + "helm_mmlu/Sociology": 0.94, + "helm_mmlu/Virology": 0.572, + "helm_mmlu/World Religions": 0.906, + "helm_mmlu/Mean win rate": 0.33 + } + }, + { + "id": "meta/llama-3.1-70b-instruct-turbo", + "name": "Llama 3.1 Instruct Turbo 70B", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.574, + "helm_capabilities/MMLU-Pro": 0.653, + "helm_capabilities/GPQA": 0.426, + "helm_capabilities/IFEval": 0.821, + "helm_capabilities/WildBench": 0.758, + "helm_capabilities/Omni-MATH": 0.21, + "helm_lite/Mean win rate": 0.808, + "helm_lite/NarrativeQA": 0.772, + "helm_lite/NaturalQuestions (closed-book)": 0.452, + "helm_lite/OpenbookQA": 0.938, + "helm_lite/MMLU": 0.709, + "helm_lite/MATH": 0.783, + "helm_lite/GSM8K": 0.938, + "helm_lite/LegalBench": 0.687, + "helm_lite/MedQA": 0.769, + "helm_lite/WMT 2014": 0.223, + "helm_mmlu/MMLU All Subjects": 0.801, + "helm_mmlu/Abstract Algebra": 0.55, + "helm_mmlu/Anatomy": 0.8, + "helm_mmlu/College Physics": 0.559, + "helm_mmlu/Computer Security": 0.8, + "helm_mmlu/Econometrics": 0.675, + "helm_mmlu/Global Facts": 0.61, + "helm_mmlu/Jurisprudence": 0.889, + "helm_mmlu/Philosophy": 0.833, + "helm_mmlu/Professional Psychology": 0.846, + "helm_mmlu/Us Foreign Policy": 0.93, + "helm_mmlu/Astronomy": 0.908, + "helm_mmlu/Business Ethics": 0.72, + "helm_mmlu/Clinical Knowledge": 0.845, + "helm_mmlu/Conceptual Physics": 0.834, + "helm_mmlu/Electrical Engineering": 0.745, + "helm_mmlu/Elementary Mathematics": 0.701, + "helm_mmlu/Formal Logic": 0.675, + "helm_mmlu/High School World History": 0.937, + "helm_mmlu/Human Sexuality": 0.855, + "helm_mmlu/International Law": 0.926, + "helm_mmlu/Logical Fallacies": 0.84, + "helm_mmlu/Machine Learning": 0.696, + "helm_mmlu/Management": 0.913, + "helm_mmlu/Marketing": 0.936, + "helm_mmlu/Medical Genetics": 0.93, + "helm_mmlu/Miscellaneous": 0.913, + "helm_mmlu/Moral Scenarios": 0.834, + "helm_mmlu/Nutrition": 0.889, + "helm_mmlu/Prehistory": 0.88, + "helm_mmlu/Public Relations": 0.709, + "helm_mmlu/Security Studies": 0.849, + "helm_mmlu/Sociology": 0.92, + "helm_mmlu/Virology": 0.578, + "helm_mmlu/World Religions": 0.895, + "helm_mmlu/Mean win rate": 0.021 + } + }, + { + "id": "meta/llama-3.1-8b-instruct-turbo", + "name": "Llama 3.1 Instruct Turbo 8B", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.444, + "helm_capabilities/MMLU-Pro": 0.406, + "helm_capabilities/GPQA": 0.247, + "helm_capabilities/IFEval": 0.743, + "helm_capabilities/WildBench": 0.686, + "helm_capabilities/Omni-MATH": 0.137, + "helm_lite/Mean win rate": 0.303, + "helm_lite/NarrativeQA": 0.756, + "helm_lite/NaturalQuestions (closed-book)": 0.209, + "helm_lite/OpenbookQA": 0.74, + "helm_lite/MMLU": 0.5, + "helm_lite/MATH": 0.703, + "helm_lite/GSM8K": 0.798, + "helm_lite/LegalBench": 0.342, + "helm_lite/MedQA": 0.245, + "helm_lite/WMT 2014": 0.181, + "helm_mmlu/MMLU All Subjects": 0.561, + "helm_mmlu/Abstract Algebra": 0.26, + "helm_mmlu/Anatomy": 0.459, + "helm_mmlu/College Physics": 0.363, + "helm_mmlu/Computer Security": 0.71, + "helm_mmlu/Econometrics": 0.351, + "helm_mmlu/Global Facts": 0.26, + "helm_mmlu/Jurisprudence": 0.731, + "helm_mmlu/Philosophy": 0.64, + "helm_mmlu/Professional Psychology": 0.649, + "helm_mmlu/Us Foreign Policy": 0.79, + "helm_mmlu/Astronomy": 0.645, + "helm_mmlu/Business Ethics": 0.65, + "helm_mmlu/Clinical Knowledge": 0.615, + "helm_mmlu/Conceptual Physics": 0.528, + "helm_mmlu/Electrical Engineering": 0.441, + "helm_mmlu/Elementary Mathematics": 0.429, + "helm_mmlu/Formal Logic": 0.444, + "helm_mmlu/High School World History": 0.515, + "helm_mmlu/Human Sexuality": 0.733, + "helm_mmlu/International Law": 0.694, + "helm_mmlu/Logical Fallacies": 0.742, + "helm_mmlu/Machine Learning": 0.384, + "helm_mmlu/Management": 0.709, + "helm_mmlu/Marketing": 0.833, + "helm_mmlu/Medical Genetics": 0.66, + "helm_mmlu/Miscellaneous": 0.653, + "helm_mmlu/Moral Scenarios": 0.368, + "helm_mmlu/Nutrition": 0.712, + "helm_mmlu/Prehistory": 0.728, + "helm_mmlu/Public Relations": 0.664, + "helm_mmlu/Security Studies": 0.576, + "helm_mmlu/Sociology": 0.701, + "helm_mmlu/Virology": 0.446, + "helm_mmlu/World Religions": 0.789, + "helm_mmlu/Mean win rate": 0.475 + } + }, + { + "id": "meta/llama-3.2-11b-vision-instruct-turbo", + "name": "Llama 3.2 Vision Instruct Turbo 11B", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.325, + "helm_lite/NarrativeQA": 0.756, + "helm_lite/NaturalQuestions (closed-book)": 0.234, + "helm_lite/OpenbookQA": 0.724, + "helm_lite/MMLU": 0.511, + "helm_lite/MATH": 0.739, + "helm_lite/GSM8K": 0.823, + "helm_lite/LegalBench": 0.435, + "helm_lite/MedQA": 0.27, + "helm_lite/WMT 2014": 0.179, + "helm_mmlu/MMLU All Subjects": 0.565, + "helm_mmlu/Abstract Algebra": 0.28, + "helm_mmlu/Anatomy": 0.533, + "helm_mmlu/College Physics": 0.333, + "helm_mmlu/Computer Security": 0.71, + "helm_mmlu/Econometrics": 0.395, + "helm_mmlu/Global Facts": 0.25, + "helm_mmlu/Jurisprudence": 0.722, + "helm_mmlu/Philosophy": 0.646, + "helm_mmlu/Professional Psychology": 0.649, + "helm_mmlu/Us Foreign Policy": 0.78, + "helm_mmlu/Astronomy": 0.671, + "helm_mmlu/Business Ethics": 0.64, + "helm_mmlu/Clinical Knowledge": 0.638, + "helm_mmlu/Conceptual Physics": 0.536, + "helm_mmlu/Electrical Engineering": 0.51, + "helm_mmlu/Elementary Mathematics": 0.458, + "helm_mmlu/Formal Logic": 0.46, + "helm_mmlu/High School World History": 0.502, + "helm_mmlu/Human Sexuality": 0.763, + "helm_mmlu/International Law": 0.711, + "helm_mmlu/Logical Fallacies": 0.742, + "helm_mmlu/Machine Learning": 0.375, + "helm_mmlu/Management": 0.728, + "helm_mmlu/Marketing": 0.838, + "helm_mmlu/Medical Genetics": 0.7, + "helm_mmlu/Miscellaneous": 0.644, + "helm_mmlu/Moral Scenarios": 0.328, + "helm_mmlu/Nutrition": 0.752, + "helm_mmlu/Prehistory": 0.744, + "helm_mmlu/Public Relations": 0.645, + "helm_mmlu/Security Studies": 0.567, + "helm_mmlu/Sociology": 0.627, + "helm_mmlu/Virology": 0.446, + "helm_mmlu/World Religions": 0.696, + "helm_mmlu/Mean win rate": 0.897 + } + }, + { + "id": "meta/llama-3.2-90b-vision-instruct-turbo", + "name": "Llama 3.2 Vision Instruct Turbo 90B", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.819, + "helm_lite/NarrativeQA": 0.777, + "helm_lite/NaturalQuestions (closed-book)": 0.457, + "helm_lite/OpenbookQA": 0.942, + "helm_lite/MMLU": 0.703, + "helm_lite/MATH": 0.791, + "helm_lite/GSM8K": 0.936, + "helm_lite/LegalBench": 0.68, + "helm_lite/MedQA": 0.769, + "helm_lite/WMT 2014": 0.224, + "helm_mmlu/MMLU All Subjects": 0.803, + "helm_mmlu/Abstract Algebra": 0.52, + "helm_mmlu/Anatomy": 0.8, + "helm_mmlu/College Physics": 0.539, + "helm_mmlu/Computer Security": 0.81, + "helm_mmlu/Econometrics": 0.684, + "helm_mmlu/Global Facts": 0.6, + "helm_mmlu/Jurisprudence": 0.88, + "helm_mmlu/Philosophy": 0.839, + "helm_mmlu/Professional Psychology": 0.843, + "helm_mmlu/Us Foreign Policy": 0.93, + "helm_mmlu/Astronomy": 0.921, + "helm_mmlu/Business Ethics": 0.76, + "helm_mmlu/Clinical Knowledge": 0.845, + "helm_mmlu/Conceptual Physics": 0.826, + "helm_mmlu/Electrical Engineering": 0.759, + "helm_mmlu/Elementary Mathematics": 0.688, + "helm_mmlu/Formal Logic": 0.683, + "helm_mmlu/High School World History": 0.941, + "helm_mmlu/Human Sexuality": 0.87, + "helm_mmlu/International Law": 0.934, + "helm_mmlu/Logical Fallacies": 0.834, + "helm_mmlu/Machine Learning": 0.688, + "helm_mmlu/Management": 0.913, + "helm_mmlu/Marketing": 0.944, + "helm_mmlu/Medical Genetics": 0.92, + "helm_mmlu/Miscellaneous": 0.913, + "helm_mmlu/Moral Scenarios": 0.841, + "helm_mmlu/Nutrition": 0.889, + "helm_mmlu/Prehistory": 0.886, + "helm_mmlu/Public Relations": 0.718, + "helm_mmlu/Security Studies": 0.853, + "helm_mmlu/Sociology": 0.92, + "helm_mmlu/Virology": 0.584, + "helm_mmlu/World Religions": 0.901, + "helm_mmlu/Mean win rate": 0.773 + } + }, + { + "id": "meta/llama-3.3-70b-instruct-turbo", + "name": "Llama 3.3 Instruct Turbo 70B", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.812, + "helm_lite/NarrativeQA": 0.791, + "helm_lite/NaturalQuestions (closed-book)": 0.431, + "helm_lite/OpenbookQA": 0.928, + "helm_lite/MMLU": 0.7, + "helm_lite/MATH": 0.808, + "helm_lite/GSM8K": 0.942, + "helm_lite/LegalBench": 0.725, + "helm_lite/MedQA": 0.761, + "helm_lite/WMT 2014": 0.219, + "helm_mmlu/MMLU All Subjects": 0.791, + "helm_mmlu/Abstract Algebra": 0.5, + "helm_mmlu/Anatomy": 0.778, + "helm_mmlu/College Physics": 0.52, + "helm_mmlu/Computer Security": 0.8, + "helm_mmlu/Econometrics": 0.719, + "helm_mmlu/Global Facts": 0.58, + "helm_mmlu/Jurisprudence": 0.87, + "helm_mmlu/Philosophy": 0.83, + "helm_mmlu/Professional Psychology": 0.845, + "helm_mmlu/Us Foreign Policy": 0.93, + "helm_mmlu/Astronomy": 0.888, + "helm_mmlu/Business Ethics": 0.8, + "helm_mmlu/Clinical Knowledge": 0.83, + "helm_mmlu/Conceptual Physics": 0.821, + "helm_mmlu/Electrical Engineering": 0.745, + "helm_mmlu/Elementary Mathematics": 0.672, + "helm_mmlu/Formal Logic": 0.675, + "helm_mmlu/High School World History": 0.907, + "helm_mmlu/Human Sexuality": 0.855, + "helm_mmlu/International Law": 0.884, + "helm_mmlu/Logical Fallacies": 0.816, + "helm_mmlu/Machine Learning": 0.714, + "helm_mmlu/Management": 0.903, + "helm_mmlu/Marketing": 0.927, + "helm_mmlu/Medical Genetics": 0.9, + "helm_mmlu/Miscellaneous": 0.914, + "helm_mmlu/Moral Scenarios": 0.698, + "helm_mmlu/Nutrition": 0.882, + "helm_mmlu/Prehistory": 0.895, + "helm_mmlu/Public Relations": 0.727, + "helm_mmlu/Security Studies": 0.845, + "helm_mmlu/Sociology": 0.92, + "helm_mmlu/Virology": 0.566, + "helm_mmlu/World Religions": 0.883, + "helm_mmlu/Mean win rate": 0.722 + } + }, + { + "id": "meta/llama-4-maverick-17b-128e-instruct-fp8", + "name": "Llama 4 Maverick 17Bx128E Instruct FP8", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.718, + "helm_capabilities/MMLU-Pro": 0.81, + "helm_capabilities/GPQA": 0.65, + "helm_capabilities/IFEval": 0.908, + "helm_capabilities/WildBench": 0.8, + "helm_capabilities/Omni-MATH": 0.422 + } + }, + { + "id": "meta/llama-4-scout-17b-16e-instruct", + "name": "Llama 4 Scout 17Bx16E Instruct", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.644, + "helm_capabilities/MMLU-Pro": 0.742, + "helm_capabilities/GPQA": 0.507, + "helm_capabilities/IFEval": 0.818, + "helm_capabilities/WildBench": 0.779, + "helm_capabilities/Omni-MATH": 0.373 + } + }, + { + "id": "meta/llama-65b", + "name": "LLaMA 65B", + "developer": "meta", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.345, + "helm_lite/NarrativeQA": 0.755, + "helm_lite/NaturalQuestions (closed-book)": 0.433, + "helm_lite/OpenbookQA": 0.754, + "helm_lite/MMLU": 0.584, + "helm_lite/MATH": 0.257, + "helm_lite/GSM8K": 0.489, + "helm_lite/LegalBench": 0.48, + "helm_lite/MedQA": 0.507, + "helm_lite/WMT 2014": 0.189 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/mhl1.json b/data/developers/mhl1.json new file mode 100644 index 0000000000000000000000000000000000000000..936e3681876c7812d0d3b012b89250105f55a50b --- /dev/null +++ b/data/developers/mhl1.json @@ -0,0 +1,19 @@ +{ + "developer": "mhl1", + "models": [ + { + "id": "mhl1/Qwen2.5-0.5B-cinstruct-stage1", + "name": "Qwen2.5-0.5B-cinstruct-stage1", + "developer": "mhl1", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1482, + "hfopenllm_v2/BBH": 0.3256, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.35, + "hfopenllm_v2/MMLU-PRO": 0.1139 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/microsoft.json b/data/developers/microsoft.json new file mode 100644 index 0000000000000000000000000000000000000000..8f4a0b4aafd0e6a24d113b83750e80b0a606d1be --- /dev/null +++ b/data/developers/microsoft.json @@ -0,0 +1,429 @@ +{ + "developer": "microsoft", + "models": [ + { + "id": "microsoft/DialoGPT-medium", + "name": "DialoGPT-medium", + "developer": "microsoft", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1479, + "hfopenllm_v2/BBH": 0.3014, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.4287, + "hfopenllm_v2/MMLU-PRO": 0.1119 + } + }, + { + "id": "microsoft/Orca-2-13b", + "name": "Orca-2-13b", + "developer": "microsoft", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3128, + "hfopenllm_v2/BBH": 0.4884, + "hfopenllm_v2/MATH Level 5": 0.0317, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.513, + "hfopenllm_v2/MMLU-PRO": 0.2749 + } + }, + { + "id": "microsoft/Orca-2-7b", + "name": "Orca-2-7b", + "developer": "microsoft", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2183, + "hfopenllm_v2/BBH": 0.4452, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.5026, + "hfopenllm_v2/MMLU-PRO": 0.2319 + } + }, + { + "id": "microsoft/Phi-3-medium-128k-instruct", + "name": "Phi-3-medium-128k-instruct", + "developer": "microsoft", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.604, + "hfopenllm_v2/BBH": 0.6382, + "hfopenllm_v2/MATH Level 5": 0.1918, + "hfopenllm_v2/GPQA": 0.3364, + "hfopenllm_v2/MUSR": 0.4129, + "hfopenllm_v2/MMLU-PRO": 0.4712 + } + }, + { + "id": "microsoft/Phi-3-medium-4k-instruct", + "name": "Phi-3-medium-4k-instruct", + "developer": "microsoft", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6423, + "hfopenllm_v2/BBH": 0.6412, + "hfopenllm_v2/MATH Level 5": 0.1956, + "hfopenllm_v2/GPQA": 0.3364, + "hfopenllm_v2/MUSR": 0.4258, + "hfopenllm_v2/MMLU-PRO": 0.4676 + } + }, + { + "id": "microsoft/Phi-3-mini-128k-instruct", + "name": "Phi-3-mini-128k-instruct", + "developer": "microsoft", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5976, + "hfopenllm_v2/BBH": 0.5575, + "hfopenllm_v2/MATH Level 5": 0.1405, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.3937, + "hfopenllm_v2/MMLU-PRO": 0.3734 + } + }, + { + "id": "microsoft/Phi-3-mini-4k-instruct", + "name": "Phi-3-mini-4k-instruct", + "developer": "microsoft", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5613, + "hfopenllm_v2/BBH": 0.5676, + "hfopenllm_v2/MATH Level 5": 0.1163, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.395, + "hfopenllm_v2/MMLU-PRO": 0.3866 + } + }, + { + "id": "microsoft/Phi-3-small-128k-instruct", + "name": "Phi-3-small-128k-instruct", + "developer": "microsoft", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6368, + "hfopenllm_v2/BBH": 0.6202, + "hfopenllm_v2/MATH Level 5": 0.2026, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4378, + "hfopenllm_v2/MMLU-PRO": 0.4491 + } + }, + { + "id": "microsoft/Phi-3-small-8k-instruct", + "name": "Phi-3-small-8k-instruct", + "developer": "microsoft", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6497, + "hfopenllm_v2/BBH": 0.6208, + "hfopenllm_v2/MATH Level 5": 0.1887, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4558, + "hfopenllm_v2/MMLU-PRO": 0.4506 + } + }, + { + "id": "microsoft/Phi-3.5-MoE-instruct", + "name": "Phi-3.5-MoE-instruct", + "developer": "microsoft", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6925, + "hfopenllm_v2/BBH": 0.6408, + "hfopenllm_v2/MATH Level 5": 0.3119, + "hfopenllm_v2/GPQA": 0.3557, + "hfopenllm_v2/MUSR": 0.4565, + "hfopenllm_v2/MMLU-PRO": 0.4658 + } + }, + { + "id": "microsoft/Phi-3.5-mini-instruct", + "name": "Phi-3.5-mini-instruct", + "developer": "microsoft", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5775, + "hfopenllm_v2/BBH": 0.5518, + "hfopenllm_v2/MATH Level 5": 0.1964, + "hfopenllm_v2/GPQA": 0.3398, + "hfopenllm_v2/MUSR": 0.4021, + "hfopenllm_v2/MMLU-PRO": 0.3962 + } + }, + { + "id": "microsoft/Phi-4-mini-instruct", + "name": "Phi-4-mini-instruct", + "developer": "microsoft", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7378, + "hfopenllm_v2/BBH": 0.5689, + "hfopenllm_v2/MATH Level 5": 0.1699, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.3873, + "hfopenllm_v2/MMLU-PRO": 0.3932 + } + }, + { + "id": "microsoft/TNLG-v2-530B", + "name": "TNLG v2 530B", + "developer": "microsoft", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.787, + "helm_classic/MMLU": 0.469, + "helm_classic/BoolQ": 0.809, + "helm_classic/NarrativeQA": 0.722, + "helm_classic/NaturalQuestions (open-book)": 0.642, + "helm_classic/QuAC": 0.39, + "helm_classic/HellaSwag": 0.799, + "helm_classic/OpenbookQA": 0.562, + "helm_classic/TruthfulQA": 0.251, + "helm_classic/MS MARCO (TREC)": 0.643, + "helm_classic/CNN/DailyMail": 0.161, + "helm_classic/XSUM": 0.169, + "helm_classic/IMDB": 0.941, + "helm_classic/CivilComments": 0.601, + "helm_classic/RAFT": 0.679 + } + }, + { + "id": "microsoft/TNLG-v2-6.7B", + "name": "TNLG v2 6.7B", + "developer": "microsoft", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.309, + "helm_classic/MMLU": 0.242, + "helm_classic/BoolQ": 0.698, + "helm_classic/NarrativeQA": 0.631, + "helm_classic/NaturalQuestions (open-book)": 0.561, + "helm_classic/QuAC": 0.345, + "helm_classic/HellaSwag": 0.704, + "helm_classic/OpenbookQA": 0.478, + "helm_classic/TruthfulQA": 0.167, + "helm_classic/MS MARCO (TREC)": 0.332, + "helm_classic/CNN/DailyMail": 0.146, + "helm_classic/XSUM": 0.11, + "helm_classic/IMDB": 0.927, + "helm_classic/CivilComments": 0.532, + "helm_classic/RAFT": 0.525 + } + }, + { + "id": "microsoft/phi-1", + "name": "phi-1", + "developer": "microsoft", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2068, + "hfopenllm_v2/BBH": 0.3139, + "hfopenllm_v2/MATH Level 5": 0.0098, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3525, + "hfopenllm_v2/MMLU-PRO": 0.1162 + } + }, + { + "id": "microsoft/phi-1_5", + "name": "phi-1_5", + "developer": "microsoft", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2033, + "hfopenllm_v2/BBH": 0.336, + "hfopenllm_v2/MATH Level 5": 0.0181, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3404, + "hfopenllm_v2/MMLU-PRO": 0.1691 + } + }, + { + "id": "microsoft/phi-2", + "name": "Phi-2", + "developer": "microsoft", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.169, + "helm_lite/NarrativeQA": 0.703, + "helm_lite/NaturalQuestions (closed-book)": 0.155, + "helm_lite/OpenbookQA": 0.798, + "helm_lite/MMLU": 0.518, + "helm_lite/MATH": 0.255, + "helm_lite/GSM8K": 0.581, + "helm_lite/LegalBench": 0.334, + "helm_lite/MedQA": 0.41, + "helm_lite/WMT 2014": 0.038, + "helm_mmlu/MMLU All Subjects": 0.584, + "helm_mmlu/Abstract Algebra": 0.31, + "helm_mmlu/Anatomy": 0.437, + "helm_mmlu/College Physics": 0.382, + "helm_mmlu/Computer Security": 0.73, + "helm_mmlu/Econometrics": 0.342, + "helm_mmlu/Global Facts": 0.35, + "helm_mmlu/Jurisprudence": 0.694, + "helm_mmlu/Philosophy": 0.598, + "helm_mmlu/Professional Psychology": 0.572, + "helm_mmlu/Us Foreign Policy": 0.78, + "helm_mmlu/Astronomy": 0.605, + "helm_mmlu/Business Ethics": 0.59, + "helm_mmlu/Clinical Knowledge": 0.619, + "helm_mmlu/Conceptual Physics": 0.519, + "helm_mmlu/Electrical Engineering": 0.545, + "helm_mmlu/Elementary Mathematics": 0.463, + "helm_mmlu/Formal Logic": 0.389, + "helm_mmlu/High School World History": 0.73, + "helm_mmlu/Human Sexuality": 0.733, + "helm_mmlu/International Law": 0.752, + "helm_mmlu/Logical Fallacies": 0.767, + "helm_mmlu/Machine Learning": 0.5, + "helm_mmlu/Management": 0.748, + "helm_mmlu/Marketing": 0.833, + "helm_mmlu/Medical Genetics": 0.62, + "helm_mmlu/Miscellaneous": 0.688, + "helm_mmlu/Moral Scenarios": 0.231, + "helm_mmlu/Nutrition": 0.627, + "helm_mmlu/Prehistory": 0.605, + "helm_mmlu/Public Relations": 0.673, + "helm_mmlu/Security Studies": 0.702, + "helm_mmlu/Sociology": 0.816, + "helm_mmlu/Virology": 0.47, + "helm_mmlu/World Religions": 0.702, + "helm_mmlu/Mean win rate": 0.824, + "hfopenllm_v2/IFEval": 0.2739, + "hfopenllm_v2/BBH": 0.4881, + "hfopenllm_v2/MATH Level 5": 0.0295, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.4099, + "hfopenllm_v2/MMLU-PRO": 0.2628 + } + }, + { + "id": "microsoft/phi-3-medium-4k-instruct", + "name": "Phi-3 14B", + "developer": "microsoft", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.509, + "helm_lite/NarrativeQA": 0.724, + "helm_lite/NaturalQuestions (closed-book)": 0.278, + "helm_lite/OpenbookQA": 0.916, + "helm_lite/MMLU": 0.675, + "helm_lite/MATH": 0.611, + "helm_lite/GSM8K": 0.878, + "helm_lite/LegalBench": 0.593, + "helm_lite/MedQA": 0.696, + "helm_lite/WMT 2014": 0.17, + "helm_mmlu/MMLU All Subjects": 0.775, + "helm_mmlu/Abstract Algebra": 0.5, + "helm_mmlu/Anatomy": 0.719, + "helm_mmlu/College Physics": 0.529, + "helm_mmlu/Computer Security": 0.79, + "helm_mmlu/Econometrics": 0.614, + "helm_mmlu/Global Facts": 0.5, + "helm_mmlu/Jurisprudence": 0.88, + "helm_mmlu/Philosophy": 0.804, + "helm_mmlu/Professional Psychology": 0.835, + "helm_mmlu/Us Foreign Policy": 0.95, + "helm_mmlu/Astronomy": 0.849, + "helm_mmlu/Business Ethics": 0.8, + "helm_mmlu/Clinical Knowledge": 0.826, + "helm_mmlu/Conceptual Physics": 0.809, + "helm_mmlu/Electrical Engineering": 0.683, + "helm_mmlu/Elementary Mathematics": 0.709, + "helm_mmlu/Formal Logic": 0.587, + "helm_mmlu/High School World History": 0.903, + "helm_mmlu/Human Sexuality": 0.863, + "helm_mmlu/International Law": 0.934, + "helm_mmlu/Logical Fallacies": 0.828, + "helm_mmlu/Machine Learning": 0.696, + "helm_mmlu/Management": 0.864, + "helm_mmlu/Marketing": 0.919, + "helm_mmlu/Medical Genetics": 0.91, + "helm_mmlu/Miscellaneous": 0.894, + "helm_mmlu/Moral Scenarios": 0.639, + "helm_mmlu/Nutrition": 0.837, + "helm_mmlu/Prehistory": 0.867, + "helm_mmlu/Public Relations": 0.755, + "helm_mmlu/Security Studies": 0.829, + "helm_mmlu/Sociology": 0.891, + "helm_mmlu/Virology": 0.554, + "helm_mmlu/World Religions": 0.865, + "helm_mmlu/Mean win rate": 0.015 + } + }, + { + "id": "microsoft/phi-3-small-8k-instruct", + "name": "Phi-3 7B", + "developer": "microsoft", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.473, + "helm_lite/NarrativeQA": 0.754, + "helm_lite/NaturalQuestions (closed-book)": 0.324, + "helm_lite/OpenbookQA": 0.912, + "helm_lite/MMLU": 0.659, + "helm_lite/MATH": 0.703, + "helm_lite/GSM8K": -1.0, + "helm_lite/LegalBench": 0.584, + "helm_lite/MedQA": 0.672, + "helm_lite/WMT 2014": 0.154, + "helm_mmlu/MMLU All Subjects": 0.757, + "helm_mmlu/Abstract Algebra": 0.44, + "helm_mmlu/Anatomy": 0.726, + "helm_mmlu/College Physics": 0.559, + "helm_mmlu/Computer Security": 0.77, + "helm_mmlu/Econometrics": 0.596, + "helm_mmlu/Global Facts": 0.52, + "helm_mmlu/Jurisprudence": 0.843, + "helm_mmlu/Philosophy": 0.82, + "helm_mmlu/Professional Psychology": 0.835, + "helm_mmlu/Us Foreign Policy": 0.95, + "helm_mmlu/Astronomy": 0.849, + "helm_mmlu/Business Ethics": 0.77, + "helm_mmlu/Clinical Knowledge": 0.83, + "helm_mmlu/Conceptual Physics": 0.779, + "helm_mmlu/Electrical Engineering": 0.69, + "helm_mmlu/Elementary Mathematics": 0.619, + "helm_mmlu/Formal Logic": 0.595, + "helm_mmlu/High School World History": 0.848, + "helm_mmlu/Human Sexuality": 0.817, + "helm_mmlu/International Law": 0.851, + "helm_mmlu/Logical Fallacies": 0.81, + "helm_mmlu/Machine Learning": 0.652, + "helm_mmlu/Management": 0.903, + "helm_mmlu/Marketing": 0.897, + "helm_mmlu/Medical Genetics": 0.84, + "helm_mmlu/Miscellaneous": 0.871, + "helm_mmlu/Moral Scenarios": 0.711, + "helm_mmlu/Nutrition": 0.833, + "helm_mmlu/Prehistory": 0.858, + "helm_mmlu/Public Relations": 0.727, + "helm_mmlu/Security Studies": 0.804, + "helm_mmlu/Sociology": 0.886, + "helm_mmlu/Virology": 0.548, + "helm_mmlu/World Religions": 0.825, + "helm_mmlu/Mean win rate": 0.708 + } + }, + { + "id": "microsoft/phi-4", + "name": "phi-4", + "developer": "microsoft", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0585, + "hfopenllm_v2/BBH": 0.6691, + "hfopenllm_v2/MATH Level 5": 0.3165, + "hfopenllm_v2/GPQA": 0.406, + "hfopenllm_v2/MUSR": 0.5034, + "hfopenllm_v2/MMLU-PRO": 0.5287 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/mightbe.json b/data/developers/mightbe.json new file mode 100644 index 0000000000000000000000000000000000000000..ad73c924f7d1a8257a57195d8aa42a6544ae6187 --- /dev/null +++ b/data/developers/mightbe.json @@ -0,0 +1,19 @@ +{ + "developer": "mightbe", + "models": [ + { + "id": "mightbe/Better-PairRM", + "name": "mightbe/Better-PairRM", + "developer": "mightbe", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.673, + "reward-bench/Chat": 0.9553, + "reward-bench/Chat Hard": 0.3925, + "reward-bench/Safety": 0.8203, + "reward-bench/Reasoning": 0.4983, + "reward-bench/Prior Sets (0.5 weight)": 0.724 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/migtissera.json b/data/developers/migtissera.json new file mode 100644 index 0000000000000000000000000000000000000000..758dddd03daa89044ae5d24105c8c1c0de081425 --- /dev/null +++ b/data/developers/migtissera.json @@ -0,0 +1,117 @@ +{ + "developer": "migtissera", + "models": [ + { + "id": "migtissera/Llama-3-70B-Synthia-v3.5", + "name": "Llama-3-70B-Synthia-v3.5", + "developer": "migtissera", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6076, + "hfopenllm_v2/BBH": 0.6489, + "hfopenllm_v2/MATH Level 5": 0.2115, + "hfopenllm_v2/GPQA": 0.3876, + "hfopenllm_v2/MUSR": 0.4922, + "hfopenllm_v2/MMLU-PRO": 0.4658 + } + }, + { + "id": "migtissera/Llama-3-8B-Synthia-v3.5", + "name": "Llama-3-8B-Synthia-v3.5", + "developer": "migtissera", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.507, + "hfopenllm_v2/BBH": 0.4888, + "hfopenllm_v2/MATH Level 5": 0.0657, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.4044, + "hfopenllm_v2/MMLU-PRO": 0.303 + } + }, + { + "id": "migtissera/Tess-3-7B-SFT", + "name": "Tess-3-7B-SFT", + "developer": "migtissera", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3946, + "hfopenllm_v2/BBH": 0.4607, + "hfopenllm_v2/MATH Level 5": 0.04, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.4113, + "hfopenllm_v2/MMLU-PRO": 0.3034 + } + }, + { + "id": "migtissera/Tess-3-Mistral-Nemo-12B", + "name": "Tess-3-Mistral-Nemo-12B", + "developer": "migtissera", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3355, + "hfopenllm_v2/BBH": 0.4899, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.2508, + "hfopenllm_v2/MUSR": 0.4458, + "hfopenllm_v2/MMLU-PRO": 0.2565 + } + }, + { + "id": "migtissera/Tess-v2.5-Phi-3-medium-128k-14B", + "name": "Tess-v2.5-Phi-3-medium-128k-14B", + "developer": "migtissera", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4539, + "hfopenllm_v2/BBH": 0.6207, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4113, + "hfopenllm_v2/MMLU-PRO": 0.3732 + } + }, + { + "id": "migtissera/Tess-v2.5.2-Qwen2-72B", + "name": "Tess-v2.5.2-Qwen2-72B", + "developer": "migtissera", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4494, + "hfopenllm_v2/BBH": 0.6647, + "hfopenllm_v2/MATH Level 5": 0.2938, + "hfopenllm_v2/GPQA": 0.3507, + "hfopenllm_v2/MUSR": 0.4188, + "hfopenllm_v2/MMLU-PRO": 0.5561 + } + }, + { + "id": "migtissera/Trinity-2-Codestral-22B", + "name": "Trinity-2-Codestral-22B", + "developer": "migtissera", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4202, + "hfopenllm_v2/BBH": 0.5593, + "hfopenllm_v2/MATH Level 5": 0.0967, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.4111, + "hfopenllm_v2/MMLU-PRO": 0.3308 + } + }, + { + "id": "migtissera/Trinity-2-Codestral-22B-v0.2", + "name": "Trinity-2-Codestral-22B-v0.2", + "developer": "migtissera", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.443, + "hfopenllm_v2/BBH": 0.5706, + "hfopenllm_v2/MATH Level 5": 0.0869, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4031, + "hfopenllm_v2/MMLU-PRO": 0.3354 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/mindw96.json b/data/developers/mindw96.json new file mode 100644 index 0000000000000000000000000000000000000000..849de50cbe76021240babe9df68c9bad5c76d358 --- /dev/null +++ b/data/developers/mindw96.json @@ -0,0 +1,19 @@ +{ + "developer": "mindw96", + "models": [ + { + "id": "mindw96/DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3", + "name": "DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3", + "developer": "mindw96", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1388, + "hfopenllm_v2/BBH": 0.3068, + "hfopenllm_v2/MATH Level 5": 0.0083, + "hfopenllm_v2/GPQA": 0.2508, + "hfopenllm_v2/MUSR": 0.3792, + "hfopenllm_v2/MMLU-PRO": 0.1106 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/minghaowu.json b/data/developers/minghaowu.json new file mode 100644 index 0000000000000000000000000000000000000000..5b01f9cd3a0fe33648498ba5463b6bc8b16a60b7 --- /dev/null +++ b/data/developers/minghaowu.json @@ -0,0 +1,19 @@ +{ + "developer": "minghaowu", + "models": [ + { + "id": "minghaowu/Qwen1.5-1.8B-OpenHermes-2.5", + "name": "Qwen1.5-1.8B-OpenHermes-2.5", + "developer": "minghaowu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2778, + "hfopenllm_v2/BBH": 0.3375, + "hfopenllm_v2/MATH Level 5": 0.0242, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3529, + "hfopenllm_v2/MMLU-PRO": 0.1792 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/minimax.json b/data/developers/minimax.json new file mode 100644 index 0000000000000000000000000000000000000000..4109f62ad72b732d20b6d28221c6abd3e74fcf6a --- /dev/null +++ b/data/developers/minimax.json @@ -0,0 +1,14 @@ +{ + "developer": "minimax", + "models": [ + { + "id": "minimax/Minimax-2.5", + "name": "Minimax-2.5", + "developer": "minimax", + "evaluator_relationship": null, + "benchmark_scores": { + "apex-agents/Corporate Lawyer Mean Score": 0.339 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ministral.json b/data/developers/ministral.json new file mode 100644 index 0000000000000000000000000000000000000000..6955cf075a7bf1b3ace7fe1d40d931e0250d87d9 --- /dev/null +++ b/data/developers/ministral.json @@ -0,0 +1,19 @@ +{ + "developer": "ministral", + "models": [ + { + "id": "ministral/Ministral-3b-instruct", + "name": "Ministral-3b-instruct", + "developer": "ministral", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1358, + "hfopenllm_v2/BBH": 0.3192, + "hfopenllm_v2/MATH Level 5": 0.0083, + "hfopenllm_v2/GPQA": 0.2517, + "hfopenllm_v2/MUSR": 0.3382, + "hfopenllm_v2/MMLU-PRO": 0.1093 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/mistral-community.json b/data/developers/mistral-community.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5d6738731770955843d6671665c5e716c0d4 --- /dev/null +++ b/data/developers/mistral-community.json @@ -0,0 +1,47 @@ +{ + "developer": "mistral-community", + "models": [ + { + "id": "mistral-community/Mistral-7B-v0.2", + "name": "Mistral-7B-v0.2", + "developer": "mistral-community", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2266, + "hfopenllm_v2/BBH": 0.451, + "hfopenllm_v2/MATH Level 5": 0.0302, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4032, + "hfopenllm_v2/MMLU-PRO": 0.2953 + } + }, + { + "id": "mistral-community/Mixtral-8x22B-v0.1", + "name": "Mixtral-8x22B-v0.1", + "developer": "mistral-community", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3167, + "hfopenllm_v2/BBH": 0.38, + "hfopenllm_v2/MATH Level 5": 0.1543, + "hfopenllm_v2/GPQA": 0.33, + "hfopenllm_v2/MUSR": 0.3533, + "hfopenllm_v2/MMLU-PRO": 0.36 + } + }, + { + "id": "mistral-community/mixtral-8x22B-v0.3", + "name": "mixtral-8x22B-v0.3", + "developer": "mistral-community", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2583, + "hfopenllm_v2/BBH": 0.625, + "hfopenllm_v2/MATH Level 5": 0.1835, + "hfopenllm_v2/GPQA": 0.3775, + "hfopenllm_v2/MUSR": 0.4037, + "hfopenllm_v2/MMLU-PRO": 0.4639 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/mistralai.json b/data/developers/mistralai.json new file mode 100644 index 0000000000000000000000000000000000000000..49ed42eb9fa248702f2e286105ac1814c51dacf3 --- /dev/null +++ b/data/developers/mistralai.json @@ -0,0 +1,816 @@ +{ + "developer": "mistralai", + "models": [ + { + "id": "mistralai/Codestral-22B-v0.1", + "name": "Codestral-22B-v0.1", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5772, + "hfopenllm_v2/BBH": 0.5139, + "hfopenllm_v2/MATH Level 5": 0.1005, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4187, + "hfopenllm_v2/MMLU-PRO": 0.3156 + } + }, + { + "id": "mistralai/Ministral-8B-Instruct-2410", + "name": "Ministral-8B-Instruct-2410", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5896, + "hfopenllm_v2/BBH": 0.4762, + "hfopenllm_v2/MATH Level 5": 0.1956, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4138, + "hfopenllm_v2/MMLU-PRO": 0.3291 + } + }, + { + "id": "mistralai/Mistral-7B-Instruct-v0.1", + "name": "Mistral-7B-Instruct-v0.1", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4487, + "hfopenllm_v2/BBH": 0.3355, + "hfopenllm_v2/MATH Level 5": 0.0227, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3848, + "hfopenllm_v2/MMLU-PRO": 0.2414 + } + }, + { + "id": "mistralai/Mistral-7B-Instruct-v0.2", + "name": "Mistral-7B-Instruct-v0.2", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5496, + "hfopenllm_v2/BBH": 0.446, + "hfopenllm_v2/MATH Level 5": 0.0302, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3966, + "hfopenllm_v2/MMLU-PRO": 0.2717 + } + }, + { + "id": "mistralai/Mistral-7B-Instruct-v0.3", + "name": "Mistral-7B-Instruct-v0.3", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5465, + "hfopenllm_v2/BBH": 0.4722, + "hfopenllm_v2/MATH Level 5": 0.0385, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3739, + "hfopenllm_v2/MMLU-PRO": 0.3075 + } + }, + { + "id": "mistralai/Mistral-7B-v0.1", + "name": "Mistral-7B-v0.1", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2386, + "hfopenllm_v2/BBH": 0.4419, + "hfopenllm_v2/MATH Level 5": 0.0295, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4139, + "hfopenllm_v2/MMLU-PRO": 0.3013 + } + }, + { + "id": "mistralai/Mistral-7B-v0.3", + "name": "Mistral-7B-v0.3", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2266, + "hfopenllm_v2/BBH": 0.4517, + "hfopenllm_v2/MATH Level 5": 0.0302, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4032, + "hfopenllm_v2/MMLU-PRO": 0.2953 + } + }, + { + "id": "mistralai/Mistral-Large-Instruct-2411", + "name": "Mistral-Large-Instruct-2411", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8401, + "hfopenllm_v2/BBH": 0.6747, + "hfopenllm_v2/MATH Level 5": 0.4955, + "hfopenllm_v2/GPQA": 0.4371, + "hfopenllm_v2/MUSR": 0.454, + "hfopenllm_v2/MMLU-PRO": 0.5562 + } + }, + { + "id": "mistralai/Mistral-Nemo-Base-2407", + "name": "Mistral-Nemo-Base-2407", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.163, + "hfopenllm_v2/BBH": 0.5035, + "hfopenllm_v2/MATH Level 5": 0.0597, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.3921, + "hfopenllm_v2/MMLU-PRO": 0.3472 + } + }, + { + "id": "mistralai/Mistral-Nemo-Instruct-2407", + "name": "Mistral-Nemo-Instruct-2407", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.638, + "hfopenllm_v2/BBH": 0.5037, + "hfopenllm_v2/MATH Level 5": 0.1269, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.39, + "hfopenllm_v2/MMLU-PRO": 0.3517 + } + }, + { + "id": "mistralai/Mistral-Small-24B-Base-2501", + "name": "Mistral-Small-24B-Base-2501", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1672, + "hfopenllm_v2/BBH": 0.6442, + "hfopenllm_v2/MATH Level 5": 0.1971, + "hfopenllm_v2/GPQA": 0.3876, + "hfopenllm_v2/MUSR": 0.4237, + "hfopenllm_v2/MMLU-PRO": 0.5406 + } + }, + { + "id": "mistralai/Mistral-Small-Instruct-2409", + "name": "Mistral-Small-Instruct-2409", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6283, + "hfopenllm_v2/BBH": 0.583, + "hfopenllm_v2/MATH Level 5": 0.2039, + "hfopenllm_v2/GPQA": 0.3331, + "hfopenllm_v2/MUSR": 0.4063, + "hfopenllm_v2/MMLU-PRO": 0.4099 + } + }, + { + "id": "mistralai/Mistral-v0.1-7B", + "name": "Mistral v0.1 7B", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.884, + "helm_classic/MMLU": 0.572, + "helm_classic/BoolQ": 0.874, + "helm_classic/NarrativeQA": 0.716, + "helm_classic/NaturalQuestions (open-book)": 0.687, + "helm_classic/QuAC": 0.423, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.422, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.962, + "helm_classic/CivilComments": 0.624, + "helm_classic/RAFT": 0.707 + } + }, + { + "id": "mistralai/Mixtral-8x22B-Instruct-v0.1", + "name": "Mixtral-8x22B-Instruct-v0.1", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7184, + "hfopenllm_v2/BBH": 0.6125, + "hfopenllm_v2/MATH Level 5": 0.1873, + "hfopenllm_v2/GPQA": 0.3733, + "hfopenllm_v2/MUSR": 0.4311, + "hfopenllm_v2/MMLU-PRO": 0.4483 + } + }, + { + "id": "mistralai/Mixtral-8x22B-v0.1", + "name": "Mixtral-8x22B-v0.1", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2583, + "hfopenllm_v2/BBH": 0.624, + "hfopenllm_v2/MATH Level 5": 0.1835, + "hfopenllm_v2/GPQA": 0.3758, + "hfopenllm_v2/MUSR": 0.4037, + "hfopenllm_v2/MMLU-PRO": 0.4639 + } + }, + { + "id": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "name": "Mixtral-8x7B-Instruct-v0.1", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5599, + "hfopenllm_v2/BBH": 0.4962, + "hfopenllm_v2/MATH Level 5": 0.0914, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4203, + "hfopenllm_v2/MMLU-PRO": 0.3692, + "reward-bench/Score": 0.7455, + "reward-bench/Chat": 0.9497, + "reward-bench/Chat Hard": 0.6404, + "reward-bench/Safety": 0.7257, + "reward-bench/Reasoning": 0.7872, + "reward-bench/Prior Sets (0.5 weight)": 0.5033 + } + }, + { + "id": "mistralai/Mixtral-8x7B-v0.1", + "name": "Mixtral-8x7B-v0.1", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2326, + "hfopenllm_v2/BBH": 0.5098, + "hfopenllm_v2/MATH Level 5": 0.0937, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.4413, + "hfopenllm_v2/MMLU-PRO": 0.3871 + } + }, + { + "id": "mistralai/mistral-7b-instruct-v0.3", + "name": "Mistral Instruct v0.3 7B", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.376, + "helm_capabilities/MMLU-Pro": 0.277, + "helm_capabilities/GPQA": 0.303, + "helm_capabilities/IFEval": 0.567, + "helm_capabilities/WildBench": 0.66, + "helm_capabilities/Omni-MATH": 0.072, + "helm_lite/Mean win rate": 0.196, + "helm_lite/NarrativeQA": 0.716, + "helm_lite/NaturalQuestions (closed-book)": 0.253, + "helm_lite/OpenbookQA": 0.79, + "helm_lite/MMLU": 0.51, + "helm_lite/MATH": 0.289, + "helm_lite/GSM8K": 0.538, + "helm_lite/LegalBench": 0.331, + "helm_lite/MedQA": 0.517, + "helm_lite/WMT 2014": 0.142, + "helm_mmlu/MMLU All Subjects": 0.599, + "helm_mmlu/Abstract Algebra": 0.27, + "helm_mmlu/Anatomy": 0.585, + "helm_mmlu/College Physics": 0.343, + "helm_mmlu/Computer Security": 0.7, + "helm_mmlu/Econometrics": 0.421, + "helm_mmlu/Global Facts": 0.33, + "helm_mmlu/Jurisprudence": 0.713, + "helm_mmlu/Philosophy": 0.659, + "helm_mmlu/Professional Psychology": 0.641, + "helm_mmlu/Us Foreign Policy": 0.79, + "helm_mmlu/Astronomy": 0.638, + "helm_mmlu/Business Ethics": 0.57, + "helm_mmlu/Clinical Knowledge": 0.687, + "helm_mmlu/Conceptual Physics": 0.549, + "helm_mmlu/Electrical Engineering": 0.572, + "helm_mmlu/Elementary Mathematics": 0.402, + "helm_mmlu/Formal Logic": 0.397, + "helm_mmlu/High School World History": 0.759, + "helm_mmlu/Human Sexuality": 0.702, + "helm_mmlu/International Law": 0.76, + "helm_mmlu/Logical Fallacies": 0.712, + "helm_mmlu/Machine Learning": 0.455, + "helm_mmlu/Management": 0.767, + "helm_mmlu/Marketing": 0.842, + "helm_mmlu/Medical Genetics": 0.75, + "helm_mmlu/Miscellaneous": 0.785, + "helm_mmlu/Moral Scenarios": 0.393, + "helm_mmlu/Nutrition": 0.676, + "helm_mmlu/Prehistory": 0.673, + "helm_mmlu/Public Relations": 0.636, + "helm_mmlu/Security Studies": 0.682, + "helm_mmlu/Sociology": 0.806, + "helm_mmlu/Virology": 0.47, + "helm_mmlu/World Religions": 0.825, + "helm_mmlu/Mean win rate": 0.509 + } + }, + { + "id": "mistralai/mistral-7b-v0.1", + "name": "Mistral v0.1 7B", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.292, + "helm_lite/NarrativeQA": 0.716, + "helm_lite/NaturalQuestions (closed-book)": 0.367, + "helm_lite/OpenbookQA": 0.776, + "helm_lite/MMLU": 0.584, + "helm_lite/MATH": 0.297, + "helm_lite/GSM8K": 0.377, + "helm_lite/LegalBench": 0.58, + "helm_lite/MedQA": 0.525, + "helm_lite/WMT 2014": 0.16, + "helm_mmlu/MMLU All Subjects": 0.566, + "helm_mmlu/Abstract Algebra": 0.25, + "helm_mmlu/Anatomy": 0.467, + "helm_mmlu/College Physics": 0.314, + "helm_mmlu/Computer Security": 0.69, + "helm_mmlu/Econometrics": 0.351, + "helm_mmlu/Global Facts": 0.29, + "helm_mmlu/Jurisprudence": 0.667, + "helm_mmlu/Philosophy": 0.63, + "helm_mmlu/Professional Psychology": 0.578, + "helm_mmlu/Us Foreign Policy": 0.79, + "helm_mmlu/Astronomy": 0.599, + "helm_mmlu/Business Ethics": 0.56, + "helm_mmlu/Clinical Knowledge": 0.653, + "helm_mmlu/Conceptual Physics": 0.451, + "helm_mmlu/Electrical Engineering": 0.538, + "helm_mmlu/Elementary Mathematics": 0.32, + "helm_mmlu/Formal Logic": 0.365, + "helm_mmlu/High School World History": 0.726, + "helm_mmlu/Human Sexuality": 0.702, + "helm_mmlu/International Law": 0.76, + "helm_mmlu/Logical Fallacies": 0.693, + "helm_mmlu/Machine Learning": 0.438, + "helm_mmlu/Management": 0.709, + "helm_mmlu/Marketing": 0.833, + "helm_mmlu/Medical Genetics": 0.68, + "helm_mmlu/Miscellaneous": 0.72, + "helm_mmlu/Moral Scenarios": 0.33, + "helm_mmlu/Nutrition": 0.657, + "helm_mmlu/Prehistory": 0.642, + "helm_mmlu/Public Relations": 0.6, + "helm_mmlu/Security Studies": 0.731, + "helm_mmlu/Sociology": 0.831, + "helm_mmlu/Virology": 0.44, + "helm_mmlu/World Religions": 0.789, + "helm_mmlu/Mean win rate": 0.213 + } + }, + { + "id": "mistralai/mistral-large-2402", + "name": "Mistral Large 2402", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.328, + "helm_lite/NarrativeQA": 0.454, + "helm_lite/NaturalQuestions (closed-book)": 0.311, + "helm_lite/OpenbookQA": 0.894, + "helm_lite/MMLU": 0.638, + "helm_lite/MATH": 0.75, + "helm_lite/GSM8K": 0.694, + "helm_lite/LegalBench": 0.479, + "helm_lite/MedQA": 0.499, + "helm_lite/WMT 2014": 0.182, + "helm_mmlu/MMLU All Subjects": 0.688, + "helm_mmlu/Abstract Algebra": 0.45, + "helm_mmlu/Anatomy": 0.674, + "helm_mmlu/College Physics": 0.373, + "helm_mmlu/Computer Security": 0.8, + "helm_mmlu/Econometrics": 0.64, + "helm_mmlu/Global Facts": 0.34, + "helm_mmlu/Jurisprudence": 0.815, + "helm_mmlu/Philosophy": 0.794, + "helm_mmlu/Professional Psychology": 0.809, + "helm_mmlu/Us Foreign Policy": 0.92, + "helm_mmlu/Astronomy": 0.842, + "helm_mmlu/Business Ethics": 0.67, + "helm_mmlu/Clinical Knowledge": 0.751, + "helm_mmlu/Conceptual Physics": 0.574, + "helm_mmlu/Electrical Engineering": 0.545, + "helm_mmlu/Elementary Mathematics": 0.508, + "helm_mmlu/Formal Logic": 0.532, + "helm_mmlu/High School World History": 0.886, + "helm_mmlu/Human Sexuality": 0.847, + "helm_mmlu/International Law": 0.868, + "helm_mmlu/Logical Fallacies": 0.81, + "helm_mmlu/Machine Learning": 0.562, + "helm_mmlu/Management": 0.854, + "helm_mmlu/Marketing": 0.897, + "helm_mmlu/Medical Genetics": 0.74, + "helm_mmlu/Miscellaneous": 0.9, + "helm_mmlu/Moral Scenarios": 0.579, + "helm_mmlu/Nutrition": 0.791, + "helm_mmlu/Prehistory": 0.904, + "helm_mmlu/Public Relations": 0.709, + "helm_mmlu/Security Studies": 0.824, + "helm_mmlu/Sociology": 0.93, + "helm_mmlu/Virology": 0.554, + "helm_mmlu/World Religions": 0.883, + "helm_mmlu/Mean win rate": 0.464 + } + }, + { + "id": "mistralai/mistral-large-2407", + "name": "Mistral Large 2 2407", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.744, + "helm_lite/NarrativeQA": 0.779, + "helm_lite/NaturalQuestions (closed-book)": 0.453, + "helm_lite/OpenbookQA": 0.932, + "helm_lite/MMLU": 0.725, + "helm_lite/MATH": 0.677, + "helm_lite/GSM8K": 0.912, + "helm_lite/LegalBench": 0.646, + "helm_lite/MedQA": 0.775, + "helm_lite/WMT 2014": 0.192, + "helm_mmlu/MMLU All Subjects": 0.8, + "helm_mmlu/Abstract Algebra": 0.7, + "helm_mmlu/Anatomy": 0.785, + "helm_mmlu/College Physics": 0.559, + "helm_mmlu/Computer Security": 0.81, + "helm_mmlu/Econometrics": 0.693, + "helm_mmlu/Global Facts": 0.56, + "helm_mmlu/Jurisprudence": 0.861, + "helm_mmlu/Philosophy": 0.826, + "helm_mmlu/Professional Psychology": 0.861, + "helm_mmlu/Us Foreign Policy": 0.9, + "helm_mmlu/Astronomy": 0.921, + "helm_mmlu/Business Ethics": 0.79, + "helm_mmlu/Clinical Knowledge": 0.864, + "helm_mmlu/Conceptual Physics": 0.864, + "helm_mmlu/Electrical Engineering": 0.793, + "helm_mmlu/Elementary Mathematics": 0.799, + "helm_mmlu/Formal Logic": 0.579, + "helm_mmlu/High School World History": 0.92, + "helm_mmlu/Human Sexuality": 0.924, + "helm_mmlu/International Law": 0.926, + "helm_mmlu/Logical Fallacies": 0.847, + "helm_mmlu/Machine Learning": 0.661, + "helm_mmlu/Management": 0.883, + "helm_mmlu/Marketing": 0.94, + "helm_mmlu/Medical Genetics": 0.9, + "helm_mmlu/Miscellaneous": 0.936, + "helm_mmlu/Moral Scenarios": 0.839, + "helm_mmlu/Nutrition": 0.827, + "helm_mmlu/Prehistory": 0.92, + "helm_mmlu/Public Relations": 0.764, + "helm_mmlu/Security Studies": 0.865, + "helm_mmlu/Sociology": 0.91, + "helm_mmlu/Virology": 0.59, + "helm_mmlu/World Religions": 0.865, + "helm_mmlu/Mean win rate": 0.24 + } + }, + { + "id": "mistralai/mistral-large-2411", + "name": "Mistral Large 2411", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.598, + "helm_capabilities/MMLU-Pro": 0.599, + "helm_capabilities/GPQA": 0.435, + "helm_capabilities/IFEval": 0.876, + "helm_capabilities/WildBench": 0.801, + "helm_capabilities/Omni-MATH": 0.281 + } + }, + { + "id": "mistralai/mistral-medium-2312", + "name": "Mistral Medium 2312", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.268, + "helm_lite/NarrativeQA": 0.449, + "helm_lite/NaturalQuestions (closed-book)": 0.29, + "helm_lite/OpenbookQA": 0.83, + "helm_lite/MMLU": 0.618, + "helm_lite/MATH": 0.565, + "helm_lite/GSM8K": 0.706, + "helm_lite/LegalBench": 0.452, + "helm_lite/MedQA": 0.61, + "helm_lite/WMT 2014": 0.169 + } + }, + { + "id": "mistralai/mistral-medium-3", + "name": "mistral-medium-3", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.5511, + "global-mmlu-lite/Culturally Sensitive": 0.5391, + "global-mmlu-lite/Culturally Agnostic": 0.5631, + "global-mmlu-lite/Arabic": 0.455, + "global-mmlu-lite/English": 0.38, + "global-mmlu-lite/Bengali": 0.5175, + "global-mmlu-lite/German": 0.4775, + "global-mmlu-lite/French": 0.41, + "global-mmlu-lite/Hindi": 0.555, + "global-mmlu-lite/Indonesian": 0.515, + "global-mmlu-lite/Italian": 0.535, + "global-mmlu-lite/Japanese": 0.58, + "global-mmlu-lite/Korean": 0.595, + "global-mmlu-lite/Portuguese": 0.5175, + "global-mmlu-lite/Spanish": 0.5375, + "global-mmlu-lite/Swahili": 0.7075, + "global-mmlu-lite/Yoruba": 0.7675, + "global-mmlu-lite/Chinese": 0.535, + "global-mmlu-lite/Burmese": 0.7325 + } + }, + { + "id": "mistralai/mistral-small-2402", + "name": "Mistral Small 2402", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.288, + "helm_lite/NarrativeQA": 0.519, + "helm_lite/NaturalQuestions (closed-book)": 0.304, + "helm_lite/OpenbookQA": 0.862, + "helm_lite/MMLU": 0.593, + "helm_lite/MATH": 0.621, + "helm_lite/GSM8K": 0.734, + "helm_lite/LegalBench": 0.389, + "helm_lite/MedQA": 0.616, + "helm_lite/WMT 2014": 0.169, + "helm_mmlu/MMLU All Subjects": 0.687, + "helm_mmlu/Abstract Algebra": 0.26, + "helm_mmlu/Anatomy": 0.674, + "helm_mmlu/College Physics": 0.402, + "helm_mmlu/Computer Security": 0.77, + "helm_mmlu/Econometrics": 0.614, + "helm_mmlu/Global Facts": 0.45, + "helm_mmlu/Jurisprudence": 0.833, + "helm_mmlu/Philosophy": 0.765, + "helm_mmlu/Professional Psychology": 0.768, + "helm_mmlu/Us Foreign Policy": 0.89, + "helm_mmlu/Astronomy": 0.77, + "helm_mmlu/Business Ethics": 0.71, + "helm_mmlu/Clinical Knowledge": 0.766, + "helm_mmlu/Conceptual Physics": 0.685, + "helm_mmlu/Electrical Engineering": 0.628, + "helm_mmlu/Elementary Mathematics": 0.415, + "helm_mmlu/Formal Logic": 0.516, + "helm_mmlu/High School World History": 0.857, + "helm_mmlu/Human Sexuality": 0.824, + "helm_mmlu/International Law": 0.826, + "helm_mmlu/Logical Fallacies": 0.804, + "helm_mmlu/Machine Learning": 0.562, + "helm_mmlu/Management": 0.786, + "helm_mmlu/Marketing": 0.906, + "helm_mmlu/Medical Genetics": 0.75, + "helm_mmlu/Miscellaneous": 0.844, + "helm_mmlu/Moral Scenarios": 0.575, + "helm_mmlu/Nutrition": 0.761, + "helm_mmlu/Prehistory": 0.802, + "helm_mmlu/Public Relations": 0.773, + "helm_mmlu/Security Studies": 0.788, + "helm_mmlu/Sociology": 0.871, + "helm_mmlu/Virology": 0.542, + "helm_mmlu/World Religions": 0.848, + "helm_mmlu/Mean win rate": 0.54 + } + }, + { + "id": "mistralai/mistral-small-2503", + "name": "mistral-small-2503", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.7852, + "global-mmlu-lite/Culturally Sensitive": 0.7537, + "global-mmlu-lite/Culturally Agnostic": 0.8166, + "global-mmlu-lite/Arabic": 0.7875, + "global-mmlu-lite/English": 0.8, + "global-mmlu-lite/Bengali": 0.7725, + "global-mmlu-lite/German": 0.7975, + "global-mmlu-lite/French": 0.8, + "global-mmlu-lite/Hindi": 0.795, + "global-mmlu-lite/Indonesian": 0.785, + "global-mmlu-lite/Italian": 0.805, + "global-mmlu-lite/Japanese": 0.77, + "global-mmlu-lite/Korean": 0.79, + "global-mmlu-lite/Portuguese": 0.7925, + "global-mmlu-lite/Spanish": 0.7825, + "global-mmlu-lite/Swahili": 0.775, + "global-mmlu-lite/Yoruba": 0.735, + "global-mmlu-lite/Chinese": 0.7925, + "global-mmlu-lite/Burmese": 0.7825, + "helm_capabilities/Mean score": 0.558, + "helm_capabilities/MMLU-Pro": 0.61, + "helm_capabilities/GPQA": 0.392, + "helm_capabilities/IFEval": 0.75, + "helm_capabilities/WildBench": 0.788, + "helm_capabilities/Omni-MATH": 0.248 + } + }, + { + "id": "mistralai/mixtral-8x22b", + "name": "Mixtral 8x22B", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.705, + "helm_lite/NarrativeQA": 0.779, + "helm_lite/NaturalQuestions (closed-book)": 0.478, + "helm_lite/OpenbookQA": 0.882, + "helm_lite/MMLU": 0.701, + "helm_lite/MATH": 0.656, + "helm_lite/GSM8K": 0.8, + "helm_lite/LegalBench": 0.708, + "helm_lite/MedQA": 0.704, + "helm_lite/WMT 2014": 0.209, + "helm_mmlu/MMLU All Subjects": 0.778, + "helm_mmlu/Abstract Algebra": 0.48, + "helm_mmlu/Anatomy": 0.741, + "helm_mmlu/College Physics": 0.569, + "helm_mmlu/Computer Security": 0.84, + "helm_mmlu/Econometrics": 0.667, + "helm_mmlu/Global Facts": 0.56, + "helm_mmlu/Jurisprudence": 0.852, + "helm_mmlu/Philosophy": 0.842, + "helm_mmlu/Professional Psychology": 0.845, + "helm_mmlu/Us Foreign Policy": 0.95, + "helm_mmlu/Astronomy": 0.882, + "helm_mmlu/Business Ethics": 0.74, + "helm_mmlu/Clinical Knowledge": 0.819, + "helm_mmlu/Conceptual Physics": 0.796, + "helm_mmlu/Electrical Engineering": 0.766, + "helm_mmlu/Elementary Mathematics": 0.622, + "helm_mmlu/Formal Logic": 0.627, + "helm_mmlu/High School World History": 0.895, + "helm_mmlu/Human Sexuality": 0.885, + "helm_mmlu/International Law": 0.917, + "helm_mmlu/Logical Fallacies": 0.877, + "helm_mmlu/Machine Learning": 0.661, + "helm_mmlu/Management": 0.883, + "helm_mmlu/Marketing": 0.915, + "helm_mmlu/Medical Genetics": 0.85, + "helm_mmlu/Miscellaneous": 0.899, + "helm_mmlu/Moral Scenarios": 0.646, + "helm_mmlu/Nutrition": 0.866, + "helm_mmlu/Prehistory": 0.87, + "helm_mmlu/Public Relations": 0.755, + "helm_mmlu/Security Studies": 0.865, + "helm_mmlu/Sociology": 0.92, + "helm_mmlu/Virology": 0.596, + "helm_mmlu/World Religions": 0.901, + "helm_mmlu/Mean win rate": 0.598 + } + }, + { + "id": "mistralai/mixtral-8x22b-instruct-v0.1", + "name": "Mixtral Instruct 8x22B", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.478, + "helm_capabilities/MMLU-Pro": 0.46, + "helm_capabilities/GPQA": 0.334, + "helm_capabilities/IFEval": 0.724, + "helm_capabilities/WildBench": 0.711, + "helm_capabilities/Omni-MATH": 0.163 + } + }, + { + "id": "mistralai/mixtral-8x7b-32kseqlen", + "name": "Mixtral 8x7B 32K seqlen", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.51, + "helm_lite/NarrativeQA": 0.767, + "helm_lite/NaturalQuestions (closed-book)": 0.427, + "helm_lite/OpenbookQA": 0.868, + "helm_lite/MMLU": 0.649, + "helm_lite/MATH": 0.494, + "helm_lite/GSM8K": 0.622, + "helm_lite/LegalBench": 0.63, + "helm_lite/MedQA": 0.652, + "helm_lite/WMT 2014": 0.19, + "helm_mmlu/MMLU All Subjects": 0.717, + "helm_mmlu/Abstract Algebra": 0.38, + "helm_mmlu/Anatomy": 0.696, + "helm_mmlu/College Physics": 0.51, + "helm_mmlu/Computer Security": 0.81, + "helm_mmlu/Econometrics": 0.605, + "helm_mmlu/Global Facts": 0.46, + "helm_mmlu/Jurisprudence": 0.833, + "helm_mmlu/Philosophy": 0.797, + "helm_mmlu/Professional Psychology": 0.779, + "helm_mmlu/Us Foreign Policy": 0.93, + "helm_mmlu/Astronomy": 0.829, + "helm_mmlu/Business Ethics": 0.72, + "helm_mmlu/Clinical Knowledge": 0.785, + "helm_mmlu/Conceptual Physics": 0.681, + "helm_mmlu/Electrical Engineering": 0.676, + "helm_mmlu/Elementary Mathematics": 0.476, + "helm_mmlu/Formal Logic": 0.532, + "helm_mmlu/High School World History": 0.886, + "helm_mmlu/Human Sexuality": 0.87, + "helm_mmlu/International Law": 0.86, + "helm_mmlu/Logical Fallacies": 0.767, + "helm_mmlu/Machine Learning": 0.509, + "helm_mmlu/Management": 0.845, + "helm_mmlu/Marketing": 0.923, + "helm_mmlu/Medical Genetics": 0.76, + "helm_mmlu/Miscellaneous": 0.881, + "helm_mmlu/Moral Scenarios": 0.444, + "helm_mmlu/Nutrition": 0.83, + "helm_mmlu/Prehistory": 0.849, + "helm_mmlu/Public Relations": 0.682, + "helm_mmlu/Security Studies": 0.792, + "helm_mmlu/Sociology": 0.871, + "helm_mmlu/Virology": 0.506, + "helm_mmlu/World Religions": 0.871, + "helm_mmlu/Mean win rate": 0.689 + } + }, + { + "id": "mistralai/mixtral-8x7b-instruct-v0.1", + "name": "Mixtral Instruct 8x7B", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.397, + "helm_capabilities/MMLU-Pro": 0.335, + "helm_capabilities/GPQA": 0.296, + "helm_capabilities/IFEval": 0.575, + "helm_capabilities/WildBench": 0.673, + "helm_capabilities/Omni-MATH": 0.105 + } + }, + { + "id": "mistralai/open-mistral-nemo-2407", + "name": "Mistral NeMo 2402", + "developer": "mistralai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.333, + "helm_lite/NarrativeQA": 0.731, + "helm_lite/NaturalQuestions (closed-book)": 0.265, + "helm_lite/OpenbookQA": 0.822, + "helm_lite/MMLU": 0.604, + "helm_lite/MATH": 0.668, + "helm_lite/GSM8K": 0.782, + "helm_lite/LegalBench": 0.415, + "helm_lite/MedQA": 0.59, + "helm_lite/WMT 2014": 0.177, + "helm_mmlu/MMLU All Subjects": 0.653, + "helm_mmlu/Abstract Algebra": 0.29, + "helm_mmlu/Anatomy": 0.607, + "helm_mmlu/College Physics": 0.373, + "helm_mmlu/Computer Security": 0.81, + "helm_mmlu/Econometrics": 0.561, + "helm_mmlu/Global Facts": 0.4, + "helm_mmlu/Jurisprudence": 0.796, + "helm_mmlu/Philosophy": 0.733, + "helm_mmlu/Professional Psychology": 0.588, + "helm_mmlu/Us Foreign Policy": 0.89, + "helm_mmlu/Astronomy": 0.691, + "helm_mmlu/Business Ethics": 0.49, + "helm_mmlu/Clinical Knowledge": 0.736, + "helm_mmlu/Conceptual Physics": 0.647, + "helm_mmlu/Electrical Engineering": 0.531, + "helm_mmlu/Elementary Mathematics": 0.439, + "helm_mmlu/Formal Logic": 0.405, + "helm_mmlu/High School World History": 0.848, + "helm_mmlu/Human Sexuality": 0.702, + "helm_mmlu/International Law": 0.769, + "helm_mmlu/Logical Fallacies": 0.791, + "helm_mmlu/Machine Learning": 0.402, + "helm_mmlu/Management": 0.796, + "helm_mmlu/Marketing": 0.889, + "helm_mmlu/Medical Genetics": 0.78, + "helm_mmlu/Miscellaneous": 0.861, + "helm_mmlu/Moral Scenarios": 0.381, + "helm_mmlu/Nutrition": 0.709, + "helm_mmlu/Prehistory": 0.765, + "helm_mmlu/Public Relations": 0.718, + "helm_mmlu/Security Studies": 0.771, + "helm_mmlu/Sociology": 0.726, + "helm_mmlu/Virology": 0.56, + "helm_mmlu/World Religions": 0.789, + "helm_mmlu/Mean win rate": 0.215 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/mixtao.json b/data/developers/mixtao.json new file mode 100644 index 0000000000000000000000000000000000000000..0d228718cf1a13d82689d129c9ec3a15e10653bd --- /dev/null +++ b/data/developers/mixtao.json @@ -0,0 +1,19 @@ +{ + "developer": "mixtao", + "models": [ + { + "id": "mixtao/MixTAO-7Bx2-MoE-v8.1", + "name": "MixTAO-7Bx2-MoE-v8.1", + "developer": "mixtao", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4162, + "hfopenllm_v2/BBH": 0.5189, + "hfopenllm_v2/MATH Level 5": 0.0906, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4463, + "hfopenllm_v2/MMLU-PRO": 0.3123 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/mkurman.json b/data/developers/mkurman.json new file mode 100644 index 0000000000000000000000000000000000000000..12ddb403e528047f75f58de4389102db97a272fe --- /dev/null +++ b/data/developers/mkurman.json @@ -0,0 +1,47 @@ +{ + "developer": "mkurman", + "models": [ + { + "id": "mkurman/llama-3.2-MEDIT-3B-o1", + "name": "llama-3.2-MEDIT-3B-o1", + "developer": "mkurman", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4382, + "hfopenllm_v2/BBH": 0.44, + "hfopenllm_v2/MATH Level 5": 0.1307, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3565, + "hfopenllm_v2/MMLU-PRO": 0.2741 + } + }, + { + "id": "mkurman/phi-4-MedIT-11B-exp-1", + "name": "phi-4-MedIT-11B-exp-1", + "developer": "mkurman", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5948, + "hfopenllm_v2/BBH": 0.5414, + "hfopenllm_v2/MATH Level 5": 0.0899, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.3848, + "hfopenllm_v2/MMLU-PRO": 0.3825 + } + }, + { + "id": "mkurman/phi4-MedIT-10B-o1", + "name": "phi4-MedIT-10B-o1", + "developer": "mkurman", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3463, + "hfopenllm_v2/BBH": 0.5198, + "hfopenllm_v2/MATH Level 5": 0.1148, + "hfopenllm_v2/GPQA": 0.2458, + "hfopenllm_v2/MUSR": 0.3968, + "hfopenllm_v2/MMLU-PRO": 0.3507 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/mkxu.json b/data/developers/mkxu.json new file mode 100644 index 0000000000000000000000000000000000000000..9d6fd0d4ad640a4d5ba72a7f3963c78e0818c27c --- /dev/null +++ b/data/developers/mkxu.json @@ -0,0 +1,33 @@ +{ + "developer": "mkxu", + "models": [ + { + "id": "mkxu/llama-3-8b-instruct-fpo", + "name": "llama-3-8b-instruct-fpo", + "developer": "mkxu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.679, + "hfopenllm_v2/BBH": 0.4959, + "hfopenllm_v2/MATH Level 5": 0.0733, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.3658, + "hfopenllm_v2/MMLU-PRO": 0.3605 + } + }, + { + "id": "mkxu/llama-3-8b-po1", + "name": "llama-3-8b-po1", + "developer": "mkxu", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4081, + "hfopenllm_v2/BBH": 0.4976, + "hfopenllm_v2/MATH Level 5": 0.0702, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.3804, + "hfopenllm_v2/MMLU-PRO": 0.3562 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/mlabonne.json b/data/developers/mlabonne.json new file mode 100644 index 0000000000000000000000000000000000000000..be86bd7fe732025f133b06dc7c412aa5e56b7119 --- /dev/null +++ b/data/developers/mlabonne.json @@ -0,0 +1,201 @@ +{ + "developer": "mlabonne", + "models": [ + { + "id": "mlabonne/AlphaMonarch-7B", + "name": "AlphaMonarch-7B", + "developer": "mlabonne", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4939, + "hfopenllm_v2/BBH": 0.4626, + "hfopenllm_v2/MATH Level 5": 0.0408, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.4121, + "hfopenllm_v2/MMLU-PRO": 0.2473 + } + }, + { + "id": "mlabonne/Beyonder-4x7B-v3", + "name": "Beyonder-4x7B-v3", + "developer": "mlabonne", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5608, + "hfopenllm_v2/BBH": 0.4671, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.4045, + "hfopenllm_v2/MMLU-PRO": 0.2512 + } + }, + { + "id": "mlabonne/BigQwen2.5-52B-Instruct", + "name": "BigQwen2.5-52B-Instruct", + "developer": "mlabonne", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7913, + "hfopenllm_v2/BBH": 0.7121, + "hfopenllm_v2/MATH Level 5": 0.5476, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4113, + "hfopenllm_v2/MMLU-PRO": 0.5519 + } + }, + { + "id": "mlabonne/BigQwen2.5-Echo-47B-Instruct", + "name": "BigQwen2.5-Echo-47B-Instruct", + "developer": "mlabonne", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7357, + "hfopenllm_v2/BBH": 0.6125, + "hfopenllm_v2/MATH Level 5": 0.4381, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.4125, + "hfopenllm_v2/MMLU-PRO": 0.4734 + } + }, + { + "id": "mlabonne/ChimeraLlama-3-8B-v2", + "name": "ChimeraLlama-3-8B-v2", + "developer": "mlabonne", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4469, + "hfopenllm_v2/BBH": 0.5046, + "hfopenllm_v2/MATH Level 5": 0.0906, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.3791, + "hfopenllm_v2/MMLU-PRO": 0.3569 + } + }, + { + "id": "mlabonne/ChimeraLlama-3-8B-v3", + "name": "ChimeraLlama-3-8B-v3", + "developer": "mlabonne", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4408, + "hfopenllm_v2/BBH": 0.4978, + "hfopenllm_v2/MATH Level 5": 0.0884, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4004, + "hfopenllm_v2/MMLU-PRO": 0.3669 + } + }, + { + "id": "mlabonne/Daredevil-8B", + "name": "Daredevil-8B", + "developer": "mlabonne", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4548, + "hfopenllm_v2/BBH": 0.5194, + "hfopenllm_v2/MATH Level 5": 0.1065, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.3939, + "hfopenllm_v2/MMLU-PRO": 0.3831 + } + }, + { + "id": "mlabonne/Daredevil-8B-abliterated", + "name": "Daredevil-8B-abliterated", + "developer": "mlabonne", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4426, + "hfopenllm_v2/BBH": 0.4254, + "hfopenllm_v2/MATH Level 5": 0.0944, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.407, + "hfopenllm_v2/MMLU-PRO": 0.3701 + } + }, + { + "id": "mlabonne/Hermes-3-Llama-3.1-70B-lorablated", + "name": "Hermes-3-Llama-3.1-70B-lorablated", + "developer": "mlabonne", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3424, + "hfopenllm_v2/BBH": 0.6693, + "hfopenllm_v2/MATH Level 5": 0.2243, + "hfopenllm_v2/GPQA": 0.3658, + "hfopenllm_v2/MUSR": 0.5029, + "hfopenllm_v2/MMLU-PRO": 0.4679 + } + }, + { + "id": "mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated", + "name": "Meta-Llama-3.1-8B-Instruct-abliterated", + "developer": "mlabonne", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7329, + "hfopenllm_v2/BBH": 0.4874, + "hfopenllm_v2/MATH Level 5": 0.0687, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3649, + "hfopenllm_v2/MMLU-PRO": 0.3503 + } + }, + { + "id": "mlabonne/NeuralBeagle14-7B", + "name": "NeuralBeagle14-7B", + "developer": "mlabonne", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4935, + "hfopenllm_v2/BBH": 0.4628, + "hfopenllm_v2/MATH Level 5": 0.0521, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.4319, + "hfopenllm_v2/MMLU-PRO": 0.2601 + } + }, + { + "id": "mlabonne/NeuralDaredevil-8B-abliterated", + "name": "NeuralDaredevil-8B-abliterated", + "developer": "mlabonne", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4162, + "hfopenllm_v2/BBH": 0.5124, + "hfopenllm_v2/MATH Level 5": 0.0853, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.415, + "hfopenllm_v2/MMLU-PRO": 0.3802 + } + }, + { + "id": "mlabonne/OrpoLlama-3-8B", + "name": "OrpoLlama-3-8B", + "developer": "mlabonne", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3653, + "hfopenllm_v2/BBH": 0.4424, + "hfopenllm_v2/MATH Level 5": 0.0559, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3579, + "hfopenllm_v2/MMLU-PRO": 0.2705 + } + }, + { + "id": "mlabonne/phixtral-2x2_8", + "name": "phixtral-2x2_8", + "developer": "mlabonne", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3431, + "hfopenllm_v2/BBH": 0.4889, + "hfopenllm_v2/MATH Level 5": 0.0355, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3644, + "hfopenllm_v2/MMLU-PRO": 0.2551 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/mlx-community.json b/data/developers/mlx-community.json new file mode 100644 index 0000000000000000000000000000000000000000..cd8003f96a2e0f9e711b90ad5a115cb92e0bbb0d --- /dev/null +++ b/data/developers/mlx-community.json @@ -0,0 +1,33 @@ +{ + "developer": "mlx-community", + "models": [ + { + "id": "mlx-community/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32", + "name": "Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32", + "developer": "mlx-community", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3369, + "hfopenllm_v2/BBH": 0.3292, + "hfopenllm_v2/MATH Level 5": 0.0846, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3249, + "hfopenllm_v2/MMLU-PRO": 0.1638 + } + }, + { + "id": "mlx-community/Mistral-Small-24B-Instruct-2501-bf16", + "name": "Mistral-Small-24B-Instruct-2501-bf16", + "developer": "mlx-community", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6283, + "hfopenllm_v2/BBH": 0.6713, + "hfopenllm_v2/MATH Level 5": 0.3225, + "hfopenllm_v2/GPQA": 0.3951, + "hfopenllm_v2/MUSR": 0.4618, + "hfopenllm_v2/MMLU-PRO": 0.5395 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/mmnga.json b/data/developers/mmnga.json new file mode 100644 index 0000000000000000000000000000000000000000..094ed0afe1c425dd37754bed393f751628d653d7 --- /dev/null +++ b/data/developers/mmnga.json @@ -0,0 +1,19 @@ +{ + "developer": "mmnga", + "models": [ + { + "id": "mmnga/Llama-3-70B-japanese-suzume-vector-v0.1", + "name": "Llama-3-70B-japanese-suzume-vector-v0.1", + "developer": "mmnga", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4649, + "hfopenllm_v2/BBH": 0.6542, + "hfopenllm_v2/MATH Level 5": 0.2326, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.4141, + "hfopenllm_v2/MMLU-PRO": 0.5224 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/mobiuslabsgmbh.json b/data/developers/mobiuslabsgmbh.json new file mode 100644 index 0000000000000000000000000000000000000000..e3829cad0ef3f0085f581959646b05ecd692b6b0 --- /dev/null +++ b/data/developers/mobiuslabsgmbh.json @@ -0,0 +1,33 @@ +{ + "developer": "mobiuslabsgmbh", + "models": [ + { + "id": "mobiuslabsgmbh/DeepSeek-R1-ReDistill-Llama3-8B-v1.1", + "name": "DeepSeek-R1-ReDistill-Llama3-8B-v1.1", + "developer": "mobiuslabsgmbh", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3704, + "hfopenllm_v2/BBH": 0.3473, + "hfopenllm_v2/MATH Level 5": 0.3285, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3396, + "hfopenllm_v2/MMLU-PRO": 0.2198 + } + }, + { + "id": "mobiuslabsgmbh/DeepSeek-R1-ReDistill-Qwen-7B-v1.1", + "name": "DeepSeek-R1-ReDistill-Qwen-7B-v1.1", + "developer": "mobiuslabsgmbh", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3473, + "hfopenllm_v2/BBH": 0.3698, + "hfopenllm_v2/MATH Level 5": 0.3497, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.4009, + "hfopenllm_v2/MMLU-PRO": 0.2326 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/moeru-ai.json b/data/developers/moeru-ai.json new file mode 100644 index 0000000000000000000000000000000000000000..66cb52131be19f67ec9f4a03862bca0081d107c9 --- /dev/null +++ b/data/developers/moeru-ai.json @@ -0,0 +1,47 @@ +{ + "developer": "moeru-ai", + "models": [ + { + "id": "moeru-ai/L3.1-Moe-2x8B-v0.2", + "name": "L3.1-Moe-2x8B-v0.2", + "developer": "moeru-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7348, + "hfopenllm_v2/BBH": 0.5256, + "hfopenllm_v2/MATH Level 5": 0.1699, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4199, + "hfopenllm_v2/MMLU-PRO": 0.3858 + } + }, + { + "id": "moeru-ai/L3.1-Moe-4x8B-v0.1", + "name": "L3.1-Moe-4x8B-v0.1", + "developer": "moeru-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4332, + "hfopenllm_v2/BBH": 0.4939, + "hfopenllm_v2/MATH Level 5": 0.1299, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3609, + "hfopenllm_v2/MMLU-PRO": 0.3454 + } + }, + { + "id": "moeru-ai/L3.1-Moe-4x8B-v0.2", + "name": "L3.1-Moe-4x8B-v0.2", + "developer": "moeru-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5407, + "hfopenllm_v2/BBH": 0.4466, + "hfopenllm_v2/MATH Level 5": 0.1035, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3234, + "hfopenllm_v2/MMLU-PRO": 0.2763 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/monsterapi.json b/data/developers/monsterapi.json new file mode 100644 index 0000000000000000000000000000000000000000..9b178fa2ef49ae640e325a4534d297a7266911fc --- /dev/null +++ b/data/developers/monsterapi.json @@ -0,0 +1,33 @@ +{ + "developer": "monsterapi", + "models": [ + { + "id": "monsterapi/Llama-3_1-8B-Instruct-orca-ORPO", + "name": "Llama-3_1-8B-Instruct-orca-ORPO", + "developer": "monsterapi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2273, + "hfopenllm_v2/BBH": 0.2865, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2492, + "hfopenllm_v2/MUSR": 0.3445, + "hfopenllm_v2/MMLU-PRO": 0.1168 + } + }, + { + "id": "monsterapi/gemma-2-2b-LoRA-MonsterInstruct", + "name": "gemma-2-2b-LoRA-MonsterInstruct", + "developer": "monsterapi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3903, + "hfopenllm_v2/BBH": 0.365, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3644, + "hfopenllm_v2/MMLU-PRO": 0.1987 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/moonshot.json b/data/developers/moonshot.json new file mode 100644 index 0000000000000000000000000000000000000000..69a15692e44ee14f98c4ae22bc5403fe4fd7daa6 --- /dev/null +++ b/data/developers/moonshot.json @@ -0,0 +1,29 @@ +{ + "developer": "moonshot", + "models": [ + { + "id": "moonshot/Kimi K2 Thinking", + "name": "Kimi K2 Thinking", + "developer": "moonshot", + "evaluator_relationship": null, + "benchmark_scores": { + "apex-agents/Overall Pass@1": 0.04, + "apex-agents/Overall Pass@8": 0.144, + "apex-agents/Overall Mean Score": 0.115, + "apex-agents/Investment Banking Pass@1": 0.012, + "apex-agents/Management Consulting Pass@1": 0.029, + "apex-agents/Corporate Law Pass@1": 0.08, + "apex-agents/Corporate Lawyer Mean Score": 0.223 + } + }, + { + "id": "moonshot/Kimi K2.5", + "name": "Kimi K2.5", + "developer": "moonshot", + "evaluator_relationship": null, + "benchmark_scores": { + "apex-agents/Corporate Lawyer Mean Score": 0.402 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/moonshotai.json b/data/developers/moonshotai.json new file mode 100644 index 0000000000000000000000000000000000000000..d6798af4c3c1812aa14e6e42710330691d1c7ec5 --- /dev/null +++ b/data/developers/moonshotai.json @@ -0,0 +1,19 @@ +{ + "developer": "moonshotai", + "models": [ + { + "id": "moonshotai/kimi-k2-instruct", + "name": "Kimi K2 Instruct", + "developer": "moonshotai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.768, + "helm_capabilities/MMLU-Pro": 0.819, + "helm_capabilities/GPQA": 0.652, + "helm_capabilities/IFEval": 0.85, + "helm_capabilities/WildBench": 0.862, + "helm_capabilities/Omni-MATH": 0.654 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/mosaicml.json b/data/developers/mosaicml.json new file mode 100644 index 0000000000000000000000000000000000000000..9a3d1c335058663d023ddb48811a646eee39f004 --- /dev/null +++ b/data/developers/mosaicml.json @@ -0,0 +1,65 @@ +{ + "developer": "mosaicml", + "models": [ + { + "id": "mosaicml/MPT-30B", + "name": "MPT 30B", + "developer": "mosaicml", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.714, + "helm_classic/MMLU": 0.437, + "helm_classic/BoolQ": 0.704, + "helm_classic/NarrativeQA": 0.732, + "helm_classic/NaturalQuestions (open-book)": 0.673, + "helm_classic/QuAC": 0.393, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.231, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.959, + "helm_classic/CivilComments": 0.599, + "helm_classic/RAFT": 0.723 + } + }, + { + "id": "mosaicml/MPT-Instruct-30B", + "name": "MPT-Instruct 30B", + "developer": "mosaicml", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.716, + "helm_classic/MMLU": 0.444, + "helm_classic/BoolQ": 0.85, + "helm_classic/NarrativeQA": 0.733, + "helm_classic/NaturalQuestions (open-book)": 0.697, + "helm_classic/QuAC": 0.327, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.234, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.956, + "helm_classic/CivilComments": 0.573, + "helm_classic/RAFT": 0.68 + } + }, + { + "id": "mosaicml/mpt-7b", + "name": "mpt-7b", + "developer": "mosaicml", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2152, + "hfopenllm_v2/BBH": 0.33, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3672, + "hfopenllm_v2/MMLU-PRO": 0.1206 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/mosama.json b/data/developers/mosama.json new file mode 100644 index 0000000000000000000000000000000000000000..1256a4044b4b150c3257e24c8598003fb436e5bc --- /dev/null +++ b/data/developers/mosama.json @@ -0,0 +1,19 @@ +{ + "developer": "mosama", + "models": [ + { + "id": "mosama/Qwen2.5-1.5B-Instruct-CoT-Reflection", + "name": "Qwen2.5-1.5B-Instruct-CoT-Reflection", + "developer": "mosama", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.287, + "hfopenllm_v2/BBH": 0.4109, + "hfopenllm_v2/MATH Level 5": 0.0272, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3212, + "hfopenllm_v2/MMLU-PRO": 0.2651 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/mrdayl.json b/data/developers/mrdayl.json new file mode 100644 index 0000000000000000000000000000000000000000..40195f7da75002a2f4c8fe34a6b972163d303c5a --- /dev/null +++ b/data/developers/mrdayl.json @@ -0,0 +1,75 @@ +{ + "developer": "mrdayl", + "models": [ + { + "id": "mrdayl/OpenCogito", + "name": "OpenCogito", + "developer": "mrdayl", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3934, + "hfopenllm_v2/BBH": 0.472, + "hfopenllm_v2/MATH Level 5": 0.2183, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.424, + "hfopenllm_v2/MMLU-PRO": 0.3452 + } + }, + { + "id": "mrdayl/OpenCognito", + "name": "OpenCognito", + "developer": "mrdayl", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4062, + "hfopenllm_v2/BBH": 0.4706, + "hfopenllm_v2/MATH Level 5": 0.2115, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4293, + "hfopenllm_v2/MMLU-PRO": 0.3443 + } + }, + { + "id": "mrdayl/OpenCognito-r1", + "name": "OpenCognito-r1", + "developer": "mrdayl", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4241, + "hfopenllm_v2/BBH": 0.4673, + "hfopenllm_v2/MATH Level 5": 0.1903, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4241, + "hfopenllm_v2/MMLU-PRO": 0.3475 + } + }, + { + "id": "mrdayl/OpenCognito-r2", + "name": "OpenCognito-r2", + "developer": "mrdayl", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3959, + "hfopenllm_v2/BBH": 0.4688, + "hfopenllm_v2/MATH Level 5": 0.2024, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4202, + "hfopenllm_v2/MMLU-PRO": 0.3462 + } + }, + { + "id": "mrdayl/OpenThink", + "name": "OpenThink", + "developer": "mrdayl", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2054, + "hfopenllm_v2/BBH": 0.346, + "hfopenllm_v2/MATH Level 5": 0.2885, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.3289, + "hfopenllm_v2/MMLU-PRO": 0.185 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/mrm8488.json b/data/developers/mrm8488.json new file mode 100644 index 0000000000000000000000000000000000000000..c81e9cb9e2a78f011fe0d3a763e255886a8a8105 --- /dev/null +++ b/data/developers/mrm8488.json @@ -0,0 +1,33 @@ +{ + "developer": "mrm8488", + "models": [ + { + "id": "mrm8488/phi-4-14B-grpo-gsm8k-3e", + "name": "phi-4-14B-grpo-gsm8k-3e", + "developer": "mrm8488", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6885, + "hfopenllm_v2/BBH": 0.6805, + "hfopenllm_v2/MATH Level 5": 0.4524, + "hfopenllm_v2/GPQA": 0.3356, + "hfopenllm_v2/MUSR": 0.3994, + "hfopenllm_v2/MMLU-PRO": 0.5268 + } + }, + { + "id": "mrm8488/phi-4-14B-grpo-limo", + "name": "phi-4-14B-grpo-limo", + "developer": "mrm8488", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6812, + "hfopenllm_v2/BBH": 0.6785, + "hfopenllm_v2/MATH Level 5": 0.4569, + "hfopenllm_v2/GPQA": 0.3364, + "hfopenllm_v2/MUSR": 0.3981, + "hfopenllm_v2/MMLU-PRO": 0.5261 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/mukaj.json b/data/developers/mukaj.json new file mode 100644 index 0000000000000000000000000000000000000000..386d271068ea1cac3eb6da1ae9eed760f0975f1f --- /dev/null +++ b/data/developers/mukaj.json @@ -0,0 +1,19 @@ +{ + "developer": "mukaj", + "models": [ + { + "id": "mukaj/Llama-3.1-Hawkish-8B", + "name": "Llama-3.1-Hawkish-8B", + "developer": "mukaj", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.672, + "hfopenllm_v2/BBH": 0.4884, + "hfopenllm_v2/MATH Level 5": 0.2432, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.3967, + "hfopenllm_v2/MMLU-PRO": 0.3331 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/my_model.json b/data/developers/my_model.json new file mode 100644 index 0000000000000000000000000000000000000000..5fd12b02258e615d31e894de4bd584d76311e1e7 --- /dev/null +++ b/data/developers/my_model.json @@ -0,0 +1,18 @@ +{ + "developer": "my_model", + "models": [ + { + "id": "my_model/", + "name": "my_model/", + "developer": "my_model", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5267, + "reward-bench/Chat": 0.4553, + "reward-bench/Chat Hard": 0.5592, + "reward-bench/Safety": 0.4392, + "reward-bench/Reasoning": 0.6532 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/natong19.json b/data/developers/natong19.json new file mode 100644 index 0000000000000000000000000000000000000000..ea57a03181f4077ece6207733989872ede378a46 --- /dev/null +++ b/data/developers/natong19.json @@ -0,0 +1,33 @@ +{ + "developer": "natong19", + "models": [ + { + "id": "natong19/Mistral-Nemo-Instruct-2407-abliterated", + "name": "Mistral-Nemo-Instruct-2407-abliterated", + "developer": "natong19", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6392, + "hfopenllm_v2/BBH": 0.5048, + "hfopenllm_v2/MATH Level 5": 0.1322, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.4033, + "hfopenllm_v2/MMLU-PRO": 0.3518 + } + }, + { + "id": "natong19/Qwen2-7B-Instruct-abliterated", + "name": "Qwen2-7B-Instruct-abliterated", + "developer": "natong19", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5837, + "hfopenllm_v2/BBH": 0.5553, + "hfopenllm_v2/MATH Level 5": 0.2764, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4034, + "hfopenllm_v2/MMLU-PRO": 0.3842 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/nazimali.json b/data/developers/nazimali.json new file mode 100644 index 0000000000000000000000000000000000000000..34d47c9647d462b17a0fe6015c5c4f9fc00264e7 --- /dev/null +++ b/data/developers/nazimali.json @@ -0,0 +1,33 @@ +{ + "developer": "nazimali", + "models": [ + { + "id": "nazimali/Mistral-Nemo-Kurdish", + "name": "Mistral-Nemo-Kurdish", + "developer": "nazimali", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3401, + "hfopenllm_v2/BBH": 0.5133, + "hfopenllm_v2/MATH Level 5": 0.0959, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4116, + "hfopenllm_v2/MMLU-PRO": 0.3235 + } + }, + { + "id": "nazimali/Mistral-Nemo-Kurdish-Instruct", + "name": "Mistral-Nemo-Kurdish-Instruct", + "developer": "nazimali", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.486, + "hfopenllm_v2/BBH": 0.4721, + "hfopenllm_v2/MATH Level 5": 0.0846, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4006, + "hfopenllm_v2/MMLU-PRO": 0.3087 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/nbeerbower.json b/data/developers/nbeerbower.json new file mode 100644 index 0000000000000000000000000000000000000000..2dd18221ea70d73c59a52f62543c0258c0648479 --- /dev/null +++ b/data/developers/nbeerbower.json @@ -0,0 +1,719 @@ +{ + "developer": "nbeerbower", + "models": [ + { + "id": "nbeerbower/BigKartoffel-mistral-nemo-20B", + "name": "BigKartoffel-mistral-nemo-20B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5857, + "hfopenllm_v2/BBH": 0.5515, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.428, + "hfopenllm_v2/MMLU-PRO": 0.353 + } + }, + { + "id": "nbeerbower/DoppelKartoffel-Mistral-Nemo-23B", + "name": "DoppelKartoffel-Mistral-Nemo-23B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5191, + "hfopenllm_v2/BBH": 0.5218, + "hfopenllm_v2/MATH Level 5": 0.031, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3795, + "hfopenllm_v2/MMLU-PRO": 0.308 + } + }, + { + "id": "nbeerbower/DoublePotato-Mistral-Nemo-13B", + "name": "DoublePotato-Mistral-Nemo-13B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6796, + "hfopenllm_v2/BBH": 0.5438, + "hfopenllm_v2/MATH Level 5": 0.04, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.46, + "hfopenllm_v2/MMLU-PRO": 0.3596 + } + }, + { + "id": "nbeerbower/Dumpling-Qwen2.5-1.5B", + "name": "Dumpling-Qwen2.5-1.5B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3699, + "hfopenllm_v2/BBH": 0.416, + "hfopenllm_v2/MATH Level 5": 0.1171, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3728, + "hfopenllm_v2/MMLU-PRO": 0.2772 + } + }, + { + "id": "nbeerbower/Dumpling-Qwen2.5-14B", + "name": "Dumpling-Qwen2.5-14B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6064, + "hfopenllm_v2/BBH": 0.6451, + "hfopenllm_v2/MATH Level 5": 0.3097, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4354, + "hfopenllm_v2/MMLU-PRO": 0.517 + } + }, + { + "id": "nbeerbower/Dumpling-Qwen2.5-7B-1k-r16", + "name": "Dumpling-Qwen2.5-7B-1k-r16", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.486, + "hfopenllm_v2/BBH": 0.5214, + "hfopenllm_v2/MATH Level 5": 0.2364, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.423, + "hfopenllm_v2/MMLU-PRO": 0.3959 + } + }, + { + "id": "nbeerbower/Dumpling-Qwen2.5-7B-1k-r64-2e-5", + "name": "Dumpling-Qwen2.5-7B-1k-r64-2e-5", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4179, + "hfopenllm_v2/BBH": 0.5301, + "hfopenllm_v2/MATH Level 5": 0.2115, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.4486, + "hfopenllm_v2/MMLU-PRO": 0.4122 + } + }, + { + "id": "nbeerbower/EVA-abliterated-TIES-Qwen2.5-1.5B", + "name": "EVA-abliterated-TIES-Qwen2.5-1.5B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4115, + "hfopenllm_v2/BBH": 0.3997, + "hfopenllm_v2/MATH Level 5": 0.1375, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3502, + "hfopenllm_v2/MMLU-PRO": 0.2712 + } + }, + { + "id": "nbeerbower/EVA-abliterated-TIES-Qwen2.5-14B", + "name": "EVA-abliterated-TIES-Qwen2.5-14B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7836, + "hfopenllm_v2/BBH": 0.6372, + "hfopenllm_v2/MATH Level 5": 0.5045, + "hfopenllm_v2/GPQA": 0.3549, + "hfopenllm_v2/MUSR": 0.4407, + "hfopenllm_v2/MMLU-PRO": 0.5211 + } + }, + { + "id": "nbeerbower/Flammades-Mistral-Nemo-12B", + "name": "Flammades-Mistral-Nemo-12B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3842, + "hfopenllm_v2/BBH": 0.53, + "hfopenllm_v2/MATH Level 5": 0.0755, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4806, + "hfopenllm_v2/MMLU-PRO": 0.3661 + } + }, + { + "id": "nbeerbower/Gemma2-Gutenberg-Doppel-9B", + "name": "Gemma2-Gutenberg-Doppel-9B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7171, + "hfopenllm_v2/BBH": 0.587, + "hfopenllm_v2/MATH Level 5": 0.1979, + "hfopenllm_v2/GPQA": 0.3297, + "hfopenllm_v2/MUSR": 0.4608, + "hfopenllm_v2/MMLU-PRO": 0.4127 + } + }, + { + "id": "nbeerbower/Gutensuppe-mistral-nemo-12B", + "name": "Gutensuppe-mistral-nemo-12B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2916, + "hfopenllm_v2/BBH": 0.5487, + "hfopenllm_v2/MATH Level 5": 0.1329, + "hfopenllm_v2/GPQA": 0.3372, + "hfopenllm_v2/MUSR": 0.429, + "hfopenllm_v2/MMLU-PRO": 0.368 + } + }, + { + "id": "nbeerbower/Hermes2-Gutenberg2-Mistral-7B", + "name": "Hermes2-Gutenberg2-Mistral-7B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3721, + "hfopenllm_v2/BBH": 0.4981, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.4623, + "hfopenllm_v2/MMLU-PRO": 0.2993 + } + }, + { + "id": "nbeerbower/Kartoffel-Deepfry-12B", + "name": "Kartoffel-Deepfry-12B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5022, + "hfopenllm_v2/BBH": 0.5365, + "hfopenllm_v2/MATH Level 5": 0.0604, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4792, + "hfopenllm_v2/MMLU-PRO": 0.3582 + } + }, + { + "id": "nbeerbower/Llama-3.1-Nemotron-lorablated-70B", + "name": "Llama-3.1-Nemotron-lorablated-70B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7229, + "hfopenllm_v2/BBH": 0.6825, + "hfopenllm_v2/MATH Level 5": 0.3338, + "hfopenllm_v2/GPQA": 0.3909, + "hfopenllm_v2/MUSR": 0.4682, + "hfopenllm_v2/MMLU-PRO": 0.5343 + } + }, + { + "id": "nbeerbower/Llama3.1-Gutenberg-Doppel-70B", + "name": "Llama3.1-Gutenberg-Doppel-70B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7092, + "hfopenllm_v2/BBH": 0.6661, + "hfopenllm_v2/MATH Level 5": 0.2122, + "hfopenllm_v2/GPQA": 0.3448, + "hfopenllm_v2/MUSR": 0.4897, + "hfopenllm_v2/MMLU-PRO": 0.4737 + } + }, + { + "id": "nbeerbower/Lyra-Gutenberg-mistral-nemo-12B", + "name": "Lyra-Gutenberg-mistral-nemo-12B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3495, + "hfopenllm_v2/BBH": 0.5586, + "hfopenllm_v2/MATH Level 5": 0.1012, + "hfopenllm_v2/GPQA": 0.3339, + "hfopenllm_v2/MUSR": 0.4357, + "hfopenllm_v2/MMLU-PRO": 0.3628 + } + }, + { + "id": "nbeerbower/Lyra4-Gutenberg-12B", + "name": "Lyra4-Gutenberg-12B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2212, + "hfopenllm_v2/BBH": 0.5387, + "hfopenllm_v2/MATH Level 5": 0.1299, + "hfopenllm_v2/GPQA": 0.3188, + "hfopenllm_v2/MUSR": 0.4038, + "hfopenllm_v2/MMLU-PRO": 0.3571 + } + }, + { + "id": "nbeerbower/Lyra4-Gutenberg2-12B", + "name": "Lyra4-Gutenberg2-12B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2585, + "hfopenllm_v2/BBH": 0.5345, + "hfopenllm_v2/MATH Level 5": 0.1171, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.3972, + "hfopenllm_v2/MMLU-PRO": 0.3565 + } + }, + { + "id": "nbeerbower/Mahou-1.5-mistral-nemo-12B-lorablated", + "name": "Mahou-1.5-mistral-nemo-12B-lorablated", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6825, + "hfopenllm_v2/BBH": 0.5496, + "hfopenllm_v2/MATH Level 5": 0.0891, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.4522, + "hfopenllm_v2/MMLU-PRO": 0.3574 + } + }, + { + "id": "nbeerbower/Mistral-Gutenberg-Doppel-7B-FFT", + "name": "Mistral-Gutenberg-Doppel-7B-FFT", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5717, + "hfopenllm_v2/BBH": 0.4076, + "hfopenllm_v2/MATH Level 5": 0.0249, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.4059, + "hfopenllm_v2/MMLU-PRO": 0.2729 + } + }, + { + "id": "nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B", + "name": "Mistral-Nemo-Gutenberg-Doppel-12B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3567, + "hfopenllm_v2/BBH": 0.5275, + "hfopenllm_v2/MATH Level 5": 0.1216, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4132, + "hfopenllm_v2/MMLU-PRO": 0.3579 + } + }, + { + "id": "nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B-v2", + "name": "Mistral-Nemo-Gutenberg-Doppel-12B-v2", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6536, + "hfopenllm_v2/BBH": 0.5374, + "hfopenllm_v2/MATH Level 5": 0.1156, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.4233, + "hfopenllm_v2/MMLU-PRO": 0.3546 + } + }, + { + "id": "nbeerbower/Mistral-Nemo-Moderne-12B-FFT-experimental", + "name": "Mistral-Nemo-Moderne-12B-FFT-experimental", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3352, + "hfopenllm_v2/BBH": 0.5234, + "hfopenllm_v2/MATH Level 5": 0.077, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.3715, + "hfopenllm_v2/MMLU-PRO": 0.3455 + } + }, + { + "id": "nbeerbower/Mistral-Nemo-Prism-12B", + "name": "Mistral-Nemo-Prism-12B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6858, + "hfopenllm_v2/BBH": 0.5475, + "hfopenllm_v2/MATH Level 5": 0.0869, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4626, + "hfopenllm_v2/MMLU-PRO": 0.3581 + } + }, + { + "id": "nbeerbower/Mistral-Nemo-Prism-12B-v2", + "name": "Mistral-Nemo-Prism-12B-v2", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6974, + "hfopenllm_v2/BBH": 0.5492, + "hfopenllm_v2/MATH Level 5": 0.0891, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.46, + "hfopenllm_v2/MMLU-PRO": 0.3567 + } + }, + { + "id": "nbeerbower/Mistral-Nemo-Prism-12B-v7", + "name": "Mistral-Nemo-Prism-12B-v7", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6962, + "hfopenllm_v2/BBH": 0.5521, + "hfopenllm_v2/MATH Level 5": 0.0869, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4639, + "hfopenllm_v2/MMLU-PRO": 0.359 + } + }, + { + "id": "nbeerbower/Mistral-Small-Drummer-22B", + "name": "Mistral-Small-Drummer-22B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6331, + "hfopenllm_v2/BBH": 0.5793, + "hfopenllm_v2/MATH Level 5": 0.1888, + "hfopenllm_v2/GPQA": 0.3431, + "hfopenllm_v2/MUSR": 0.4064, + "hfopenllm_v2/MMLU-PRO": 0.4095 + } + }, + { + "id": "nbeerbower/Mistral-Small-Gutenberg-Doppel-22B", + "name": "Mistral-Small-Gutenberg-Doppel-22B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4893, + "hfopenllm_v2/BBH": 0.5859, + "hfopenllm_v2/MATH Level 5": 0.2183, + "hfopenllm_v2/GPQA": 0.3465, + "hfopenllm_v2/MUSR": 0.3971, + "hfopenllm_v2/MMLU-PRO": 0.4124 + } + }, + { + "id": "nbeerbower/Nemo-Loony-12B-experimental", + "name": "Nemo-Loony-12B-experimental", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3734, + "hfopenllm_v2/BBH": 0.3822, + "hfopenllm_v2/MATH Level 5": 0.0151, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3341, + "hfopenllm_v2/MMLU-PRO": 0.1589 + } + }, + { + "id": "nbeerbower/Nemoties-ChatML-12B", + "name": "Nemoties-ChatML-12B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6382, + "hfopenllm_v2/BBH": 0.547, + "hfopenllm_v2/MATH Level 5": 0.0785, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4509, + "hfopenllm_v2/MMLU-PRO": 0.3551 + } + }, + { + "id": "nbeerbower/Qwen2.5-Gutenberg-Doppel-14B", + "name": "Qwen2.5-Gutenberg-Doppel-14B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8091, + "hfopenllm_v2/BBH": 0.6382, + "hfopenllm_v2/MATH Level 5": 0.5415, + "hfopenllm_v2/GPQA": 0.3331, + "hfopenllm_v2/MUSR": 0.4101, + "hfopenllm_v2/MMLU-PRO": 0.4921 + } + }, + { + "id": "nbeerbower/SmolNemo-12B-FFT-experimental", + "name": "SmolNemo-12B-FFT-experimental", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3348, + "hfopenllm_v2/BBH": 0.3336, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3847, + "hfopenllm_v2/MMLU-PRO": 0.1217 + } + }, + { + "id": "nbeerbower/Stella-mistral-nemo-12B-v2", + "name": "Stella-mistral-nemo-12B-v2", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3274, + "hfopenllm_v2/BBH": 0.5484, + "hfopenllm_v2/MATH Level 5": 0.1163, + "hfopenllm_v2/GPQA": 0.3322, + "hfopenllm_v2/MUSR": 0.4304, + "hfopenllm_v2/MMLU-PRO": 0.3684 + } + }, + { + "id": "nbeerbower/gemma2-gutenberg-27B", + "name": "gemma2-gutenberg-27B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2947, + "hfopenllm_v2/BBH": 0.3797, + "hfopenllm_v2/MATH Level 5": 0.0189, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3727, + "hfopenllm_v2/MMLU-PRO": 0.1982 + } + }, + { + "id": "nbeerbower/gemma2-gutenberg-9B", + "name": "gemma2-gutenberg-9B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2796, + "hfopenllm_v2/BBH": 0.5951, + "hfopenllm_v2/MATH Level 5": 0.0808, + "hfopenllm_v2/GPQA": 0.3381, + "hfopenllm_v2/MUSR": 0.4595, + "hfopenllm_v2/MMLU-PRO": 0.4192 + } + }, + { + "id": "nbeerbower/llama-3-gutenberg-8B", + "name": "llama-3-gutenberg-8B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4372, + "hfopenllm_v2/BBH": 0.4994, + "hfopenllm_v2/MATH Level 5": 0.0785, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4073, + "hfopenllm_v2/MMLU-PRO": 0.3831 + } + }, + { + "id": "nbeerbower/llama3.1-cc-8B", + "name": "llama3.1-cc-8B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5068, + "hfopenllm_v2/BBH": 0.4871, + "hfopenllm_v2/MATH Level 5": 0.071, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.3885, + "hfopenllm_v2/MMLU-PRO": 0.3347 + } + }, + { + "id": "nbeerbower/llama3.1-kartoffeldes-70B", + "name": "llama3.1-kartoffeldes-70B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.823, + "hfopenllm_v2/BBH": 0.6894, + "hfopenllm_v2/MATH Level 5": 0.3218, + "hfopenllm_v2/GPQA": 0.3515, + "hfopenllm_v2/MUSR": 0.4646, + "hfopenllm_v2/MMLU-PRO": 0.4988 + } + }, + { + "id": "nbeerbower/mistral-nemo-bophades-12B", + "name": "mistral-nemo-bophades-12B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6794, + "hfopenllm_v2/BBH": 0.4988, + "hfopenllm_v2/MATH Level 5": 0.1231, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.4178, + "hfopenllm_v2/MMLU-PRO": 0.3501 + } + }, + { + "id": "nbeerbower/mistral-nemo-bophades3-12B", + "name": "mistral-nemo-bophades3-12B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6578, + "hfopenllm_v2/BBH": 0.5449, + "hfopenllm_v2/MATH Level 5": 0.0846, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4604, + "hfopenllm_v2/MMLU-PRO": 0.3371 + } + }, + { + "id": "nbeerbower/mistral-nemo-cc-12B", + "name": "mistral-nemo-cc-12B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1435, + "hfopenllm_v2/BBH": 0.5399, + "hfopenllm_v2/MATH Level 5": 0.0257, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.4424, + "hfopenllm_v2/MMLU-PRO": 0.3598 + } + }, + { + "id": "nbeerbower/mistral-nemo-gutades-12B", + "name": "mistral-nemo-gutades-12B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3425, + "hfopenllm_v2/BBH": 0.5407, + "hfopenllm_v2/MATH Level 5": 0.1178, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.404, + "hfopenllm_v2/MMLU-PRO": 0.3561 + } + }, + { + "id": "nbeerbower/mistral-nemo-gutenberg-12B", + "name": "mistral-nemo-gutenberg-12B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3504, + "hfopenllm_v2/BBH": 0.5281, + "hfopenllm_v2/MATH Level 5": 0.1163, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4171, + "hfopenllm_v2/MMLU-PRO": 0.3562 + } + }, + { + "id": "nbeerbower/mistral-nemo-gutenberg-12B-v2", + "name": "mistral-nemo-gutenberg-12B-v2", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6203, + "hfopenllm_v2/BBH": 0.5397, + "hfopenllm_v2/MATH Level 5": 0.1088, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.4287, + "hfopenllm_v2/MMLU-PRO": 0.3499 + } + }, + { + "id": "nbeerbower/mistral-nemo-gutenberg-12B-v3", + "name": "mistral-nemo-gutenberg-12B-v3", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2183, + "hfopenllm_v2/BBH": 0.5441, + "hfopenllm_v2/MATH Level 5": 0.0597, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.445, + "hfopenllm_v2/MMLU-PRO": 0.3644 + } + }, + { + "id": "nbeerbower/mistral-nemo-gutenberg-12B-v4", + "name": "mistral-nemo-gutenberg-12B-v4", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2379, + "hfopenllm_v2/BBH": 0.5269, + "hfopenllm_v2/MATH Level 5": 0.1261, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4104, + "hfopenllm_v2/MMLU-PRO": 0.3575 + } + }, + { + "id": "nbeerbower/mistral-nemo-gutenberg2-12B-test", + "name": "mistral-nemo-gutenberg2-12B-test", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3385, + "hfopenllm_v2/BBH": 0.5255, + "hfopenllm_v2/MATH Level 5": 0.1163, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4157, + "hfopenllm_v2/MMLU-PRO": 0.3555 + } + }, + { + "id": "nbeerbower/mistral-nemo-kartoffel-12B", + "name": "mistral-nemo-kartoffel-12B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7032, + "hfopenllm_v2/BBH": 0.5484, + "hfopenllm_v2/MATH Level 5": 0.0853, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4653, + "hfopenllm_v2/MMLU-PRO": 0.3585 + } + }, + { + "id": "nbeerbower/mistral-nemo-narwhal-12B", + "name": "mistral-nemo-narwhal-12B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5549, + "hfopenllm_v2/BBH": 0.5057, + "hfopenllm_v2/MATH Level 5": 0.0582, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3847, + "hfopenllm_v2/MMLU-PRO": 0.3483 + } + }, + { + "id": "nbeerbower/mistral-nemo-wissenschaft-12B", + "name": "mistral-nemo-wissenschaft-12B", + "developer": "nbeerbower", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.652, + "hfopenllm_v2/BBH": 0.504, + "hfopenllm_v2/MATH Level 5": 0.1216, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4178, + "hfopenllm_v2/MMLU-PRO": 0.3532 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/nbrahme.json b/data/developers/nbrahme.json new file mode 100644 index 0000000000000000000000000000000000000000..5b66226217bb36661e02e5e8f01691e3aae8a17f --- /dev/null +++ b/data/developers/nbrahme.json @@ -0,0 +1,19 @@ +{ + "developer": "nbrahme", + "models": [ + { + "id": "nbrahme/IndusQ", + "name": "IndusQ", + "developer": "nbrahme", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.244, + "hfopenllm_v2/BBH": 0.3062, + "hfopenllm_v2/MATH Level 5": 0.0008, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3366, + "hfopenllm_v2/MMLU-PRO": 0.112 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/necva.json b/data/developers/necva.json new file mode 100644 index 0000000000000000000000000000000000000000..edc80018436dc75b2113f6a9422f4bd9c8eb566b --- /dev/null +++ b/data/developers/necva.json @@ -0,0 +1,33 @@ +{ + "developer": "necva", + "models": [ + { + "id": "necva/IE-cont-Llama3.1-8B", + "name": "IE-cont-Llama3.1-8B", + "developer": "necva", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2049, + "hfopenllm_v2/BBH": 0.2912, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3575, + "hfopenllm_v2/MMLU-PRO": 0.1167 + } + }, + { + "id": "necva/replica-IEPile", + "name": "replica-IEPile", + "developer": "necva", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4678, + "hfopenllm_v2/BBH": 0.4779, + "hfopenllm_v2/MATH Level 5": 0.1239, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.3998, + "hfopenllm_v2/MMLU-PRO": 0.3561 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/neopolita.json b/data/developers/neopolita.json new file mode 100644 index 0000000000000000000000000000000000000000..f2d08b45e69605ff9b5860f25d070f5c01699763 --- /dev/null +++ b/data/developers/neopolita.json @@ -0,0 +1,159 @@ +{ + "developer": "neopolita", + "models": [ + { + "id": "neopolita/jessi-v0.1-bf16-falcon3-7b-instruct", + "name": "jessi-v0.1-bf16-falcon3-7b-instruct", + "developer": "neopolita", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7527, + "hfopenllm_v2/BBH": 0.5516, + "hfopenllm_v2/MATH Level 5": 0.3807, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4825, + "hfopenllm_v2/MMLU-PRO": 0.3924 + } + }, + { + "id": "neopolita/jessi-v0.1-falcon3-10b-instruct", + "name": "jessi-v0.1-falcon3-10b-instruct", + "developer": "neopolita", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7552, + "hfopenllm_v2/BBH": 0.5953, + "hfopenllm_v2/MATH Level 5": 0.2002, + "hfopenllm_v2/GPQA": 0.3188, + "hfopenllm_v2/MUSR": 0.4279, + "hfopenllm_v2/MMLU-PRO": 0.4188 + } + }, + { + "id": "neopolita/jessi-v0.1-qwen2.5-7b-instruct", + "name": "jessi-v0.1-qwen2.5-7b-instruct", + "developer": "neopolita", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7327, + "hfopenllm_v2/BBH": 0.5292, + "hfopenllm_v2/MATH Level 5": 0.4086, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.3914, + "hfopenllm_v2/MMLU-PRO": 0.4228 + } + }, + { + "id": "neopolita/jessi-v0.1-virtuoso-small", + "name": "jessi-v0.1-virtuoso-small", + "developer": "neopolita", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7959, + "hfopenllm_v2/BBH": 0.6443, + "hfopenllm_v2/MATH Level 5": 0.3399, + "hfopenllm_v2/GPQA": 0.3305, + "hfopenllm_v2/MUSR": 0.4362, + "hfopenllm_v2/MMLU-PRO": 0.513 + } + }, + { + "id": "neopolita/jessi-v0.2-falcon3-10b-instruct", + "name": "jessi-v0.2-falcon3-10b-instruct", + "developer": "neopolita", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7768, + "hfopenllm_v2/BBH": 0.6205, + "hfopenllm_v2/MATH Level 5": 0.2122, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.4281, + "hfopenllm_v2/MMLU-PRO": 0.4354 + } + }, + { + "id": "neopolita/jessi-v0.2-falcon3-7b-instruct", + "name": "jessi-v0.2-falcon3-7b-instruct", + "developer": "neopolita", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5771, + "hfopenllm_v2/BBH": 0.5363, + "hfopenllm_v2/MATH Level 5": 0.2538, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4479, + "hfopenllm_v2/MMLU-PRO": 0.3905 + } + }, + { + "id": "neopolita/jessi-v0.3-falcon3-7b-instruct", + "name": "jessi-v0.3-falcon3-7b-instruct", + "developer": "neopolita", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7509, + "hfopenllm_v2/BBH": 0.5388, + "hfopenllm_v2/MATH Level 5": 0.1888, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.4692, + "hfopenllm_v2/MMLU-PRO": 0.397 + } + }, + { + "id": "neopolita/jessi-v0.4-falcon3-7b-instruct", + "name": "jessi-v0.4-falcon3-7b-instruct", + "developer": "neopolita", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7604, + "hfopenllm_v2/BBH": 0.5522, + "hfopenllm_v2/MATH Level 5": 0.3769, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4971, + "hfopenllm_v2/MMLU-PRO": 0.4004 + } + }, + { + "id": "neopolita/jessi-v0.5-falcon3-7b-instruct", + "name": "jessi-v0.5-falcon3-7b-instruct", + "developer": "neopolita", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7412, + "hfopenllm_v2/BBH": 0.559, + "hfopenllm_v2/MATH Level 5": 0.3739, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.4865, + "hfopenllm_v2/MMLU-PRO": 0.3966 + } + }, + { + "id": "neopolita/jessi-v0.6-falcon3-7b-instruct", + "name": "jessi-v0.6-falcon3-7b-instruct", + "developer": "neopolita", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7402, + "hfopenllm_v2/BBH": 0.5509, + "hfopenllm_v2/MATH Level 5": 0.3565, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4904, + "hfopenllm_v2/MMLU-PRO": 0.3957 + } + }, + { + "id": "neopolita/loki-v0.1-virtuoso", + "name": "loki-v0.1-virtuoso", + "developer": "neopolita", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7819, + "hfopenllm_v2/BBH": 0.6467, + "hfopenllm_v2/MATH Level 5": 0.3391, + "hfopenllm_v2/GPQA": 0.3507, + "hfopenllm_v2/MUSR": 0.4375, + "hfopenllm_v2/MMLU-PRO": 0.5129 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/netcat420.json b/data/developers/netcat420.json new file mode 100644 index 0000000000000000000000000000000000000000..0f76c4d0b164b25ad7b3d09e6f14a102bc37c823 --- /dev/null +++ b/data/developers/netcat420.json @@ -0,0 +1,677 @@ +{ + "developer": "netcat420", + "models": [ + { + "id": "netcat420/DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b", + "name": "DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.115, + "hfopenllm_v2/BBH": 0.2877, + "hfopenllm_v2/MATH Level 5": 0.0015, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3724, + "hfopenllm_v2/MMLU-PRO": 0.109 + } + }, + { + "id": "netcat420/DeepSeek-R1-MFANN-TIES-unretrained-7b", + "name": "DeepSeek-R1-MFANN-TIES-unretrained-7b", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2587, + "hfopenllm_v2/BBH": 0.3086, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.255, + "hfopenllm_v2/MUSR": 0.3527, + "hfopenllm_v2/MMLU-PRO": 0.1145 + } + }, + { + "id": "netcat420/Llama3.1-MFANN-8b", + "name": "Llama3.1-MFANN-8b", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.297, + "hfopenllm_v2/BBH": 0.4281, + "hfopenllm_v2/MATH Level 5": 0.0295, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.3379, + "hfopenllm_v2/MMLU-PRO": 0.2725 + } + }, + { + "id": "netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V2", + "name": "MFANN-Llama3.1-Abliterated-SLERP-TIES-V2", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.421, + "hfopenllm_v2/BBH": 0.4924, + "hfopenllm_v2/MATH Level 5": 0.0763, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.3728, + "hfopenllm_v2/MMLU-PRO": 0.3522 + } + }, + { + "id": "netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V3", + "name": "MFANN-Llama3.1-Abliterated-SLERP-TIES-V3", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4238, + "hfopenllm_v2/BBH": 0.4914, + "hfopenllm_v2/MATH Level 5": 0.0755, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.3741, + "hfopenllm_v2/MMLU-PRO": 0.349 + } + }, + { + "id": "netcat420/MFANN-Llama3.1-Abliterated-SLERP-V4", + "name": "MFANN-Llama3.1-Abliterated-SLERP-V4", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4169, + "hfopenllm_v2/BBH": 0.4909, + "hfopenllm_v2/MATH Level 5": 0.068, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.3821, + "hfopenllm_v2/MMLU-PRO": 0.3516 + } + }, + { + "id": "netcat420/MFANN-Llama3.1-Abliterated-SLERP-V5", + "name": "MFANN-Llama3.1-Abliterated-SLERP-V5", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4329, + "hfopenllm_v2/BBH": 0.4952, + "hfopenllm_v2/MATH Level 5": 0.0816, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.3781, + "hfopenllm_v2/MMLU-PRO": 0.3445 + } + }, + { + "id": "netcat420/MFANN-Llama3.1-Abliterated-Slerp-TIES", + "name": "MFANN-Llama3.1-Abliterated-Slerp-TIES", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4293, + "hfopenllm_v2/BBH": 0.4968, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.3687, + "hfopenllm_v2/MMLU-PRO": 0.3531 + } + }, + { + "id": "netcat420/MFANN-Llama3.1-Abliterated-Slerp-V3.2", + "name": "MFANN-Llama3.1-Abliterated-Slerp-V3.2", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4128, + "hfopenllm_v2/BBH": 0.4978, + "hfopenllm_v2/MATH Level 5": 0.0702, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.3754, + "hfopenllm_v2/MMLU-PRO": 0.3527 + } + }, + { + "id": "netcat420/MFANN-SFT", + "name": "MFANN-SFT", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3682, + "hfopenllm_v2/BBH": 0.4852, + "hfopenllm_v2/MATH Level 5": 0.0597, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.3725, + "hfopenllm_v2/MMLU-PRO": 0.3336 + } + }, + { + "id": "netcat420/MFANN-abliterated-phi2-merge-unretrained", + "name": "MFANN-abliterated-phi2-merge-unretrained", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3005, + "hfopenllm_v2/BBH": 0.4104, + "hfopenllm_v2/MATH Level 5": 0.0287, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3183, + "hfopenllm_v2/MMLU-PRO": 0.1478 + } + }, + { + "id": "netcat420/MFANN-llama3.1-Abliterated-SLERP", + "name": "MFANN-llama3.1-Abliterated-SLERP", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2591, + "hfopenllm_v2/BBH": 0.4574, + "hfopenllm_v2/MATH Level 5": 0.0483, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3809, + "hfopenllm_v2/MMLU-PRO": 0.2928 + } + }, + { + "id": "netcat420/MFANN-llama3.1-abliterated-SLERP-v3", + "name": "MFANN-llama3.1-abliterated-SLERP-v3", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3799, + "hfopenllm_v2/BBH": 0.4931, + "hfopenllm_v2/MATH Level 5": 0.0642, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.366, + "hfopenllm_v2/MMLU-PRO": 0.3531 + } + }, + { + "id": "netcat420/MFANN-llama3.1-abliterated-SLERP-v3.1", + "name": "MFANN-llama3.1-abliterated-SLERP-v3.1", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4202, + "hfopenllm_v2/BBH": 0.4921, + "hfopenllm_v2/MATH Level 5": 0.0695, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.3686, + "hfopenllm_v2/MMLU-PRO": 0.3543 + } + }, + { + "id": "netcat420/MFANN-llama3.1-abliterated-v2", + "name": "MFANN-llama3.1-abliterated-v2", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4429, + "hfopenllm_v2/BBH": 0.4941, + "hfopenllm_v2/MATH Level 5": 0.074, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.3845, + "hfopenllm_v2/MMLU-PRO": 0.3491 + } + }, + { + "id": "netcat420/MFANN-phigments-slerp-V2", + "name": "MFANN-phigments-slerp-V2", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3232, + "hfopenllm_v2/BBH": 0.4827, + "hfopenllm_v2/MATH Level 5": 0.0317, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.4037, + "hfopenllm_v2/MMLU-PRO": 0.2717 + } + }, + { + "id": "netcat420/MFANN-phigments-slerp-V3.2", + "name": "MFANN-phigments-slerp-V3.2", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3524, + "hfopenllm_v2/BBH": 0.4809, + "hfopenllm_v2/MATH Level 5": 0.0332, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3708, + "hfopenllm_v2/MMLU-PRO": 0.2705 + } + }, + { + "id": "netcat420/MFANN-phigments-slerp-V3.3", + "name": "MFANN-phigments-slerp-V3.3", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3691, + "hfopenllm_v2/BBH": 0.4895, + "hfopenllm_v2/MATH Level 5": 0.0332, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3892, + "hfopenllm_v2/MMLU-PRO": 0.2803 + } + }, + { + "id": "netcat420/MFANN3b", + "name": "MFANN3b", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2524, + "hfopenllm_v2/BBH": 0.4433, + "hfopenllm_v2/MATH Level 5": 0.0219, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.3606, + "hfopenllm_v2/MMLU-PRO": 0.2306 + } + }, + { + "id": "netcat420/MFANN3bv0.15", + "name": "MFANN3bv0.15", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2012, + "hfopenllm_v2/BBH": 0.4539, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.2517, + "hfopenllm_v2/MUSR": 0.3958, + "hfopenllm_v2/MMLU-PRO": 0.2468 + } + }, + { + "id": "netcat420/MFANN3bv0.18", + "name": "MFANN3bv0.18", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2206, + "hfopenllm_v2/BBH": 0.4514, + "hfopenllm_v2/MATH Level 5": 0.0249, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.4024, + "hfopenllm_v2/MMLU-PRO": 0.25 + } + }, + { + "id": "netcat420/MFANN3bv0.19", + "name": "MFANN3bv0.19", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2258, + "hfopenllm_v2/BBH": 0.4516, + "hfopenllm_v2/MATH Level 5": 0.0227, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.4024, + "hfopenllm_v2/MMLU-PRO": 0.252 + } + }, + { + "id": "netcat420/MFANN3bv0.20", + "name": "MFANN3bv0.20", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2193, + "hfopenllm_v2/BBH": 0.4493, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.4077, + "hfopenllm_v2/MMLU-PRO": 0.25 + } + }, + { + "id": "netcat420/MFANN3bv0.21", + "name": "MFANN3bv0.21", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1909, + "hfopenllm_v2/BBH": 0.447, + "hfopenllm_v2/MATH Level 5": 0.0317, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3759, + "hfopenllm_v2/MMLU-PRO": 0.2393 + } + }, + { + "id": "netcat420/MFANN3bv0.22", + "name": "MFANN3bv0.22", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1979, + "hfopenllm_v2/BBH": 0.4485, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3521, + "hfopenllm_v2/MMLU-PRO": 0.2517 + } + }, + { + "id": "netcat420/MFANN3bv0.23", + "name": "MFANN3bv0.23", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2048, + "hfopenllm_v2/BBH": 0.4495, + "hfopenllm_v2/MATH Level 5": 0.0249, + "hfopenllm_v2/GPQA": 0.2517, + "hfopenllm_v2/MUSR": 0.3427, + "hfopenllm_v2/MMLU-PRO": 0.2418 + } + }, + { + "id": "netcat420/MFANN3bv0.24", + "name": "MFANN3bv0.24", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.22, + "hfopenllm_v2/BBH": 0.4407, + "hfopenllm_v2/MATH Level 5": 0.0279, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3521, + "hfopenllm_v2/MMLU-PRO": 0.2352 + } + }, + { + "id": "netcat420/MFANN3bv1.1", + "name": "MFANN3bv1.1", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2507, + "hfopenllm_v2/BBH": 0.3397, + "hfopenllm_v2/MATH Level 5": 0.0204, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3223, + "hfopenllm_v2/MMLU-PRO": 0.1159 + } + }, + { + "id": "netcat420/MFANN3bv1.2", + "name": "MFANN3bv1.2", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2686, + "hfopenllm_v2/BBH": 0.366, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3156, + "hfopenllm_v2/MMLU-PRO": 0.145 + } + }, + { + "id": "netcat420/MFANN3bv1.3", + "name": "MFANN3bv1.3", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2547, + "hfopenllm_v2/BBH": 0.4456, + "hfopenllm_v2/MATH Level 5": 0.0211, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3299, + "hfopenllm_v2/MMLU-PRO": 0.2276 + } + }, + { + "id": "netcat420/MFANN3bv1.4", + "name": "MFANN3bv1.4", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3524, + "hfopenllm_v2/BBH": 0.4809, + "hfopenllm_v2/MATH Level 5": 0.037, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.3708, + "hfopenllm_v2/MMLU-PRO": 0.2705 + } + }, + { + "id": "netcat420/MFANNv0.19", + "name": "MFANNv0.19", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3057, + "hfopenllm_v2/BBH": 0.4731, + "hfopenllm_v2/MATH Level 5": 0.0415, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.3527, + "hfopenllm_v2/MMLU-PRO": 0.2473 + } + }, + { + "id": "netcat420/MFANNv0.20", + "name": "MFANNv0.20", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3479, + "hfopenllm_v2/BBH": 0.4574, + "hfopenllm_v2/MATH Level 5": 0.0498, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.3874, + "hfopenllm_v2/MMLU-PRO": 0.3202 + } + }, + { + "id": "netcat420/MFANNv0.21", + "name": "MFANNv0.21", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3233, + "hfopenllm_v2/BBH": 0.4576, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.3993, + "hfopenllm_v2/MMLU-PRO": 0.3031 + } + }, + { + "id": "netcat420/MFANNv0.22.1", + "name": "MFANNv0.22.1", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3089, + "hfopenllm_v2/BBH": 0.4661, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3753, + "hfopenllm_v2/MMLU-PRO": 0.3343 + } + }, + { + "id": "netcat420/MFANNv0.23", + "name": "MFANNv0.23", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3127, + "hfopenllm_v2/BBH": 0.4898, + "hfopenllm_v2/MATH Level 5": 0.0498, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.3768, + "hfopenllm_v2/MMLU-PRO": 0.3388 + } + }, + { + "id": "netcat420/MFANNv0.24", + "name": "MFANNv0.24", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3162, + "hfopenllm_v2/BBH": 0.479, + "hfopenllm_v2/MATH Level 5": 0.0612, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.3754, + "hfopenllm_v2/MMLU-PRO": 0.3348 + } + }, + { + "id": "netcat420/MFANNv0.25", + "name": "MFANNv0.25", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3467, + "hfopenllm_v2/BBH": 0.4794, + "hfopenllm_v2/MATH Level 5": 0.0582, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3688, + "hfopenllm_v2/MMLU-PRO": 0.3343 + } + }, + { + "id": "netcat420/Qwen2.5-7B-nerd-uncensored-v0.9-MFANN", + "name": "Qwen2.5-7B-nerd-uncensored-v0.9-MFANN", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5878, + "hfopenllm_v2/BBH": 0.5237, + "hfopenllm_v2/MATH Level 5": 0.3376, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.3926, + "hfopenllm_v2/MMLU-PRO": 0.3904 + } + }, + { + "id": "netcat420/Qwen2.5-7b-MFANN-slerp", + "name": "Qwen2.5-7b-MFANN-slerp", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6532, + "hfopenllm_v2/BBH": 0.5089, + "hfopenllm_v2/MATH Level 5": 0.287, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4073, + "hfopenllm_v2/MMLU-PRO": 0.3417 + } + }, + { + "id": "netcat420/Qwen2.5-7b-nerd-uncensored-MFANN-slerp", + "name": "Qwen2.5-7b-nerd-uncensored-MFANN-slerp", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1564, + "hfopenllm_v2/BBH": 0.292, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3792, + "hfopenllm_v2/MMLU-PRO": 0.11 + } + }, + { + "id": "netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN", + "name": "Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5742, + "hfopenllm_v2/BBH": 0.5071, + "hfopenllm_v2/MATH Level 5": 0.2568, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4058, + "hfopenllm_v2/MMLU-PRO": 0.3157 + } + }, + { + "id": "netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained", + "name": "Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6486, + "hfopenllm_v2/BBH": 0.5066, + "hfopenllm_v2/MATH Level 5": 0.2991, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4152, + "hfopenllm_v2/MMLU-PRO": 0.3432 + } + }, + { + "id": "netcat420/Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b", + "name": "Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2676, + "hfopenllm_v2/BBH": 0.3789, + "hfopenllm_v2/MATH Level 5": 0.0181, + "hfopenllm_v2/GPQA": 0.2324, + "hfopenllm_v2/MUSR": 0.3528, + "hfopenllm_v2/MMLU-PRO": 0.1677 + } + }, + { + "id": "netcat420/Qwen2.5-MFANN-7b", + "name": "Qwen2.5-MFANN-7b", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6097, + "hfopenllm_v2/BBH": 0.5054, + "hfopenllm_v2/MATH Level 5": 0.2787, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.4021, + "hfopenllm_v2/MMLU-PRO": 0.3233 + } + }, + { + "id": "netcat420/qwen2.5-MFANN-7b-SLERP-V1.2", + "name": "qwen2.5-MFANN-7b-SLERP-V1.2", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6606, + "hfopenllm_v2/BBH": 0.5111, + "hfopenllm_v2/MATH Level 5": 0.287, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4259, + "hfopenllm_v2/MMLU-PRO": 0.3438 + } + }, + { + "id": "netcat420/qwen2.5-MFANN-7b-SLERPv1.1", + "name": "qwen2.5-MFANN-7b-SLERPv1.1", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6555, + "hfopenllm_v2/BBH": 0.5075, + "hfopenllm_v2/MATH Level 5": 0.2968, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4126, + "hfopenllm_v2/MMLU-PRO": 0.3448 + } + }, + { + "id": "netcat420/qwen2.5-MFANN-7b-v1.1", + "name": "qwen2.5-MFANN-7b-v1.1", + "developer": "netcat420", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6088, + "hfopenllm_v2/BBH": 0.4967, + "hfopenllm_v2/MATH Level 5": 0.2825, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.4114, + "hfopenllm_v2/MMLU-PRO": 0.3248 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/netease-youdao.json b/data/developers/netease-youdao.json new file mode 100644 index 0000000000000000000000000000000000000000..bdfbc64e891f992717eb6aa262681ba8949f553d --- /dev/null +++ b/data/developers/netease-youdao.json @@ -0,0 +1,19 @@ +{ + "developer": "netease-youdao", + "models": [ + { + "id": "netease-youdao/Confucius-o1-14B", + "name": "Confucius-o1-14B", + "developer": "netease-youdao", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6378, + "hfopenllm_v2/BBH": 0.63, + "hfopenllm_v2/MATH Level 5": 0.4313, + "hfopenllm_v2/GPQA": 0.3649, + "hfopenllm_v2/MUSR": 0.4338, + "hfopenllm_v2/MMLU-PRO": 0.5265 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/newsbang.json b/data/developers/newsbang.json new file mode 100644 index 0000000000000000000000000000000000000000..98443dd516eb0045c4fd3eb3c8012c7292fa86dd --- /dev/null +++ b/data/developers/newsbang.json @@ -0,0 +1,103 @@ +{ + "developer": "newsbang", + "models": [ + { + "id": "newsbang/Homer-7B-v0.1", + "name": "Homer-7B-v0.1", + "developer": "newsbang", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6109, + "hfopenllm_v2/BBH": 0.5601, + "hfopenllm_v2/MATH Level 5": 0.386, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.4357, + "hfopenllm_v2/MMLU-PRO": 0.4475 + } + }, + { + "id": "newsbang/Homer-7B-v0.2", + "name": "Homer-7B-v0.2", + "developer": "newsbang", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7494, + "hfopenllm_v2/BBH": 0.5517, + "hfopenllm_v2/MATH Level 5": 0.2477, + "hfopenllm_v2/GPQA": 0.3322, + "hfopenllm_v2/MUSR": 0.4298, + "hfopenllm_v2/MMLU-PRO": 0.441 + } + }, + { + "id": "newsbang/Homer-v0.3-Qwen2.5-7B", + "name": "Homer-v0.3-Qwen2.5-7B", + "developer": "newsbang", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5154, + "hfopenllm_v2/BBH": 0.5481, + "hfopenllm_v2/MATH Level 5": 0.3089, + "hfopenllm_v2/GPQA": 0.3339, + "hfopenllm_v2/MUSR": 0.4744, + "hfopenllm_v2/MMLU-PRO": 0.4456 + } + }, + { + "id": "newsbang/Homer-v0.4-Qwen2.5-7B", + "name": "Homer-v0.4-Qwen2.5-7B", + "developer": "newsbang", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7999, + "hfopenllm_v2/BBH": 0.5533, + "hfopenllm_v2/MATH Level 5": 0.2779, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.4311, + "hfopenllm_v2/MMLU-PRO": 0.4363 + } + }, + { + "id": "newsbang/Homer-v0.5-Qwen2.5-7B", + "name": "Homer-v0.5-Qwen2.5-7B", + "developer": "newsbang", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7881, + "hfopenllm_v2/BBH": 0.554, + "hfopenllm_v2/MATH Level 5": 0.3724, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4193, + "hfopenllm_v2/MMLU-PRO": 0.4369 + } + }, + { + "id": "newsbang/Homer-v1.0-Qwen2.5-72B", + "name": "Homer-v1.0-Qwen2.5-72B", + "developer": "newsbang", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7628, + "hfopenllm_v2/BBH": 0.731, + "hfopenllm_v2/MATH Level 5": 0.4902, + "hfopenllm_v2/GPQA": 0.4161, + "hfopenllm_v2/MUSR": 0.4677, + "hfopenllm_v2/MMLU-PRO": 0.6145 + } + }, + { + "id": "newsbang/Homer-v1.0-Qwen2.5-7B", + "name": "Homer-v1.0-Qwen2.5-7B", + "developer": "newsbang", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6393, + "hfopenllm_v2/BBH": 0.5655, + "hfopenllm_v2/MATH Level 5": 0.3323, + "hfopenllm_v2/GPQA": 0.3221, + "hfopenllm_v2/MUSR": 0.4278, + "hfopenllm_v2/MMLU-PRO": 0.4535 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/nguyentd.json b/data/developers/nguyentd.json new file mode 100644 index 0000000000000000000000000000000000000000..c3f968c4e67d3da8e4b0f3979310f7e550f561b6 --- /dev/null +++ b/data/developers/nguyentd.json @@ -0,0 +1,19 @@ +{ + "developer": "nguyentd", + "models": [ + { + "id": "nguyentd/FinancialAdvice-Qwen2.5-7B", + "name": "FinancialAdvice-Qwen2.5-7B", + "developer": "nguyentd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4496, + "hfopenllm_v2/BBH": 0.4731, + "hfopenllm_v2/MATH Level 5": 0.1148, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4025, + "hfopenllm_v2/MMLU-PRO": 0.3752 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ngxson.json b/data/developers/ngxson.json new file mode 100644 index 0000000000000000000000000000000000000000..0eb48a0ae55dd8730e6e1ebbdc813d2849fd5a70 --- /dev/null +++ b/data/developers/ngxson.json @@ -0,0 +1,33 @@ +{ + "developer": "ngxson", + "models": [ + { + "id": "ngxson/MiniThinky-1B-Llama-3.2", + "name": "MiniThinky-1B-Llama-3.2", + "developer": "ngxson", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2771, + "hfopenllm_v2/BBH": 0.3142, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.2391, + "hfopenllm_v2/MUSR": 0.3434, + "hfopenllm_v2/MMLU-PRO": 0.1147 + } + }, + { + "id": "ngxson/MiniThinky-v2-1B-Llama-3.2", + "name": "MiniThinky-v2-1B-Llama-3.2", + "developer": "ngxson", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2963, + "hfopenllm_v2/BBH": 0.3205, + "hfopenllm_v2/MATH Level 5": 0.0287, + "hfopenllm_v2/GPQA": 0.2399, + "hfopenllm_v2/MUSR": 0.3356, + "hfopenllm_v2/MMLU-PRO": 0.1116 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/nhyha.json b/data/developers/nhyha.json new file mode 100644 index 0000000000000000000000000000000000000000..63edcc83f0fcde264203184b33fcd20a95023c0a --- /dev/null +++ b/data/developers/nhyha.json @@ -0,0 +1,75 @@ +{ + "developer": "nhyha", + "models": [ + { + "id": "nhyha/N3N_Delirium-v1_1030_0227", + "name": "N3N_Delirium-v1_1030_0227", + "developer": "nhyha", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8023, + "hfopenllm_v2/BBH": 0.5891, + "hfopenllm_v2/MATH Level 5": 0.2107, + "hfopenllm_v2/GPQA": 0.3372, + "hfopenllm_v2/MUSR": 0.4098, + "hfopenllm_v2/MMLU-PRO": 0.415 + } + }, + { + "id": "nhyha/N3N_Llama-3.1-8B-Instruct_1028_0216", + "name": "N3N_Llama-3.1-8B-Instruct_1028_0216", + "developer": "nhyha", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4796, + "hfopenllm_v2/BBH": 0.5054, + "hfopenllm_v2/MATH Level 5": 0.1707, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.405, + "hfopenllm_v2/MMLU-PRO": 0.3638 + } + }, + { + "id": "nhyha/N3N_gemma-2-9b-it_20241029_1532", + "name": "N3N_gemma-2-9b-it_20241029_1532", + "developer": "nhyha", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6752, + "hfopenllm_v2/BBH": 0.5863, + "hfopenllm_v2/MATH Level 5": 0.2122, + "hfopenllm_v2/GPQA": 0.3406, + "hfopenllm_v2/MUSR": 0.4594, + "hfopenllm_v2/MMLU-PRO": 0.4122 + } + }, + { + "id": "nhyha/N3N_gemma-2-9b-it_20241110_2026", + "name": "N3N_gemma-2-9b-it_20241110_2026", + "developer": "nhyha", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6283, + "hfopenllm_v2/BBH": 0.5867, + "hfopenllm_v2/MATH Level 5": 0.1609, + "hfopenllm_v2/GPQA": 0.3364, + "hfopenllm_v2/MUSR": 0.4073, + "hfopenllm_v2/MMLU-PRO": 0.402 + } + }, + { + "id": "nhyha/merge_Qwen2.5-7B-Instruct_20241023_0314", + "name": "merge_Qwen2.5-7B-Instruct_20241023_0314", + "developer": "nhyha", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5695, + "hfopenllm_v2/BBH": 0.5559, + "hfopenllm_v2/MATH Level 5": 0.3542, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.4251, + "hfopenllm_v2/MMLU-PRO": 0.4542 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/nicolinho.json b/data/developers/nicolinho.json new file mode 100644 index 0000000000000000000000000000000000000000..551d4a5de698babd0e830b509f51bb11f4dd2ac7 --- /dev/null +++ b/data/developers/nicolinho.json @@ -0,0 +1,67 @@ +{ + "developer": "nicolinho", + "models": [ + { + "id": "nicolinho/QRM-Gemma-2-27B", + "name": "nicolinho/QRM-Gemma-2-27B", + "developer": "nicolinho", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9444, + "reward-bench/Factuality": 0.7853, + "reward-bench/Precise IF": 0.3719, + "reward-bench/Math": 0.6995, + "reward-bench/Safety": 0.927, + "reward-bench/Focus": 0.9535, + "reward-bench/Ties": 0.8321, + "reward-bench/Chat": 0.9665, + "reward-bench/Chat Hard": 0.9013, + "reward-bench/Reasoning": 0.9826 + } + }, + { + "id": "nicolinho/QRM-Llama3-8B", + "name": "nicolinho/QRM-Llama3-8B", + "developer": "nicolinho", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.911, + "reward-bench/Chat": 0.9581, + "reward-bench/Chat Hard": 0.8114, + "reward-bench/Safety": 0.8986, + "reward-bench/Reasoning": 0.9758 + } + }, + { + "id": "nicolinho/QRM-Llama3.1-8B", + "name": "nicolinho/QRM-Llama3.1-8B", + "developer": "nicolinho", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9306, + "reward-bench/Chat": 0.9441, + "reward-bench/Chat Hard": 0.8969, + "reward-bench/Safety": 0.923, + "reward-bench/Reasoning": 0.9583 + } + }, + { + "id": "nicolinho/QRM-Llama3.1-8B-v2", + "name": "nicolinho/QRM-Llama3.1-8B-v2", + "developer": "nicolinho", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9314, + "reward-bench/Factuality": 0.6653, + "reward-bench/Precise IF": 0.4062, + "reward-bench/Math": 0.612, + "reward-bench/Safety": 0.9257, + "reward-bench/Focus": 0.8909, + "reward-bench/Ties": 0.7234, + "reward-bench/Chat": 0.9637, + "reward-bench/Chat Hard": 0.8684, + "reward-bench/Reasoning": 0.9677 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/nidum.json b/data/developers/nidum.json new file mode 100644 index 0000000000000000000000000000000000000000..af8e93bb5f8a01abfc8ee7d10496e70d2cff68b5 --- /dev/null +++ b/data/developers/nidum.json @@ -0,0 +1,19 @@ +{ + "developer": "nidum", + "models": [ + { + "id": "nidum/Nidum-Limitless-Gemma-2B", + "name": "Nidum-Limitless-Gemma-2B", + "developer": "nidum", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2424, + "hfopenllm_v2/BBH": 0.3079, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.374, + "hfopenllm_v2/MMLU-PRO": 0.1174 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/nisten.json b/data/developers/nisten.json new file mode 100644 index 0000000000000000000000000000000000000000..7b275a3c64fa267662b1c7ec09c2c6db9c0fbfc6 --- /dev/null +++ b/data/developers/nisten.json @@ -0,0 +1,33 @@ +{ + "developer": "nisten", + "models": [ + { + "id": "nisten/franqwenstein-35b", + "name": "franqwenstein-35b", + "developer": "nisten", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3914, + "hfopenllm_v2/BBH": 0.6591, + "hfopenllm_v2/MATH Level 5": 0.3044, + "hfopenllm_v2/GPQA": 0.3591, + "hfopenllm_v2/MUSR": 0.4681, + "hfopenllm_v2/MMLU-PRO": 0.5611 + } + }, + { + "id": "nisten/tqwendo-36b", + "name": "tqwendo-36b", + "developer": "nisten", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6778, + "hfopenllm_v2/BBH": 0.6432, + "hfopenllm_v2/MATH Level 5": 0.4154, + "hfopenllm_v2/GPQA": 0.3314, + "hfopenllm_v2/MUSR": 0.443, + "hfopenllm_v2/MMLU-PRO": 0.4381 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/nlpguy.json b/data/developers/nlpguy.json new file mode 100644 index 0000000000000000000000000000000000000000..53da7ebbcfa5579e01d9df387a8a0c56ca3eec5d --- /dev/null +++ b/data/developers/nlpguy.json @@ -0,0 +1,131 @@ +{ + "developer": "nlpguy", + "models": [ + { + "id": "nlpguy/Lion-Lamarck-v.1.0.8", + "name": "Lion-Lamarck-v.1.0.8", + "developer": "nlpguy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4509, + "hfopenllm_v2/BBH": 0.5869, + "hfopenllm_v2/MATH Level 5": 0.5544, + "hfopenllm_v2/GPQA": 0.3582, + "hfopenllm_v2/MUSR": 0.4673, + "hfopenllm_v2/MMLU-PRO": 0.4643 + } + }, + { + "id": "nlpguy/Lion-Lamarck-v.1.0.9", + "name": "Lion-Lamarck-v.1.0.9", + "developer": "nlpguy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3409, + "hfopenllm_v2/BBH": 0.5918, + "hfopenllm_v2/MATH Level 5": 0.5642, + "hfopenllm_v2/GPQA": 0.3901, + "hfopenllm_v2/MUSR": 0.53, + "hfopenllm_v2/MMLU-PRO": 0.4704 + } + }, + { + "id": "nlpguy/Lion-Lamarck-v.1.1.0", + "name": "Lion-Lamarck-v.1.1.0", + "developer": "nlpguy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3658, + "hfopenllm_v2/BBH": 0.5962, + "hfopenllm_v2/MATH Level 5": 0.5755, + "hfopenllm_v2/GPQA": 0.3926, + "hfopenllm_v2/MUSR": 0.5325, + "hfopenllm_v2/MMLU-PRO": 0.4631 + } + }, + { + "id": "nlpguy/Miisce-one", + "name": "Miisce-one", + "developer": "nlpguy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6066, + "hfopenllm_v2/BBH": 0.6505, + "hfopenllm_v2/MATH Level 5": 0.4169, + "hfopenllm_v2/GPQA": 0.3859, + "hfopenllm_v2/MUSR": 0.482, + "hfopenllm_v2/MMLU-PRO": 0.5412 + } + }, + { + "id": "nlpguy/Mistral-NeMo-Minitron-Upscale-v1", + "name": "Mistral-NeMo-Minitron-Upscale-v1", + "developer": "nlpguy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1648, + "hfopenllm_v2/BBH": 0.4468, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3804, + "hfopenllm_v2/MMLU-PRO": 0.2537 + } + }, + { + "id": "nlpguy/Mistral-NeMo-Minitron-Upscale-v2", + "name": "Mistral-NeMo-Minitron-Upscale-v2", + "developer": "nlpguy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1573, + "hfopenllm_v2/BBH": 0.395, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3791, + "hfopenllm_v2/MMLU-PRO": 0.1927 + } + }, + { + "id": "nlpguy/Mistral-NeMo-Minitron-Upscale-v3", + "name": "Mistral-NeMo-Minitron-Upscale-v3", + "developer": "nlpguy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1412, + "hfopenllm_v2/BBH": 0.3052, + "hfopenllm_v2/MATH Level 5": 0.0113, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.4098, + "hfopenllm_v2/MMLU-PRO": 0.1171 + } + }, + { + "id": "nlpguy/StableProse", + "name": "StableProse", + "developer": "nlpguy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1972, + "hfopenllm_v2/BBH": 0.5117, + "hfopenllm_v2/MATH Level 5": 0.065, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4067, + "hfopenllm_v2/MMLU-PRO": 0.3468 + } + }, + { + "id": "nlpguy/StarFusion-alpha1", + "name": "StarFusion-alpha1", + "developer": "nlpguy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.566, + "hfopenllm_v2/BBH": 0.4429, + "hfopenllm_v2/MATH Level 5": 0.0718, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4081, + "hfopenllm_v2/MMLU-PRO": 0.3191 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/noname0202.json b/data/developers/noname0202.json new file mode 100644 index 0000000000000000000000000000000000000000..fd20ef1a079ab350467e00901ddc19e0f0a28101 --- /dev/null +++ b/data/developers/noname0202.json @@ -0,0 +1,117 @@ +{ + "developer": "noname0202", + "models": [ + { + "id": "noname0202/Llama-3.2-4x3B-Instruct", + "name": "Llama-3.2-4x3B-Instruct", + "developer": "noname0202", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7067, + "hfopenllm_v2/BBH": 0.4647, + "hfopenllm_v2/MATH Level 5": 0.1586, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3674, + "hfopenllm_v2/MMLU-PRO": 0.3285 + } + }, + { + "id": "noname0202/gemma-2-2b-it-ties", + "name": "gemma-2-2b-it-ties", + "developer": "noname0202", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1266, + "hfopenllm_v2/BBH": 0.4206, + "hfopenllm_v2/MATH Level 5": 0.0242, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3929, + "hfopenllm_v2/MMLU-PRO": 0.2561 + } + }, + { + "id": "noname0202/gemma-2-9b-sft-jp-en-zh-v1", + "name": "gemma-2-9b-sft-jp-en-zh-v1", + "developer": "noname0202", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2988, + "hfopenllm_v2/BBH": 0.4519, + "hfopenllm_v2/MATH Level 5": 0.0891, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.408, + "hfopenllm_v2/MMLU-PRO": 0.3125 + } + }, + { + "id": "noname0202/gemma-2-9b-sft-jp-en-zh-v2", + "name": "gemma-2-9b-sft-jp-en-zh-v2", + "developer": "noname0202", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3993, + "hfopenllm_v2/BBH": 0.4515, + "hfopenllm_v2/MATH Level 5": 0.1042, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.3612, + "hfopenllm_v2/MMLU-PRO": 0.3675 + } + }, + { + "id": "noname0202/llama-math-1b-r16-0to512tokens-test", + "name": "llama-math-1b-r16-0to512tokens-test", + "developer": "noname0202", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.547, + "hfopenllm_v2/BBH": 0.3488, + "hfopenllm_v2/MATH Level 5": 0.0816, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3143, + "hfopenllm_v2/MMLU-PRO": 0.1728 + } + }, + { + "id": "noname0202/llama-math-1b-r32-0to512tokens-test", + "name": "llama-math-1b-r32-0to512tokens-test", + "developer": "noname0202", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5683, + "hfopenllm_v2/BBH": 0.3495, + "hfopenllm_v2/MATH Level 5": 0.0906, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3209, + "hfopenllm_v2/MMLU-PRO": 0.176 + } + }, + { + "id": "noname0202/llama-math-1b-r32-test", + "name": "llama-math-1b-r32-test", + "developer": "noname0202", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5819, + "hfopenllm_v2/BBH": 0.3486, + "hfopenllm_v2/MATH Level 5": 0.0725, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3156, + "hfopenllm_v2/MMLU-PRO": 0.1781 + } + }, + { + "id": "noname0202/llama-math-1b-r8-512tokens-test", + "name": "llama-math-1b-r8-512tokens-test", + "developer": "noname0202", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5792, + "hfopenllm_v2/BBH": 0.3496, + "hfopenllm_v2/MATH Level 5": 0.0816, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3169, + "hfopenllm_v2/MMLU-PRO": 0.1753 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/notbdq.json b/data/developers/notbdq.json new file mode 100644 index 0000000000000000000000000000000000000000..de7f9a0a6578459c2cee836077e08bc53f540af8 --- /dev/null +++ b/data/developers/notbdq.json @@ -0,0 +1,19 @@ +{ + "developer": "notbdq", + "models": [ + { + "id": "notbdq/Qwen2.5-14B-Instruct-1M-GRPO-Reasoning", + "name": "Qwen2.5-14B-Instruct-1M-GRPO-Reasoning", + "developer": "notbdq", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8414, + "hfopenllm_v2/BBH": 0.6198, + "hfopenllm_v2/MATH Level 5": 0.5302, + "hfopenllm_v2/GPQA": 0.3431, + "hfopenllm_v2/MUSR": 0.418, + "hfopenllm_v2/MMLU-PRO": 0.485 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/nothingiisreal.json b/data/developers/nothingiisreal.json new file mode 100644 index 0000000000000000000000000000000000000000..92ab56bfd7ff6fc5c35e22c47a69c430dbab00d9 --- /dev/null +++ b/data/developers/nothingiisreal.json @@ -0,0 +1,47 @@ +{ + "developer": "nothingiisreal", + "models": [ + { + "id": "nothingiisreal/L3.1-8B-Celeste-V1.5", + "name": "L3.1-8B-Celeste-V1.5", + "developer": "nothingiisreal", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7327, + "hfopenllm_v2/BBH": 0.5012, + "hfopenllm_v2/MATH Level 5": 0.1465, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.3749, + "hfopenllm_v2/MMLU-PRO": 0.3704 + } + }, + { + "id": "nothingiisreal/MN-12B-Starcannon-v2", + "name": "MN-12B-Starcannon-v2", + "developer": "nothingiisreal", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3925, + "hfopenllm_v2/BBH": 0.5004, + "hfopenllm_v2/MATH Level 5": 0.0597, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.3978, + "hfopenllm_v2/MMLU-PRO": 0.3128 + } + }, + { + "id": "nothingiisreal/MN-12B-Starcannon-v3", + "name": "MN-12B-Starcannon-v3", + "developer": "nothingiisreal", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3807, + "hfopenllm_v2/BBH": 0.5171, + "hfopenllm_v2/MATH Level 5": 0.0778, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.4046, + "hfopenllm_v2/MMLU-PRO": 0.3265 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/nvidia.json b/data/developers/nvidia.json new file mode 100644 index 0000000000000000000000000000000000000000..ada6763b040c2b056dfed4d5f3cce1038be7c6e8 --- /dev/null +++ b/data/developers/nvidia.json @@ -0,0 +1,296 @@ +{ + "developer": "nvidia", + "models": [ + { + "id": "nvidia/AceInstruct-1.5B", + "name": "AceInstruct-1.5B", + "developer": "nvidia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3948, + "hfopenllm_v2/BBH": 0.3932, + "hfopenllm_v2/MATH Level 5": 0.3127, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.346, + "hfopenllm_v2/MMLU-PRO": 0.2574 + } + }, + { + "id": "nvidia/AceInstruct-72B", + "name": "AceInstruct-72B", + "developer": "nvidia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7119, + "hfopenllm_v2/BBH": 0.6139, + "hfopenllm_v2/MATH Level 5": 0.6261, + "hfopenllm_v2/GPQA": 0.3213, + "hfopenllm_v2/MUSR": 0.4206, + "hfopenllm_v2/MMLU-PRO": 0.4874 + } + }, + { + "id": "nvidia/AceInstruct-7B", + "name": "AceInstruct-7B", + "developer": "nvidia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5422, + "hfopenllm_v2/BBH": 0.5501, + "hfopenllm_v2/MATH Level 5": 0.5295, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4255, + "hfopenllm_v2/MMLU-PRO": 0.4177 + } + }, + { + "id": "nvidia/AceMath-1.5B-Instruct", + "name": "AceMath-1.5B-Instruct", + "developer": "nvidia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3212, + "hfopenllm_v2/BBH": 0.4024, + "hfopenllm_v2/MATH Level 5": 0.5287, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3607, + "hfopenllm_v2/MMLU-PRO": 0.2064 + } + }, + { + "id": "nvidia/AceMath-72B-Instruct", + "name": "AceMath-72B-Instruct", + "developer": "nvidia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.495, + "hfopenllm_v2/BBH": 0.6402, + "hfopenllm_v2/MATH Level 5": 0.7145, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.4062, + "hfopenllm_v2/MMLU-PRO": 0.4411 + } + }, + { + "id": "nvidia/AceMath-72B-RM", + "name": "AceMath-72B-RM", + "developer": "nvidia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1413, + "hfopenllm_v2/BBH": 0.2717, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2341, + "hfopenllm_v2/MUSR": 0.3351, + "hfopenllm_v2/MMLU-PRO": 0.1179 + } + }, + { + "id": "nvidia/AceMath-7B-Instruct", + "name": "AceMath-7B-Instruct", + "developer": "nvidia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4532, + "hfopenllm_v2/BBH": 0.4994, + "hfopenllm_v2/MATH Level 5": 0.6337, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4193, + "hfopenllm_v2/MMLU-PRO": 0.3383 + } + }, + { + "id": "nvidia/AceMath-7B-RM", + "name": "AceMath-7B-RM", + "developer": "nvidia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1494, + "hfopenllm_v2/BBH": 0.2423, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2458, + "hfopenllm_v2/MUSR": 0.358, + "hfopenllm_v2/MMLU-PRO": 0.1139 + } + }, + { + "id": "nvidia/Hymba-1.5B-Base", + "name": "Hymba-1.5B-Base", + "developer": "nvidia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2295, + "hfopenllm_v2/BBH": 0.3256, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3566, + "hfopenllm_v2/MMLU-PRO": 0.1922 + } + }, + { + "id": "nvidia/Hymba-1.5B-Instruct", + "name": "Hymba-1.5B-Instruct", + "developer": "nvidia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6009, + "hfopenllm_v2/BBH": 0.3067, + "hfopenllm_v2/MATH Level 5": 0.0272, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.3316, + "hfopenllm_v2/MMLU-PRO": 0.204 + } + }, + { + "id": "nvidia/Llama-3.1-Minitron-4B-Depth-Base", + "name": "Llama-3.1-Minitron-4B-Depth-Base", + "developer": "nvidia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1607, + "hfopenllm_v2/BBH": 0.4171, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.4011, + "hfopenllm_v2/MMLU-PRO": 0.2798 + } + }, + { + "id": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", + "name": "Llama-3.1-Nemotron-70B-Instruct-HF", + "developer": "nvidia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7381, + "hfopenllm_v2/BBH": 0.6316, + "hfopenllm_v2/MATH Level 5": 0.4267, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.4328, + "hfopenllm_v2/MMLU-PRO": 0.4919 + } + }, + { + "id": "nvidia/Llama-3.1-Nemotron-70B-Reward", + "name": "nvidia/Llama-3.1-Nemotron-70B-Reward", + "developer": "nvidia", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.9411, + "reward-bench/Chat": 0.9749, + "reward-bench/Chat Hard": 0.8575, + "reward-bench/Safety": 0.9514, + "reward-bench/Reasoning": 0.9807 + } + }, + { + "id": "nvidia/Llama3-70B-SteerLM-RM", + "name": "nvidia/Llama3-70B-SteerLM-RM", + "developer": "nvidia", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8877, + "reward-bench/Chat": 0.9134, + "reward-bench/Chat Hard": 0.8026, + "reward-bench/Safety": 0.9284, + "reward-bench/Reasoning": 0.9064 + } + }, + { + "id": "nvidia/Minitron-4B-Base", + "name": "Minitron-4B-Base", + "developer": "nvidia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2218, + "hfopenllm_v2/BBH": 0.4084, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.4134, + "hfopenllm_v2/MMLU-PRO": 0.262 + } + }, + { + "id": "nvidia/Minitron-8B-Base", + "name": "Minitron-8B-Base", + "developer": "nvidia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2424, + "hfopenllm_v2/BBH": 0.4395, + "hfopenllm_v2/MATH Level 5": 0.0257, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.4026, + "hfopenllm_v2/MMLU-PRO": 0.3181 + } + }, + { + "id": "nvidia/Mistral-NeMo-Minitron-8B-Base", + "name": "Mistral-NeMo-Minitron-8B-Base", + "developer": "nvidia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1946, + "hfopenllm_v2/BBH": 0.5219, + "hfopenllm_v2/MATH Level 5": 0.0483, + "hfopenllm_v2/GPQA": 0.3255, + "hfopenllm_v2/MUSR": 0.4092, + "hfopenllm_v2/MMLU-PRO": 0.3796 + } + }, + { + "id": "nvidia/Mistral-NeMo-Minitron-8B-Instruct", + "name": "Mistral-NeMo-Minitron-8B-Instruct", + "developer": "nvidia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5004, + "hfopenllm_v2/BBH": 0.5321, + "hfopenllm_v2/MATH Level 5": 0.1163, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.3886, + "hfopenllm_v2/MMLU-PRO": 0.3991 + } + }, + { + "id": "nvidia/Nemotron-4-340B-Reward", + "name": "nvidia/Nemotron-4-340B-Reward", + "developer": "nvidia", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.92, + "reward-bench/Chat": 0.9581, + "reward-bench/Chat Hard": 0.8706, + "reward-bench/Safety": 0.9149, + "reward-bench/Reasoning": 0.9363 + } + }, + { + "id": "nvidia/Nemotron-Mini-4B-Instruct", + "name": "Nemotron-Mini-4B-Instruct", + "developer": "nvidia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6669, + "hfopenllm_v2/BBH": 0.3865, + "hfopenllm_v2/MATH Level 5": 0.0257, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3767, + "hfopenllm_v2/MMLU-PRO": 0.2626 + } + }, + { + "id": "nvidia/OpenMath2-Llama3.1-8B", + "name": "OpenMath2-Llama3.1-8B", + "developer": "nvidia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2331, + "hfopenllm_v2/BBH": 0.4096, + "hfopenllm_v2/MATH Level 5": 0.2674, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3436, + "hfopenllm_v2/MMLU-PRO": 0.1553 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/nxmwxm.json b/data/developers/nxmwxm.json new file mode 100644 index 0000000000000000000000000000000000000000..4bfd5d6679d4e288a25c6fc080fcde8e2ac4ee73 --- /dev/null +++ b/data/developers/nxmwxm.json @@ -0,0 +1,19 @@ +{ + "developer": "nxmwxm", + "models": [ + { + "id": "nxmwxm/Beast-Soul-new", + "name": "Beast-Soul-new", + "developer": "nxmwxm", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4869, + "hfopenllm_v2/BBH": 0.5227, + "hfopenllm_v2/MATH Level 5": 0.074, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.4459, + "hfopenllm_v2/MMLU-PRO": 0.3102 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/occiglot.json b/data/developers/occiglot.json new file mode 100644 index 0000000000000000000000000000000000000000..504d6f906b1432668570bd83990235a541e2e8b3 --- /dev/null +++ b/data/developers/occiglot.json @@ -0,0 +1,19 @@ +{ + "developer": "occiglot", + "models": [ + { + "id": "occiglot/occiglot-7b-es-en-instruct", + "name": "occiglot-7b-es-en-instruct", + "developer": "occiglot", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3485, + "hfopenllm_v2/BBH": 0.4111, + "hfopenllm_v2/MATH Level 5": 0.0242, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3738, + "hfopenllm_v2/MMLU-PRO": 0.2311 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/odyssey-labs.json b/data/developers/odyssey-labs.json new file mode 100644 index 0000000000000000000000000000000000000000..1e0ac40fc81dd082be186a296fb3d45f66972844 --- /dev/null +++ b/data/developers/odyssey-labs.json @@ -0,0 +1,19 @@ +{ + "developer": "odyssey-labs", + "models": [ + { + "id": "odyssey-labs/Astral-1-10B", + "name": "Astral-1-10B", + "developer": "odyssey-labs", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3878, + "hfopenllm_v2/BBH": 0.4873, + "hfopenllm_v2/MATH Level 5": 0.0347, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.428, + "hfopenllm_v2/MMLU-PRO": 0.2985 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/olabs-ai.json b/data/developers/olabs-ai.json new file mode 100644 index 0000000000000000000000000000000000000000..9c8f5e5288b93827b623178356c89affb926a845 --- /dev/null +++ b/data/developers/olabs-ai.json @@ -0,0 +1,19 @@ +{ + "developer": "olabs-ai", + "models": [ + { + "id": "olabs-ai/reflection_model", + "name": "reflection_model", + "developer": "olabs-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1599, + "hfopenllm_v2/BBH": 0.4713, + "hfopenllm_v2/MATH Level 5": 0.0514, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.3508, + "hfopenllm_v2/MMLU-PRO": 0.3311 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ontocord.json b/data/developers/ontocord.json new file mode 100644 index 0000000000000000000000000000000000000000..26bf0b3ef88f6bd021717109acdee1567c9357e3 --- /dev/null +++ b/data/developers/ontocord.json @@ -0,0 +1,453 @@ +{ + "developer": "ontocord", + "models": [ + { + "id": "ontocord/Llama_3.2_1b-autoredteam_helpfulness-train", + "name": "Llama_3.2_1b-autoredteam_helpfulness-train", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2765, + "hfopenllm_v2/BBH": 0.3115, + "hfopenllm_v2/MATH Level 5": 0.0166, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3459, + "hfopenllm_v2/MMLU-PRO": 0.1132 + } + }, + { + "id": "ontocord/RedPajama-3B-v1-AutoRedteam", + "name": "RedPajama-3B-v1-AutoRedteam", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1343, + "hfopenllm_v2/BBH": 0.3026, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.2424, + "hfopenllm_v2/MUSR": 0.3661, + "hfopenllm_v2/MMLU-PRO": 0.1108 + } + }, + { + "id": "ontocord/RedPajama-3B-v1-AutoRedteam-Harmless-only", + "name": "RedPajama-3B-v1-AutoRedteam-Harmless-only", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1525, + "hfopenllm_v2/BBH": 0.3124, + "hfopenllm_v2/MATH Level 5": 0.006, + "hfopenllm_v2/GPQA": 0.2315, + "hfopenllm_v2/MUSR": 0.3661, + "hfopenllm_v2/MMLU-PRO": 0.11 + } + }, + { + "id": "ontocord/RedPajama3b_v1-autoredteam_helpfulness-train", + "name": "RedPajama3b_v1-autoredteam_helpfulness-train", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2848, + "hfopenllm_v2/BBH": 0.3093, + "hfopenllm_v2/MATH Level 5": 0.0068, + "hfopenllm_v2/GPQA": 0.2458, + "hfopenllm_v2/MUSR": 0.358, + "hfopenllm_v2/MMLU-PRO": 0.1107 + } + }, + { + "id": "ontocord/merged_0.2_expert_0.8", + "name": "merged_0.2_expert_0.8", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1743, + "hfopenllm_v2/BBH": 0.3046, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3621, + "hfopenllm_v2/MMLU-PRO": 0.1111 + } + }, + { + "id": "ontocord/merged_0.2_expert_0.8-stack_2x", + "name": "merged_0.2_expert_0.8-stack_2x", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1796, + "hfopenllm_v2/BBH": 0.3006, + "hfopenllm_v2/MATH Level 5": 0.0249, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3541, + "hfopenllm_v2/MMLU-PRO": 0.1103 + } + }, + { + "id": "ontocord/merged_0.5_expert_0.5", + "name": "merged_0.5_expert_0.5", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1787, + "hfopenllm_v2/BBH": 0.3017, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3542, + "hfopenllm_v2/MMLU-PRO": 0.1108 + } + }, + { + "id": "ontocord/ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful", + "name": "ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1318, + "hfopenllm_v2/BBH": 0.3004, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3631, + "hfopenllm_v2/MMLU-PRO": 0.1142 + } + }, + { + "id": "ontocord/ontocord_wide_7b-stacked-stage1", + "name": "ontocord_wide_7b-stacked-stage1", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1485, + "hfopenllm_v2/BBH": 0.2897, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3604, + "hfopenllm_v2/MMLU-PRO": 0.1105 + } + }, + { + "id": "ontocord/ontocord_wide_7b-stacked-stage1-instruct", + "name": "ontocord_wide_7b-stacked-stage1-instruct", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.153, + "hfopenllm_v2/BBH": 0.2854, + "hfopenllm_v2/MATH Level 5": 0.0068, + "hfopenllm_v2/GPQA": 0.2466, + "hfopenllm_v2/MUSR": 0.3538, + "hfopenllm_v2/MMLU-PRO": 0.1117 + } + }, + { + "id": "ontocord/starcoder2-29b-ls", + "name": "starcoder2-29b-ls", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2149, + "hfopenllm_v2/BBH": 0.3735, + "hfopenllm_v2/MATH Level 5": 0.0189, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.37, + "hfopenllm_v2/MMLU-PRO": 0.1869 + } + }, + { + "id": "ontocord/starcoder2_3b-AutoRedteam", + "name": "starcoder2_3b-AutoRedteam", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1574, + "hfopenllm_v2/BBH": 0.3498, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2517, + "hfopenllm_v2/MUSR": 0.3646, + "hfopenllm_v2/MMLU-PRO": 0.1336 + } + }, + { + "id": "ontocord/wide_3b-merge_test", + "name": "wide_3b-merge_test", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1763, + "hfopenllm_v2/BBH": 0.3011, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2399, + "hfopenllm_v2/MUSR": 0.342, + "hfopenllm_v2/MMLU-PRO": 0.1066 + } + }, + { + "id": "ontocord/wide_3b-stage1_shuf_sample1_jsonl-pretrained", + "name": "wide_3b-stage1_shuf_sample1_jsonl-pretrained", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1395, + "hfopenllm_v2/BBH": 0.3004, + "hfopenllm_v2/MATH Level 5": 0.0166, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3632, + "hfopenllm_v2/MMLU-PRO": 0.114 + } + }, + { + "id": "ontocord/wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge", + "name": "wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1664, + "hfopenllm_v2/BBH": 0.3031, + "hfopenllm_v2/MATH Level 5": 0.0113, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3845, + "hfopenllm_v2/MMLU-PRO": 0.1111 + } + }, + { + "id": "ontocord/wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge", + "name": "wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1697, + "hfopenllm_v2/BBH": 0.2975, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3778, + "hfopenllm_v2/MMLU-PRO": 0.1125 + } + }, + { + "id": "ontocord/wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue", + "name": "wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.148, + "hfopenllm_v2/BBH": 0.3095, + "hfopenllm_v2/MATH Level 5": 0.0204, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3579, + "hfopenllm_v2/MMLU-PRO": 0.1108 + } + }, + { + "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue", + "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1237, + "hfopenllm_v2/BBH": 0.306, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3673, + "hfopenllm_v2/MMLU-PRO": 0.1111 + } + }, + { + "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue", + "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1192, + "hfopenllm_v2/BBH": 0.2956, + "hfopenllm_v2/MATH Level 5": 0.0068, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3553, + "hfopenllm_v2/MMLU-PRO": 0.1183 + } + }, + { + "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue", + "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1162, + "hfopenllm_v2/BBH": 0.3184, + "hfopenllm_v2/MATH Level 5": 0.0076, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3447, + "hfopenllm_v2/MMLU-PRO": 0.1124 + } + }, + { + "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue", + "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1317, + "hfopenllm_v2/BBH": 0.3064, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3446, + "hfopenllm_v2/MMLU-PRO": 0.1144 + } + }, + { + "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue", + "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1182, + "hfopenllm_v2/BBH": 0.3037, + "hfopenllm_v2/MATH Level 5": 0.0083, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3567, + "hfopenllm_v2/MMLU-PRO": 0.1162 + } + }, + { + "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue", + "name": "wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.124, + "hfopenllm_v2/BBH": 0.3032, + "hfopenllm_v2/MATH Level 5": 0.0076, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3487, + "hfopenllm_v2/MMLU-PRO": 0.1128 + } + }, + { + "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_math.no_issue", + "name": "wide_3b_sft_stage1.1-ss1-with_math.no_issue", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1298, + "hfopenllm_v2/BBH": 0.3052, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3928, + "hfopenllm_v2/MMLU-PRO": 0.1147 + } + }, + { + "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue", + "name": "wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2049, + "hfopenllm_v2/BBH": 0.2912, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3575, + "hfopenllm_v2/MMLU-PRO": 0.1167 + } + }, + { + "id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical", + "name": "wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1461, + "hfopenllm_v2/BBH": 0.2998, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3926, + "hfopenllm_v2/MMLU-PRO": 0.1141 + } + }, + { + "id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_formatted_text", + "name": "wide_3b_sft_stage1.2-ss1-expert_formatted_text", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1487, + "hfopenllm_v2/BBH": 0.3069, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3474, + "hfopenllm_v2/MMLU-PRO": 0.1146 + } + }, + { + "id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_how-to", + "name": "wide_3b_sft_stage1.2-ss1-expert_how-to", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1245, + "hfopenllm_v2/BBH": 0.3047, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3658, + "hfopenllm_v2/MMLU-PRO": 0.1153 + } + }, + { + "id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_math", + "name": "wide_3b_sft_stage1.2-ss1-expert_math", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1915, + "hfopenllm_v2/BBH": 0.306, + "hfopenllm_v2/MATH Level 5": 0.0279, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.37, + "hfopenllm_v2/MMLU-PRO": 0.1092 + } + }, + { + "id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_news", + "name": "wide_3b_sft_stage1.2-ss1-expert_news", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1658, + "hfopenllm_v2/BBH": 0.2926, + "hfopenllm_v2/MATH Level 5": 0.0166, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3621, + "hfopenllm_v2/MMLU-PRO": 0.1111 + } + }, + { + "id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_software", + "name": "wide_3b_sft_stage1.2-ss1-expert_software", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1734, + "hfopenllm_v2/BBH": 0.298, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3569, + "hfopenllm_v2/MMLU-PRO": 0.114 + } + }, + { + "id": "ontocord/wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked", + "name": "wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked", + "developer": "ontocord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1244, + "hfopenllm_v2/BBH": 0.3026, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3686, + "hfopenllm_v2/MMLU-PRO": 0.1115 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/oobabooga.json b/data/developers/oobabooga.json new file mode 100644 index 0000000000000000000000000000000000000000..d6cce322b760eafcdd9d8d0a8f78e5eee9704c01 --- /dev/null +++ b/data/developers/oobabooga.json @@ -0,0 +1,19 @@ +{ + "developer": "oobabooga", + "models": [ + { + "id": "oobabooga/CodeBooga-34B-v0.1", + "name": "CodeBooga-34B-v0.1", + "developer": "oobabooga", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.525, + "hfopenllm_v2/BBH": 0.3427, + "hfopenllm_v2/MATH Level 5": 0.0393, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.431, + "hfopenllm_v2/MMLU-PRO": 0.236 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/oopere.json b/data/developers/oopere.json new file mode 100644 index 0000000000000000000000000000000000000000..71d39c6c35063e6c97b2a688c804f115030e73dd --- /dev/null +++ b/data/developers/oopere.json @@ -0,0 +1,131 @@ +{ + "developer": "oopere", + "models": [ + { + "id": "oopere/Llama-FinSent-S", + "name": "Llama-FinSent-S", + "developer": "oopere", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2164, + "hfopenllm_v2/BBH": 0.3169, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3832, + "hfopenllm_v2/MMLU-PRO": 0.1134 + } + }, + { + "id": "oopere/pruned10-llama-3.2-3B", + "name": "pruned10-llama-3.2-3B", + "developer": "oopere", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1776, + "hfopenllm_v2/BBH": 0.334, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3722, + "hfopenllm_v2/MMLU-PRO": 0.164 + } + }, + { + "id": "oopere/pruned20-llama-1b", + "name": "pruned20-llama-1b", + "developer": "oopere", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1994, + "hfopenllm_v2/BBH": 0.3031, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3631, + "hfopenllm_v2/MMLU-PRO": 0.1123 + } + }, + { + "id": "oopere/pruned20-llama-3.2-3b", + "name": "pruned20-llama-3.2-3b", + "developer": "oopere", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1789, + "hfopenllm_v2/BBH": 0.3248, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3418, + "hfopenllm_v2/MMLU-PRO": 0.128 + } + }, + { + "id": "oopere/pruned40-llama-1b", + "name": "pruned40-llama-1b", + "developer": "oopere", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2284, + "hfopenllm_v2/BBH": 0.2969, + "hfopenllm_v2/MATH Level 5": 0.0076, + "hfopenllm_v2/GPQA": 0.2433, + "hfopenllm_v2/MUSR": 0.4287, + "hfopenllm_v2/MMLU-PRO": 0.1082 + } + }, + { + "id": "oopere/pruned40-llama-3.2-1B", + "name": "pruned40-llama-3.2-1B", + "developer": "oopere", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2266, + "hfopenllm_v2/BBH": 0.2982, + "hfopenllm_v2/MATH Level 5": 0.0083, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.4352, + "hfopenllm_v2/MMLU-PRO": 0.1115 + } + }, + { + "id": "oopere/pruned40-llama-3.2-3b", + "name": "pruned40-llama-3.2-3b", + "developer": "oopere", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2183, + "hfopenllm_v2/BBH": 0.3167, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2299, + "hfopenllm_v2/MUSR": 0.3539, + "hfopenllm_v2/MMLU-PRO": 0.1177 + } + }, + { + "id": "oopere/pruned60-llama-1b", + "name": "pruned60-llama-1b", + "developer": "oopere", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1829, + "hfopenllm_v2/BBH": 0.3016, + "hfopenllm_v2/MATH Level 5": 0.0023, + "hfopenllm_v2/GPQA": 0.2492, + "hfopenllm_v2/MUSR": 0.4088, + "hfopenllm_v2/MMLU-PRO": 0.1173 + } + }, + { + "id": "oopere/pruned60-llama-3.2-3b", + "name": "pruned60-llama-3.2-3b", + "developer": "oopere", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1825, + "hfopenllm_v2/BBH": 0.3166, + "hfopenllm_v2/MATH Level 5": 0.0038, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3633, + "hfopenllm_v2/MMLU-PRO": 0.1131 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/open-atlas.json b/data/developers/open-atlas.json new file mode 100644 index 0000000000000000000000000000000000000000..2b4ab3cf42ca20f6291a93804bff62a5bc01d90b --- /dev/null +++ b/data/developers/open-atlas.json @@ -0,0 +1,33 @@ +{ + "developer": "open-atlas", + "models": [ + { + "id": "open-atlas/Atlas-Flash-1.5B-Preview", + "name": "Atlas-Flash-1.5B-Preview", + "developer": "open-atlas", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.327, + "hfopenllm_v2/BBH": 0.3215, + "hfopenllm_v2/MATH Level 5": 0.2213, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.3488, + "hfopenllm_v2/MMLU-PRO": 0.1374 + } + }, + { + "id": "open-atlas/Atlas-Flash-7B-Preview", + "name": "Atlas-Flash-7B-Preview", + "developer": "open-atlas", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3908, + "hfopenllm_v2/BBH": 0.3542, + "hfopenllm_v2/MATH Level 5": 0.2576, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.3836, + "hfopenllm_v2/MMLU-PRO": 0.2784 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/open-neo.json b/data/developers/open-neo.json new file mode 100644 index 0000000000000000000000000000000000000000..d3e72f4e3df9575c45fa609e3634c93986b615ee --- /dev/null +++ b/data/developers/open-neo.json @@ -0,0 +1,33 @@ +{ + "developer": "open-neo", + "models": [ + { + "id": "open-neo/Kyro-n1-3B", + "name": "Kyro-n1-3B", + "developer": "open-neo", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4595, + "hfopenllm_v2/BBH": 0.4685, + "hfopenllm_v2/MATH Level 5": 0.2855, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.4088, + "hfopenllm_v2/MMLU-PRO": 0.3423 + } + }, + { + "id": "open-neo/Kyro-n1-7B", + "name": "Kyro-n1-7B", + "developer": "open-neo", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5573, + "hfopenllm_v2/BBH": 0.5387, + "hfopenllm_v2/MATH Level 5": 0.3897, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3884, + "hfopenllm_v2/MMLU-PRO": 0.4333 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/open-thoughts.json b/data/developers/open-thoughts.json new file mode 100644 index 0000000000000000000000000000000000000000..1770bb79eb3160de246e6849b9e6cbf81aced3e6 --- /dev/null +++ b/data/developers/open-thoughts.json @@ -0,0 +1,19 @@ +{ + "developer": "open-thoughts", + "models": [ + { + "id": "open-thoughts/OpenThinker-7B", + "name": "OpenThinker-7B", + "developer": "open-thoughts", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4089, + "hfopenllm_v2/BBH": 0.5343, + "hfopenllm_v2/MATH Level 5": 0.426, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.382, + "hfopenllm_v2/MMLU-PRO": 0.4165 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/openai-community.json b/data/developers/openai-community.json new file mode 100644 index 0000000000000000000000000000000000000000..b65e0a6d2ad9fcf571ab1f95656f2b9e939df55e --- /dev/null +++ b/data/developers/openai-community.json @@ -0,0 +1,61 @@ +{ + "developer": "openai-community", + "models": [ + { + "id": "openai-community/gpt2", + "name": "gpt2", + "developer": "openai-community", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.178, + "hfopenllm_v2/BBH": 0.3017, + "hfopenllm_v2/MATH Level 5": 0.0053, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.439, + "hfopenllm_v2/MMLU-PRO": 0.1165 + } + }, + { + "id": "openai-community/gpt2-large", + "name": "gpt2-large", + "developer": "openai-community", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2048, + "hfopenllm_v2/BBH": 0.3069, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3789, + "hfopenllm_v2/MMLU-PRO": 0.1142 + } + }, + { + "id": "openai-community/gpt2-medium", + "name": "gpt2-medium", + "developer": "openai-community", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2208, + "hfopenllm_v2/BBH": 0.305, + "hfopenllm_v2/MATH Level 5": 0.0076, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3884, + "hfopenllm_v2/MMLU-PRO": 0.1182 + } + }, + { + "id": "openai-community/gpt2-xl", + "name": "gpt2-xl", + "developer": "openai-community", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2039, + "hfopenllm_v2/BBH": 0.3009, + "hfopenllm_v2/MATH Level 5": 0.0098, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.371, + "hfopenllm_v2/MMLU-PRO": 0.1131 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/openai.json b/data/developers/openai.json new file mode 100644 index 0000000000000000000000000000000000000000..b8a74d2a5687da3273bb20273947cddb36eec00c --- /dev/null +++ b/data/developers/openai.json @@ -0,0 +1,1250 @@ +{ + "developer": "openai", + "models": [ + { + "id": "openai/GPT 4o", + "name": "GPT 4o", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "apex-v1/Overall Score": 0.359 + } + }, + { + "id": "openai/GPT 5", + "name": "GPT 5", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "ace/Overall Score": 0.561, + "ace/DIY Score": 0.55, + "ace/Food Score": 0.7, + "ace/Gaming Score": 0.575, + "apex-agents/Overall Pass@1": 0.183, + "apex-agents/Overall Pass@8": 0.31, + "apex-agents/Overall Mean Score": 0.329, + "apex-agents/Investment Banking Pass@1": 0.273, + "apex-agents/Management Consulting Pass@1": 0.123, + "apex-agents/Corporate Law Pass@1": 0.153, + "apex-agents/Corporate Lawyer Mean Score": 0.382, + "apex-v1/Overall Score": 0.67, + "apex-v1/Big Law Score": 0.78, + "apex-v1/Medicine (MD) Score": 0.66, + "apex-v1/Investment Banking Score": 0.61 + } + }, + { + "id": "openai/GPT 5 Codex", + "name": "GPT 5 Codex", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "apex-agents/Corporate Lawyer Mean Score": 0.362 + } + }, + { + "id": "openai/GPT 5.1", + "name": "GPT 5.1", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "ace/Overall Score": 0.551, + "ace/DIY Score": 0.56, + "ace/Gaming Score": 0.61, + "ace/Shopping Score": 0.45, + "apex-agents/Corporate Lawyer Mean Score": 0.376, + "apex-v1/Big Law Score": 0.77 + } + }, + { + "id": "openai/GPT 5.1 Codex", + "name": "GPT 5.1 Codex", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "apex-agents/Corporate Lawyer Mean Score": 0.366 + } + }, + { + "id": "openai/GPT 5.2", + "name": "GPT 5.2", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "ace/Overall Score": 0.515, + "ace/Food Score": 0.65, + "ace/Gaming Score": 0.578, + "apex-agents/Overall Pass@1": 0.23, + "apex-agents/Overall Pass@8": 0.4, + "apex-agents/Overall Mean Score": 0.387, + "apex-agents/Investment Banking Pass@1": 0.273, + "apex-agents/Management Consulting Pass@1": 0.227, + "apex-agents/Corporate Law Pass@1": 0.189, + "apex-agents/Corporate Lawyer Mean Score": 0.443 + } + }, + { + "id": "openai/GPT 5.2 Codex", + "name": "GPT 5.2 Codex", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "apex-agents/Overall Pass@1": 0.276, + "apex-agents/Corporate Lawyer Mean Score": 0.394 + } + }, + { + "id": "openai/GPT 5.2 Pro", + "name": "GPT 5.2 Pro", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "apex-v1/Overall Score": 0.668, + "apex-v1/Consulting Score": 0.64, + "apex-v1/Medicine (MD) Score": 0.65, + "apex-v1/Investment Banking Score": 0.64 + } + }, + { + "id": "openai/GPT 5.3 Codex", + "name": "GPT 5.3 Codex", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "apex-agents/Overall Pass@1": 0.317 + } + }, + { + "id": "openai/GPT OSS 120B", + "name": "GPT OSS 120B", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "apex-agents/Overall Pass@1": 0.047, + "apex-agents/Overall Pass@8": 0.115, + "apex-agents/Overall Mean Score": 0.145, + "apex-agents/Investment Banking Pass@1": 0.027, + "apex-agents/Management Consulting Pass@1": 0.035, + "apex-agents/Corporate Law Pass@1": 0.078, + "apex-agents/Corporate Lawyer Mean Score": 0.269 + } + }, + { + "id": "openai/GPT-J-6B", + "name": "GPT-J 6B", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.273, + "helm_classic/MMLU": 0.249, + "helm_classic/BoolQ": 0.649, + "helm_classic/NarrativeQA": 0.545, + "helm_classic/NaturalQuestions (open-book)": 0.559, + "helm_classic/QuAC": 0.33, + "helm_classic/HellaSwag": 0.663, + "helm_classic/OpenbookQA": 0.514, + "helm_classic/TruthfulQA": 0.199, + "helm_classic/MS MARCO (TREC)": 0.345, + "helm_classic/CNN/DailyMail": 0.131, + "helm_classic/XSUM": 0.096, + "helm_classic/IMDB": 0.939, + "helm_classic/CivilComments": 0.52, + "helm_classic/RAFT": 0.619 + } + }, + { + "id": "openai/GPT-NeoX-20B", + "name": "GPT-NeoX 20B", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.351, + "helm_classic/MMLU": 0.276, + "helm_classic/BoolQ": 0.683, + "helm_classic/NarrativeQA": 0.599, + "helm_classic/NaturalQuestions (open-book)": 0.596, + "helm_classic/QuAC": 0.326, + "helm_classic/HellaSwag": 0.718, + "helm_classic/OpenbookQA": 0.524, + "helm_classic/TruthfulQA": 0.216, + "helm_classic/MS MARCO (TREC)": 0.398, + "helm_classic/CNN/DailyMail": 0.123, + "helm_classic/XSUM": 0.102, + "helm_classic/IMDB": 0.948, + "helm_classic/CivilComments": 0.516, + "helm_classic/RAFT": 0.505 + } + }, + { + "id": "openai/ada-350M", + "name": "ada 350M", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.108, + "helm_classic/MMLU": 0.243, + "helm_classic/BoolQ": 0.581, + "helm_classic/NarrativeQA": 0.326, + "helm_classic/NaturalQuestions (open-book)": 0.365, + "helm_classic/QuAC": 0.242, + "helm_classic/HellaSwag": 0.435, + "helm_classic/OpenbookQA": 0.38, + "helm_classic/TruthfulQA": 0.215, + "helm_classic/MS MARCO (TREC)": 0.29, + "helm_classic/CNN/DailyMail": 0.09, + "helm_classic/XSUM": 0.022, + "helm_classic/IMDB": 0.849, + "helm_classic/CivilComments": 0.517, + "helm_classic/RAFT": 0.423 + } + }, + { + "id": "openai/babbage-1.3B", + "name": "babbage 1.3B", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.114, + "helm_classic/MMLU": 0.235, + "helm_classic/BoolQ": 0.574, + "helm_classic/NarrativeQA": 0.491, + "helm_classic/NaturalQuestions (open-book)": 0.451, + "helm_classic/QuAC": 0.273, + "helm_classic/HellaSwag": 0.555, + "helm_classic/OpenbookQA": 0.438, + "helm_classic/TruthfulQA": 0.188, + "helm_classic/MS MARCO (TREC)": 0.317, + "helm_classic/CNN/DailyMail": 0.079, + "helm_classic/XSUM": 0.045, + "helm_classic/IMDB": 0.597, + "helm_classic/CivilComments": 0.519, + "helm_classic/RAFT": 0.455 + } + }, + { + "id": "openai/curie-6.7B", + "name": "curie 6.7B", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.247, + "helm_classic/MMLU": 0.243, + "helm_classic/BoolQ": 0.656, + "helm_classic/NarrativeQA": 0.604, + "helm_classic/NaturalQuestions (open-book)": 0.552, + "helm_classic/QuAC": 0.321, + "helm_classic/HellaSwag": 0.682, + "helm_classic/OpenbookQA": 0.502, + "helm_classic/TruthfulQA": 0.232, + "helm_classic/MS MARCO (TREC)": 0.3, + "helm_classic/CNN/DailyMail": 0.113, + "helm_classic/XSUM": 0.091, + "helm_classic/IMDB": 0.889, + "helm_classic/CivilComments": 0.539, + "helm_classic/RAFT": 0.49 + } + }, + { + "id": "openai/davinci-175B", + "name": "davinci 175B", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.538, + "helm_classic/MMLU": 0.422, + "helm_classic/BoolQ": 0.722, + "helm_classic/NarrativeQA": 0.687, + "helm_classic/NaturalQuestions (open-book)": 0.625, + "helm_classic/QuAC": 0.36, + "helm_classic/HellaSwag": 0.775, + "helm_classic/OpenbookQA": 0.586, + "helm_classic/TruthfulQA": 0.194, + "helm_classic/MS MARCO (TREC)": 0.378, + "helm_classic/CNN/DailyMail": 0.127, + "helm_classic/XSUM": 0.126, + "helm_classic/IMDB": 0.933, + "helm_classic/CivilComments": 0.532, + "helm_classic/RAFT": 0.642 + } + }, + { + "id": "openai/gpt-3.5-turbo-0125", + "name": "GPT-3.5 Turbo 0125", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_mmlu/MMLU All Subjects": 0.673, + "helm_mmlu/Abstract Algebra": 0.31, + "helm_mmlu/Anatomy": 0.696, + "helm_mmlu/College Physics": 0.471, + "helm_mmlu/Computer Security": 0.78, + "helm_mmlu/Econometrics": 0.474, + "helm_mmlu/Global Facts": 0.39, + "helm_mmlu/Jurisprudence": 0.806, + "helm_mmlu/Philosophy": 0.746, + "helm_mmlu/Professional Psychology": 0.722, + "helm_mmlu/Us Foreign Policy": 0.89, + "helm_mmlu/Astronomy": 0.75, + "helm_mmlu/Business Ethics": 0.75, + "helm_mmlu/Clinical Knowledge": 0.755, + "helm_mmlu/Conceptual Physics": 0.634, + "helm_mmlu/Electrical Engineering": 0.669, + "helm_mmlu/Elementary Mathematics": 0.534, + "helm_mmlu/Formal Logic": 0.444, + "helm_mmlu/High School World History": 0.819, + "helm_mmlu/Human Sexuality": 0.779, + "helm_mmlu/International Law": 0.81, + "helm_mmlu/Logical Fallacies": 0.779, + "helm_mmlu/Machine Learning": 0.455, + "helm_mmlu/Management": 0.835, + "helm_mmlu/Marketing": 0.91, + "helm_mmlu/Medical Genetics": 0.73, + "helm_mmlu/Miscellaneous": 0.89, + "helm_mmlu/Moral Scenarios": 0.355, + "helm_mmlu/Nutrition": 0.748, + "helm_mmlu/Prehistory": 0.735, + "helm_mmlu/Public Relations": 0.727, + "helm_mmlu/Security Studies": 0.751, + "helm_mmlu/Sociology": 0.861, + "helm_mmlu/Virology": 0.536, + "helm_mmlu/World Religions": 0.842, + "helm_mmlu/Mean win rate": 0.493, + "reward-bench/Score": 0.6534, + "reward-bench/Chat": 0.9218, + "reward-bench/Chat Hard": 0.4452, + "reward-bench/Safety": 0.6547, + "reward-bench/Reasoning": 0.5912, + "reward-bench/Prior Sets (0.5 weight)": 0.6548 + } + }, + { + "id": "openai/gpt-3.5-turbo-0301", + "name": "gpt-3.5-turbo-0301", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.76, + "helm_classic/MMLU": 0.59, + "helm_classic/BoolQ": 0.74, + "helm_classic/NarrativeQA": 0.663, + "helm_classic/NaturalQuestions (open-book)": 0.624, + "helm_classic/QuAC": 0.512, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.609, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.899, + "helm_classic/CivilComments": 0.674, + "helm_classic/RAFT": 0.768 + } + }, + { + "id": "openai/gpt-3.5-turbo-0613", + "name": "gpt-3.5-turbo-0613", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.783, + "helm_classic/MMLU": 0.391, + "helm_classic/BoolQ": 0.87, + "helm_classic/NarrativeQA": 0.625, + "helm_classic/NaturalQuestions (open-book)": 0.675, + "helm_classic/QuAC": 0.485, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.339, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.943, + "helm_classic/CivilComments": 0.696, + "helm_classic/RAFT": 0.748, + "helm_instruct/Mean win rate": 0.689, + "helm_instruct/Anthropic RLHF dataset": 4.964, + "helm_instruct/Best ChatGPT Prompts": 4.986, + "helm_instruct/Koala test dataset": 4.987, + "helm_instruct/Open Assistant": 4.987, + "helm_instruct/Self Instruct": 4.99, + "helm_instruct/Vicuna": 4.992, + "helm_lite/Mean win rate": 0.358, + "helm_lite/NarrativeQA": 0.655, + "helm_lite/NaturalQuestions (closed-book)": 0.335, + "helm_lite/OpenbookQA": 0.838, + "helm_lite/MMLU": 0.614, + "helm_lite/MATH": 0.667, + "helm_lite/GSM8K": 0.501, + "helm_lite/LegalBench": 0.528, + "helm_lite/MedQA": 0.622, + "helm_lite/WMT 2014": 0.187, + "helm_mmlu/MMLU All Subjects": 0.689, + "helm_mmlu/Abstract Algebra": 0.38, + "helm_mmlu/Anatomy": 0.659, + "helm_mmlu/College Physics": 0.461, + "helm_mmlu/Computer Security": 0.81, + "helm_mmlu/Econometrics": 0.5, + "helm_mmlu/Global Facts": 0.37, + "helm_mmlu/Jurisprudence": 0.806, + "helm_mmlu/Philosophy": 0.759, + "helm_mmlu/Professional Psychology": 0.732, + "helm_mmlu/Us Foreign Policy": 0.88, + "helm_mmlu/Astronomy": 0.763, + "helm_mmlu/Business Ethics": 0.75, + "helm_mmlu/Clinical Knowledge": 0.777, + "helm_mmlu/Conceptual Physics": 0.613, + "helm_mmlu/Electrical Engineering": 0.648, + "helm_mmlu/Elementary Mathematics": 0.5, + "helm_mmlu/Formal Logic": 0.397, + "helm_mmlu/High School World History": 0.857, + "helm_mmlu/Human Sexuality": 0.786, + "helm_mmlu/International Law": 0.843, + "helm_mmlu/Logical Fallacies": 0.791, + "helm_mmlu/Machine Learning": 0.455, + "helm_mmlu/Management": 0.845, + "helm_mmlu/Marketing": 0.91, + "helm_mmlu/Medical Genetics": 0.8, + "helm_mmlu/Miscellaneous": 0.893, + "helm_mmlu/Moral Scenarios": 0.404, + "helm_mmlu/Nutrition": 0.758, + "helm_mmlu/Prehistory": 0.787, + "helm_mmlu/Public Relations": 0.745, + "helm_mmlu/Security Studies": 0.8, + "helm_mmlu/Sociology": 0.871, + "helm_mmlu/Virology": 0.542, + "helm_mmlu/World Religions": 0.836, + "helm_mmlu/Mean win rate": 0.589 + } + }, + { + "id": "openai/gpt-4-0125-preview", + "name": "openai/gpt-4-0125-preview", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8434, + "reward-bench/Chat": 0.9525, + "reward-bench/Chat Hard": 0.7434, + "reward-bench/Safety": 0.8757, + "reward-bench/Reasoning": 0.8692, + "reward-bench/Prior Sets (0.5 weight)": 0.7085 + } + }, + { + "id": "openai/gpt-4-0314", + "name": "GPT-4 0314", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_instruct/Mean win rate": 0.611, + "helm_instruct/Anthropic RLHF dataset": 4.934, + "helm_instruct/Best ChatGPT Prompts": 4.973, + "helm_instruct/Koala test dataset": 4.966, + "helm_instruct/Open Assistant": 4.986, + "helm_instruct/Self Instruct": 4.976, + "helm_instruct/Vicuna": 4.995 + } + }, + { + "id": "openai/gpt-4-0613", + "name": "GPT-4 0613", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.867, + "helm_lite/NarrativeQA": 0.768, + "helm_lite/NaturalQuestions (closed-book)": 0.457, + "helm_lite/OpenbookQA": 0.96, + "helm_lite/MMLU": 0.735, + "helm_lite/MATH": 0.802, + "helm_lite/GSM8K": 0.932, + "helm_lite/LegalBench": 0.713, + "helm_lite/MedQA": 0.815, + "helm_lite/WMT 2014": 0.211, + "helm_mmlu/MMLU All Subjects": 0.824, + "helm_mmlu/Abstract Algebra": 0.63, + "helm_mmlu/Anatomy": 0.8, + "helm_mmlu/College Physics": 0.627, + "helm_mmlu/Computer Security": 0.86, + "helm_mmlu/Econometrics": 0.684, + "helm_mmlu/Global Facts": 0.62, + "helm_mmlu/Jurisprudence": 0.889, + "helm_mmlu/Philosophy": 0.859, + "helm_mmlu/Professional Psychology": 0.891, + "helm_mmlu/Us Foreign Policy": 0.95, + "helm_mmlu/Astronomy": 0.934, + "helm_mmlu/Business Ethics": 0.79, + "helm_mmlu/Clinical Knowledge": 0.845, + "helm_mmlu/Conceptual Physics": 0.868, + "helm_mmlu/Electrical Engineering": 0.786, + "helm_mmlu/Elementary Mathematics": 0.807, + "helm_mmlu/Formal Logic": 0.643, + "helm_mmlu/High School World History": 0.945, + "helm_mmlu/Human Sexuality": 0.908, + "helm_mmlu/International Law": 0.917, + "helm_mmlu/Logical Fallacies": 0.871, + "helm_mmlu/Machine Learning": 0.759, + "helm_mmlu/Management": 0.932, + "helm_mmlu/Marketing": 0.962, + "helm_mmlu/Medical Genetics": 0.94, + "helm_mmlu/Miscellaneous": 0.949, + "helm_mmlu/Moral Scenarios": 0.902, + "helm_mmlu/Nutrition": 0.892, + "helm_mmlu/Prehistory": 0.926, + "helm_mmlu/Public Relations": 0.745, + "helm_mmlu/Security Studies": 0.861, + "helm_mmlu/Sociology": 0.93, + "helm_mmlu/Virology": 0.596, + "helm_mmlu/World Religions": 0.877, + "helm_mmlu/Mean win rate": 0.517 + } + }, + { + "id": "openai/gpt-4-1106-preview", + "name": "GPT-4 Turbo 1106 preview", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.698, + "helm_lite/NarrativeQA": 0.727, + "helm_lite/NaturalQuestions (closed-book)": 0.435, + "helm_lite/OpenbookQA": 0.95, + "helm_lite/MMLU": 0.699, + "helm_lite/MATH": 0.857, + "helm_lite/GSM8K": 0.668, + "helm_lite/LegalBench": 0.626, + "helm_lite/MedQA": 0.817, + "helm_lite/WMT 2014": 0.205, + "helm_mmlu/MMLU All Subjects": 0.796, + "helm_mmlu/Abstract Algebra": 0.53, + "helm_mmlu/Anatomy": 0.807, + "helm_mmlu/College Physics": 0.402, + "helm_mmlu/Computer Security": 0.86, + "helm_mmlu/Econometrics": 0.675, + "helm_mmlu/Global Facts": 0.58, + "helm_mmlu/Jurisprudence": 0.889, + "helm_mmlu/Philosophy": 0.852, + "helm_mmlu/Professional Psychology": 0.887, + "helm_mmlu/Us Foreign Policy": 0.96, + "helm_mmlu/Astronomy": 0.941, + "helm_mmlu/Business Ethics": 0.78, + "helm_mmlu/Clinical Knowledge": 0.864, + "helm_mmlu/Conceptual Physics": 0.894, + "helm_mmlu/Electrical Engineering": 0.772, + "helm_mmlu/Elementary Mathematics": 0.638, + "helm_mmlu/Formal Logic": 0.651, + "helm_mmlu/High School World History": 0.958, + "helm_mmlu/Human Sexuality": 0.908, + "helm_mmlu/International Law": 0.926, + "helm_mmlu/Logical Fallacies": 0.865, + "helm_mmlu/Machine Learning": 0.723, + "helm_mmlu/Management": 0.913, + "helm_mmlu/Marketing": 0.932, + "helm_mmlu/Medical Genetics": 0.93, + "helm_mmlu/Miscellaneous": 0.946, + "helm_mmlu/Moral Scenarios": 0.816, + "helm_mmlu/Nutrition": 0.879, + "helm_mmlu/Prehistory": 0.917, + "helm_mmlu/Public Relations": 0.782, + "helm_mmlu/Security Studies": 0.841, + "helm_mmlu/Sociology": 0.925, + "helm_mmlu/Virology": 0.59, + "helm_mmlu/World Religions": 0.854, + "helm_mmlu/Mean win rate": 0.416 + } + }, + { + "id": "openai/gpt-4-turbo-2024-04-09", + "name": "GPT-4 Turbo 2024-04-09", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.864, + "helm_lite/NarrativeQA": 0.761, + "helm_lite/NaturalQuestions (closed-book)": 0.482, + "helm_lite/OpenbookQA": 0.97, + "helm_lite/MMLU": 0.711, + "helm_lite/MATH": 0.833, + "helm_lite/GSM8K": 0.824, + "helm_lite/LegalBench": 0.727, + "helm_lite/MedQA": 0.783, + "helm_lite/WMT 2014": 0.218, + "helm_mmlu/MMLU All Subjects": 0.813, + "helm_mmlu/Abstract Algebra": 0.56, + "helm_mmlu/Anatomy": 0.822, + "helm_mmlu/College Physics": 0.539, + "helm_mmlu/Computer Security": 0.83, + "helm_mmlu/Econometrics": 0.675, + "helm_mmlu/Global Facts": 0.58, + "helm_mmlu/Jurisprudence": 0.88, + "helm_mmlu/Philosophy": 0.868, + "helm_mmlu/Professional Psychology": 0.873, + "helm_mmlu/Us Foreign Policy": 0.96, + "helm_mmlu/Astronomy": 0.941, + "helm_mmlu/Business Ethics": 0.82, + "helm_mmlu/Clinical Knowledge": 0.83, + "helm_mmlu/Conceptual Physics": 0.894, + "helm_mmlu/Electrical Engineering": 0.752, + "helm_mmlu/Elementary Mathematics": 0.72, + "helm_mmlu/Formal Logic": 0.706, + "helm_mmlu/High School World History": 0.941, + "helm_mmlu/Human Sexuality": 0.901, + "helm_mmlu/International Law": 0.942, + "helm_mmlu/Logical Fallacies": 0.871, + "helm_mmlu/Machine Learning": 0.741, + "helm_mmlu/Management": 0.883, + "helm_mmlu/Marketing": 0.949, + "helm_mmlu/Medical Genetics": 0.92, + "helm_mmlu/Miscellaneous": 0.945, + "helm_mmlu/Moral Scenarios": 0.803, + "helm_mmlu/Nutrition": 0.892, + "helm_mmlu/Prehistory": 0.92, + "helm_mmlu/Public Relations": 0.755, + "helm_mmlu/Security Studies": 0.8, + "helm_mmlu/Sociology": 0.915, + "helm_mmlu/Virology": 0.602, + "helm_mmlu/World Religions": 0.848, + "helm_mmlu/Mean win rate": 0.351, + "reward-bench/Score": 0.8395, + "reward-bench/Chat": 0.9525, + "reward-bench/Chat Hard": 0.7544, + "reward-bench/Safety": 0.8757, + "reward-bench/Reasoning": 0.827, + "reward-bench/Prior Sets (0.5 weight)": 0.7363 + } + }, + { + "id": "openai/gpt-4.1-2025-04-14", + "name": "gpt-4.1-2025-04-14", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.8755, + "global-mmlu-lite/Culturally Sensitive": 0.8541, + "global-mmlu-lite/Culturally Agnostic": 0.8969, + "global-mmlu-lite/Arabic": 0.88, + "global-mmlu-lite/English": 0.8825, + "global-mmlu-lite/Bengali": 0.8625, + "global-mmlu-lite/German": 0.875, + "global-mmlu-lite/French": 0.8875, + "global-mmlu-lite/Hindi": 0.8775, + "global-mmlu-lite/Indonesian": 0.885, + "global-mmlu-lite/Italian": 0.88, + "global-mmlu-lite/Japanese": 0.8725, + "global-mmlu-lite/Korean": 0.87, + "global-mmlu-lite/Portuguese": 0.875, + "global-mmlu-lite/Spanish": 0.885, + "global-mmlu-lite/Swahili": 0.8725, + "global-mmlu-lite/Yoruba": 0.875, + "global-mmlu-lite/Chinese": 0.87, + "global-mmlu-lite/Burmese": 0.8575, + "helm_capabilities/Mean score": 0.727, + "helm_capabilities/MMLU-Pro": 0.811, + "helm_capabilities/GPQA": 0.659, + "helm_capabilities/IFEval": 0.838, + "helm_capabilities/WildBench": 0.854, + "helm_capabilities/Omni-MATH": 0.471, + "reward-bench/Score": 0.7232, + "reward-bench/Factuality": 0.8289, + "reward-bench/Precise IF": 0.3974, + "reward-bench/Math": 0.6521, + "reward-bench/Safety": 0.8726, + "reward-bench/Focus": 0.7338, + "reward-bench/Ties": 0.8542 + } + }, + { + "id": "openai/gpt-4.1-mini-2025-04-14", + "name": "GPT-4.1 mini 2025-04-14", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.726, + "helm_capabilities/MMLU-Pro": 0.783, + "helm_capabilities/GPQA": 0.614, + "helm_capabilities/IFEval": 0.904, + "helm_capabilities/WildBench": 0.838, + "helm_capabilities/Omni-MATH": 0.491, + "reward-bench/Score": 0.6573, + "reward-bench/Factuality": 0.6084, + "reward-bench/Precise IF": 0.4125, + "reward-bench/Math": 0.7213, + "reward-bench/Safety": 0.7265, + "reward-bench/Focus": 0.7354, + "reward-bench/Ties": 0.74 + } + }, + { + "id": "openai/gpt-4.1-nano-2025-04-14", + "name": "GPT-4.1 nano 2025-04-14", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.616, + "helm_capabilities/MMLU-Pro": 0.55, + "helm_capabilities/GPQA": 0.507, + "helm_capabilities/IFEval": 0.843, + "helm_capabilities/WildBench": 0.811, + "helm_capabilities/Omni-MATH": 0.367, + "reward-bench/Score": 0.4849, + "reward-bench/Factuality": 0.4646, + "reward-bench/Precise IF": 0.2578, + "reward-bench/Math": 0.5041, + "reward-bench/Safety": 0.7156, + "reward-bench/Focus": 0.466, + "reward-bench/Ties": 0.5015 + } + }, + { + "id": "openai/gpt-4o-2024-05-13", + "name": "GPT-4o 2024-05-13", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.938, + "helm_lite/NarrativeQA": 0.804, + "helm_lite/NaturalQuestions (closed-book)": 0.501, + "helm_lite/OpenbookQA": 0.966, + "helm_lite/MMLU": 0.748, + "helm_lite/MATH": 0.829, + "helm_lite/GSM8K": 0.905, + "helm_lite/LegalBench": 0.733, + "helm_lite/MedQA": 0.857, + "helm_lite/WMT 2014": 0.231, + "helm_mmlu/MMLU All Subjects": 0.842, + "helm_mmlu/Abstract Algebra": 0.66, + "helm_mmlu/Anatomy": 0.911, + "helm_mmlu/College Physics": 0.686, + "helm_mmlu/Computer Security": 0.85, + "helm_mmlu/Econometrics": 0.693, + "helm_mmlu/Global Facts": 0.64, + "helm_mmlu/Jurisprudence": 0.898, + "helm_mmlu/Philosophy": 0.9, + "helm_mmlu/Professional Psychology": 0.905, + "helm_mmlu/Us Foreign Policy": 0.96, + "helm_mmlu/Astronomy": 0.941, + "helm_mmlu/Business Ethics": 0.85, + "helm_mmlu/Clinical Knowledge": 0.894, + "helm_mmlu/Conceptual Physics": 0.911, + "helm_mmlu/Electrical Engineering": 0.807, + "helm_mmlu/Elementary Mathematics": 0.741, + "helm_mmlu/Formal Logic": 0.683, + "helm_mmlu/High School World History": 0.945, + "helm_mmlu/Human Sexuality": 0.908, + "helm_mmlu/International Law": 0.934, + "helm_mmlu/Logical Fallacies": 0.883, + "helm_mmlu/Machine Learning": 0.768, + "helm_mmlu/Management": 0.942, + "helm_mmlu/Marketing": 0.936, + "helm_mmlu/Medical Genetics": 0.96, + "helm_mmlu/Miscellaneous": 0.954, + "helm_mmlu/Moral Scenarios": 0.841, + "helm_mmlu/Nutrition": 0.899, + "helm_mmlu/Prehistory": 0.938, + "helm_mmlu/Public Relations": 0.809, + "helm_mmlu/Security Studies": 0.837, + "helm_mmlu/Sociology": 0.94, + "helm_mmlu/Virology": 0.596, + "helm_mmlu/World Religions": 0.889, + "helm_mmlu/Mean win rate": 0.671, + "reward-bench/Score": 0.8327, + "reward-bench/Chat": 0.9665, + "reward-bench/Chat Hard": 0.7039, + "reward-bench/Safety": 0.8649, + "reward-bench/Reasoning": 0.8487, + "reward-bench/Prior Sets (0.5 weight)": 0.7262 + } + }, + { + "id": "openai/gpt-4o-2024-08-06", + "name": "GPT-4o 2024-08-06", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.928, + "helm_lite/NarrativeQA": 0.795, + "helm_lite/NaturalQuestions (closed-book)": 0.496, + "helm_lite/OpenbookQA": 0.968, + "helm_lite/MMLU": 0.738, + "helm_lite/MATH": 0.853, + "helm_lite/GSM8K": 0.909, + "helm_lite/LegalBench": 0.721, + "helm_lite/MedQA": 0.863, + "helm_lite/WMT 2014": 0.225, + "helm_mmlu/MMLU All Subjects": 0.843, + "helm_mmlu/Abstract Algebra": 0.58, + "helm_mmlu/Anatomy": 0.911, + "helm_mmlu/College Physics": 0.686, + "helm_mmlu/Computer Security": 0.85, + "helm_mmlu/Econometrics": 0.711, + "helm_mmlu/Global Facts": 0.69, + "helm_mmlu/Jurisprudence": 0.907, + "helm_mmlu/Philosophy": 0.894, + "helm_mmlu/Professional Psychology": 0.899, + "helm_mmlu/Us Foreign Policy": 0.95, + "helm_mmlu/Astronomy": 0.947, + "helm_mmlu/Business Ethics": 0.89, + "helm_mmlu/Clinical Knowledge": 0.894, + "helm_mmlu/Conceptual Physics": 0.923, + "helm_mmlu/Electrical Engineering": 0.793, + "helm_mmlu/Elementary Mathematics": 0.775, + "helm_mmlu/Formal Logic": 0.675, + "helm_mmlu/High School World History": 0.941, + "helm_mmlu/Human Sexuality": 0.901, + "helm_mmlu/International Law": 0.942, + "helm_mmlu/Logical Fallacies": 0.902, + "helm_mmlu/Machine Learning": 0.777, + "helm_mmlu/Management": 0.913, + "helm_mmlu/Marketing": 0.94, + "helm_mmlu/Medical Genetics": 0.98, + "helm_mmlu/Miscellaneous": 0.958, + "helm_mmlu/Moral Scenarios": 0.802, + "helm_mmlu/Nutrition": 0.905, + "helm_mmlu/Prehistory": 0.935, + "helm_mmlu/Public Relations": 0.782, + "helm_mmlu/Security Studies": 0.833, + "helm_mmlu/Sociology": 0.945, + "helm_mmlu/Virology": 0.578, + "helm_mmlu/World Religions": 0.883, + "helm_mmlu/Mean win rate": 0.52, + "reward-bench/Score": 0.8673, + "reward-bench/Factuality": 0.5684, + "reward-bench/Precise IF": 0.3312, + "reward-bench/Math": 0.623, + "reward-bench/Safety": 0.8811, + "reward-bench/Focus": 0.7293, + "reward-bench/Ties": 0.7819, + "reward-bench/Chat": 0.9609, + "reward-bench/Chat Hard": 0.761, + "reward-bench/Reasoning": 0.8661 + } + }, + { + "id": "openai/gpt-4o-2024-11-20", + "name": "GPT-4o 2024-11-20", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.634, + "helm_capabilities/MMLU-Pro": 0.713, + "helm_capabilities/GPQA": 0.52, + "helm_capabilities/IFEval": 0.817, + "helm_capabilities/WildBench": 0.828, + "helm_capabilities/Omni-MATH": 0.293, + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.0, + "livecodebenchpro/Easy Problems": 0.07042253521126761 + } + }, + { + "id": "openai/gpt-4o-mini-2024-07-18", + "name": "GPT-4o mini 2024-07-18", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.565, + "helm_capabilities/MMLU-Pro": 0.603, + "helm_capabilities/GPQA": 0.368, + "helm_capabilities/IFEval": 0.782, + "helm_capabilities/WildBench": 0.791, + "helm_capabilities/Omni-MATH": 0.28, + "helm_lite/Mean win rate": 0.701, + "helm_lite/NarrativeQA": 0.768, + "helm_lite/NaturalQuestions (closed-book)": 0.386, + "helm_lite/OpenbookQA": 0.92, + "helm_lite/MMLU": 0.668, + "helm_lite/MATH": 0.802, + "helm_lite/GSM8K": 0.843, + "helm_lite/LegalBench": 0.653, + "helm_lite/MedQA": 0.748, + "helm_lite/WMT 2014": 0.206, + "helm_mmlu/MMLU All Subjects": 0.767, + "helm_mmlu/Abstract Algebra": 0.42, + "helm_mmlu/Anatomy": 0.77, + "helm_mmlu/College Physics": 0.559, + "helm_mmlu/Computer Security": 0.85, + "helm_mmlu/Econometrics": 0.649, + "helm_mmlu/Global Facts": 0.45, + "helm_mmlu/Jurisprudence": 0.87, + "helm_mmlu/Philosophy": 0.772, + "helm_mmlu/Professional Psychology": 0.833, + "helm_mmlu/Us Foreign Policy": 0.91, + "helm_mmlu/Astronomy": 0.849, + "helm_mmlu/Business Ethics": 0.79, + "helm_mmlu/Clinical Knowledge": 0.845, + "helm_mmlu/Conceptual Physics": 0.791, + "helm_mmlu/Electrical Engineering": 0.731, + "helm_mmlu/Elementary Mathematics": 0.651, + "helm_mmlu/Formal Logic": 0.556, + "helm_mmlu/High School World History": 0.903, + "helm_mmlu/Human Sexuality": 0.863, + "helm_mmlu/International Law": 0.926, + "helm_mmlu/Logical Fallacies": 0.871, + "helm_mmlu/Machine Learning": 0.616, + "helm_mmlu/Management": 0.845, + "helm_mmlu/Marketing": 0.927, + "helm_mmlu/Medical Genetics": 0.89, + "helm_mmlu/Miscellaneous": 0.913, + "helm_mmlu/Moral Scenarios": 0.485, + "helm_mmlu/Nutrition": 0.827, + "helm_mmlu/Prehistory": 0.833, + "helm_mmlu/Public Relations": 0.791, + "helm_mmlu/Security Studies": 0.788, + "helm_mmlu/Sociology": 0.9, + "helm_mmlu/Virology": 0.536, + "helm_mmlu/World Religions": 0.86, + "helm_mmlu/Mean win rate": 0.774, + "reward-bench/Score": 0.8007, + "reward-bench/Factuality": 0.4105, + "reward-bench/Precise IF": 0.3438, + "reward-bench/Math": 0.5191, + "reward-bench/Safety": 0.8081, + "reward-bench/Focus": 0.7414, + "reward-bench/Ties": 0.6962, + "reward-bench/Chat": 0.9497, + "reward-bench/Chat Hard": 0.6075, + "reward-bench/Reasoning": 0.8374 + } + }, + { + "id": "openai/gpt-5-2025-08-07", + "name": "gpt-5-2025-08-07", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.8895, + "global-mmlu-lite/Culturally Sensitive": 0.8913, + "global-mmlu-lite/Culturally Agnostic": 0.8878, + "global-mmlu-lite/Arabic": 0.8925, + "global-mmlu-lite/English": 0.8725, + "global-mmlu-lite/Bengali": 0.9, + "global-mmlu-lite/German": 0.91, + "global-mmlu-lite/French": 0.9075, + "global-mmlu-lite/Hindi": 0.865, + "global-mmlu-lite/Indonesian": 0.795, + "global-mmlu-lite/Italian": 0.9075, + "global-mmlu-lite/Japanese": 0.8875, + "global-mmlu-lite/Korean": 0.915, + "global-mmlu-lite/Portuguese": 0.8875, + "global-mmlu-lite/Spanish": 0.905, + "global-mmlu-lite/Swahili": 0.865, + "global-mmlu-lite/Yoruba": 0.9125, + "global-mmlu-lite/Chinese": 0.895, + "global-mmlu-lite/Burmese": 0.915, + "helm_capabilities/Mean score": 0.807, + "helm_capabilities/MMLU-Pro": 0.863, + "helm_capabilities/GPQA": 0.791, + "helm_capabilities/IFEval": 0.875, + "helm_capabilities/WildBench": 0.857, + "helm_capabilities/Omni-MATH": 0.647, + "livecodebenchpro/Hard Problems": 0.04225352112676056, + "livecodebenchpro/Medium Problems": 0.4084507042253521, + "livecodebenchpro/Easy Problems": 0.8873239436619719 + } + }, + { + "id": "openai/gpt-5-mini-2025-08-07", + "name": "GPT-5 mini 2025-08-07", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.819, + "helm_capabilities/MMLU-Pro": 0.835, + "helm_capabilities/GPQA": 0.756, + "helm_capabilities/IFEval": 0.927, + "helm_capabilities/WildBench": 0.855, + "helm_capabilities/Omni-MATH": 0.722 + } + }, + { + "id": "openai/gpt-5-nano-2025-08-07", + "name": "GPT-5 nano 2025-08-07", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.748, + "helm_capabilities/MMLU-Pro": 0.778, + "helm_capabilities/GPQA": 0.679, + "helm_capabilities/IFEval": 0.932, + "helm_capabilities/WildBench": 0.806, + "helm_capabilities/Omni-MATH": 0.547 + } + }, + { + "id": "openai/gpt-oss-120b", + "name": "gpt-oss-120b", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.77, + "helm_capabilities/MMLU-Pro": 0.795, + "helm_capabilities/GPQA": 0.684, + "helm_capabilities/IFEval": 0.836, + "helm_capabilities/WildBench": 0.845, + "helm_capabilities/Omni-MATH": 0.688, + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.11267605633802817, + "livecodebenchpro/Easy Problems": 0.6619718309859155, + "terminal-bench-2.0/terminal-bench-2.0": 18.7 + } + }, + { + "id": "openai/gpt-oss-20b", + "name": "gpt-oss-20b", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.674, + "helm_capabilities/MMLU-Pro": 0.74, + "helm_capabilities/GPQA": 0.594, + "helm_capabilities/IFEval": 0.732, + "helm_capabilities/WildBench": 0.737, + "helm_capabilities/Omni-MATH": 0.565, + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.056338028169014086, + "livecodebenchpro/Easy Problems": 0.5070422535211268, + "terminal-bench-2.0/terminal-bench-2.0": 3.4 + } + }, + { + "id": "openai/o3", + "name": "o3", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "ace/Overall Score": 0.529, + "ace/Gaming Score": 0.585, + "ace/Shopping Score": 0.45, + "apex-v1/Big Law Score": 0.76 + } + }, + { + "id": "openai/o3 Pro", + "name": "o3 Pro", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "ace/Overall Score": 0.552, + "ace/DIY Score": 0.54, + "ace/Food Score": 0.6, + "ace/Gaming Score": 0.613, + "ace/Shopping Score": 0.45 + } + }, + { + "id": "openai/o3-2025-04-16", + "name": "o3 2025-04-16", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.811, + "helm_capabilities/MMLU-Pro": 0.859, + "helm_capabilities/GPQA": 0.753, + "helm_capabilities/IFEval": 0.869, + "helm_capabilities/WildBench": 0.861, + "helm_capabilities/Omni-MATH": 0.714, + "livecodebenchpro/Hard Problems": 0.0, + "livecodebenchpro/Medium Problems": 0.22535211267605634, + "livecodebenchpro/Easy Problems": 0.7183098591549296 + } + }, + { + "id": "openai/o3-mini-2025-01-31", + "name": "o3-mini-2025-01-31", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.78, + "global-mmlu-lite/Culturally Sensitive": 0.765, + "global-mmlu-lite/Culturally Agnostic": 0.795, + "global-mmlu-lite/Arabic": 0.7725, + "global-mmlu-lite/English": 0.8025, + "global-mmlu-lite/Bengali": 0.77, + "global-mmlu-lite/German": 0.7525, + "global-mmlu-lite/French": 0.74, + "global-mmlu-lite/Hindi": 0.7525, + "global-mmlu-lite/Indonesian": 0.7425, + "global-mmlu-lite/Italian": 0.8, + "global-mmlu-lite/Japanese": 0.81, + "global-mmlu-lite/Korean": 0.8075, + "global-mmlu-lite/Portuguese": 0.7975, + "global-mmlu-lite/Spanish": 0.775, + "global-mmlu-lite/Swahili": 0.765, + "global-mmlu-lite/Yoruba": 0.7725, + "global-mmlu-lite/Chinese": 0.8125, + "global-mmlu-lite/Burmese": 0.8075 + } + }, + { + "id": "openai/o4-mini-2025-04-16", + "name": "o4-mini-2025-04-16", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.8705, + "global-mmlu-lite/Culturally Sensitive": 0.8503, + "global-mmlu-lite/Culturally Agnostic": 0.8906, + "global-mmlu-lite/Arabic": 0.865, + "global-mmlu-lite/English": 0.8675, + "global-mmlu-lite/Bengali": 0.8875, + "global-mmlu-lite/German": 0.8775, + "global-mmlu-lite/French": 0.87, + "global-mmlu-lite/Hindi": 0.87, + "global-mmlu-lite/Indonesian": 0.8675, + "global-mmlu-lite/Italian": 0.855, + "global-mmlu-lite/Japanese": 0.885, + "global-mmlu-lite/Korean": 0.88, + "global-mmlu-lite/Portuguese": 0.88, + "global-mmlu-lite/Spanish": 0.855, + "global-mmlu-lite/Swahili": 0.8525, + "global-mmlu-lite/Yoruba": 0.8525, + "global-mmlu-lite/Chinese": 0.89, + "global-mmlu-lite/Burmese": 0.8725, + "helm_capabilities/Mean score": 0.812, + "helm_capabilities/MMLU-Pro": 0.82, + "helm_capabilities/GPQA": 0.735, + "helm_capabilities/IFEval": 0.929, + "helm_capabilities/WildBench": 0.854, + "helm_capabilities/Omni-MATH": 0.72, + "livecodebenchpro/Hard Problems": 0.0143, + "livecodebenchpro/Medium Problems": 0.2923, + "livecodebenchpro/Easy Problems": 0.8571 + } + }, + { + "id": "openai/text-ada-001", + "name": "text-ada-001", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.107, + "helm_classic/MMLU": 0.238, + "helm_classic/BoolQ": 0.464, + "helm_classic/NarrativeQA": 0.238, + "helm_classic/NaturalQuestions (open-book)": 0.149, + "helm_classic/QuAC": 0.176, + "helm_classic/HellaSwag": 0.429, + "helm_classic/OpenbookQA": 0.346, + "helm_classic/TruthfulQA": 0.232, + "helm_classic/MS MARCO (TREC)": 0.302, + "helm_classic/CNN/DailyMail": 0.136, + "helm_classic/XSUM": 0.034, + "helm_classic/IMDB": 0.822, + "helm_classic/CivilComments": 0.503, + "helm_classic/RAFT": 0.406 + } + }, + { + "id": "openai/text-babbage-001", + "name": "text-babbage-001", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.229, + "helm_classic/MMLU": 0.229, + "helm_classic/BoolQ": 0.451, + "helm_classic/NarrativeQA": 0.429, + "helm_classic/NaturalQuestions (open-book)": 0.33, + "helm_classic/QuAC": 0.284, + "helm_classic/HellaSwag": 0.561, + "helm_classic/OpenbookQA": 0.452, + "helm_classic/TruthfulQA": 0.233, + "helm_classic/MS MARCO (TREC)": 0.449, + "helm_classic/CNN/DailyMail": 0.151, + "helm_classic/XSUM": 0.046, + "helm_classic/IMDB": 0.913, + "helm_classic/CivilComments": 0.499, + "helm_classic/RAFT": 0.509 + } + }, + { + "id": "openai/text-curie-001", + "name": "text-curie-001", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.36, + "helm_classic/MMLU": 0.237, + "helm_classic/BoolQ": 0.62, + "helm_classic/NarrativeQA": 0.582, + "helm_classic/NaturalQuestions (open-book)": 0.571, + "helm_classic/QuAC": 0.358, + "helm_classic/HellaSwag": 0.676, + "helm_classic/OpenbookQA": 0.514, + "helm_classic/TruthfulQA": 0.257, + "helm_classic/MS MARCO (TREC)": 0.507, + "helm_classic/CNN/DailyMail": 0.152, + "helm_classic/XSUM": 0.076, + "helm_classic/IMDB": 0.923, + "helm_classic/CivilComments": 0.537, + "helm_classic/RAFT": 0.489 + } + }, + { + "id": "openai/text-davinci-002", + "name": "text-davinci-002", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.905, + "helm_classic/MMLU": 0.568, + "helm_classic/BoolQ": 0.877, + "helm_classic/NarrativeQA": 0.727, + "helm_classic/NaturalQuestions (open-book)": 0.713, + "helm_classic/QuAC": 0.445, + "helm_classic/HellaSwag": 0.815, + "helm_classic/OpenbookQA": 0.594, + "helm_classic/TruthfulQA": 0.61, + "helm_classic/MS MARCO (TREC)": 0.664, + "helm_classic/CNN/DailyMail": 0.153, + "helm_classic/XSUM": 0.144, + "helm_classic/IMDB": 0.948, + "helm_classic/CivilComments": 0.668, + "helm_classic/RAFT": 0.733, + "helm_lite/Mean win rate": 0.336, + "helm_lite/NarrativeQA": 0.719, + "helm_lite/NaturalQuestions (closed-book)": 0.394, + "helm_lite/OpenbookQA": 0.796, + "helm_lite/MMLU": 0.568, + "helm_lite/MATH": 0.428, + "helm_lite/GSM8K": 0.479, + "helm_lite/LegalBench": 0.58, + "helm_lite/MedQA": 0.525, + "helm_lite/WMT 2014": 0.174 + } + }, + { + "id": "openai/text-davinci-003", + "name": "text-davinci-003", + "developer": "openai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.872, + "helm_classic/MMLU": 0.569, + "helm_classic/BoolQ": 0.881, + "helm_classic/NarrativeQA": 0.727, + "helm_classic/NaturalQuestions (open-book)": 0.77, + "helm_classic/QuAC": 0.525, + "helm_classic/HellaSwag": 0.822, + "helm_classic/OpenbookQA": 0.646, + "helm_classic/TruthfulQA": 0.593, + "helm_classic/MS MARCO (TREC)": 0.644, + "helm_classic/CNN/DailyMail": 0.156, + "helm_classic/XSUM": 0.124, + "helm_classic/IMDB": 0.848, + "helm_classic/CivilComments": 0.684, + "helm_classic/RAFT": 0.759, + "helm_lite/Mean win rate": 0.439, + "helm_lite/NarrativeQA": 0.731, + "helm_lite/NaturalQuestions (closed-book)": 0.413, + "helm_lite/OpenbookQA": 0.828, + "helm_lite/MMLU": 0.555, + "helm_lite/MATH": 0.449, + "helm_lite/GSM8K": 0.615, + "helm_lite/LegalBench": 0.622, + "helm_lite/MedQA": 0.531, + "helm_lite/WMT 2014": 0.191 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/openbmb.json b/data/developers/openbmb.json new file mode 100644 index 0000000000000000000000000000000000000000..dd2ac9b2651a2fee615e466e3b3da81ac3cfeef4 --- /dev/null +++ b/data/developers/openbmb.json @@ -0,0 +1,85 @@ +{ + "developer": "openbmb", + "models": [ + { + "id": "openbmb/Eurus-7b-kto", + "name": "openbmb/Eurus-7b-kto", + "developer": "openbmb", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.69, + "reward-bench/Chat": 0.9525, + "reward-bench/Chat Hard": 0.5373, + "reward-bench/Safety": 0.6054, + "reward-bench/Reasoning": 0.7467, + "reward-bench/Prior Sets (0.5 weight)": 0.5261 + } + }, + { + "id": "openbmb/Eurus-RM-7b", + "name": "openbmb/Eurus-RM-7b", + "developer": "openbmb", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5806, + "reward-bench/Chat": 0.9804, + "reward-bench/Chat Hard": 0.6557, + "reward-bench/Safety": 0.6267, + "reward-bench/Reasoning": 0.8633, + "reward-bench/Prior Sets (0.5 weight)": 0.7172, + "reward-bench/Factuality": 0.6, + "reward-bench/Precise IF": 0.3438, + "reward-bench/Math": 0.5683, + "reward-bench/Focus": 0.7475, + "reward-bench/Ties": 0.5972 + } + }, + { + "id": "openbmb/MiniCPM-2B-dpo-fp32", + "name": "openbmb/MiniCPM-2B-dpo-fp32", + "developer": "openbmb", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.673, + "reward-bench/Chat": 0.8911, + "reward-bench/Chat Hard": 0.4934, + "reward-bench/Safety": 0.573, + "reward-bench/Reasoning": 0.8233, + "reward-bench/Prior Sets (0.5 weight)": 0.4958 + } + }, + { + "id": "openbmb/MiniCPM-S-1B-sft-llama-format", + "name": "MiniCPM-S-1B-sft-llama-format", + "developer": "openbmb", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3329, + "hfopenllm_v2/BBH": 0.3049, + "hfopenllm_v2/MATH Level 5": 0.031, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3317, + "hfopenllm_v2/MMLU-PRO": 0.1858 + } + }, + { + "id": "openbmb/UltraRM-13b", + "name": "openbmb/UltraRM-13b", + "developer": "openbmb", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6903, + "reward-bench/Factuality": 0.5063, + "reward-bench/Precise IF": 0.3312, + "reward-bench/Math": 0.5519, + "reward-bench/Safety": 0.5986, + "reward-bench/Focus": 0.6081, + "reward-bench/Ties": 0.3036, + "reward-bench/Chat": 0.9637, + "reward-bench/Chat Hard": 0.5548, + "reward-bench/Reasoning": 0.6244, + "reward-bench/Prior Sets (0.5 weight)": 0.7294 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/openchat.json b/data/developers/openchat.json new file mode 100644 index 0000000000000000000000000000000000000000..a7809e488044248dbd02599ca637f543387e3f9b --- /dev/null +++ b/data/developers/openchat.json @@ -0,0 +1,89 @@ +{ + "developer": "openchat", + "models": [ + { + "id": "openchat/openchat-3.5-0106", + "name": "openchat-3.5-0106", + "developer": "openchat", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5967, + "hfopenllm_v2/BBH": 0.4617, + "hfopenllm_v2/MATH Level 5": 0.0763, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4254, + "hfopenllm_v2/MMLU-PRO": 0.3291 + } + }, + { + "id": "openchat/openchat-3.5-1210", + "name": "openchat-3.5-1210", + "developer": "openchat", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6037, + "hfopenllm_v2/BBH": 0.4535, + "hfopenllm_v2/MATH Level 5": 0.0785, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4414, + "hfopenllm_v2/MMLU-PRO": 0.3142 + } + }, + { + "id": "openchat/openchat-3.6-8b-20240522", + "name": "openchat-3.6-8b-20240522", + "developer": "openchat", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5343, + "hfopenllm_v2/BBH": 0.5338, + "hfopenllm_v2/MATH Level 5": 0.0997, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.3999, + "hfopenllm_v2/MMLU-PRO": 0.3229 + } + }, + { + "id": "openchat/openchat_3.5", + "name": "openchat_3.5", + "developer": "openchat", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5931, + "hfopenllm_v2/BBH": 0.4426, + "hfopenllm_v2/MATH Level 5": 0.0725, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4229, + "hfopenllm_v2/MMLU-PRO": 0.3153 + } + }, + { + "id": "openchat/openchat_v3.2", + "name": "openchat_v3.2", + "developer": "openchat", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2981, + "hfopenllm_v2/BBH": 0.4331, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.4336, + "hfopenllm_v2/MMLU-PRO": 0.2422 + } + }, + { + "id": "openchat/openchat_v3.2_super", + "name": "openchat_v3.2_super", + "developer": "openchat", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2862, + "hfopenllm_v2/BBH": 0.4221, + "hfopenllm_v2/MATH Level 5": 0.0211, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.4161, + "hfopenllm_v2/MMLU-PRO": 0.2425 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/opencompass.json b/data/developers/opencompass.json new file mode 100644 index 0000000000000000000000000000000000000000..94e25e5bb03f1eaf9994b9c840b0dea07de68c21 --- /dev/null +++ b/data/developers/opencompass.json @@ -0,0 +1,57 @@ +{ + "developer": "opencompass", + "models": [ + { + "id": "opencompass/CompassJudger-1-1.5B-Instruct", + "name": "opencompass/CompassJudger-1-1.5B-Instruct", + "developer": "opencompass", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7344, + "reward-bench/Chat": 0.9637, + "reward-bench/Chat Hard": 0.4923, + "reward-bench/Safety": 0.7818, + "reward-bench/Reasoning": 0.6999 + } + }, + { + "id": "opencompass/CompassJudger-1-14B-Instruct", + "name": "opencompass/CompassJudger-1-14B-Instruct", + "developer": "opencompass", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8409, + "reward-bench/Chat": 0.9749, + "reward-bench/Chat Hard": 0.6228, + "reward-bench/Safety": 0.8392, + "reward-bench/Reasoning": 0.9268 + } + }, + { + "id": "opencompass/CompassJudger-1-32B-Instruct", + "name": "opencompass/CompassJudger-1-32B-Instruct", + "developer": "opencompass", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8522, + "reward-bench/Chat": 0.9804, + "reward-bench/Chat Hard": 0.6513, + "reward-bench/Safety": 0.8527, + "reward-bench/Reasoning": 0.9244 + } + }, + { + "id": "opencompass/CompassJudger-1-7B-Instruct", + "name": "opencompass/CompassJudger-1-7B-Instruct", + "developer": "opencompass", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8317, + "reward-bench/Chat": 0.9777, + "reward-bench/Chat Hard": 0.6096, + "reward-bench/Safety": 0.8446, + "reward-bench/Reasoning": 0.8948 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/orai-nlp.json b/data/developers/orai-nlp.json new file mode 100644 index 0000000000000000000000000000000000000000..ae97aed54f747cee9f9d22196fe5b655770f58e7 --- /dev/null +++ b/data/developers/orai-nlp.json @@ -0,0 +1,19 @@ +{ + "developer": "orai-nlp", + "models": [ + { + "id": "orai-nlp/Llama-eus-8B", + "name": "Llama-eus-8B", + "developer": "orai-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2161, + "hfopenllm_v2/BBH": 0.4418, + "hfopenllm_v2/MATH Level 5": 0.0468, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.3919, + "hfopenllm_v2/MMLU-PRO": 0.3058 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/oxyapi.json b/data/developers/oxyapi.json new file mode 100644 index 0000000000000000000000000000000000000000..db5cfa74854c055d13f627682108bebb7241409e --- /dev/null +++ b/data/developers/oxyapi.json @@ -0,0 +1,19 @@ +{ + "developer": "oxyapi", + "models": [ + { + "id": "oxyapi/oxy-1-small", + "name": "oxy-1-small", + "developer": "oxyapi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6245, + "hfopenllm_v2/BBH": 0.5885, + "hfopenllm_v2/MATH Level 5": 0.3603, + "hfopenllm_v2/GPQA": 0.3716, + "hfopenllm_v2/MUSR": 0.4487, + "hfopenllm_v2/MMLU-PRO": 0.5001 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ozone-ai.json b/data/developers/ozone-ai.json new file mode 100644 index 0000000000000000000000000000000000000000..abe1b28ad015e6d4b54f97763783387da31eb70f --- /dev/null +++ b/data/developers/ozone-ai.json @@ -0,0 +1,19 @@ +{ + "developer": "ozone-ai", + "models": [ + { + "id": "ozone-ai/0x-lite", + "name": "0x-lite", + "developer": "ozone-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.774, + "hfopenllm_v2/BBH": 0.6341, + "hfopenllm_v2/MATH Level 5": 0.5045, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.4221, + "hfopenllm_v2/MMLU-PRO": 0.5184 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ozone-research.json b/data/developers/ozone-research.json new file mode 100644 index 0000000000000000000000000000000000000000..fd1aaf7ea4d20823384132dacf4797c8baeb36d3 --- /dev/null +++ b/data/developers/ozone-research.json @@ -0,0 +1,19 @@ +{ + "developer": "ozone-research", + "models": [ + { + "id": "ozone-research/Chirp-01", + "name": "Chirp-01", + "developer": "ozone-research", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6348, + "hfopenllm_v2/BBH": 0.465, + "hfopenllm_v2/MATH Level 5": 0.3467, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.4487, + "hfopenllm_v2/MMLU-PRO": 0.3508 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/paloalma.json b/data/developers/paloalma.json new file mode 100644 index 0000000000000000000000000000000000000000..6598701688bb611b27294d51194869f074d2a252 --- /dev/null +++ b/data/developers/paloalma.json @@ -0,0 +1,75 @@ +{ + "developer": "paloalma", + "models": [ + { + "id": "paloalma/ECE-TW3-JRGL-V1", + "name": "ECE-TW3-JRGL-V1", + "developer": "paloalma", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5535, + "hfopenllm_v2/BBH": 0.6284, + "hfopenllm_v2/MATH Level 5": 0.1314, + "hfopenllm_v2/GPQA": 0.3473, + "hfopenllm_v2/MUSR": 0.4621, + "hfopenllm_v2/MMLU-PRO": 0.4221 + } + }, + { + "id": "paloalma/ECE-TW3-JRGL-V2", + "name": "ECE-TW3-JRGL-V2", + "developer": "paloalma", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2255, + "hfopenllm_v2/BBH": 0.6031, + "hfopenllm_v2/MATH Level 5": 0.185, + "hfopenllm_v2/GPQA": 0.3314, + "hfopenllm_v2/MUSR": 0.4793, + "hfopenllm_v2/MMLU-PRO": 0.4588 + } + }, + { + "id": "paloalma/ECE-TW3-JRGL-V5", + "name": "ECE-TW3-JRGL-V5", + "developer": "paloalma", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4553, + "hfopenllm_v2/BBH": 0.6025, + "hfopenllm_v2/MATH Level 5": 0.1835, + "hfopenllm_v2/GPQA": 0.3414, + "hfopenllm_v2/MUSR": 0.4621, + "hfopenllm_v2/MMLU-PRO": 0.4648 + } + }, + { + "id": "paloalma/Le_Triomphant-ECE-TW3", + "name": "Le_Triomphant-ECE-TW3", + "developer": "paloalma", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5402, + "hfopenllm_v2/BBH": 0.6112, + "hfopenllm_v2/MATH Level 5": 0.1949, + "hfopenllm_v2/GPQA": 0.349, + "hfopenllm_v2/MUSR": 0.4725, + "hfopenllm_v2/MMLU-PRO": 0.4763 + } + }, + { + "id": "paloalma/TW3-JRGL-v2", + "name": "TW3-JRGL-v2", + "developer": "paloalma", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5316, + "hfopenllm_v2/BBH": 0.6138, + "hfopenllm_v2/MATH Level 5": 0.179, + "hfopenllm_v2/GPQA": 0.3591, + "hfopenllm_v2/MUSR": 0.4858, + "hfopenllm_v2/MMLU-PRO": 0.4858 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/pankajmathur.json b/data/developers/pankajmathur.json new file mode 100644 index 0000000000000000000000000000000000000000..817880875c15da047744ee954089b88c588562e4 --- /dev/null +++ b/data/developers/pankajmathur.json @@ -0,0 +1,411 @@ +{ + "developer": "pankajmathur", + "models": [ + { + "id": "pankajmathur/Al_Dente_v1_8b", + "name": "Al_Dente_v1_8b", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3694, + "hfopenllm_v2/BBH": 0.4835, + "hfopenllm_v2/MATH Level 5": 0.0408, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.3987, + "hfopenllm_v2/MMLU-PRO": 0.286 + } + }, + { + "id": "pankajmathur/model_007_13b_v2", + "name": "model_007_13b_v2", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3056, + "hfopenllm_v2/BBH": 0.4702, + "hfopenllm_v2/MATH Level 5": 0.0211, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.4611, + "hfopenllm_v2/MMLU-PRO": 0.2461 + } + }, + { + "id": "pankajmathur/orca_mini_3b", + "name": "orca_mini_3b", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0742, + "hfopenllm_v2/BBH": 0.3196, + "hfopenllm_v2/MATH Level 5": 0.0083, + "hfopenllm_v2/GPQA": 0.2458, + "hfopenllm_v2/MUSR": 0.3349, + "hfopenllm_v2/MMLU-PRO": 0.1145 + } + }, + { + "id": "pankajmathur/orca_mini_7b", + "name": "orca_mini_7b", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0412, + "hfopenllm_v2/BBH": 0.3332, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.3698, + "hfopenllm_v2/MMLU-PRO": 0.1246 + } + }, + { + "id": "pankajmathur/orca_mini_phi-4", + "name": "orca_mini_phi-4", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7781, + "hfopenllm_v2/BBH": 0.6856, + "hfopenllm_v2/MATH Level 5": 0.2953, + "hfopenllm_v2/GPQA": 0.3742, + "hfopenllm_v2/MUSR": 0.4703, + "hfopenllm_v2/MMLU-PRO": 0.5255 + } + }, + { + "id": "pankajmathur/orca_mini_v2_7b", + "name": "orca_mini_v2_7b", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1358, + "hfopenllm_v2/BBH": 0.3536, + "hfopenllm_v2/MATH Level 5": 0.0113, + "hfopenllm_v2/GPQA": 0.2492, + "hfopenllm_v2/MUSR": 0.3593, + "hfopenllm_v2/MMLU-PRO": 0.1542 + } + }, + { + "id": "pankajmathur/orca_mini_v3_13b", + "name": "orca_mini_v3_13b", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2897, + "hfopenllm_v2/BBH": 0.4711, + "hfopenllm_v2/MATH Level 5": 0.0211, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.4598, + "hfopenllm_v2/MMLU-PRO": 0.2305 + } + }, + { + "id": "pankajmathur/orca_mini_v3_70b", + "name": "orca_mini_v3_70b", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4015, + "hfopenllm_v2/BBH": 0.5949, + "hfopenllm_v2/MATH Level 5": 0.0385, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.5079, + "hfopenllm_v2/MMLU-PRO": 0.3757 + } + }, + { + "id": "pankajmathur/orca_mini_v3_7b", + "name": "orca_mini_v3_7b", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2821, + "hfopenllm_v2/BBH": 0.4095, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2466, + "hfopenllm_v2/MUSR": 0.4982, + "hfopenllm_v2/MMLU-PRO": 0.2084 + } + }, + { + "id": "pankajmathur/orca_mini_v5_8b", + "name": "orca_mini_v5_8b", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4806, + "hfopenllm_v2/BBH": 0.5064, + "hfopenllm_v2/MATH Level 5": 0.0989, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.4, + "hfopenllm_v2/MMLU-PRO": 0.3076 + } + }, + { + "id": "pankajmathur/orca_mini_v5_8b_dpo", + "name": "orca_mini_v5_8b_dpo", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4896, + "hfopenllm_v2/BBH": 0.5075, + "hfopenllm_v2/MATH Level 5": 0.0974, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3894, + "hfopenllm_v2/MMLU-PRO": 0.3116 + } + }, + { + "id": "pankajmathur/orca_mini_v5_8b_orpo", + "name": "orca_mini_v5_8b_orpo", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0824, + "hfopenllm_v2/BBH": 0.4964, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4131, + "hfopenllm_v2/MMLU-PRO": 0.2947 + } + }, + { + "id": "pankajmathur/orca_mini_v6_8b", + "name": "orca_mini_v6_8b", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0111, + "hfopenllm_v2/BBH": 0.3029, + "hfopenllm_v2/MATH Level 5": 0.0038, + "hfopenllm_v2/GPQA": 0.2383, + "hfopenllm_v2/MUSR": 0.3555, + "hfopenllm_v2/MMLU-PRO": 0.1125 + } + }, + { + "id": "pankajmathur/orca_mini_v6_8b_dpo", + "name": "orca_mini_v6_8b_dpo", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3883, + "hfopenllm_v2/BBH": 0.5203, + "hfopenllm_v2/MATH Level 5": 0.0612, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.409, + "hfopenllm_v2/MMLU-PRO": 0.3596 + } + }, + { + "id": "pankajmathur/orca_mini_v7_72b", + "name": "orca_mini_v7_72b", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.593, + "hfopenllm_v2/BBH": 0.6842, + "hfopenllm_v2/MATH Level 5": 0.0937, + "hfopenllm_v2/GPQA": 0.3851, + "hfopenllm_v2/MUSR": 0.507, + "hfopenllm_v2/MMLU-PRO": 0.5622 + } + }, + { + "id": "pankajmathur/orca_mini_v7_7b", + "name": "orca_mini_v7_7b", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4388, + "hfopenllm_v2/BBH": 0.5275, + "hfopenllm_v2/MATH Level 5": 0.1208, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.436, + "hfopenllm_v2/MMLU-PRO": 0.4167 + } + }, + { + "id": "pankajmathur/orca_mini_v8_1_70b", + "name": "orca_mini_v8_1_70b", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8571, + "hfopenllm_v2/BBH": 0.6781, + "hfopenllm_v2/MATH Level 5": 0.3527, + "hfopenllm_v2/GPQA": 0.4329, + "hfopenllm_v2/MUSR": 0.4437, + "hfopenllm_v2/MMLU-PRO": 0.4983 + } + }, + { + "id": "pankajmathur/orca_mini_v9_0_3B-Instruct", + "name": "orca_mini_v9_0_3B-Instruct", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5754, + "hfopenllm_v2/BBH": 0.4413, + "hfopenllm_v2/MATH Level 5": 0.1465, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.3659, + "hfopenllm_v2/MMLU-PRO": 0.2603 + } + }, + { + "id": "pankajmathur/orca_mini_v9_1_1B-Instruct", + "name": "orca_mini_v9_1_1B-Instruct", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3629, + "hfopenllm_v2/BBH": 0.3205, + "hfopenllm_v2/MATH Level 5": 0.0461, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3381, + "hfopenllm_v2/MMLU-PRO": 0.1374 + } + }, + { + "id": "pankajmathur/orca_mini_v9_2_14B", + "name": "orca_mini_v9_2_14B", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7781, + "hfopenllm_v2/BBH": 0.6856, + "hfopenllm_v2/MATH Level 5": 0.2953, + "hfopenllm_v2/GPQA": 0.3742, + "hfopenllm_v2/MUSR": 0.4703, + "hfopenllm_v2/MMLU-PRO": 0.5255 + } + }, + { + "id": "pankajmathur/orca_mini_v9_2_70b", + "name": "orca_mini_v9_2_70b", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8383, + "hfopenllm_v2/BBH": 0.6745, + "hfopenllm_v2/MATH Level 5": 0.2938, + "hfopenllm_v2/GPQA": 0.3733, + "hfopenllm_v2/MUSR": 0.471, + "hfopenllm_v2/MMLU-PRO": 0.4821 + } + }, + { + "id": "pankajmathur/orca_mini_v9_4_70B", + "name": "orca_mini_v9_4_70B", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8015, + "hfopenllm_v2/BBH": 0.6419, + "hfopenllm_v2/MATH Level 5": 0.3263, + "hfopenllm_v2/GPQA": 0.3658, + "hfopenllm_v2/MUSR": 0.4647, + "hfopenllm_v2/MMLU-PRO": 0.4536 + } + }, + { + "id": "pankajmathur/orca_mini_v9_5_1B-Instruct", + "name": "orca_mini_v9_5_1B-Instruct", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4638, + "hfopenllm_v2/BBH": 0.3337, + "hfopenllm_v2/MATH Level 5": 0.0302, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3182, + "hfopenllm_v2/MMLU-PRO": 0.137 + } + }, + { + "id": "pankajmathur/orca_mini_v9_5_1B-Instruct_preview", + "name": "orca_mini_v9_5_1B-Instruct_preview", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3936, + "hfopenllm_v2/BBH": 0.3277, + "hfopenllm_v2/MATH Level 5": 0.0385, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3395, + "hfopenllm_v2/MMLU-PRO": 0.1327 + } + }, + { + "id": "pankajmathur/orca_mini_v9_5_3B-Instruct", + "name": "orca_mini_v9_5_3B-Instruct", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7207, + "hfopenllm_v2/BBH": 0.4496, + "hfopenllm_v2/MATH Level 5": 0.1322, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.427, + "hfopenllm_v2/MMLU-PRO": 0.2882 + } + }, + { + "id": "pankajmathur/orca_mini_v9_6_1B-Instruct", + "name": "orca_mini_v9_6_1B-Instruct", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6086, + "hfopenllm_v2/BBH": 0.3561, + "hfopenllm_v2/MATH Level 5": 0.077, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3396, + "hfopenllm_v2/MMLU-PRO": 0.1809 + } + }, + { + "id": "pankajmathur/orca_mini_v9_6_3B-Instruct", + "name": "orca_mini_v9_6_3B-Instruct", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7316, + "hfopenllm_v2/BBH": 0.4568, + "hfopenllm_v2/MATH Level 5": 0.1329, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4068, + "hfopenllm_v2/MMLU-PRO": 0.2851 + } + }, + { + "id": "pankajmathur/orca_mini_v9_7_1B-Instruct", + "name": "orca_mini_v9_7_1B-Instruct", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.561, + "hfopenllm_v2/BBH": 0.3182, + "hfopenllm_v2/MATH Level 5": 0.0446, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3527, + "hfopenllm_v2/MMLU-PRO": 0.1345 + } + }, + { + "id": "pankajmathur/orca_mini_v9_7_3B-Instruct", + "name": "orca_mini_v9_7_3B-Instruct", + "developer": "pankajmathur", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5618, + "hfopenllm_v2/BBH": 0.3297, + "hfopenllm_v2/MATH Level 5": 0.0619, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3619, + "hfopenllm_v2/MMLU-PRO": 0.1375 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/paulml.json b/data/developers/paulml.json new file mode 100644 index 0000000000000000000000000000000000000000..79ccf1de04171581ead52be392d5d52d76f09fea --- /dev/null +++ b/data/developers/paulml.json @@ -0,0 +1,19 @@ +{ + "developer": "paulml", + "models": [ + { + "id": "paulml/ECE-ILAB-Q1", + "name": "ECE-ILAB-Q1", + "developer": "paulml", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7865, + "hfopenllm_v2/BBH": 0.6718, + "hfopenllm_v2/MATH Level 5": 0.3557, + "hfopenllm_v2/GPQA": 0.3867, + "hfopenllm_v2/MUSR": 0.4614, + "hfopenllm_v2/MMLU-PRO": 0.5505 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/pints-ai.json b/data/developers/pints-ai.json new file mode 100644 index 0000000000000000000000000000000000000000..c84130dcb16187404bc96a785501ec5f09292a97 --- /dev/null +++ b/data/developers/pints-ai.json @@ -0,0 +1,33 @@ +{ + "developer": "pints-ai", + "models": [ + { + "id": "pints-ai/1.5-Pints-16K-v0.1", + "name": "1.5-Pints-16K-v0.1", + "developer": "pints-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1636, + "hfopenllm_v2/BBH": 0.3133, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2357, + "hfopenllm_v2/MUSR": 0.3579, + "hfopenllm_v2/MMLU-PRO": 0.1119 + } + }, + { + "id": "pints-ai/1.5-Pints-2K-v0.1", + "name": "1.5-Pints-2K-v0.1", + "developer": "pints-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1762, + "hfopenllm_v2/BBH": 0.298, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2483, + "hfopenllm_v2/MUSR": 0.3502, + "hfopenllm_v2/MMLU-PRO": 0.1104 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/piotr25691.json b/data/developers/piotr25691.json new file mode 100644 index 0000000000000000000000000000000000000000..1e4f9dc41e6d677d63260ee2a5456c180d5ccc40 --- /dev/null +++ b/data/developers/piotr25691.json @@ -0,0 +1,47 @@ +{ + "developer": "piotr25691", + "models": [ + { + "id": "piotr25691/thea-3b-25r", + "name": "thea-3b-25r", + "developer": "piotr25691", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7344, + "hfopenllm_v2/BBH": 0.4484, + "hfopenllm_v2/MATH Level 5": 0.1782, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3315, + "hfopenllm_v2/MMLU-PRO": 0.3182 + } + }, + { + "id": "piotr25691/thea-c-3b-25r", + "name": "thea-c-3b-25r", + "developer": "piotr25691", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7402, + "hfopenllm_v2/BBH": 0.4532, + "hfopenllm_v2/MATH Level 5": 0.1526, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3315, + "hfopenllm_v2/MMLU-PRO": 0.3178 + } + }, + { + "id": "piotr25691/thea-rp-3b-25r", + "name": "thea-rp-3b-25r", + "developer": "piotr25691", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6578, + "hfopenllm_v2/BBH": 0.439, + "hfopenllm_v2/MATH Level 5": 0.1322, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3819, + "hfopenllm_v2/MMLU-PRO": 0.306 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/postbot.json b/data/developers/postbot.json new file mode 100644 index 0000000000000000000000000000000000000000..663a02f652b9c08bc4ad5fb99a26081f82712198 --- /dev/null +++ b/data/developers/postbot.json @@ -0,0 +1,19 @@ +{ + "developer": "postbot", + "models": [ + { + "id": "postbot/gpt2-medium-emailgen", + "name": "gpt2-medium-emailgen", + "developer": "postbot", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1492, + "hfopenllm_v2/BBH": 0.313, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3911, + "hfopenllm_v2/MMLU-PRO": 0.1147 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/prince-canuma.json b/data/developers/prince-canuma.json new file mode 100644 index 0000000000000000000000000000000000000000..1479ff9192e5e7f0ed71e7814a1c507d6d0537c1 --- /dev/null +++ b/data/developers/prince-canuma.json @@ -0,0 +1,19 @@ +{ + "developer": "prince-canuma", + "models": [ + { + "id": "prince-canuma/Ministral-8B-Instruct-2410-HF", + "name": "Ministral-8B-Instruct-2410-HF", + "developer": "prince-canuma", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5912, + "hfopenllm_v2/BBH": 0.4586, + "hfopenllm_v2/MATH Level 5": 0.1918, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.4138, + "hfopenllm_v2/MMLU-PRO": 0.3298 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/princeton-nlp.json b/data/developers/princeton-nlp.json new file mode 100644 index 0000000000000000000000000000000000000000..772aeed79272f3649337c3fa020b0172aa1a578a --- /dev/null +++ b/data/developers/princeton-nlp.json @@ -0,0 +1,719 @@ +{ + "developer": "princeton-nlp", + "models": [ + { + "id": "princeton-nlp/Llama-3-8B-ProLong-512k-Base", + "name": "Llama-3-8B-ProLong-512k-Base", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5322, + "hfopenllm_v2/BBH": 0.5033, + "hfopenllm_v2/MATH Level 5": 0.0687, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.4223, + "hfopenllm_v2/MMLU-PRO": 0.3329 + } + }, + { + "id": "princeton-nlp/Llama-3-8B-ProLong-512k-Instruct", + "name": "Llama-3-8B-ProLong-512k-Instruct", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5508, + "hfopenllm_v2/BBH": 0.5028, + "hfopenllm_v2/MATH Level 5": 0.0529, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.4266, + "hfopenllm_v2/MMLU-PRO": 0.3231 + } + }, + { + "id": "princeton-nlp/Llama-3-8B-ProLong-64k-Base", + "name": "Llama-3-8B-ProLong-64k-Base", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5201, + "hfopenllm_v2/BBH": 0.4927, + "hfopenllm_v2/MATH Level 5": 0.065, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.4341, + "hfopenllm_v2/MMLU-PRO": 0.3348 + } + }, + { + "id": "princeton-nlp/Llama-3-8B-ProLong-64k-Instruct", + "name": "Llama-3-8B-ProLong-64k-Instruct", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5563, + "hfopenllm_v2/BBH": 0.5083, + "hfopenllm_v2/MATH Level 5": 0.065, + "hfopenllm_v2/GPQA": 0.2953, + "hfopenllm_v2/MUSR": 0.4397, + "hfopenllm_v2/MMLU-PRO": 0.3275 + } + }, + { + "id": "princeton-nlp/Llama-3-Base-8B-SFT", + "name": "Llama-3-Base-8B-SFT", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2796, + "hfopenllm_v2/BBH": 0.4643, + "hfopenllm_v2/MATH Level 5": 0.04, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4118, + "hfopenllm_v2/MMLU-PRO": 0.3093 + } + }, + { + "id": "princeton-nlp/Llama-3-Base-8B-SFT-CPO", + "name": "Llama-3-Base-8B-SFT-CPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3703, + "hfopenllm_v2/BBH": 0.4595, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3609, + "hfopenllm_v2/MMLU-PRO": 0.2976 + } + }, + { + "id": "princeton-nlp/Llama-3-Base-8B-SFT-DPO", + "name": "Llama-3-Base-8B-SFT-DPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4111, + "hfopenllm_v2/BBH": 0.4666, + "hfopenllm_v2/MATH Level 5": 0.0415, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.3867, + "hfopenllm_v2/MMLU-PRO": 0.3078 + } + }, + { + "id": "princeton-nlp/Llama-3-Base-8B-SFT-IPO", + "name": "Llama-3-Base-8B-SFT-IPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4487, + "hfopenllm_v2/BBH": 0.469, + "hfopenllm_v2/MATH Level 5": 0.0393, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.3919, + "hfopenllm_v2/MMLU-PRO": 0.3115 + } + }, + { + "id": "princeton-nlp/Llama-3-Base-8B-SFT-KTO", + "name": "Llama-3-Base-8B-SFT-KTO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4523, + "hfopenllm_v2/BBH": 0.4693, + "hfopenllm_v2/MATH Level 5": 0.0529, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.3842, + "hfopenllm_v2/MMLU-PRO": 0.3054 + } + }, + { + "id": "princeton-nlp/Llama-3-Base-8B-SFT-ORPO", + "name": "Llama-3-Base-8B-SFT-ORPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4517, + "hfopenllm_v2/BBH": 0.4734, + "hfopenllm_v2/MATH Level 5": 0.0468, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.3707, + "hfopenllm_v2/MMLU-PRO": 0.3083 + } + }, + { + "id": "princeton-nlp/Llama-3-Base-8B-SFT-RDPO", + "name": "Llama-3-Base-8B-SFT-RDPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.448, + "hfopenllm_v2/BBH": 0.4662, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4027, + "hfopenllm_v2/MMLU-PRO": 0.3014 + } + }, + { + "id": "princeton-nlp/Llama-3-Base-8B-SFT-RRHF", + "name": "Llama-3-Base-8B-SFT-RRHF", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3357, + "hfopenllm_v2/BBH": 0.452, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.3722, + "hfopenllm_v2/MMLU-PRO": 0.2889 + } + }, + { + "id": "princeton-nlp/Llama-3-Base-8B-SFT-SLiC-HF", + "name": "Llama-3-Base-8B-SFT-SLiC-HF", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.489, + "hfopenllm_v2/BBH": 0.4704, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.4091, + "hfopenllm_v2/MMLU-PRO": 0.3063 + } + }, + { + "id": "princeton-nlp/Llama-3-Base-8B-SFT-SimPO", + "name": "Llama-3-Base-8B-SFT-SimPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4685, + "hfopenllm_v2/BBH": 0.4741, + "hfopenllm_v2/MATH Level 5": 0.0551, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.4127, + "hfopenllm_v2/MMLU-PRO": 0.3105 + } + }, + { + "id": "princeton-nlp/Llama-3-Instruct-8B-CPO", + "name": "Llama-3-Instruct-8B-CPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7293, + "hfopenllm_v2/BBH": 0.4999, + "hfopenllm_v2/MATH Level 5": 0.0989, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3514, + "hfopenllm_v2/MMLU-PRO": 0.3652 + } + }, + { + "id": "princeton-nlp/Llama-3-Instruct-8B-CPO-v0.2", + "name": "Llama-3-Instruct-8B-CPO-v0.2", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7506, + "hfopenllm_v2/BBH": 0.5027, + "hfopenllm_v2/MATH Level 5": 0.108, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3619, + "hfopenllm_v2/MMLU-PRO": 0.3706 + } + }, + { + "id": "princeton-nlp/Llama-3-Instruct-8B-DPO", + "name": "Llama-3-Instruct-8B-DPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6757, + "hfopenllm_v2/BBH": 0.4991, + "hfopenllm_v2/MATH Level 5": 0.0846, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3738, + "hfopenllm_v2/MMLU-PRO": 0.3665 + } + }, + { + "id": "princeton-nlp/Llama-3-Instruct-8B-DPO-v0.2", + "name": "Llama-3-Instruct-8B-DPO-v0.2", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7208, + "hfopenllm_v2/BBH": 0.5056, + "hfopenllm_v2/MATH Level 5": 0.0899, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.3844, + "hfopenllm_v2/MMLU-PRO": 0.3769 + } + }, + { + "id": "princeton-nlp/Llama-3-Instruct-8B-KTO", + "name": "Llama-3-Instruct-8B-KTO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6864, + "hfopenllm_v2/BBH": 0.4982, + "hfopenllm_v2/MATH Level 5": 0.0725, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3698, + "hfopenllm_v2/MMLU-PRO": 0.3599 + } + }, + { + "id": "princeton-nlp/Llama-3-Instruct-8B-KTO-v0.2", + "name": "Llama-3-Instruct-8B-KTO-v0.2", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.729, + "hfopenllm_v2/BBH": 0.508, + "hfopenllm_v2/MATH Level 5": 0.0997, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3777, + "hfopenllm_v2/MMLU-PRO": 0.3668 + } + }, + { + "id": "princeton-nlp/Llama-3-Instruct-8B-ORPO", + "name": "Llama-3-Instruct-8B-ORPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7128, + "hfopenllm_v2/BBH": 0.5001, + "hfopenllm_v2/MATH Level 5": 0.0785, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3502, + "hfopenllm_v2/MMLU-PRO": 0.3646 + } + }, + { + "id": "princeton-nlp/Llama-3-Instruct-8B-ORPO-v0.2", + "name": "Llama-3-Instruct-8B-ORPO-v0.2", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7633, + "hfopenllm_v2/BBH": 0.5078, + "hfopenllm_v2/MATH Level 5": 0.102, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.378, + "hfopenllm_v2/MMLU-PRO": 0.3731 + } + }, + { + "id": "princeton-nlp/Llama-3-Instruct-8B-RDPO", + "name": "Llama-3-Instruct-8B-RDPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.666, + "hfopenllm_v2/BBH": 0.5034, + "hfopenllm_v2/MATH Level 5": 0.0846, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.3752, + "hfopenllm_v2/MMLU-PRO": 0.3607 + } + }, + { + "id": "princeton-nlp/Llama-3-Instruct-8B-RDPO-v0.2", + "name": "Llama-3-Instruct-8B-RDPO-v0.2", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7077, + "hfopenllm_v2/BBH": 0.5049, + "hfopenllm_v2/MATH Level 5": 0.0869, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.3804, + "hfopenllm_v2/MMLU-PRO": 0.3774 + } + }, + { + "id": "princeton-nlp/Llama-3-Instruct-8B-RRHF", + "name": "Llama-3-Instruct-8B-RRHF", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7275, + "hfopenllm_v2/BBH": 0.4911, + "hfopenllm_v2/MATH Level 5": 0.0967, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3476, + "hfopenllm_v2/MMLU-PRO": 0.3644 + } + }, + { + "id": "princeton-nlp/Llama-3-Instruct-8B-RRHF-v0.2", + "name": "Llama-3-Instruct-8B-RRHF-v0.2", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7125, + "hfopenllm_v2/BBH": 0.4984, + "hfopenllm_v2/MATH Level 5": 0.0876, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3738, + "hfopenllm_v2/MMLU-PRO": 0.3482 + } + }, + { + "id": "princeton-nlp/Llama-3-Instruct-8B-SLiC-HF", + "name": "Llama-3-Instruct-8B-SLiC-HF", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.74, + "hfopenllm_v2/BBH": 0.5029, + "hfopenllm_v2/MATH Level 5": 0.0974, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.3723, + "hfopenllm_v2/MMLU-PRO": 0.3585 + } + }, + { + "id": "princeton-nlp/Llama-3-Instruct-8B-SLiC-HF-v0.2", + "name": "Llama-3-Instruct-8B-SLiC-HF-v0.2", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.711, + "hfopenllm_v2/BBH": 0.4984, + "hfopenllm_v2/MATH Level 5": 0.0876, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3738, + "hfopenllm_v2/MMLU-PRO": 0.3482 + } + }, + { + "id": "princeton-nlp/Llama-3-Instruct-8B-SimPO", + "name": "Llama-3-Instruct-8B-SimPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6504, + "hfopenllm_v2/BBH": 0.4845, + "hfopenllm_v2/MATH Level 5": 0.0861, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.3948, + "hfopenllm_v2/MMLU-PRO": 0.3489 + } + }, + { + "id": "princeton-nlp/Llama-3-Instruct-8B-SimPO-v0.2", + "name": "Llama-3-Instruct-8B-SimPO-v0.2", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6809, + "hfopenllm_v2/BBH": 0.5038, + "hfopenllm_v2/MATH Level 5": 0.074, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.3988, + "hfopenllm_v2/MMLU-PRO": 0.3622 + } + }, + { + "id": "princeton-nlp/Mistral-7B-Base-SFT-CPO", + "name": "Mistral-7B-Base-SFT-CPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4655, + "hfopenllm_v2/BBH": 0.4382, + "hfopenllm_v2/MATH Level 5": 0.0279, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4071, + "hfopenllm_v2/MMLU-PRO": 0.2651 + } + }, + { + "id": "princeton-nlp/Mistral-7B-Base-SFT-DPO", + "name": "Mistral-7B-Base-SFT-DPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4403, + "hfopenllm_v2/BBH": 0.435, + "hfopenllm_v2/MATH Level 5": 0.0211, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.4122, + "hfopenllm_v2/MMLU-PRO": 0.2645 + } + }, + { + "id": "princeton-nlp/Mistral-7B-Base-SFT-IPO", + "name": "Mistral-7B-Base-SFT-IPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.483, + "hfopenllm_v2/BBH": 0.4458, + "hfopenllm_v2/MATH Level 5": 0.0287, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3776, + "hfopenllm_v2/MMLU-PRO": 0.2792 + } + }, + { + "id": "princeton-nlp/Mistral-7B-Base-SFT-KTO", + "name": "Mistral-7B-Base-SFT-KTO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4785, + "hfopenllm_v2/BBH": 0.4476, + "hfopenllm_v2/MATH Level 5": 0.0393, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4368, + "hfopenllm_v2/MMLU-PRO": 0.2872 + } + }, + { + "id": "princeton-nlp/Mistral-7B-Base-SFT-RDPO", + "name": "Mistral-7B-Base-SFT-RDPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4606, + "hfopenllm_v2/BBH": 0.444, + "hfopenllm_v2/MATH Level 5": 0.0219, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.3579, + "hfopenllm_v2/MMLU-PRO": 0.2777 + } + }, + { + "id": "princeton-nlp/Mistral-7B-Base-SFT-RRHF", + "name": "Mistral-7B-Base-SFT-RRHF", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4407, + "hfopenllm_v2/BBH": 0.4281, + "hfopenllm_v2/MATH Level 5": 0.0249, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4187, + "hfopenllm_v2/MMLU-PRO": 0.2398 + } + }, + { + "id": "princeton-nlp/Mistral-7B-Base-SFT-SLiC-HF", + "name": "Mistral-7B-Base-SFT-SLiC-HF", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5127, + "hfopenllm_v2/BBH": 0.4422, + "hfopenllm_v2/MATH Level 5": 0.0355, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4261, + "hfopenllm_v2/MMLU-PRO": 0.2781 + } + }, + { + "id": "princeton-nlp/Mistral-7B-Base-SFT-SimPO", + "name": "Mistral-7B-Base-SFT-SimPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4701, + "hfopenllm_v2/BBH": 0.4398, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3971, + "hfopenllm_v2/MMLU-PRO": 0.2702 + } + }, + { + "id": "princeton-nlp/Mistral-7B-Instruct-CPO", + "name": "Mistral-7B-Instruct-CPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4203, + "hfopenllm_v2/BBH": 0.4069, + "hfopenllm_v2/MATH Level 5": 0.0204, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.4178, + "hfopenllm_v2/MMLU-PRO": 0.2701 + } + }, + { + "id": "princeton-nlp/Mistral-7B-Instruct-DPO", + "name": "Mistral-7B-Instruct-DPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5176, + "hfopenllm_v2/BBH": 0.406, + "hfopenllm_v2/MATH Level 5": 0.031, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3833, + "hfopenllm_v2/MMLU-PRO": 0.2749 + } + }, + { + "id": "princeton-nlp/Mistral-7B-Instruct-IPO", + "name": "Mistral-7B-Instruct-IPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4929, + "hfopenllm_v2/BBH": 0.4322, + "hfopenllm_v2/MATH Level 5": 0.0204, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.4324, + "hfopenllm_v2/MMLU-PRO": 0.2708 + } + }, + { + "id": "princeton-nlp/Mistral-7B-Instruct-KTO", + "name": "Mistral-7B-Instruct-KTO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4908, + "hfopenllm_v2/BBH": 0.414, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3953, + "hfopenllm_v2/MMLU-PRO": 0.2812 + } + }, + { + "id": "princeton-nlp/Mistral-7B-Instruct-ORPO", + "name": "Mistral-7B-Instruct-ORPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.472, + "hfopenllm_v2/BBH": 0.4104, + "hfopenllm_v2/MATH Level 5": 0.0295, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3912, + "hfopenllm_v2/MMLU-PRO": 0.2662 + } + }, + { + "id": "princeton-nlp/Mistral-7B-Instruct-RDPO", + "name": "Mistral-7B-Instruct-RDPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4887, + "hfopenllm_v2/BBH": 0.405, + "hfopenllm_v2/MATH Level 5": 0.0249, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3873, + "hfopenllm_v2/MMLU-PRO": 0.2777 + } + }, + { + "id": "princeton-nlp/Mistral-7B-Instruct-RRHF", + "name": "Mistral-7B-Instruct-RRHF", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.496, + "hfopenllm_v2/BBH": 0.419, + "hfopenllm_v2/MATH Level 5": 0.0279, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3979, + "hfopenllm_v2/MMLU-PRO": 0.2651 + } + }, + { + "id": "princeton-nlp/Mistral-7B-Instruct-SLiC-HF", + "name": "Mistral-7B-Instruct-SLiC-HF", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5115, + "hfopenllm_v2/BBH": 0.404, + "hfopenllm_v2/MATH Level 5": 0.0174, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3913, + "hfopenllm_v2/MMLU-PRO": 0.2715 + } + }, + { + "id": "princeton-nlp/Mistral-7B-Instruct-SimPO", + "name": "Mistral-7B-Instruct-SimPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4687, + "hfopenllm_v2/BBH": 0.4507, + "hfopenllm_v2/MATH Level 5": 0.0287, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.4098, + "hfopenllm_v2/MMLU-PRO": 0.2797 + } + }, + { + "id": "princeton-nlp/Sheared-LLaMA-1.3B", + "name": "Sheared-LLaMA-1.3B", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2198, + "hfopenllm_v2/BBH": 0.3197, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2399, + "hfopenllm_v2/MUSR": 0.3713, + "hfopenllm_v2/MMLU-PRO": 0.1171 + } + }, + { + "id": "princeton-nlp/Sheared-LLaMA-2.7B", + "name": "Sheared-LLaMA-2.7B", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2417, + "hfopenllm_v2/BBH": 0.3259, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3567, + "hfopenllm_v2/MMLU-PRO": 0.1187 + } + }, + { + "id": "princeton-nlp/gemma-2-9b-it-DPO", + "name": "gemma-2-9b-it-DPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2769, + "hfopenllm_v2/BBH": 0.5941, + "hfopenllm_v2/MATH Level 5": 0.0831, + "hfopenllm_v2/GPQA": 0.3356, + "hfopenllm_v2/MUSR": 0.382, + "hfopenllm_v2/MMLU-PRO": 0.3723 + } + }, + { + "id": "princeton-nlp/gemma-2-9b-it-SimPO", + "name": "gemma-2-9b-it-SimPO", + "developer": "princeton-nlp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3207, + "hfopenllm_v2/BBH": 0.5839, + "hfopenllm_v2/MATH Level 5": 0.071, + "hfopenllm_v2/GPQA": 0.3356, + "hfopenllm_v2/MUSR": 0.4123, + "hfopenllm_v2/MMLU-PRO": 0.3975 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/prithivMLmods.json b/data/developers/prithivMLmods.json new file mode 100644 index 0000000000000000000000000000000000000000..9955e069086868f5f2f3d0b914ef44e14d31fc3f --- /dev/null +++ b/data/developers/prithivMLmods.json @@ -0,0 +1,1545 @@ +{ + "developer": "prithivMLmods", + "models": [ + { + "id": "prithivMLmods/Bellatrix-1.5B-xElite", + "name": "Bellatrix-1.5B-xElite", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1964, + "hfopenllm_v2/BBH": 0.3501, + "hfopenllm_v2/MATH Level 5": 0.287, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.3619, + "hfopenllm_v2/MMLU-PRO": 0.1657 + } + }, + { + "id": "prithivMLmods/Bellatrix-Tiny-1.5B-R1", + "name": "Bellatrix-Tiny-1.5B-R1", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3352, + "hfopenllm_v2/BBH": 0.4022, + "hfopenllm_v2/MATH Level 5": 0.0604, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.3683, + "hfopenllm_v2/MMLU-PRO": 0.2751 + } + }, + { + "id": "prithivMLmods/Bellatrix-Tiny-1B-v2", + "name": "Bellatrix-Tiny-1B-v2", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.151, + "hfopenllm_v2/BBH": 0.3268, + "hfopenllm_v2/MATH Level 5": 0.0287, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.343, + "hfopenllm_v2/MMLU-PRO": 0.1493 + } + }, + { + "id": "prithivMLmods/Blaze-14B-xElite", + "name": "Blaze-14B-xElite", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0363, + "hfopenllm_v2/BBH": 0.6628, + "hfopenllm_v2/MATH Level 5": 0.3693, + "hfopenllm_v2/GPQA": 0.3943, + "hfopenllm_v2/MUSR": 0.4625, + "hfopenllm_v2/MMLU-PRO": 0.5111 + } + }, + { + "id": "prithivMLmods/COCO-7B-Instruct-1M", + "name": "COCO-7B-Instruct-1M", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4743, + "hfopenllm_v2/BBH": 0.541, + "hfopenllm_v2/MATH Level 5": 0.3497, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4382, + "hfopenllm_v2/MMLU-PRO": 0.4186 + } + }, + { + "id": "prithivMLmods/Calcium-Opus-14B-Elite", + "name": "Calcium-Opus-14B-Elite", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6064, + "hfopenllm_v2/BBH": 0.6296, + "hfopenllm_v2/MATH Level 5": 0.3708, + "hfopenllm_v2/GPQA": 0.3733, + "hfopenllm_v2/MUSR": 0.4873, + "hfopenllm_v2/MMLU-PRO": 0.5307 + } + }, + { + "id": "prithivMLmods/Calcium-Opus-14B-Elite-1M", + "name": "Calcium-Opus-14B-Elite-1M", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5613, + "hfopenllm_v2/BBH": 0.6329, + "hfopenllm_v2/MATH Level 5": 0.4456, + "hfopenllm_v2/GPQA": 0.3523, + "hfopenllm_v2/MUSR": 0.4676, + "hfopenllm_v2/MMLU-PRO": 0.5152 + } + }, + { + "id": "prithivMLmods/Calcium-Opus-14B-Elite-Stock", + "name": "Calcium-Opus-14B-Elite-Stock", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6143, + "hfopenllm_v2/BBH": 0.6329, + "hfopenllm_v2/MATH Level 5": 0.4668, + "hfopenllm_v2/GPQA": 0.3683, + "hfopenllm_v2/MUSR": 0.4808, + "hfopenllm_v2/MMLU-PRO": 0.5284 + } + }, + { + "id": "prithivMLmods/Calcium-Opus-14B-Elite2", + "name": "Calcium-Opus-14B-Elite2", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6176, + "hfopenllm_v2/BBH": 0.6318, + "hfopenllm_v2/MATH Level 5": 0.469, + "hfopenllm_v2/GPQA": 0.37, + "hfopenllm_v2/MUSR": 0.494, + "hfopenllm_v2/MMLU-PRO": 0.5301 + } + }, + { + "id": "prithivMLmods/Calcium-Opus-14B-Elite2-R1", + "name": "Calcium-Opus-14B-Elite2-R1", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6326, + "hfopenllm_v2/BBH": 0.6362, + "hfopenllm_v2/MATH Level 5": 0.3338, + "hfopenllm_v2/GPQA": 0.3909, + "hfopenllm_v2/MUSR": 0.49, + "hfopenllm_v2/MMLU-PRO": 0.5248 + } + }, + { + "id": "prithivMLmods/Calcium-Opus-14B-Elite3", + "name": "Calcium-Opus-14B-Elite3", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5428, + "hfopenllm_v2/BBH": 0.635, + "hfopenllm_v2/MATH Level 5": 0.4705, + "hfopenllm_v2/GPQA": 0.3708, + "hfopenllm_v2/MUSR": 0.4795, + "hfopenllm_v2/MMLU-PRO": 0.5335 + } + }, + { + "id": "prithivMLmods/Calcium-Opus-14B-Elite4", + "name": "Calcium-Opus-14B-Elite4", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6112, + "hfopenllm_v2/BBH": 0.6195, + "hfopenllm_v2/MATH Level 5": 0.3625, + "hfopenllm_v2/GPQA": 0.3557, + "hfopenllm_v2/MUSR": 0.4687, + "hfopenllm_v2/MMLU-PRO": 0.5149 + } + }, + { + "id": "prithivMLmods/Calcium-Opus-14B-Merge", + "name": "Calcium-Opus-14B-Merge", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4949, + "hfopenllm_v2/BBH": 0.6319, + "hfopenllm_v2/MATH Level 5": 0.4637, + "hfopenllm_v2/GPQA": 0.3708, + "hfopenllm_v2/MUSR": 0.4861, + "hfopenllm_v2/MMLU-PRO": 0.5356 + } + }, + { + "id": "prithivMLmods/Calcium-Opus-20B-v1", + "name": "Calcium-Opus-20B-v1", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3093, + "hfopenllm_v2/BBH": 0.599, + "hfopenllm_v2/MATH Level 5": 0.3618, + "hfopenllm_v2/GPQA": 0.3532, + "hfopenllm_v2/MUSR": 0.4943, + "hfopenllm_v2/MMLU-PRO": 0.4734 + } + }, + { + "id": "prithivMLmods/Codepy-Deepthink-3B", + "name": "Codepy-Deepthink-3B", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4327, + "hfopenllm_v2/BBH": 0.4259, + "hfopenllm_v2/MATH Level 5": 0.1156, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.331, + "hfopenllm_v2/MMLU-PRO": 0.309 + } + }, + { + "id": "prithivMLmods/Coma-II-14B", + "name": "Coma-II-14B", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4168, + "hfopenllm_v2/BBH": 0.6321, + "hfopenllm_v2/MATH Level 5": 0.5514, + "hfopenllm_v2/GPQA": 0.4002, + "hfopenllm_v2/MUSR": 0.5351, + "hfopenllm_v2/MMLU-PRO": 0.504 + } + }, + { + "id": "prithivMLmods/Condor-Opus-14B-Exp", + "name": "Condor-Opus-14B-Exp", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4043, + "hfopenllm_v2/BBH": 0.6154, + "hfopenllm_v2/MATH Level 5": 0.5227, + "hfopenllm_v2/GPQA": 0.3918, + "hfopenllm_v2/MUSR": 0.5194, + "hfopenllm_v2/MMLU-PRO": 0.5014 + } + }, + { + "id": "prithivMLmods/Cygnus-II-14B", + "name": "Cygnus-II-14B", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6184, + "hfopenllm_v2/BBH": 0.6661, + "hfopenllm_v2/MATH Level 5": 0.4396, + "hfopenllm_v2/GPQA": 0.3876, + "hfopenllm_v2/MUSR": 0.4688, + "hfopenllm_v2/MMLU-PRO": 0.5391 + } + }, + { + "id": "prithivMLmods/Deepthink-Llama-3-8B-Preview", + "name": "Deepthink-Llama-3-8B-Preview", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2955, + "hfopenllm_v2/BBH": 0.4665, + "hfopenllm_v2/MATH Level 5": 0.355, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.3707, + "hfopenllm_v2/MMLU-PRO": 0.2739 + } + }, + { + "id": "prithivMLmods/Deepthink-Reasoning-14B", + "name": "Deepthink-Reasoning-14B", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5424, + "hfopenllm_v2/BBH": 0.6334, + "hfopenllm_v2/MATH Level 5": 0.423, + "hfopenllm_v2/GPQA": 0.3666, + "hfopenllm_v2/MUSR": 0.4732, + "hfopenllm_v2/MMLU-PRO": 0.5296 + } + }, + { + "id": "prithivMLmods/Deepthink-Reasoning-7B", + "name": "Deepthink-Reasoning-7B", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.484, + "hfopenllm_v2/BBH": 0.5505, + "hfopenllm_v2/MATH Level 5": 0.3346, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4432, + "hfopenllm_v2/MMLU-PRO": 0.4349 + } + }, + { + "id": "prithivMLmods/Dinobot-Opus-14B-Exp", + "name": "Dinobot-Opus-14B-Exp", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.824, + "hfopenllm_v2/BBH": 0.637, + "hfopenllm_v2/MATH Level 5": 0.5317, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.426, + "hfopenllm_v2/MMLU-PRO": 0.4979 + } + }, + { + "id": "prithivMLmods/Elita-0.1-Distilled-R1-abliterated", + "name": "Elita-0.1-Distilled-R1-abliterated", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3542, + "hfopenllm_v2/BBH": 0.3828, + "hfopenllm_v2/MATH Level 5": 0.3066, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.366, + "hfopenllm_v2/MMLU-PRO": 0.2758 + } + }, + { + "id": "prithivMLmods/Elita-1", + "name": "Elita-1", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4906, + "hfopenllm_v2/BBH": 0.652, + "hfopenllm_v2/MATH Level 5": 0.3429, + "hfopenllm_v2/GPQA": 0.3758, + "hfopenllm_v2/MUSR": 0.4834, + "hfopenllm_v2/MMLU-PRO": 0.5381 + } + }, + { + "id": "prithivMLmods/Epimetheus-14B-Axo", + "name": "Epimetheus-14B-Axo", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5546, + "hfopenllm_v2/BBH": 0.6613, + "hfopenllm_v2/MATH Level 5": 0.4101, + "hfopenllm_v2/GPQA": 0.3926, + "hfopenllm_v2/MUSR": 0.482, + "hfopenllm_v2/MMLU-PRO": 0.5304 + } + }, + { + "id": "prithivMLmods/Equuleus-Opus-14B-Exp", + "name": "Equuleus-Opus-14B-Exp", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7001, + "hfopenllm_v2/BBH": 0.6434, + "hfopenllm_v2/MATH Level 5": 0.4585, + "hfopenllm_v2/GPQA": 0.3867, + "hfopenllm_v2/MUSR": 0.4952, + "hfopenllm_v2/MMLU-PRO": 0.5374 + } + }, + { + "id": "prithivMLmods/Eridanus-Opus-14B-r999", + "name": "Eridanus-Opus-14B-r999", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6386, + "hfopenllm_v2/BBH": 0.6584, + "hfopenllm_v2/MATH Level 5": 0.386, + "hfopenllm_v2/GPQA": 0.3943, + "hfopenllm_v2/MUSR": 0.4769, + "hfopenllm_v2/MMLU-PRO": 0.5362 + } + }, + { + "id": "prithivMLmods/Evac-Opus-14B-Exp", + "name": "Evac-Opus-14B-Exp", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5916, + "hfopenllm_v2/BBH": 0.6475, + "hfopenllm_v2/MATH Level 5": 0.4215, + "hfopenllm_v2/GPQA": 0.3884, + "hfopenllm_v2/MUSR": 0.4728, + "hfopenllm_v2/MMLU-PRO": 0.5317 + } + }, + { + "id": "prithivMLmods/FastThink-0.5B-Tiny", + "name": "FastThink-0.5B-Tiny", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.258, + "hfopenllm_v2/BBH": 0.3206, + "hfopenllm_v2/MATH Level 5": 0.0204, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3566, + "hfopenllm_v2/MMLU-PRO": 0.1649 + } + }, + { + "id": "prithivMLmods/GWQ-9B-Preview", + "name": "GWQ-9B-Preview", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5066, + "hfopenllm_v2/BBH": 0.5806, + "hfopenllm_v2/MATH Level 5": 0.2266, + "hfopenllm_v2/GPQA": 0.3398, + "hfopenllm_v2/MUSR": 0.4951, + "hfopenllm_v2/MMLU-PRO": 0.3984 + } + }, + { + "id": "prithivMLmods/GWQ-9B-Preview2", + "name": "GWQ-9B-Preview2", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5209, + "hfopenllm_v2/BBH": 0.5797, + "hfopenllm_v2/MATH Level 5": 0.2372, + "hfopenllm_v2/GPQA": 0.3263, + "hfopenllm_v2/MUSR": 0.486, + "hfopenllm_v2/MMLU-PRO": 0.3997 + } + }, + { + "id": "prithivMLmods/GWQ2b", + "name": "GWQ2b", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4115, + "hfopenllm_v2/BBH": 0.4143, + "hfopenllm_v2/MATH Level 5": 0.0627, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4311, + "hfopenllm_v2/MMLU-PRO": 0.2473 + } + }, + { + "id": "prithivMLmods/Gaea-Opus-14B-Exp", + "name": "Gaea-Opus-14B-Exp", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5956, + "hfopenllm_v2/BBH": 0.656, + "hfopenllm_v2/MATH Level 5": 0.4275, + "hfopenllm_v2/GPQA": 0.3909, + "hfopenllm_v2/MUSR": 0.4859, + "hfopenllm_v2/MMLU-PRO": 0.5401 + } + }, + { + "id": "prithivMLmods/Galactic-Qwen-14B-Exp1", + "name": "Galactic-Qwen-14B-Exp1", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5832, + "hfopenllm_v2/BBH": 0.6582, + "hfopenllm_v2/MATH Level 5": 0.4018, + "hfopenllm_v2/GPQA": 0.3935, + "hfopenllm_v2/MUSR": 0.4781, + "hfopenllm_v2/MMLU-PRO": 0.5396 + } + }, + { + "id": "prithivMLmods/Galactic-Qwen-14B-Exp2", + "name": "Galactic-Qwen-14B-Exp2", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.662, + "hfopenllm_v2/BBH": 0.7203, + "hfopenllm_v2/MATH Level 5": 0.3474, + "hfopenllm_v2/GPQA": 0.3993, + "hfopenllm_v2/MUSR": 0.5354, + "hfopenllm_v2/MMLU-PRO": 0.5691 + } + }, + { + "id": "prithivMLmods/Gauss-Opus-14B-R999", + "name": "Gauss-Opus-14B-R999", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3907, + "hfopenllm_v2/BBH": 0.6228, + "hfopenllm_v2/MATH Level 5": 0.5755, + "hfopenllm_v2/GPQA": 0.3918, + "hfopenllm_v2/MUSR": 0.5338, + "hfopenllm_v2/MMLU-PRO": 0.5007 + } + }, + { + "id": "prithivMLmods/Jolt-v0.1", + "name": "Jolt-v0.1", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5092, + "hfopenllm_v2/BBH": 0.6521, + "hfopenllm_v2/MATH Level 5": 0.3565, + "hfopenllm_v2/GPQA": 0.38, + "hfopenllm_v2/MUSR": 0.4847, + "hfopenllm_v2/MMLU-PRO": 0.5386 + } + }, + { + "id": "prithivMLmods/Lacerta-Opus-14B-Elite8", + "name": "Lacerta-Opus-14B-Elite8", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6141, + "hfopenllm_v2/BBH": 0.6401, + "hfopenllm_v2/MATH Level 5": 0.3648, + "hfopenllm_v2/GPQA": 0.3784, + "hfopenllm_v2/MUSR": 0.4635, + "hfopenllm_v2/MMLU-PRO": 0.5322 + } + }, + { + "id": "prithivMLmods/Llama-3.1-5B-Instruct", + "name": "Llama-3.1-5B-Instruct", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1407, + "hfopenllm_v2/BBH": 0.3051, + "hfopenllm_v2/MATH Level 5": 0.0151, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.354, + "hfopenllm_v2/MMLU-PRO": 0.1184 + } + }, + { + "id": "prithivMLmods/Llama-3.1-8B-Open-SFT", + "name": "Llama-3.1-8B-Open-SFT", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4123, + "hfopenllm_v2/BBH": 0.4968, + "hfopenllm_v2/MATH Level 5": 0.1216, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.3904, + "hfopenllm_v2/MMLU-PRO": 0.3522 + } + }, + { + "id": "prithivMLmods/Llama-3.2-3B-Math-Oct", + "name": "Llama-3.2-3B-Math-Oct", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4585, + "hfopenllm_v2/BBH": 0.4372, + "hfopenllm_v2/MATH Level 5": 0.1156, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.347, + "hfopenllm_v2/MMLU-PRO": 0.2911 + } + }, + { + "id": "prithivMLmods/Llama-3.2-6B-AlgoCode", + "name": "Llama-3.2-6B-AlgoCode", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2136, + "hfopenllm_v2/BBH": 0.3748, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.4013, + "hfopenllm_v2/MMLU-PRO": 0.1798 + } + }, + { + "id": "prithivMLmods/Llama-8B-Distill-CoT", + "name": "Llama-8B-Distill-CoT", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3342, + "hfopenllm_v2/BBH": 0.4298, + "hfopenllm_v2/MATH Level 5": 0.4003, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.372, + "hfopenllm_v2/MMLU-PRO": 0.2732 + } + }, + { + "id": "prithivMLmods/Llama-Deepsync-1B", + "name": "Llama-Deepsync-1B", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.357, + "hfopenllm_v2/BBH": 0.3386, + "hfopenllm_v2/MATH Level 5": 0.0438, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3565, + "hfopenllm_v2/MMLU-PRO": 0.1738 + } + }, + { + "id": "prithivMLmods/Llama-Deepsync-3B", + "name": "Llama-Deepsync-3B", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4302, + "hfopenllm_v2/BBH": 0.4292, + "hfopenllm_v2/MATH Level 5": 0.1178, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3324, + "hfopenllm_v2/MMLU-PRO": 0.3031 + } + }, + { + "id": "prithivMLmods/Llama-Express.1-Math", + "name": "Llama-Express.1-Math", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5084, + "hfopenllm_v2/BBH": 0.3364, + "hfopenllm_v2/MATH Level 5": 0.0559, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3143, + "hfopenllm_v2/MMLU-PRO": 0.161 + } + }, + { + "id": "prithivMLmods/LwQ-10B-Instruct", + "name": "LwQ-10B-Instruct", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3935, + "hfopenllm_v2/BBH": 0.5122, + "hfopenllm_v2/MATH Level 5": 0.04, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4544, + "hfopenllm_v2/MMLU-PRO": 0.3318 + } + }, + { + "id": "prithivMLmods/LwQ-Reasoner-10B", + "name": "LwQ-Reasoner-10B", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2941, + "hfopenllm_v2/BBH": 0.5866, + "hfopenllm_v2/MATH Level 5": 0.358, + "hfopenllm_v2/GPQA": 0.3465, + "hfopenllm_v2/MUSR": 0.4079, + "hfopenllm_v2/MMLU-PRO": 0.4147 + } + }, + { + "id": "prithivMLmods/Magellanic-Opus-14B-Exp", + "name": "Magellanic-Opus-14B-Exp", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6866, + "hfopenllm_v2/BBH": 0.6383, + "hfopenllm_v2/MATH Level 5": 0.3799, + "hfopenllm_v2/GPQA": 0.3742, + "hfopenllm_v2/MUSR": 0.4926, + "hfopenllm_v2/MMLU-PRO": 0.5273 + } + }, + { + "id": "prithivMLmods/Magellanic-Qwen-25B-R999", + "name": "Magellanic-Qwen-25B-R999", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1873, + "hfopenllm_v2/BBH": 0.2608, + "hfopenllm_v2/MATH Level 5": 0.0053, + "hfopenllm_v2/GPQA": 0.2508, + "hfopenllm_v2/MUSR": 0.3831, + "hfopenllm_v2/MMLU-PRO": 0.13 + } + }, + { + "id": "prithivMLmods/Megatron-Corpus-14B-Exp", + "name": "Megatron-Corpus-14B-Exp", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4983, + "hfopenllm_v2/BBH": 0.6355, + "hfopenllm_v2/MATH Level 5": 0.3429, + "hfopenllm_v2/GPQA": 0.3633, + "hfopenllm_v2/MUSR": 0.4767, + "hfopenllm_v2/MMLU-PRO": 0.526 + } + }, + { + "id": "prithivMLmods/Megatron-Corpus-14B-Exp.v2", + "name": "Megatron-Corpus-14B-Exp.v2", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.487, + "hfopenllm_v2/BBH": 0.6321, + "hfopenllm_v2/MATH Level 5": 0.2591, + "hfopenllm_v2/GPQA": 0.3423, + "hfopenllm_v2/MUSR": 0.449, + "hfopenllm_v2/MMLU-PRO": 0.481 + } + }, + { + "id": "prithivMLmods/Megatron-Opus-14B-2.0", + "name": "Megatron-Opus-14B-2.0", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6694, + "hfopenllm_v2/BBH": 0.6871, + "hfopenllm_v2/MATH Level 5": 0.2779, + "hfopenllm_v2/GPQA": 0.3591, + "hfopenllm_v2/MUSR": 0.414, + "hfopenllm_v2/MMLU-PRO": 0.517 + } + }, + { + "id": "prithivMLmods/Megatron-Opus-14B-2.1", + "name": "Megatron-Opus-14B-2.1", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0246, + "hfopenllm_v2/BBH": 0.6727, + "hfopenllm_v2/MATH Level 5": 0.2998, + "hfopenllm_v2/GPQA": 0.3834, + "hfopenllm_v2/MUSR": 0.4928, + "hfopenllm_v2/MMLU-PRO": 0.5174 + } + }, + { + "id": "prithivMLmods/Megatron-Opus-14B-Exp", + "name": "Megatron-Opus-14B-Exp", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4979, + "hfopenllm_v2/BBH": 0.6516, + "hfopenllm_v2/MATH Level 5": 0.3535, + "hfopenllm_v2/GPQA": 0.375, + "hfopenllm_v2/MUSR": 0.4887, + "hfopenllm_v2/MMLU-PRO": 0.5401 + } + }, + { + "id": "prithivMLmods/Megatron-Opus-14B-Stock", + "name": "Megatron-Opus-14B-Stock", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5174, + "hfopenllm_v2/BBH": 0.6412, + "hfopenllm_v2/MATH Level 5": 0.3346, + "hfopenllm_v2/GPQA": 0.375, + "hfopenllm_v2/MUSR": 0.482, + "hfopenllm_v2/MMLU-PRO": 0.5293 + } + }, + { + "id": "prithivMLmods/Megatron-Opus-7B-Exp", + "name": "Megatron-Opus-7B-Exp", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6017, + "hfopenllm_v2/BBH": 0.5367, + "hfopenllm_v2/MATH Level 5": 0.1971, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.4186, + "hfopenllm_v2/MMLU-PRO": 0.39 + } + }, + { + "id": "prithivMLmods/Messier-Opus-14B-Elite7", + "name": "Messier-Opus-14B-Elite7", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7113, + "hfopenllm_v2/BBH": 0.6499, + "hfopenllm_v2/MATH Level 5": 0.4071, + "hfopenllm_v2/GPQA": 0.3909, + "hfopenllm_v2/MUSR": 0.4886, + "hfopenllm_v2/MMLU-PRO": 0.5404 + } + }, + { + "id": "prithivMLmods/Omni-Reasoner-Merged", + "name": "Omni-Reasoner-Merged", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4599, + "hfopenllm_v2/BBH": 0.5508, + "hfopenllm_v2/MATH Level 5": 0.3331, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4616, + "hfopenllm_v2/MMLU-PRO": 0.4364 + } + }, + { + "id": "prithivMLmods/Omni-Reasoner3-Merged", + "name": "Omni-Reasoner3-Merged", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4935, + "hfopenllm_v2/BBH": 0.4388, + "hfopenllm_v2/MATH Level 5": 0.1088, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3522, + "hfopenllm_v2/MMLU-PRO": 0.295 + } + }, + { + "id": "prithivMLmods/Pegasus-Opus-14B-Exp", + "name": "Pegasus-Opus-14B-Exp", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6982, + "hfopenllm_v2/BBH": 0.6548, + "hfopenllm_v2/MATH Level 5": 0.4086, + "hfopenllm_v2/GPQA": 0.3951, + "hfopenllm_v2/MUSR": 0.486, + "hfopenllm_v2/MMLU-PRO": 0.5412 + } + }, + { + "id": "prithivMLmods/Phi-4-Empathetic", + "name": "Phi-4-Empathetic", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0497, + "hfopenllm_v2/BBH": 0.6727, + "hfopenllm_v2/MATH Level 5": 0.2621, + "hfopenllm_v2/GPQA": 0.38, + "hfopenllm_v2/MUSR": 0.4991, + "hfopenllm_v2/MMLU-PRO": 0.5066 + } + }, + { + "id": "prithivMLmods/Phi-4-Math-IO", + "name": "Phi-4-Math-IO", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.059, + "hfopenllm_v2/BBH": 0.6668, + "hfopenllm_v2/MATH Level 5": 0.4577, + "hfopenllm_v2/GPQA": 0.3985, + "hfopenllm_v2/MUSR": 0.4873, + "hfopenllm_v2/MMLU-PRO": 0.5205 + } + }, + { + "id": "prithivMLmods/Phi-4-QwQ", + "name": "Phi-4-QwQ", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0559, + "hfopenllm_v2/BBH": 0.6696, + "hfopenllm_v2/MATH Level 5": 0.4577, + "hfopenllm_v2/GPQA": 0.3909, + "hfopenllm_v2/MUSR": 0.4651, + "hfopenllm_v2/MMLU-PRO": 0.5275 + } + }, + { + "id": "prithivMLmods/Phi-4-Super", + "name": "Phi-4-Super", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0481, + "hfopenllm_v2/BBH": 0.672, + "hfopenllm_v2/MATH Level 5": 0.3489, + "hfopenllm_v2/GPQA": 0.3943, + "hfopenllm_v2/MUSR": 0.5044, + "hfopenllm_v2/MMLU-PRO": 0.5266 + } + }, + { + "id": "prithivMLmods/Phi-4-Super-1", + "name": "Phi-4-Super-1", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0418, + "hfopenllm_v2/BBH": 0.6729, + "hfopenllm_v2/MATH Level 5": 0.352, + "hfopenllm_v2/GPQA": 0.3935, + "hfopenllm_v2/MUSR": 0.5017, + "hfopenllm_v2/MMLU-PRO": 0.5235 + } + }, + { + "id": "prithivMLmods/Phi-4-Super-o1", + "name": "Phi-4-Super-o1", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0418, + "hfopenllm_v2/BBH": 0.6729, + "hfopenllm_v2/MATH Level 5": 0.352, + "hfopenllm_v2/GPQA": 0.3935, + "hfopenllm_v2/MUSR": 0.5017, + "hfopenllm_v2/MMLU-PRO": 0.5235 + } + }, + { + "id": "prithivMLmods/Phi-4-o1", + "name": "Phi-4-o1", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.029, + "hfopenllm_v2/BBH": 0.6689, + "hfopenllm_v2/MATH Level 5": 0.3995, + "hfopenllm_v2/GPQA": 0.3826, + "hfopenllm_v2/MUSR": 0.4978, + "hfopenllm_v2/MMLU-PRO": 0.5174 + } + }, + { + "id": "prithivMLmods/Phi4-Super", + "name": "Phi4-Super", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0481, + "hfopenllm_v2/BBH": 0.672, + "hfopenllm_v2/MATH Level 5": 0.3489, + "hfopenllm_v2/GPQA": 0.3943, + "hfopenllm_v2/MUSR": 0.5044, + "hfopenllm_v2/MMLU-PRO": 0.5266 + } + }, + { + "id": "prithivMLmods/Porpoise-Opus-14B-Exp", + "name": "Porpoise-Opus-14B-Exp", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7098, + "hfopenllm_v2/BBH": 0.6519, + "hfopenllm_v2/MATH Level 5": 0.4041, + "hfopenllm_v2/GPQA": 0.3935, + "hfopenllm_v2/MUSR": 0.4926, + "hfopenllm_v2/MMLU-PRO": 0.5396 + } + }, + { + "id": "prithivMLmods/Primal-Opus-14B-Optimus-v1", + "name": "Primal-Opus-14B-Optimus-v1", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5013, + "hfopenllm_v2/BBH": 0.6419, + "hfopenllm_v2/MATH Level 5": 0.3384, + "hfopenllm_v2/GPQA": 0.3725, + "hfopenllm_v2/MUSR": 0.4847, + "hfopenllm_v2/MMLU-PRO": 0.5259 + } + }, + { + "id": "prithivMLmods/Primal-Opus-14B-Optimus-v2", + "name": "Primal-Opus-14B-Optimus-v2", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6404, + "hfopenllm_v2/BBH": 0.6544, + "hfopenllm_v2/MATH Level 5": 0.4207, + "hfopenllm_v2/GPQA": 0.3918, + "hfopenllm_v2/MUSR": 0.49, + "hfopenllm_v2/MMLU-PRO": 0.5422 + } + }, + { + "id": "prithivMLmods/QwQ-LCoT-14B-Conversational", + "name": "QwQ-LCoT-14B-Conversational", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4047, + "hfopenllm_v2/BBH": 0.624, + "hfopenllm_v2/MATH Level 5": 0.4653, + "hfopenllm_v2/GPQA": 0.3498, + "hfopenllm_v2/MUSR": 0.4847, + "hfopenllm_v2/MMLU-PRO": 0.5278 + } + }, + { + "id": "prithivMLmods/QwQ-LCoT-3B-Instruct", + "name": "QwQ-LCoT-3B-Instruct", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4354, + "hfopenllm_v2/BBH": 0.4763, + "hfopenllm_v2/MATH Level 5": 0.2825, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.4358, + "hfopenllm_v2/MMLU-PRO": 0.3582 + } + }, + { + "id": "prithivMLmods/QwQ-LCoT-7B-Instruct", + "name": "QwQ-LCoT-7B-Instruct", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4987, + "hfopenllm_v2/BBH": 0.5466, + "hfopenllm_v2/MATH Level 5": 0.3716, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4802, + "hfopenllm_v2/MMLU-PRO": 0.4334 + } + }, + { + "id": "prithivMLmods/QwQ-LCoT1-Merged", + "name": "QwQ-LCoT1-Merged", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4751, + "hfopenllm_v2/BBH": 0.5481, + "hfopenllm_v2/MATH Level 5": 0.3731, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4696, + "hfopenllm_v2/MMLU-PRO": 0.4358 + } + }, + { + "id": "prithivMLmods/QwQ-LCoT2-7B-Instruct", + "name": "QwQ-LCoT2-7B-Instruct", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5561, + "hfopenllm_v2/BBH": 0.5425, + "hfopenllm_v2/MATH Level 5": 0.327, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4564, + "hfopenllm_v2/MMLU-PRO": 0.4342 + } + }, + { + "id": "prithivMLmods/QwQ-MathOct-7B", + "name": "QwQ-MathOct-7B", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4684, + "hfopenllm_v2/BBH": 0.5486, + "hfopenllm_v2/MATH Level 5": 0.2953, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4601, + "hfopenllm_v2/MMLU-PRO": 0.433 + } + }, + { + "id": "prithivMLmods/QwQ-R1-Distill-1.5B-CoT", + "name": "QwQ-R1-Distill-1.5B-CoT", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2194, + "hfopenllm_v2/BBH": 0.3666, + "hfopenllm_v2/MATH Level 5": 0.3346, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.3434, + "hfopenllm_v2/MMLU-PRO": 0.1913 + } + }, + { + "id": "prithivMLmods/QwQ-R1-Distill-7B-CoT", + "name": "QwQ-R1-Distill-7B-CoT", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.35, + "hfopenllm_v2/BBH": 0.4388, + "hfopenllm_v2/MATH Level 5": 0.4683, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.3779, + "hfopenllm_v2/MMLU-PRO": 0.2804 + } + }, + { + "id": "prithivMLmods/Qwen-7B-Distill-Reasoner", + "name": "Qwen-7B-Distill-Reasoner", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3396, + "hfopenllm_v2/BBH": 0.4409, + "hfopenllm_v2/MATH Level 5": 0.395, + "hfopenllm_v2/GPQA": 0.3272, + "hfopenllm_v2/MUSR": 0.366, + "hfopenllm_v2/MMLU-PRO": 0.2818 + } + }, + { + "id": "prithivMLmods/Qwen2.5-1.5B-DeepSeek-R1-Instruct", + "name": "Qwen2.5-1.5B-DeepSeek-R1-Instruct", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1397, + "hfopenllm_v2/BBH": 0.2824, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3724, + "hfopenllm_v2/MMLU-PRO": 0.1123 + } + }, + { + "id": "prithivMLmods/Qwen2.5-14B-DeepSeek-R1-1M", + "name": "Qwen2.5-14B-DeepSeek-R1-1M", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4193, + "hfopenllm_v2/BBH": 0.5935, + "hfopenllm_v2/MATH Level 5": 0.5128, + "hfopenllm_v2/GPQA": 0.3322, + "hfopenllm_v2/MUSR": 0.4606, + "hfopenllm_v2/MMLU-PRO": 0.4899 + } + }, + { + "id": "prithivMLmods/Qwen2.5-7B-DeepSeek-R1-1M", + "name": "Qwen2.5-7B-DeepSeek-R1-1M", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1861, + "hfopenllm_v2/BBH": 0.3126, + "hfopenllm_v2/MATH Level 5": 0.0151, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3417, + "hfopenllm_v2/MMLU-PRO": 0.1201 + } + }, + { + "id": "prithivMLmods/SmolLM2-CoT-360M", + "name": "SmolLM2-CoT-360M", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2216, + "hfopenllm_v2/BBH": 0.3135, + "hfopenllm_v2/MATH Level 5": 0.0204, + "hfopenllm_v2/GPQA": 0.2366, + "hfopenllm_v2/MUSR": 0.3794, + "hfopenllm_v2/MMLU-PRO": 0.1085 + } + }, + { + "id": "prithivMLmods/Sombrero-Opus-14B-Elite5", + "name": "Sombrero-Opus-14B-Elite5", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7881, + "hfopenllm_v2/BBH": 0.6502, + "hfopenllm_v2/MATH Level 5": 0.5355, + "hfopenllm_v2/GPQA": 0.3364, + "hfopenllm_v2/MUSR": 0.4287, + "hfopenllm_v2/MMLU-PRO": 0.52 + } + }, + { + "id": "prithivMLmods/Sombrero-Opus-14B-Elite6", + "name": "Sombrero-Opus-14B-Elite6", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7226, + "hfopenllm_v2/BBH": 0.6488, + "hfopenllm_v2/MATH Level 5": 0.4079, + "hfopenllm_v2/GPQA": 0.3935, + "hfopenllm_v2/MUSR": 0.4886, + "hfopenllm_v2/MMLU-PRO": 0.539 + } + }, + { + "id": "prithivMLmods/Sombrero-Opus-14B-Sm1", + "name": "Sombrero-Opus-14B-Sm1", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3813, + "hfopenllm_v2/BBH": 0.6355, + "hfopenllm_v2/MATH Level 5": 0.5665, + "hfopenllm_v2/GPQA": 0.4035, + "hfopenllm_v2/MUSR": 0.5299, + "hfopenllm_v2/MMLU-PRO": 0.5125 + } + }, + { + "id": "prithivMLmods/Sombrero-Opus-14B-Sm2", + "name": "Sombrero-Opus-14B-Sm2", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4272, + "hfopenllm_v2/BBH": 0.6609, + "hfopenllm_v2/MATH Level 5": 0.4864, + "hfopenllm_v2/GPQA": 0.3884, + "hfopenllm_v2/MUSR": 0.5088, + "hfopenllm_v2/MMLU-PRO": 0.5345 + } + }, + { + "id": "prithivMLmods/Sombrero-Opus-14B-Sm4", + "name": "Sombrero-Opus-14B-Sm4", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4347, + "hfopenllm_v2/BBH": 0.6613, + "hfopenllm_v2/MATH Level 5": 0.4879, + "hfopenllm_v2/GPQA": 0.3951, + "hfopenllm_v2/MUSR": 0.5192, + "hfopenllm_v2/MMLU-PRO": 0.53 + } + }, + { + "id": "prithivMLmods/Sombrero-Opus-14B-Sm5", + "name": "Sombrero-Opus-14B-Sm5", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6852, + "hfopenllm_v2/BBH": 0.6564, + "hfopenllm_v2/MATH Level 5": 0.4094, + "hfopenllm_v2/GPQA": 0.3867, + "hfopenllm_v2/MUSR": 0.4806, + "hfopenllm_v2/MMLU-PRO": 0.54 + } + }, + { + "id": "prithivMLmods/Sqweeks-7B-Instruct", + "name": "Sqweeks-7B-Instruct", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2158, + "hfopenllm_v2/BBH": 0.4667, + "hfopenllm_v2/MATH Level 5": 0.5144, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4476, + "hfopenllm_v2/MMLU-PRO": 0.3133 + } + }, + { + "id": "prithivMLmods/Tadpole-Opus-14B-Exp", + "name": "Tadpole-Opus-14B-Exp", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.575, + "hfopenllm_v2/BBH": 0.6369, + "hfopenllm_v2/MATH Level 5": 0.3134, + "hfopenllm_v2/GPQA": 0.3859, + "hfopenllm_v2/MUSR": 0.4728, + "hfopenllm_v2/MMLU-PRO": 0.5322 + } + }, + { + "id": "prithivMLmods/Taurus-Opus-7B", + "name": "Taurus-Opus-7B", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4223, + "hfopenllm_v2/BBH": 0.5367, + "hfopenllm_v2/MATH Level 5": 0.2168, + "hfopenllm_v2/GPQA": 0.3263, + "hfopenllm_v2/MUSR": 0.4399, + "hfopenllm_v2/MMLU-PRO": 0.3951 + } + }, + { + "id": "prithivMLmods/Triangulum-10B", + "name": "Triangulum-10B", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3229, + "hfopenllm_v2/BBH": 0.5968, + "hfopenllm_v2/MATH Level 5": 0.355, + "hfopenllm_v2/GPQA": 0.354, + "hfopenllm_v2/MUSR": 0.4172, + "hfopenllm_v2/MMLU-PRO": 0.4178 + } + }, + { + "id": "prithivMLmods/Triangulum-5B", + "name": "Triangulum-5B", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1283, + "hfopenllm_v2/BBH": 0.3124, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.255, + "hfopenllm_v2/MUSR": 0.3445, + "hfopenllm_v2/MMLU-PRO": 0.1223 + } + }, + { + "id": "prithivMLmods/Triangulum-v2-10B", + "name": "Triangulum-v2-10B", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6705, + "hfopenllm_v2/BBH": 0.6065, + "hfopenllm_v2/MATH Level 5": 0.2447, + "hfopenllm_v2/GPQA": 0.3372, + "hfopenllm_v2/MUSR": 0.4281, + "hfopenllm_v2/MMLU-PRO": 0.4466 + } + }, + { + "id": "prithivMLmods/Tucana-Opus-14B-r999", + "name": "Tucana-Opus-14B-r999", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6067, + "hfopenllm_v2/BBH": 0.6557, + "hfopenllm_v2/MATH Level 5": 0.4063, + "hfopenllm_v2/GPQA": 0.3918, + "hfopenllm_v2/MUSR": 0.473, + "hfopenllm_v2/MMLU-PRO": 0.5384 + } + }, + { + "id": "prithivMLmods/Tulu-MathLingo-8B", + "name": "Tulu-MathLingo-8B", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5589, + "hfopenllm_v2/BBH": 0.4659, + "hfopenllm_v2/MATH Level 5": 0.145, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.3864, + "hfopenllm_v2/MMLU-PRO": 0.3044 + } + }, + { + "id": "prithivMLmods/Viper-Coder-7B-Elite14", + "name": "Viper-Coder-7B-Elite14", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1488, + "hfopenllm_v2/BBH": 0.2829, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.255, + "hfopenllm_v2/MUSR": 0.3422, + "hfopenllm_v2/MMLU-PRO": 0.1089 + } + }, + { + "id": "prithivMLmods/Viper-Coder-Hybrid-v1.2", + "name": "Viper-Coder-Hybrid-v1.2", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6736, + "hfopenllm_v2/BBH": 0.6391, + "hfopenllm_v2/MATH Level 5": 0.3331, + "hfopenllm_v2/GPQA": 0.3742, + "hfopenllm_v2/MUSR": 0.4822, + "hfopenllm_v2/MMLU-PRO": 0.5243 + } + }, + { + "id": "prithivMLmods/Viper-Coder-Hybrid-v1.3", + "name": "Viper-Coder-Hybrid-v1.3", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7555, + "hfopenllm_v2/BBH": 0.6471, + "hfopenllm_v2/MATH Level 5": 0.4517, + "hfopenllm_v2/GPQA": 0.3381, + "hfopenllm_v2/MUSR": 0.4403, + "hfopenllm_v2/MMLU-PRO": 0.5097 + } + }, + { + "id": "prithivMLmods/Viper-Coder-HybridMini-v1.3", + "name": "Viper-Coder-HybridMini-v1.3", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6104, + "hfopenllm_v2/BBH": 0.5365, + "hfopenllm_v2/MATH Level 5": 0.463, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4505, + "hfopenllm_v2/MMLU-PRO": 0.4352 + } + }, + { + "id": "prithivMLmods/Viper-Coder-v0.1", + "name": "Viper-Coder-v0.1", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5521, + "hfopenllm_v2/BBH": 0.6143, + "hfopenllm_v2/MATH Level 5": 0.327, + "hfopenllm_v2/GPQA": 0.354, + "hfopenllm_v2/MUSR": 0.4394, + "hfopenllm_v2/MMLU-PRO": 0.3928 + } + }, + { + "id": "prithivMLmods/Viper-Coder-v1.1", + "name": "Viper-Coder-v1.1", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4432, + "hfopenllm_v2/BBH": 0.6492, + "hfopenllm_v2/MATH Level 5": 0.5461, + "hfopenllm_v2/GPQA": 0.401, + "hfopenllm_v2/MUSR": 0.5219, + "hfopenllm_v2/MMLU-PRO": 0.5232 + } + }, + { + "id": "prithivMLmods/Viper-Coder-v1.6-r999", + "name": "Viper-Coder-v1.6-r999", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4433, + "hfopenllm_v2/BBH": 0.6492, + "hfopenllm_v2/MATH Level 5": 0.5657, + "hfopenllm_v2/GPQA": 0.401, + "hfopenllm_v2/MUSR": 0.5219, + "hfopenllm_v2/MMLU-PRO": 0.5232 + } + }, + { + "id": "prithivMLmods/Viper-Coder-v1.7-Vsm6", + "name": "Viper-Coder-v1.7-Vsm6", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5004, + "hfopenllm_v2/BBH": 0.6502, + "hfopenllm_v2/MATH Level 5": 0.4645, + "hfopenllm_v2/GPQA": 0.3968, + "hfopenllm_v2/MUSR": 0.4768, + "hfopenllm_v2/MMLU-PRO": 0.5288 + } + }, + { + "id": "prithivMLmods/Viper-OneCoder-UIGEN", + "name": "Viper-OneCoder-UIGEN", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4692, + "hfopenllm_v2/BBH": 0.6047, + "hfopenllm_v2/MATH Level 5": 0.3867, + "hfopenllm_v2/GPQA": 0.3423, + "hfopenllm_v2/MUSR": 0.4514, + "hfopenllm_v2/MMLU-PRO": 0.3904 + } + }, + { + "id": "prithivMLmods/Volans-Opus-14B-Exp", + "name": "Volans-Opus-14B-Exp", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5868, + "hfopenllm_v2/BBH": 0.6521, + "hfopenllm_v2/MATH Level 5": 0.4252, + "hfopenllm_v2/GPQA": 0.3851, + "hfopenllm_v2/MUSR": 0.4872, + "hfopenllm_v2/MMLU-PRO": 0.5385 + } + }, + { + "id": "prithivMLmods/WebMind-7B-v0.1", + "name": "WebMind-7B-v0.1", + "developer": "prithivMLmods", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5278, + "hfopenllm_v2/BBH": 0.5434, + "hfopenllm_v2/MATH Level 5": 0.3648, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4537, + "hfopenllm_v2/MMLU-PRO": 0.4279 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/prometheus-eval.json b/data/developers/prometheus-eval.json new file mode 100644 index 0000000000000000000000000000000000000000..308d3c0f9b08b56dc84d18e21f526fb6b5763312 --- /dev/null +++ b/data/developers/prometheus-eval.json @@ -0,0 +1,31 @@ +{ + "developer": "prometheus-eval", + "models": [ + { + "id": "prometheus-eval/prometheus-7b-v2.0", + "name": "prometheus-eval/prometheus-7b-v2.0", + "developer": "prometheus-eval", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7204, + "reward-bench/Chat": 0.8547, + "reward-bench/Chat Hard": 0.4912, + "reward-bench/Safety": 0.7709, + "reward-bench/Reasoning": 0.7648 + } + }, + { + "id": "prometheus-eval/prometheus-8x7b-v2.0", + "name": "prometheus-eval/prometheus-8x7b-v2.0", + "developer": "prometheus-eval", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7451, + "reward-bench/Chat": 0.9302, + "reward-bench/Chat Hard": 0.4715, + "reward-bench/Safety": 0.8047, + "reward-bench/Reasoning": 0.774 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/pszemraj.json b/data/developers/pszemraj.json new file mode 100644 index 0000000000000000000000000000000000000000..c0e0a36a5aea3f99dc9da3c6220482c82197c2cf --- /dev/null +++ b/data/developers/pszemraj.json @@ -0,0 +1,33 @@ +{ + "developer": "pszemraj", + "models": [ + { + "id": "pszemraj/Llama-3-6.3b-v0.1", + "name": "Llama-3-6.3b-v0.1", + "developer": "pszemraj", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1044, + "hfopenllm_v2/BBH": 0.4197, + "hfopenllm_v2/MATH Level 5": 0.0211, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3908, + "hfopenllm_v2/MMLU-PRO": 0.284 + } + }, + { + "id": "pszemraj/Mistral-v0.3-6B", + "name": "Mistral-v0.3-6B", + "developer": "pszemraj", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2454, + "hfopenllm_v2/BBH": 0.3774, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3908, + "hfopenllm_v2/MMLU-PRO": 0.2143 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/qingy2019.json b/data/developers/qingy2019.json new file mode 100644 index 0000000000000000000000000000000000000000..9e607d380bbf6837078a684f50c178b1c83bab9f --- /dev/null +++ b/data/developers/qingy2019.json @@ -0,0 +1,103 @@ +{ + "developer": "qingy2019", + "models": [ + { + "id": "qingy2019/LLaMa_3.2_3B_Catalysts", + "name": "LLaMa_3.2_3B_Catalysts", + "developer": "qingy2019", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4992, + "hfopenllm_v2/BBH": 0.4468, + "hfopenllm_v2/MATH Level 5": 0.1292, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.3788, + "hfopenllm_v2/MMLU-PRO": 0.3008 + } + }, + { + "id": "qingy2019/OpenMath2-Llama3.1-8B", + "name": "OpenMath2-Llama3.1-8B", + "developer": "qingy2019", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2331, + "hfopenllm_v2/BBH": 0.4096, + "hfopenllm_v2/MATH Level 5": 0.2674, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3436, + "hfopenllm_v2/MMLU-PRO": 0.1553 + } + }, + { + "id": "qingy2019/Oracle-14B", + "name": "Oracle-14B", + "developer": "qingy2019", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2401, + "hfopenllm_v2/BBH": 0.4622, + "hfopenllm_v2/MATH Level 5": 0.0725, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3703, + "hfopenllm_v2/MMLU-PRO": 0.2379 + } + }, + { + "id": "qingy2019/Qwen2.5-Math-14B-Instruct", + "name": "Qwen2.5-Math-14B-Instruct", + "developer": "qingy2019", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6005, + "hfopenllm_v2/BBH": 0.6356, + "hfopenllm_v2/MATH Level 5": 0.2764, + "hfopenllm_v2/GPQA": 0.3691, + "hfopenllm_v2/MUSR": 0.4757, + "hfopenllm_v2/MMLU-PRO": 0.5339 + } + }, + { + "id": "qingy2019/Qwen2.5-Math-14B-Instruct-Alpha", + "name": "Qwen2.5-Math-14B-Instruct-Alpha", + "developer": "qingy2019", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5981, + "hfopenllm_v2/BBH": 0.6375, + "hfopenllm_v2/MATH Level 5": 0.3142, + "hfopenllm_v2/GPQA": 0.37, + "hfopenllm_v2/MUSR": 0.4649, + "hfopenllm_v2/MMLU-PRO": 0.5331 + } + }, + { + "id": "qingy2019/Qwen2.5-Math-14B-Instruct-Pro", + "name": "Qwen2.5-Math-14B-Instruct-Pro", + "developer": "qingy2019", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1922, + "hfopenllm_v2/BBH": 0.5319, + "hfopenllm_v2/MATH Level 5": 0.284, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.374, + "hfopenllm_v2/MMLU-PRO": 0.3558 + } + }, + { + "id": "qingy2019/Qwen2.5-Ultimate-14B-Instruct", + "name": "Qwen2.5-Ultimate-14B-Instruct", + "developer": "qingy2019", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3938, + "hfopenllm_v2/BBH": 0.5842, + "hfopenllm_v2/MATH Level 5": 0.2893, + "hfopenllm_v2/GPQA": 0.3565, + "hfopenllm_v2/MUSR": 0.4135, + "hfopenllm_v2/MMLU-PRO": 0.4929 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/qingy2024.json b/data/developers/qingy2024.json new file mode 100644 index 0000000000000000000000000000000000000000..abc4192df3d92a1e0f50d1083a6efa755ea494d4 --- /dev/null +++ b/data/developers/qingy2024.json @@ -0,0 +1,243 @@ +{ + "developer": "qingy2024", + "models": [ + { + "id": "qingy2024/Benchmaxx-Llama-3.2-1B-Instruct", + "name": "Benchmaxx-Llama-3.2-1B-Instruct", + "developer": "qingy2024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2014, + "hfopenllm_v2/BBH": 0.8269, + "hfopenllm_v2/MATH Level 5": 0.4804, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3446, + "hfopenllm_v2/MMLU-PRO": 0.1113 + } + }, + { + "id": "qingy2024/Eyas-17B-Instruct", + "name": "Eyas-17B-Instruct", + "developer": "qingy2024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6575, + "hfopenllm_v2/BBH": 0.6085, + "hfopenllm_v2/MATH Level 5": 0.247, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.4522, + "hfopenllm_v2/MMLU-PRO": 0.4343 + } + }, + { + "id": "qingy2024/Falcon3-2x10B-MoE-Instruct", + "name": "Falcon3-2x10B-MoE-Instruct", + "developer": "qingy2024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.785, + "hfopenllm_v2/BBH": 0.6185, + "hfopenllm_v2/MATH Level 5": 0.2795, + "hfopenllm_v2/GPQA": 0.3305, + "hfopenllm_v2/MUSR": 0.4284, + "hfopenllm_v2/MMLU-PRO": 0.4423 + } + }, + { + "id": "qingy2024/Fusion-14B-Instruct", + "name": "Fusion-14B-Instruct", + "developer": "qingy2024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.726, + "hfopenllm_v2/BBH": 0.6396, + "hfopenllm_v2/MATH Level 5": 0.3369, + "hfopenllm_v2/GPQA": 0.3549, + "hfopenllm_v2/MUSR": 0.44, + "hfopenllm_v2/MMLU-PRO": 0.5044 + } + }, + { + "id": "qingy2024/Fusion2-14B-Instruct", + "name": "Fusion2-14B-Instruct", + "developer": "qingy2024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6064, + "hfopenllm_v2/BBH": 0.6119, + "hfopenllm_v2/MATH Level 5": 0.3127, + "hfopenllm_v2/GPQA": 0.3448, + "hfopenllm_v2/MUSR": 0.4634, + "hfopenllm_v2/MMLU-PRO": 0.5051 + } + }, + { + "id": "qingy2024/Fusion4-14B-Instruct", + "name": "Fusion4-14B-Instruct", + "developer": "qingy2024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7649, + "hfopenllm_v2/BBH": 0.6543, + "hfopenllm_v2/MATH Level 5": 0.3882, + "hfopenllm_v2/GPQA": 0.3305, + "hfopenllm_v2/MUSR": 0.4326, + "hfopenllm_v2/MMLU-PRO": 0.5194 + } + }, + { + "id": "qingy2024/OwO-14B-Instruct", + "name": "OwO-14B-Instruct", + "developer": "qingy2024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1383, + "hfopenllm_v2/BBH": 0.6165, + "hfopenllm_v2/MATH Level 5": 0.4162, + "hfopenllm_v2/GPQA": 0.3641, + "hfopenllm_v2/MUSR": 0.4407, + "hfopenllm_v2/MMLU-PRO": 0.5181 + } + }, + { + "id": "qingy2024/QwEnlarge-16B-Instruct", + "name": "QwEnlarge-16B-Instruct", + "developer": "qingy2024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7802, + "hfopenllm_v2/BBH": 0.5949, + "hfopenllm_v2/MATH Level 5": 0.46, + "hfopenllm_v2/GPQA": 0.3331, + "hfopenllm_v2/MUSR": 0.4101, + "hfopenllm_v2/MMLU-PRO": 0.4476 + } + }, + { + "id": "qingy2024/QwQ-14B-Math-v0.2", + "name": "QwQ-14B-Math-v0.2", + "developer": "qingy2024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3391, + "hfopenllm_v2/BBH": 0.5731, + "hfopenllm_v2/MATH Level 5": 0.4811, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.4021, + "hfopenllm_v2/MMLU-PRO": 0.48 + } + }, + { + "id": "qingy2024/Qwarkstar-4B", + "name": "Qwarkstar-4B", + "developer": "qingy2024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1994, + "hfopenllm_v2/BBH": 0.4015, + "hfopenllm_v2/MATH Level 5": 0.0861, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.4428, + "hfopenllm_v2/MMLU-PRO": 0.2425 + } + }, + { + "id": "qingy2024/Qwarkstar-4B-Instruct-Preview", + "name": "Qwarkstar-4B-Instruct-Preview", + "developer": "qingy2024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5324, + "hfopenllm_v2/BBH": 0.4358, + "hfopenllm_v2/MATH Level 5": 0.1284, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3896, + "hfopenllm_v2/MMLU-PRO": 0.2502 + } + }, + { + "id": "qingy2024/Qwen2.5-4B", + "name": "Qwen2.5-4B", + "developer": "qingy2024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2158, + "hfopenllm_v2/BBH": 0.4269, + "hfopenllm_v2/MATH Level 5": 0.0514, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.461, + "hfopenllm_v2/MMLU-PRO": 0.2525 + } + }, + { + "id": "qingy2024/Qwen2.5-Coder-Draft-1.5B-Instruct", + "name": "Qwen2.5-Coder-Draft-1.5B-Instruct", + "developer": "qingy2024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4125, + "hfopenllm_v2/BBH": 0.3837, + "hfopenllm_v2/MATH Level 5": 0.1579, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.358, + "hfopenllm_v2/MMLU-PRO": 0.2244 + } + }, + { + "id": "qingy2024/Qwen2.5-Math-14B-Instruct-Alpha", + "name": "Qwen2.5-Math-14B-Instruct-Alpha", + "developer": "qingy2024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7704, + "hfopenllm_v2/BBH": 0.6465, + "hfopenllm_v2/MATH Level 5": 0.429, + "hfopenllm_v2/GPQA": 0.349, + "hfopenllm_v2/MUSR": 0.4021, + "hfopenllm_v2/MMLU-PRO": 0.4966 + } + }, + { + "id": "qingy2024/Qwen2.5-Math-14B-Instruct-Preview", + "name": "Qwen2.5-Math-14B-Instruct-Preview", + "developer": "qingy2024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7826, + "hfopenllm_v2/BBH": 0.6294, + "hfopenllm_v2/MATH Level 5": 0.4758, + "hfopenllm_v2/GPQA": 0.3406, + "hfopenllm_v2/MUSR": 0.4115, + "hfopenllm_v2/MMLU-PRO": 0.4993 + } + }, + { + "id": "qingy2024/Qwen2.6-14B-Instruct", + "name": "Qwen2.6-14B-Instruct", + "developer": "qingy2024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5811, + "hfopenllm_v2/BBH": 0.6394, + "hfopenllm_v2/MATH Level 5": 0.3051, + "hfopenllm_v2/GPQA": 0.3792, + "hfopenllm_v2/MUSR": 0.4569, + "hfopenllm_v2/MMLU-PRO": 0.5285 + } + }, + { + "id": "qingy2024/Qwen2.6-Math-14B-Instruct", + "name": "Qwen2.6-Math-14B-Instruct", + "developer": "qingy2024", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3862, + "hfopenllm_v2/BBH": 0.6324, + "hfopenllm_v2/MATH Level 5": 0.429, + "hfopenllm_v2/GPQA": 0.37, + "hfopenllm_v2/MUSR": 0.4759, + "hfopenllm_v2/MMLU-PRO": 0.5241 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/qq8933.json b/data/developers/qq8933.json new file mode 100644 index 0000000000000000000000000000000000000000..31cf81dc475087d5768bc0201101502018108d86 --- /dev/null +++ b/data/developers/qq8933.json @@ -0,0 +1,19 @@ +{ + "developer": "qq8933", + "models": [ + { + "id": "qq8933/OpenLongCoT-Base-Gemma2-2B", + "name": "OpenLongCoT-Base-Gemma2-2B", + "developer": "qq8933", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1965, + "hfopenllm_v2/BBH": 0.3106, + "hfopenllm_v2/MATH Level 5": 0.0234, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3222, + "hfopenllm_v2/MMLU-PRO": 0.1316 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/qwen.json b/data/developers/qwen.json new file mode 100644 index 0000000000000000000000000000000000000000..97558c520ad4a20eeac87e7c81dc7120c0e9c64f --- /dev/null +++ b/data/developers/qwen.json @@ -0,0 +1,477 @@ +{ + "developer": "qwen", + "models": [ + { + "id": "qwen/qwen1.5-110b-chat", + "name": "Qwen1.5 Chat 110B", + "developer": "qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.55, + "helm_lite/NarrativeQA": 0.721, + "helm_lite/NaturalQuestions (closed-book)": 0.35, + "helm_lite/OpenbookQA": 0.922, + "helm_lite/MMLU": 0.704, + "helm_lite/MATH": 0.568, + "helm_lite/GSM8K": 0.815, + "helm_lite/LegalBench": 0.624, + "helm_lite/MedQA": 0.64, + "helm_lite/WMT 2014": 0.192, + "helm_mmlu/MMLU All Subjects": 0.768, + "helm_mmlu/Abstract Algebra": 0.57, + "helm_mmlu/Anatomy": 0.696, + "helm_mmlu/College Physics": 0.51, + "helm_mmlu/Computer Security": 0.82, + "helm_mmlu/Econometrics": 0.64, + "helm_mmlu/Global Facts": 0.51, + "helm_mmlu/Jurisprudence": 0.833, + "helm_mmlu/Philosophy": 0.823, + "helm_mmlu/Professional Psychology": 0.82, + "helm_mmlu/Us Foreign Policy": 0.87, + "helm_mmlu/Astronomy": 0.901, + "helm_mmlu/Business Ethics": 0.8, + "helm_mmlu/Clinical Knowledge": 0.766, + "helm_mmlu/Conceptual Physics": 0.838, + "helm_mmlu/Electrical Engineering": 0.752, + "helm_mmlu/Elementary Mathematics": 0.669, + "helm_mmlu/Formal Logic": 0.643, + "helm_mmlu/High School World History": 0.903, + "helm_mmlu/Human Sexuality": 0.855, + "helm_mmlu/International Law": 0.876, + "helm_mmlu/Logical Fallacies": 0.828, + "helm_mmlu/Machine Learning": 0.634, + "helm_mmlu/Management": 0.835, + "helm_mmlu/Marketing": 0.919, + "helm_mmlu/Medical Genetics": 0.85, + "helm_mmlu/Miscellaneous": 0.934, + "helm_mmlu/Moral Scenarios": 0.783, + "helm_mmlu/Nutrition": 0.804, + "helm_mmlu/Prehistory": 0.867, + "helm_mmlu/Public Relations": 0.773, + "helm_mmlu/Security Studies": 0.735, + "helm_mmlu/Sociology": 0.866, + "helm_mmlu/Virology": 0.542, + "helm_mmlu/World Religions": 0.871, + "helm_mmlu/Mean win rate": 0.875 + } + }, + { + "id": "qwen/qwen1.5-14b", + "name": "Qwen1.5 14B", + "developer": "qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.425, + "helm_lite/NarrativeQA": 0.711, + "helm_lite/NaturalQuestions (closed-book)": 0.3, + "helm_lite/OpenbookQA": 0.862, + "helm_lite/MMLU": 0.626, + "helm_lite/MATH": 0.686, + "helm_lite/GSM8K": 0.693, + "helm_lite/LegalBench": 0.593, + "helm_lite/MedQA": 0.515, + "helm_lite/WMT 2014": 0.178, + "helm_mmlu/MMLU All Subjects": 0.686, + "helm_mmlu/Abstract Algebra": 0.4, + "helm_mmlu/Anatomy": 0.637, + "helm_mmlu/College Physics": 0.48, + "helm_mmlu/Computer Security": 0.84, + "helm_mmlu/Econometrics": 0.561, + "helm_mmlu/Global Facts": 0.49, + "helm_mmlu/Jurisprudence": 0.769, + "helm_mmlu/Philosophy": 0.717, + "helm_mmlu/Professional Psychology": 0.699, + "helm_mmlu/Us Foreign Policy": 0.87, + "helm_mmlu/Astronomy": 0.724, + "helm_mmlu/Business Ethics": 0.75, + "helm_mmlu/Clinical Knowledge": 0.736, + "helm_mmlu/Conceptual Physics": 0.694, + "helm_mmlu/Electrical Engineering": 0.683, + "helm_mmlu/Elementary Mathematics": 0.603, + "helm_mmlu/Formal Logic": 0.492, + "helm_mmlu/High School World History": 0.84, + "helm_mmlu/Human Sexuality": 0.756, + "helm_mmlu/International Law": 0.826, + "helm_mmlu/Logical Fallacies": 0.736, + "helm_mmlu/Machine Learning": 0.509, + "helm_mmlu/Management": 0.816, + "helm_mmlu/Marketing": 0.893, + "helm_mmlu/Medical Genetics": 0.76, + "helm_mmlu/Miscellaneous": 0.835, + "helm_mmlu/Moral Scenarios": 0.368, + "helm_mmlu/Nutrition": 0.742, + "helm_mmlu/Prehistory": 0.71, + "helm_mmlu/Public Relations": 0.655, + "helm_mmlu/Security Studies": 0.8, + "helm_mmlu/Sociology": 0.841, + "helm_mmlu/Virology": 0.458, + "helm_mmlu/World Religions": 0.842, + "helm_mmlu/Mean win rate": 0.796 + } + }, + { + "id": "qwen/qwen1.5-32b", + "name": "Qwen1.5 32B", + "developer": "qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.546, + "helm_lite/NarrativeQA": 0.589, + "helm_lite/NaturalQuestions (closed-book)": 0.353, + "helm_lite/OpenbookQA": 0.932, + "helm_lite/MMLU": 0.628, + "helm_lite/MATH": 0.733, + "helm_lite/GSM8K": 0.773, + "helm_lite/LegalBench": 0.636, + "helm_lite/MedQA": 0.656, + "helm_lite/WMT 2014": 0.193, + "helm_mmlu/MMLU All Subjects": 0.744, + "helm_mmlu/Abstract Algebra": 0.4, + "helm_mmlu/Anatomy": 0.644, + "helm_mmlu/College Physics": 0.51, + "helm_mmlu/Computer Security": 0.77, + "helm_mmlu/Econometrics": 0.561, + "helm_mmlu/Global Facts": 0.47, + "helm_mmlu/Jurisprudence": 0.843, + "helm_mmlu/Philosophy": 0.826, + "helm_mmlu/Professional Psychology": 0.75, + "helm_mmlu/Us Foreign Policy": 0.91, + "helm_mmlu/Astronomy": 0.855, + "helm_mmlu/Business Ethics": 0.77, + "helm_mmlu/Clinical Knowledge": 0.781, + "helm_mmlu/Conceptual Physics": 0.766, + "helm_mmlu/Electrical Engineering": 0.731, + "helm_mmlu/Elementary Mathematics": 0.685, + "helm_mmlu/Formal Logic": 0.524, + "helm_mmlu/High School World History": 0.869, + "helm_mmlu/Human Sexuality": 0.847, + "helm_mmlu/International Law": 0.884, + "helm_mmlu/Logical Fallacies": 0.822, + "helm_mmlu/Machine Learning": 0.616, + "helm_mmlu/Management": 0.874, + "helm_mmlu/Marketing": 0.936, + "helm_mmlu/Medical Genetics": 0.85, + "helm_mmlu/Miscellaneous": 0.884, + "helm_mmlu/Moral Scenarios": 0.545, + "helm_mmlu/Nutrition": 0.81, + "helm_mmlu/Prehistory": 0.83, + "helm_mmlu/Public Relations": 0.664, + "helm_mmlu/Security Studies": 0.829, + "helm_mmlu/Sociology": 0.881, + "helm_mmlu/Virology": 0.578, + "helm_mmlu/World Religions": 0.854, + "helm_mmlu/Mean win rate": 0.624 + } + }, + { + "id": "qwen/qwen1.5-72b", + "name": "Qwen1.5 72B", + "developer": "qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.608, + "helm_lite/NarrativeQA": 0.601, + "helm_lite/NaturalQuestions (closed-book)": 0.417, + "helm_lite/OpenbookQA": 0.93, + "helm_lite/MMLU": 0.647, + "helm_lite/MATH": 0.683, + "helm_lite/GSM8K": 0.799, + "helm_lite/LegalBench": 0.694, + "helm_lite/MedQA": 0.67, + "helm_lite/WMT 2014": 0.201, + "helm_mmlu/MMLU All Subjects": 0.774, + "helm_mmlu/Abstract Algebra": 0.44, + "helm_mmlu/Anatomy": 0.733, + "helm_mmlu/College Physics": 0.559, + "helm_mmlu/Computer Security": 0.81, + "helm_mmlu/Econometrics": 0.544, + "helm_mmlu/Global Facts": 0.56, + "helm_mmlu/Jurisprudence": 0.824, + "helm_mmlu/Philosophy": 0.83, + "helm_mmlu/Professional Psychology": 0.809, + "helm_mmlu/Us Foreign Policy": 0.94, + "helm_mmlu/Astronomy": 0.868, + "helm_mmlu/Business Ethics": 0.79, + "helm_mmlu/Clinical Knowledge": 0.834, + "helm_mmlu/Conceptual Physics": 0.821, + "helm_mmlu/Electrical Engineering": 0.779, + "helm_mmlu/Elementary Mathematics": 0.696, + "helm_mmlu/Formal Logic": 0.556, + "helm_mmlu/High School World History": 0.899, + "helm_mmlu/Human Sexuality": 0.878, + "helm_mmlu/International Law": 0.909, + "helm_mmlu/Logical Fallacies": 0.853, + "helm_mmlu/Machine Learning": 0.67, + "helm_mmlu/Management": 0.854, + "helm_mmlu/Marketing": 0.949, + "helm_mmlu/Medical Genetics": 0.87, + "helm_mmlu/Miscellaneous": 0.921, + "helm_mmlu/Moral Scenarios": 0.669, + "helm_mmlu/Nutrition": 0.859, + "helm_mmlu/Prehistory": 0.88, + "helm_mmlu/Public Relations": 0.755, + "helm_mmlu/Security Studies": 0.824, + "helm_mmlu/Sociology": 0.9, + "helm_mmlu/Virology": 0.584, + "helm_mmlu/World Religions": 0.883, + "helm_mmlu/Mean win rate": 0.65 + } + }, + { + "id": "qwen/qwen1.5-7b", + "name": "Qwen1.5 7B", + "developer": "qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.275, + "helm_lite/NarrativeQA": 0.448, + "helm_lite/NaturalQuestions (closed-book)": 0.27, + "helm_lite/OpenbookQA": 0.806, + "helm_lite/MMLU": 0.569, + "helm_lite/MATH": 0.561, + "helm_lite/GSM8K": 0.6, + "helm_lite/LegalBench": 0.523, + "helm_lite/MedQA": 0.479, + "helm_lite/WMT 2014": 0.153, + "helm_mmlu/MMLU All Subjects": 0.626, + "helm_mmlu/Abstract Algebra": 0.39, + "helm_mmlu/Anatomy": 0.526, + "helm_mmlu/College Physics": 0.471, + "helm_mmlu/Computer Security": 0.76, + "helm_mmlu/Econometrics": 0.447, + "helm_mmlu/Global Facts": 0.4, + "helm_mmlu/Jurisprudence": 0.778, + "helm_mmlu/Philosophy": 0.691, + "helm_mmlu/Professional Psychology": 0.603, + "helm_mmlu/Us Foreign Policy": 0.84, + "helm_mmlu/Astronomy": 0.671, + "helm_mmlu/Business Ethics": 0.69, + "helm_mmlu/Clinical Knowledge": 0.691, + "helm_mmlu/Conceptual Physics": 0.579, + "helm_mmlu/Electrical Engineering": 0.572, + "helm_mmlu/Elementary Mathematics": 0.5, + "helm_mmlu/Formal Logic": 0.397, + "helm_mmlu/High School World History": 0.789, + "helm_mmlu/Human Sexuality": 0.695, + "helm_mmlu/International Law": 0.76, + "helm_mmlu/Logical Fallacies": 0.706, + "helm_mmlu/Machine Learning": 0.411, + "helm_mmlu/Management": 0.816, + "helm_mmlu/Marketing": 0.863, + "helm_mmlu/Medical Genetics": 0.69, + "helm_mmlu/Miscellaneous": 0.765, + "helm_mmlu/Moral Scenarios": 0.372, + "helm_mmlu/Nutrition": 0.696, + "helm_mmlu/Prehistory": 0.688, + "helm_mmlu/Public Relations": 0.627, + "helm_mmlu/Security Studies": 0.727, + "helm_mmlu/Sociology": 0.836, + "helm_mmlu/Virology": 0.488, + "helm_mmlu/World Religions": 0.778, + "helm_mmlu/Mean win rate": 0.843 + } + }, + { + "id": "qwen/qwen2-72b-instruct", + "name": "Qwen2 Instruct 72B", + "developer": "qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.77, + "helm_lite/NarrativeQA": 0.727, + "helm_lite/NaturalQuestions (closed-book)": 0.39, + "helm_lite/OpenbookQA": 0.954, + "helm_lite/MMLU": 0.769, + "helm_lite/MATH": 0.79, + "helm_lite/GSM8K": 0.92, + "helm_lite/LegalBench": 0.712, + "helm_lite/MedQA": 0.746, + "helm_lite/WMT 2014": 0.207, + "helm_mmlu/MMLU All Subjects": 0.824, + "helm_mmlu/Abstract Algebra": 0.67, + "helm_mmlu/Anatomy": 0.793, + "helm_mmlu/College Physics": 0.598, + "helm_mmlu/Computer Security": 0.85, + "helm_mmlu/Econometrics": 0.737, + "helm_mmlu/Global Facts": 0.58, + "helm_mmlu/Jurisprudence": 0.87, + "helm_mmlu/Philosophy": 0.859, + "helm_mmlu/Professional Psychology": 0.886, + "helm_mmlu/Us Foreign Policy": 0.94, + "helm_mmlu/Astronomy": 0.934, + "helm_mmlu/Business Ethics": 0.82, + "helm_mmlu/Clinical Knowledge": 0.868, + "helm_mmlu/Conceptual Physics": 0.872, + "helm_mmlu/Electrical Engineering": 0.793, + "helm_mmlu/Elementary Mathematics": 0.825, + "helm_mmlu/Formal Logic": 0.667, + "helm_mmlu/High School World History": 0.932, + "helm_mmlu/Human Sexuality": 0.893, + "helm_mmlu/International Law": 0.893, + "helm_mmlu/Logical Fallacies": 0.914, + "helm_mmlu/Machine Learning": 0.768, + "helm_mmlu/Management": 0.903, + "helm_mmlu/Marketing": 0.953, + "helm_mmlu/Medical Genetics": 0.9, + "helm_mmlu/Miscellaneous": 0.943, + "helm_mmlu/Moral Scenarios": 0.815, + "helm_mmlu/Nutrition": 0.902, + "helm_mmlu/Prehistory": 0.914, + "helm_mmlu/Public Relations": 0.745, + "helm_mmlu/Security Studies": 0.837, + "helm_mmlu/Sociology": 0.935, + "helm_mmlu/Virology": 0.56, + "helm_mmlu/World Religions": 0.848, + "helm_mmlu/Mean win rate": 0.826 + } + }, + { + "id": "qwen/qwen2.5-72b-instruct-turbo", + "name": "Qwen2.5 Instruct Turbo 72B", + "developer": "qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.599, + "helm_capabilities/MMLU-Pro": 0.631, + "helm_capabilities/GPQA": 0.426, + "helm_capabilities/IFEval": 0.806, + "helm_capabilities/WildBench": 0.802, + "helm_capabilities/Omni-MATH": 0.33, + "helm_lite/Mean win rate": 0.745, + "helm_lite/NarrativeQA": 0.745, + "helm_lite/NaturalQuestions (closed-book)": 0.359, + "helm_lite/OpenbookQA": 0.962, + "helm_lite/MMLU": 0.77, + "helm_lite/MATH": 0.884, + "helm_lite/GSM8K": 0.9, + "helm_lite/LegalBench": 0.74, + "helm_lite/MedQA": 0.753, + "helm_lite/WMT 2014": 0.207, + "helm_mmlu/MMLU All Subjects": 0.834, + "helm_mmlu/Abstract Algebra": 0.68, + "helm_mmlu/Anatomy": 0.822, + "helm_mmlu/College Physics": 0.588, + "helm_mmlu/Computer Security": 0.86, + "helm_mmlu/Econometrics": 0.728, + "helm_mmlu/Global Facts": 0.61, + "helm_mmlu/Jurisprudence": 0.87, + "helm_mmlu/Philosophy": 0.839, + "helm_mmlu/Professional Psychology": 0.864, + "helm_mmlu/Us Foreign Policy": 0.96, + "helm_mmlu/Astronomy": 0.934, + "helm_mmlu/Business Ethics": 0.85, + "helm_mmlu/Clinical Knowledge": 0.872, + "helm_mmlu/Conceptual Physics": 0.885, + "helm_mmlu/Electrical Engineering": 0.8, + "helm_mmlu/Elementary Mathematics": 0.87, + "helm_mmlu/Formal Logic": 0.73, + "helm_mmlu/High School World History": 0.92, + "helm_mmlu/Human Sexuality": 0.878, + "helm_mmlu/International Law": 0.893, + "helm_mmlu/Logical Fallacies": 0.89, + "helm_mmlu/Machine Learning": 0.777, + "helm_mmlu/Management": 0.913, + "helm_mmlu/Marketing": 0.953, + "helm_mmlu/Medical Genetics": 0.92, + "helm_mmlu/Miscellaneous": 0.932, + "helm_mmlu/Moral Scenarios": 0.787, + "helm_mmlu/Nutrition": 0.886, + "helm_mmlu/Prehistory": 0.91, + "helm_mmlu/Public Relations": 0.782, + "helm_mmlu/Security Studies": 0.849, + "helm_mmlu/Sociology": 0.925, + "helm_mmlu/Virology": 0.584, + "helm_mmlu/World Religions": 0.901, + "helm_mmlu/Mean win rate": 0.548 + } + }, + { + "id": "qwen/qwen2.5-7b-instruct-turbo", + "name": "Qwen2.5 Instruct Turbo 7B", + "developer": "qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.529, + "helm_capabilities/MMLU-Pro": 0.539, + "helm_capabilities/GPQA": 0.341, + "helm_capabilities/IFEval": 0.741, + "helm_capabilities/WildBench": 0.731, + "helm_capabilities/Omni-MATH": 0.294, + "helm_lite/Mean win rate": 0.488, + "helm_lite/NarrativeQA": 0.742, + "helm_lite/NaturalQuestions (closed-book)": 0.205, + "helm_lite/OpenbookQA": 0.862, + "helm_lite/MMLU": 0.658, + "helm_lite/MATH": 0.835, + "helm_lite/GSM8K": 0.83, + "helm_lite/LegalBench": 0.632, + "helm_lite/MedQA": 0.6, + "helm_lite/WMT 2014": 0.155, + "helm_mmlu/MMLU All Subjects": 0.729, + "helm_mmlu/Abstract Algebra": 0.49, + "helm_mmlu/Anatomy": 0.689, + "helm_mmlu/College Physics": 0.51, + "helm_mmlu/Computer Security": 0.79, + "helm_mmlu/Econometrics": 0.64, + "helm_mmlu/Global Facts": 0.42, + "helm_mmlu/Jurisprudence": 0.796, + "helm_mmlu/Philosophy": 0.746, + "helm_mmlu/Professional Psychology": 0.757, + "helm_mmlu/Us Foreign Policy": 0.86, + "helm_mmlu/Astronomy": 0.836, + "helm_mmlu/Business Ethics": 0.82, + "helm_mmlu/Clinical Knowledge": 0.785, + "helm_mmlu/Conceptual Physics": 0.736, + "helm_mmlu/Electrical Engineering": 0.717, + "helm_mmlu/Elementary Mathematics": 0.643, + "helm_mmlu/Formal Logic": 0.587, + "helm_mmlu/High School World History": 0.878, + "helm_mmlu/Human Sexuality": 0.794, + "helm_mmlu/International Law": 0.86, + "helm_mmlu/Logical Fallacies": 0.773, + "helm_mmlu/Machine Learning": 0.554, + "helm_mmlu/Management": 0.845, + "helm_mmlu/Marketing": 0.919, + "helm_mmlu/Medical Genetics": 0.85, + "helm_mmlu/Miscellaneous": 0.852, + "helm_mmlu/Moral Scenarios": 0.511, + "helm_mmlu/Nutrition": 0.778, + "helm_mmlu/Prehistory": 0.836, + "helm_mmlu/Public Relations": 0.709, + "helm_mmlu/Security Studies": 0.682, + "helm_mmlu/Sociology": 0.861, + "helm_mmlu/Virology": 0.578, + "helm_mmlu/World Religions": 0.83, + "helm_mmlu/Mean win rate": 0.887 + } + }, + { + "id": "qwen/qwen3-235b-a22b-fp8-tput", + "name": "Qwen3 235B A22B FP8 Throughput", + "developer": "qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.726, + "helm_capabilities/MMLU-Pro": 0.817, + "helm_capabilities/GPQA": 0.623, + "helm_capabilities/IFEval": 0.816, + "helm_capabilities/WildBench": 0.828, + "helm_capabilities/Omni-MATH": 0.548 + } + }, + { + "id": "qwen/qwen3-235b-a22b-instruct-2507-fp8", + "name": "Qwen3 235B A22B Instruct 2507 FP8", + "developer": "qwen", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.798, + "helm_capabilities/MMLU-Pro": 0.844, + "helm_capabilities/GPQA": 0.726, + "helm_capabilities/IFEval": 0.835, + "helm_capabilities/WildBench": 0.866, + "helm_capabilities/Omni-MATH": 0.718 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/raphgg.json b/data/developers/raphgg.json new file mode 100644 index 0000000000000000000000000000000000000000..e0cca6a377c18f8c30320729487a6c71178a4345 --- /dev/null +++ b/data/developers/raphgg.json @@ -0,0 +1,19 @@ +{ + "developer": "raphgg", + "models": [ + { + "id": "raphgg/test-2.5-72B", + "name": "test-2.5-72B", + "developer": "raphgg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8437, + "hfopenllm_v2/BBH": 0.7266, + "hfopenllm_v2/MATH Level 5": 0.4109, + "hfopenllm_v2/GPQA": 0.3893, + "hfopenllm_v2/MUSR": 0.4812, + "hfopenllm_v2/MMLU-PRO": 0.5837 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/rasyosef.json b/data/developers/rasyosef.json new file mode 100644 index 0000000000000000000000000000000000000000..a04dad9cab18f27426a8f9116460793f6d035061 --- /dev/null +++ b/data/developers/rasyosef.json @@ -0,0 +1,61 @@ +{ + "developer": "rasyosef", + "models": [ + { + "id": "rasyosef/Mistral-NeMo-Minitron-8B-Chat", + "name": "Mistral-NeMo-Minitron-8B-Chat", + "developer": "rasyosef", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4452, + "hfopenllm_v2/BBH": 0.4759, + "hfopenllm_v2/MATH Level 5": 0.0272, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.4304, + "hfopenllm_v2/MMLU-PRO": 0.2404 + } + }, + { + "id": "rasyosef/Phi-1_5-Instruct-v0.1", + "name": "Phi-1_5-Instruct-v0.1", + "developer": "rasyosef", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2402, + "hfopenllm_v2/BBH": 0.3118, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3422, + "hfopenllm_v2/MMLU-PRO": 0.1562 + } + }, + { + "id": "rasyosef/phi-2-instruct-apo", + "name": "phi-2-instruct-apo", + "developer": "rasyosef", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3146, + "hfopenllm_v2/BBH": 0.4445, + "hfopenllm_v2/MATH Level 5": 0.0302, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3342, + "hfopenllm_v2/MMLU-PRO": 0.2155 + } + }, + { + "id": "rasyosef/phi-2-instruct-v0.1", + "name": "phi-2-instruct-v0.1", + "developer": "rasyosef", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3681, + "hfopenllm_v2/BBH": 0.4726, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3524, + "hfopenllm_v2/MMLU-PRO": 0.2247 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/realtreetune.json b/data/developers/realtreetune.json new file mode 100644 index 0000000000000000000000000000000000000000..9db7762c41305893f8f3af8c0a1731c6478daa5c --- /dev/null +++ b/data/developers/realtreetune.json @@ -0,0 +1,19 @@ +{ + "developer": "realtreetune", + "models": [ + { + "id": "realtreetune/rho-1b-sft-MATH", + "name": "rho-1b-sft-MATH", + "developer": "realtreetune", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2121, + "hfopenllm_v2/BBH": 0.3144, + "hfopenllm_v2/MATH Level 5": 0.0347, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.3458, + "hfopenllm_v2/MMLU-PRO": 0.1117 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/recoilme.json b/data/developers/recoilme.json new file mode 100644 index 0000000000000000000000000000000000000000..862469c1653f0d1414e3f6dabd97ef04b6d4f9e2 --- /dev/null +++ b/data/developers/recoilme.json @@ -0,0 +1,89 @@ +{ + "developer": "recoilme", + "models": [ + { + "id": "recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp", + "name": "Gemma-2-Ataraxy-Gemmasutra-9B-slerp", + "developer": "recoilme", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2854, + "hfopenllm_v2/BBH": 0.5984, + "hfopenllm_v2/MATH Level 5": 0.1005, + "hfopenllm_v2/GPQA": 0.3297, + "hfopenllm_v2/MUSR": 0.4607, + "hfopenllm_v2/MMLU-PRO": 0.4162 + } + }, + { + "id": "recoilme/recoilme-gemma-2-9B-v0.1", + "name": "recoilme-gemma-2-9B-v0.1", + "developer": "recoilme", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7515, + "hfopenllm_v2/BBH": 0.5995, + "hfopenllm_v2/MATH Level 5": 0.2039, + "hfopenllm_v2/GPQA": 0.3389, + "hfopenllm_v2/MUSR": 0.4191, + "hfopenllm_v2/MMLU-PRO": 0.4159 + } + }, + { + "id": "recoilme/recoilme-gemma-2-9B-v0.2", + "name": "recoilme-gemma-2-9B-v0.2", + "developer": "recoilme", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7592, + "hfopenllm_v2/BBH": 0.6026, + "hfopenllm_v2/MATH Level 5": 0.0529, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.4099, + "hfopenllm_v2/MMLU-PRO": 0.4163 + } + }, + { + "id": "recoilme/recoilme-gemma-2-9B-v0.3", + "name": "recoilme-gemma-2-9B-v0.3", + "developer": "recoilme", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5761, + "hfopenllm_v2/BBH": 0.602, + "hfopenllm_v2/MATH Level 5": 0.1888, + "hfopenllm_v2/GPQA": 0.3372, + "hfopenllm_v2/MUSR": 0.4632, + "hfopenllm_v2/MMLU-PRO": 0.4039 + } + }, + { + "id": "recoilme/recoilme-gemma-2-9B-v0.4", + "name": "recoilme-gemma-2-9B-v0.4", + "developer": "recoilme", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2562, + "hfopenllm_v2/BBH": 0.5967, + "hfopenllm_v2/MATH Level 5": 0.0846, + "hfopenllm_v2/GPQA": 0.3406, + "hfopenllm_v2/MUSR": 0.4727, + "hfopenllm_v2/MMLU-PRO": 0.4406 + } + }, + { + "id": "recoilme/recoilme-gemma-2-9B-v0.5", + "name": "recoilme-gemma-2-9B-v0.5", + "developer": "recoilme", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7664, + "hfopenllm_v2/BBH": 0.5981, + "hfopenllm_v2/MATH Level 5": 0.2115, + "hfopenllm_v2/GPQA": 0.3364, + "hfopenllm_v2/MUSR": 0.4232, + "hfopenllm_v2/MMLU-PRO": 0.42 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/redrix.json b/data/developers/redrix.json new file mode 100644 index 0000000000000000000000000000000000000000..9dc4b1ebd40cd7d1a6293364f9deaf001b148cd4 --- /dev/null +++ b/data/developers/redrix.json @@ -0,0 +1,33 @@ +{ + "developer": "redrix", + "models": [ + { + "id": "redrix/AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS", + "name": "AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS", + "developer": "redrix", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.536, + "hfopenllm_v2/BBH": 0.5129, + "hfopenllm_v2/MATH Level 5": 0.1133, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.3818, + "hfopenllm_v2/MMLU-PRO": 0.318 + } + }, + { + "id": "redrix/patricide-12B-Unslop-Mell", + "name": "patricide-12B-Unslop-Mell", + "developer": "redrix", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4074, + "hfopenllm_v2/BBH": 0.5399, + "hfopenllm_v2/MATH Level 5": 0.1314, + "hfopenllm_v2/GPQA": 0.3238, + "hfopenllm_v2/MUSR": 0.4026, + "hfopenllm_v2/MMLU-PRO": 0.357 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/refuelai.json b/data/developers/refuelai.json new file mode 100644 index 0000000000000000000000000000000000000000..ebeaea95c700a74e10fbc9648a1d22c0b467313c --- /dev/null +++ b/data/developers/refuelai.json @@ -0,0 +1,19 @@ +{ + "developer": "refuelai", + "models": [ + { + "id": "refuelai/Llama-3-Refueled", + "name": "Llama-3-Refueled", + "developer": "refuelai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.462, + "hfopenllm_v2/BBH": 0.5871, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4454, + "hfopenllm_v2/MMLU-PRO": 0.3095 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/rhplus0831.json b/data/developers/rhplus0831.json new file mode 100644 index 0000000000000000000000000000000000000000..3ec2e804f5d7f0cbcdbc1f85b0968eeaaa6c7709 --- /dev/null +++ b/data/developers/rhplus0831.json @@ -0,0 +1,19 @@ +{ + "developer": "rhplus0831", + "models": [ + { + "id": "rhplus0831/maid-yuzu-v7", + "name": "maid-yuzu-v7", + "developer": "rhplus0831", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6462, + "hfopenllm_v2/BBH": 0.4805, + "hfopenllm_v2/MATH Level 5": 0.102, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4136, + "hfopenllm_v2/MMLU-PRO": 0.354 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/rhymes-ai.json b/data/developers/rhymes-ai.json new file mode 100644 index 0000000000000000000000000000000000000000..eaa17557e6f865ccf6660d0f4b2846647fdc7ddc --- /dev/null +++ b/data/developers/rhymes-ai.json @@ -0,0 +1,19 @@ +{ + "developer": "rhymes-ai", + "models": [ + { + "id": "rhymes-ai/Aria", + "name": "Aria", + "developer": "rhymes-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4773, + "hfopenllm_v2/BBH": 0.5695, + "hfopenllm_v2/MATH Level 5": 0.1934, + "hfopenllm_v2/GPQA": 0.3624, + "hfopenllm_v2/MUSR": 0.4338, + "hfopenllm_v2/MMLU-PRO": 0.4405 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/rhysjones.json b/data/developers/rhysjones.json new file mode 100644 index 0000000000000000000000000000000000000000..fae66ec44b4b65528290d286c26a405e0c49ba6e --- /dev/null +++ b/data/developers/rhysjones.json @@ -0,0 +1,19 @@ +{ + "developer": "rhysjones", + "models": [ + { + "id": "rhysjones/phi-2-orange-v2", + "name": "phi-2-orange-v2", + "developer": "rhysjones", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.367, + "hfopenllm_v2/BBH": 0.477, + "hfopenllm_v2/MATH Level 5": 0.0408, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.363, + "hfopenllm_v2/MMLU-PRO": 0.2532 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/riaz.json b/data/developers/riaz.json new file mode 100644 index 0000000000000000000000000000000000000000..342d5654379425c9866753c909eb0413c18e5c77 --- /dev/null +++ b/data/developers/riaz.json @@ -0,0 +1,19 @@ +{ + "developer": "riaz", + "models": [ + { + "id": "riaz/FineLlama-3.1-8B", + "name": "FineLlama-3.1-8B", + "developer": "riaz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4137, + "hfopenllm_v2/BBH": 0.4565, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3776, + "hfopenllm_v2/MMLU-PRO": 0.2978 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/rmdhirr.json b/data/developers/rmdhirr.json new file mode 100644 index 0000000000000000000000000000000000000000..509e60dd2a8e7570dc1b1d5079abbc23cd489a5a --- /dev/null +++ b/data/developers/rmdhirr.json @@ -0,0 +1,19 @@ +{ + "developer": "rmdhirr", + "models": [ + { + "id": "rmdhirr/Gluon-8B", + "name": "Gluon-8B", + "developer": "rmdhirr", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5053, + "hfopenllm_v2/BBH": 0.5153, + "hfopenllm_v2/MATH Level 5": 0.1443, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4039, + "hfopenllm_v2/MMLU-PRO": 0.3808 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/rombodawg.json b/data/developers/rombodawg.json new file mode 100644 index 0000000000000000000000000000000000000000..e7b347a91d5be7f4c52661e255a50c99caa3a3fd --- /dev/null +++ b/data/developers/rombodawg.json @@ -0,0 +1,201 @@ +{ + "developer": "rombodawg", + "models": [ + { + "id": "rombodawg/Rombos-Coder-V2.5-Qwen-14b", + "name": "Rombos-Coder-V2.5-Qwen-14b", + "developer": "rombodawg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7047, + "hfopenllm_v2/BBH": 0.6165, + "hfopenllm_v2/MATH Level 5": 0.3301, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.3915, + "hfopenllm_v2/MMLU-PRO": 0.3939 + } + }, + { + "id": "rombodawg/Rombos-Coder-V2.5-Qwen-7b", + "name": "Rombos-Coder-V2.5-Qwen-7b", + "developer": "rombodawg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.621, + "hfopenllm_v2/BBH": 0.5077, + "hfopenllm_v2/MATH Level 5": 0.3338, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3979, + "hfopenllm_v2/MMLU-PRO": 0.3398 + } + }, + { + "id": "rombodawg/Rombos-LLM-V2.5-Qwen-0.5b", + "name": "Rombos-LLM-V2.5-Qwen-0.5b", + "developer": "rombodawg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2847, + "hfopenllm_v2/BBH": 0.3294, + "hfopenllm_v2/MATH Level 5": 0.068, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3236, + "hfopenllm_v2/MMLU-PRO": 0.1866 + } + }, + { + "id": "rombodawg/Rombos-LLM-V2.5-Qwen-1.5b", + "name": "Rombos-LLM-V2.5-Qwen-1.5b", + "developer": "rombodawg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3402, + "hfopenllm_v2/BBH": 0.4257, + "hfopenllm_v2/MATH Level 5": 0.0853, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.4186, + "hfopenllm_v2/MMLU-PRO": 0.2922 + } + }, + { + "id": "rombodawg/Rombos-LLM-V2.5-Qwen-14b", + "name": "Rombos-LLM-V2.5-Qwen-14b", + "developer": "rombodawg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.584, + "hfopenllm_v2/BBH": 0.6481, + "hfopenllm_v2/MATH Level 5": 0.4554, + "hfopenllm_v2/GPQA": 0.3716, + "hfopenllm_v2/MUSR": 0.4717, + "hfopenllm_v2/MMLU-PRO": 0.5376 + } + }, + { + "id": "rombodawg/Rombos-LLM-V2.5-Qwen-32b", + "name": "Rombos-LLM-V2.5-Qwen-32b", + "developer": "rombodawg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6827, + "hfopenllm_v2/BBH": 0.7046, + "hfopenllm_v2/MATH Level 5": 0.4955, + "hfopenllm_v2/GPQA": 0.3968, + "hfopenllm_v2/MUSR": 0.5034, + "hfopenllm_v2/MMLU-PRO": 0.5916 + } + }, + { + "id": "rombodawg/Rombos-LLM-V2.5-Qwen-3b", + "name": "Rombos-LLM-V2.5-Qwen-3b", + "developer": "rombodawg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5342, + "hfopenllm_v2/BBH": 0.4809, + "hfopenllm_v2/MATH Level 5": 0.2795, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4042, + "hfopenllm_v2/MMLU-PRO": 0.3761 + } + }, + { + "id": "rombodawg/Rombos-LLM-V2.5-Qwen-72b", + "name": "Rombos-LLM-V2.5-Qwen-72b", + "developer": "rombodawg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7155, + "hfopenllm_v2/BBH": 0.723, + "hfopenllm_v2/MATH Level 5": 0.5423, + "hfopenllm_v2/GPQA": 0.3985, + "hfopenllm_v2/MUSR": 0.4599, + "hfopenllm_v2/MMLU-PRO": 0.5935 + } + }, + { + "id": "rombodawg/Rombos-LLM-V2.5-Qwen-7b", + "name": "Rombos-LLM-V2.5-Qwen-7b", + "developer": "rombodawg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6237, + "hfopenllm_v2/BBH": 0.5544, + "hfopenllm_v2/MATH Level 5": 0.3814, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.4291, + "hfopenllm_v2/MMLU-PRO": 0.4469 + } + }, + { + "id": "rombodawg/Rombos-LLM-V2.5.1-Qwen-3b", + "name": "Rombos-LLM-V2.5.1-Qwen-3b", + "developer": "rombodawg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2566, + "hfopenllm_v2/BBH": 0.39, + "hfopenllm_v2/MATH Level 5": 0.1208, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3991, + "hfopenllm_v2/MMLU-PRO": 0.2741 + } + }, + { + "id": "rombodawg/Rombos-LLM-V2.6-Nemotron-70b", + "name": "Rombos-LLM-V2.6-Nemotron-70b", + "developer": "rombodawg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7527, + "hfopenllm_v2/BBH": 0.6938, + "hfopenllm_v2/MATH Level 5": 0.3331, + "hfopenllm_v2/GPQA": 0.406, + "hfopenllm_v2/MUSR": 0.4669, + "hfopenllm_v2/MMLU-PRO": 0.5329 + } + }, + { + "id": "rombodawg/Rombos-LLM-V2.6-Qwen-14b", + "name": "Rombos-LLM-V2.6-Qwen-14b", + "developer": "rombodawg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8432, + "hfopenllm_v2/BBH": 0.6442, + "hfopenllm_v2/MATH Level 5": 0.5211, + "hfopenllm_v2/GPQA": 0.3339, + "hfopenllm_v2/MUSR": 0.4221, + "hfopenllm_v2/MMLU-PRO": 0.4961 + } + }, + { + "id": "rombodawg/rombos_Replete-Coder-Instruct-8b-Merged", + "name": "rombos_Replete-Coder-Instruct-8b-Merged", + "developer": "rombodawg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5388, + "hfopenllm_v2/BBH": 0.4462, + "hfopenllm_v2/MATH Level 5": 0.0778, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.366, + "hfopenllm_v2/MMLU-PRO": 0.1809 + } + }, + { + "id": "rombodawg/rombos_Replete-Coder-Llama3-8B", + "name": "rombos_Replete-Coder-Llama3-8B", + "developer": "rombodawg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4714, + "hfopenllm_v2/BBH": 0.3276, + "hfopenllm_v2/MATH Level 5": 0.0393, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3966, + "hfopenllm_v2/MMLU-PRO": 0.1335 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/rootxhacker.json b/data/developers/rootxhacker.json new file mode 100644 index 0000000000000000000000000000000000000000..85f94d4a69b82ff9515c80f0f5046994a47e78d4 --- /dev/null +++ b/data/developers/rootxhacker.json @@ -0,0 +1,47 @@ +{ + "developer": "rootxhacker", + "models": [ + { + "id": "rootxhacker/Apollo-70B", + "name": "Apollo-70B", + "developer": "rootxhacker", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5099, + "hfopenllm_v2/BBH": 0.6804, + "hfopenllm_v2/MATH Level 5": 0.5612, + "hfopenllm_v2/GPQA": 0.4572, + "hfopenllm_v2/MUSR": 0.4948, + "hfopenllm_v2/MMLU-PRO": 0.5279 + } + }, + { + "id": "rootxhacker/Apollo_v2-32B", + "name": "Apollo_v2-32B", + "developer": "rootxhacker", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.428, + "hfopenllm_v2/BBH": 0.7072, + "hfopenllm_v2/MATH Level 5": 0.4275, + "hfopenllm_v2/GPQA": 0.3784, + "hfopenllm_v2/MUSR": 0.4994, + "hfopenllm_v2/MMLU-PRO": 0.5869 + } + }, + { + "id": "rootxhacker/apollo-7B", + "name": "apollo-7B", + "developer": "rootxhacker", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2953, + "hfopenllm_v2/BBH": 0.3636, + "hfopenllm_v2/MATH Level 5": 0.0257, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.4131, + "hfopenllm_v2/MMLU-PRO": 0.1748 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/rsh345.json b/data/developers/rsh345.json new file mode 100644 index 0000000000000000000000000000000000000000..4882bfdab00aa1707b38ac6f21aaebfc93e138b7 --- /dev/null +++ b/data/developers/rsh345.json @@ -0,0 +1,19 @@ +{ + "developer": "rsh345", + "models": [ + { + "id": "rsh345/mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B", + "name": "mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B", + "developer": "rsh345", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3892, + "hfopenllm_v2/BBH": 0.5188, + "hfopenllm_v2/MATH Level 5": 0.0733, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.4672, + "hfopenllm_v2/MMLU-PRO": 0.3054 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/rubenroy.json b/data/developers/rubenroy.json new file mode 100644 index 0000000000000000000000000000000000000000..ad954ec8ab7d387b8b13601760cb5a1d671314ad --- /dev/null +++ b/data/developers/rubenroy.json @@ -0,0 +1,47 @@ +{ + "developer": "rubenroy", + "models": [ + { + "id": "rubenroy/Geneva-12B-GCv2-5m", + "name": "Geneva-12B-GCv2-5m", + "developer": "rubenroy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2586, + "hfopenllm_v2/BBH": 0.5278, + "hfopenllm_v2/MATH Level 5": 0.0801, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.3525, + "hfopenllm_v2/MMLU-PRO": 0.325 + } + }, + { + "id": "rubenroy/Gilgamesh-72B", + "name": "Gilgamesh-72B", + "developer": "rubenroy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8486, + "hfopenllm_v2/BBH": 0.7253, + "hfopenllm_v2/MATH Level 5": 0.4381, + "hfopenllm_v2/GPQA": 0.3943, + "hfopenllm_v2/MUSR": 0.4626, + "hfopenllm_v2/MMLU-PRO": 0.5802 + } + }, + { + "id": "rubenroy/Zurich-14B-GCv2-5m", + "name": "Zurich-14B-GCv2-5m", + "developer": "rubenroy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6164, + "hfopenllm_v2/BBH": 0.6308, + "hfopenllm_v2/MATH Level 5": 0.3074, + "hfopenllm_v2/GPQA": 0.3616, + "hfopenllm_v2/MUSR": 0.4874, + "hfopenllm_v2/MMLU-PRO": 0.5233 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ruizhe1217.json b/data/developers/ruizhe1217.json new file mode 100644 index 0000000000000000000000000000000000000000..e9a768054ffd872c8248ceed6cd26b3e2250c26f --- /dev/null +++ b/data/developers/ruizhe1217.json @@ -0,0 +1,19 @@ +{ + "developer": "ruizhe1217", + "models": [ + { + "id": "ruizhe1217/sft-s1-qwen-0.5b", + "name": "sft-s1-qwen-0.5b", + "developer": "ruizhe1217", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2749, + "hfopenllm_v2/BBH": 0.3301, + "hfopenllm_v2/MATH Level 5": 0.0619, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3196, + "hfopenllm_v2/MMLU-PRO": 0.1892 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/rwitz.json b/data/developers/rwitz.json new file mode 100644 index 0000000000000000000000000000000000000000..70890afac405774e5d2d9ef4c3d2094f2a227a5c --- /dev/null +++ b/data/developers/rwitz.json @@ -0,0 +1,19 @@ +{ + "developer": "rwitz", + "models": [ + { + "id": "rwitz/go-bruins-v2", + "name": "go-bruins-v2", + "developer": "rwitz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4096, + "hfopenllm_v2/BBH": 0.3799, + "hfopenllm_v2/MATH Level 5": 0.0672, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.4138, + "hfopenllm_v2/MMLU-PRO": 0.2761 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/sabersaleh.json b/data/developers/sabersaleh.json new file mode 100644 index 0000000000000000000000000000000000000000..49625547055886537d31bd3a5d3e3b3a4eda1a78 --- /dev/null +++ b/data/developers/sabersaleh.json @@ -0,0 +1,103 @@ +{ + "developer": "sabersaleh", + "models": [ + { + "id": "sabersaleh/Llama2-7B-CPO", + "name": "Llama2-7B-CPO", + "developer": "sabersaleh", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1545, + "hfopenllm_v2/BBH": 0.3458, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.4048, + "hfopenllm_v2/MMLU-PRO": 0.1606 + } + }, + { + "id": "sabersaleh/Llama2-7B-DPO", + "name": "Llama2-7B-DPO", + "developer": "sabersaleh", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1453, + "hfopenllm_v2/BBH": 0.3512, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.4114, + "hfopenllm_v2/MMLU-PRO": 0.1626 + } + }, + { + "id": "sabersaleh/Llama2-7B-IPO", + "name": "Llama2-7B-IPO", + "developer": "sabersaleh", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1769, + "hfopenllm_v2/BBH": 0.3475, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.4048, + "hfopenllm_v2/MMLU-PRO": 0.1617 + } + }, + { + "id": "sabersaleh/Llama2-7B-KTO", + "name": "Llama2-7B-KTO", + "developer": "sabersaleh", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1528, + "hfopenllm_v2/BBH": 0.3501, + "hfopenllm_v2/MATH Level 5": 0.0189, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.4167, + "hfopenllm_v2/MMLU-PRO": 0.1636 + } + }, + { + "id": "sabersaleh/Llama2-7B-SPO", + "name": "Llama2-7B-SPO", + "developer": "sabersaleh", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1567, + "hfopenllm_v2/BBH": 0.3383, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.3874, + "hfopenllm_v2/MMLU-PRO": 0.1757 + } + }, + { + "id": "sabersaleh/Llama2-7B-SimPO", + "name": "Llama2-7B-SimPO", + "developer": "sabersaleh", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1659, + "hfopenllm_v2/BBH": 0.3489, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.4007, + "hfopenllm_v2/MMLU-PRO": 0.1641 + } + }, + { + "id": "sabersaleh/Llama3", + "name": "Llama3", + "developer": "sabersaleh", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3321, + "hfopenllm_v2/BBH": 0.4782, + "hfopenllm_v2/MATH Level 5": 0.0566, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.3933, + "hfopenllm_v2/MMLU-PRO": 0.3162 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/sabersalehk.json b/data/developers/sabersalehk.json new file mode 100644 index 0000000000000000000000000000000000000000..41e1940ce854636569840b6b1c4642be344c3f39 --- /dev/null +++ b/data/developers/sabersalehk.json @@ -0,0 +1,61 @@ +{ + "developer": "sabersalehk", + "models": [ + { + "id": "sabersalehk/Llama3-001-300", + "name": "Llama3-001-300", + "developer": "sabersalehk", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3179, + "hfopenllm_v2/BBH": 0.4745, + "hfopenllm_v2/MATH Level 5": 0.0529, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4064, + "hfopenllm_v2/MMLU-PRO": 0.3158 + } + }, + { + "id": "sabersalehk/Llama3-SimPO", + "name": "Llama3-SimPO", + "developer": "sabersalehk", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3642, + "hfopenllm_v2/BBH": 0.4874, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4046, + "hfopenllm_v2/MMLU-PRO": 0.3157 + } + }, + { + "id": "sabersalehk/Llama3_001_200", + "name": "Llama3_001_200", + "developer": "sabersalehk", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3218, + "hfopenllm_v2/BBH": 0.4728, + "hfopenllm_v2/MATH Level 5": 0.0514, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4037, + "hfopenllm_v2/MMLU-PRO": 0.3183 + } + }, + { + "id": "sabersalehk/Llama3_01_300", + "name": "Llama3_01_300", + "developer": "sabersalehk", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2959, + "hfopenllm_v2/BBH": 0.4691, + "hfopenllm_v2/MATH Level 5": 0.0498, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4065, + "hfopenllm_v2/MMLU-PRO": 0.3124 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/saishf.json b/data/developers/saishf.json new file mode 100644 index 0000000000000000000000000000000000000000..213fdc9814d2a3044881c868cc722095c8b5929b --- /dev/null +++ b/data/developers/saishf.json @@ -0,0 +1,33 @@ +{ + "developer": "saishf", + "models": [ + { + "id": "saishf/Fimbulvetr-Kuro-Lotus-10.7B", + "name": "Fimbulvetr-Kuro-Lotus-10.7B", + "developer": "saishf", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4939, + "hfopenllm_v2/BBH": 0.4342, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4445, + "hfopenllm_v2/MMLU-PRO": 0.3389 + } + }, + { + "id": "saishf/Neural-SOVLish-Devil-8B-L3", + "name": "Neural-SOVLish-Devil-8B-L3", + "developer": "saishf", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4199, + "hfopenllm_v2/BBH": 0.5142, + "hfopenllm_v2/MATH Level 5": 0.0891, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.411, + "hfopenllm_v2/MMLU-PRO": 0.3807 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/saishshinde15.json b/data/developers/saishshinde15.json new file mode 100644 index 0000000000000000000000000000000000000000..9447e0f64fedf17428662c5533aa3c413d08be90 --- /dev/null +++ b/data/developers/saishshinde15.json @@ -0,0 +1,47 @@ +{ + "developer": "saishshinde15", + "models": [ + { + "id": "saishshinde15/TethysAI_Base_Reasoning", + "name": "TethysAI_Base_Reasoning", + "developer": "saishshinde15", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6369, + "hfopenllm_v2/BBH": 0.4519, + "hfopenllm_v2/MATH Level 5": 0.3142, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.4075, + "hfopenllm_v2/MMLU-PRO": 0.3236 + } + }, + { + "id": "saishshinde15/TethysAI_Vortex", + "name": "TethysAI_Vortex", + "developer": "saishshinde15", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4298, + "hfopenllm_v2/BBH": 0.4749, + "hfopenllm_v2/MATH Level 5": 0.315, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4458, + "hfopenllm_v2/MMLU-PRO": 0.3241 + } + }, + { + "id": "saishshinde15/TethysAI_Vortex_Reasoning", + "name": "TethysAI_Vortex_Reasoning", + "developer": "saishshinde15", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4021, + "hfopenllm_v2/BBH": 0.4694, + "hfopenllm_v2/MATH Level 5": 0.2145, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4084, + "hfopenllm_v2/MMLU-PRO": 0.3381 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/sakaltcommunity.json b/data/developers/sakaltcommunity.json new file mode 100644 index 0000000000000000000000000000000000000000..2286491f7e40f5bfd7b8b0c7e94e931b1b8ac15f --- /dev/null +++ b/data/developers/sakaltcommunity.json @@ -0,0 +1,33 @@ +{ + "developer": "sakaltcommunity", + "models": [ + { + "id": "sakaltcommunity/novablast-preview", + "name": "novablast-preview", + "developer": "sakaltcommunity", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.453, + "hfopenllm_v2/BBH": 0.7043, + "hfopenllm_v2/MATH Level 5": 0.4894, + "hfopenllm_v2/GPQA": 0.3817, + "hfopenllm_v2/MUSR": 0.5021, + "hfopenllm_v2/MMLU-PRO": 0.5915 + } + }, + { + "id": "sakaltcommunity/sakaltum-7b", + "name": "sakaltum-7b", + "developer": "sakaltcommunity", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2604, + "hfopenllm_v2/BBH": 0.4575, + "hfopenllm_v2/MATH Level 5": 0.0295, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3775, + "hfopenllm_v2/MMLU-PRO": 0.2769 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/sakhan10.json b/data/developers/sakhan10.json new file mode 100644 index 0000000000000000000000000000000000000000..b12902c83010dbcb46cf75c5d23653f21f4afc03 --- /dev/null +++ b/data/developers/sakhan10.json @@ -0,0 +1,19 @@ +{ + "developer": "sakhan10", + "models": [ + { + "id": "sakhan10/quantized_open_llama_3b_v2", + "name": "quantized_open_llama_3b_v2", + "developer": "sakhan10", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1872, + "hfopenllm_v2/BBH": 0.302, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.3682, + "hfopenllm_v2/MMLU-PRO": 0.1095 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/saltlux.json b/data/developers/saltlux.json new file mode 100644 index 0000000000000000000000000000000000000000..24babe4e487ab3e00fc37985120480dceba70b97 --- /dev/null +++ b/data/developers/saltlux.json @@ -0,0 +1,33 @@ +{ + "developer": "saltlux", + "models": [ + { + "id": "saltlux/luxia-21.4b-alignment-v1.0", + "name": "luxia-21.4b-alignment-v1.0", + "developer": "saltlux", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3693, + "hfopenllm_v2/BBH": 0.6373, + "hfopenllm_v2/MATH Level 5": 0.0974, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.4328, + "hfopenllm_v2/MMLU-PRO": 0.3403 + } + }, + { + "id": "saltlux/luxia-21.4b-alignment-v1.2", + "name": "luxia-21.4b-alignment-v1.2", + "developer": "saltlux", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4115, + "hfopenllm_v2/BBH": 0.6371, + "hfopenllm_v2/MATH Level 5": 0.0846, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4459, + "hfopenllm_v2/MMLU-PRO": 0.3473 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/sam-paech.json b/data/developers/sam-paech.json new file mode 100644 index 0000000000000000000000000000000000000000..709c4d51b66e2e5651f027f2da2034940aba8b82 --- /dev/null +++ b/data/developers/sam-paech.json @@ -0,0 +1,47 @@ +{ + "developer": "sam-paech", + "models": [ + { + "id": "sam-paech/Darkest-muse-v1", + "name": "Darkest-muse-v1", + "developer": "sam-paech", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7344, + "hfopenllm_v2/BBH": 0.5968, + "hfopenllm_v2/MATH Level 5": 0.2145, + "hfopenllm_v2/GPQA": 0.344, + "hfopenllm_v2/MUSR": 0.4502, + "hfopenllm_v2/MMLU-PRO": 0.4184 + } + }, + { + "id": "sam-paech/Delirium-v1", + "name": "Delirium-v1", + "developer": "sam-paech", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7208, + "hfopenllm_v2/BBH": 0.5962, + "hfopenllm_v2/MATH Level 5": 0.2107, + "hfopenllm_v2/GPQA": 0.3431, + "hfopenllm_v2/MUSR": 0.4514, + "hfopenllm_v2/MMLU-PRO": 0.419 + } + }, + { + "id": "sam-paech/Quill-v1", + "name": "Quill-v1", + "developer": "sam-paech", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7122, + "hfopenllm_v2/BBH": 0.5969, + "hfopenllm_v2/MATH Level 5": 0.2122, + "hfopenllm_v2/GPQA": 0.3398, + "hfopenllm_v2/MUSR": 0.4555, + "hfopenllm_v2/MMLU-PRO": 0.4171 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/sarvamai.json b/data/developers/sarvamai.json new file mode 100644 index 0000000000000000000000000000000000000000..8984c3337cb73c343c38193daf6552f8d18e1b48 --- /dev/null +++ b/data/developers/sarvamai.json @@ -0,0 +1,19 @@ +{ + "developer": "sarvamai", + "models": [ + { + "id": "sarvamai/OpenHathi-7B-Hi-v0.1-Base", + "name": "OpenHathi-7B-Hi-v0.1-Base", + "developer": "sarvamai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1804, + "hfopenllm_v2/BBH": 0.3354, + "hfopenllm_v2/MATH Level 5": 0.0083, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3658, + "hfopenllm_v2/MMLU-PRO": 0.1543 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/schnapss.json b/data/developers/schnapss.json new file mode 100644 index 0000000000000000000000000000000000000000..bcd5766199ddd0e9e653dd3e7d48a1f21af08243 --- /dev/null +++ b/data/developers/schnapss.json @@ -0,0 +1,19 @@ +{ + "developer": "schnapss", + "models": [ + { + "id": "schnapss/testmerge-7b", + "name": "testmerge-7b", + "developer": "schnapss", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3922, + "hfopenllm_v2/BBH": 0.5187, + "hfopenllm_v2/MATH Level 5": 0.0687, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4686, + "hfopenllm_v2/MMLU-PRO": 0.306 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/sci-m-wang.json b/data/developers/sci-m-wang.json new file mode 100644 index 0000000000000000000000000000000000000000..a6c5a9c80e75b2261628705e430eee8d33854978 --- /dev/null +++ b/data/developers/sci-m-wang.json @@ -0,0 +1,47 @@ +{ + "developer": "sci-m-wang", + "models": [ + { + "id": "sci-m-wang/Mistral-7B-Instruct-sa-v0.1", + "name": "Mistral-7B-Instruct-sa-v0.1", + "developer": "sci-m-wang", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4335, + "hfopenllm_v2/BBH": 0.3273, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.39, + "hfopenllm_v2/MMLU-PRO": 0.2362 + } + }, + { + "id": "sci-m-wang/Phi-3-mini-4k-instruct-sa-v0.1", + "name": "Phi-3-mini-4k-instruct-sa-v0.1", + "developer": "sci-m-wang", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5021, + "hfopenllm_v2/BBH": 0.5502, + "hfopenllm_v2/MATH Level 5": 0.148, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.4073, + "hfopenllm_v2/MMLU-PRO": 0.3985 + } + }, + { + "id": "sci-m-wang/deepseek-llm-7b-chat-sa-v0.1", + "name": "deepseek-llm-7b-chat-sa-v0.1", + "developer": "sci-m-wang", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4036, + "hfopenllm_v2/BBH": 0.3718, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.4173, + "hfopenllm_v2/MMLU-PRO": 0.2209 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/securin.json b/data/developers/securin.json new file mode 100644 index 0000000000000000000000000000000000000000..5ff352e96b7128943192bcdd7bbd31ecf10c814b --- /dev/null +++ b/data/developers/securin.json @@ -0,0 +1,19 @@ +{ + "developer": "securin", + "models": [ + { + "id": "securin/Securin-LLM-V2.5-Qwen-1.5B", + "name": "Securin-LLM-V2.5-Qwen-1.5B", + "developer": "securin", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1492, + "hfopenllm_v2/BBH": 0.3158, + "hfopenllm_v2/MATH Level 5": 0.0249, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3606, + "hfopenllm_v2/MMLU-PRO": 0.1615 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/senseable.json b/data/developers/senseable.json new file mode 100644 index 0000000000000000000000000000000000000000..b8ae08e14ae9a18180ae7bfeff29701a4a77156b --- /dev/null +++ b/data/developers/senseable.json @@ -0,0 +1,19 @@ +{ + "developer": "senseable", + "models": [ + { + "id": "senseable/WestLake-7B-v2", + "name": "WestLake-7B-v2", + "developer": "senseable", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4419, + "hfopenllm_v2/BBH": 0.4073, + "hfopenllm_v2/MATH Level 5": 0.0483, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.3937, + "hfopenllm_v2/MMLU-PRO": 0.2764 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/sequelbox.json b/data/developers/sequelbox.json new file mode 100644 index 0000000000000000000000000000000000000000..e20c1d05c1bcbf7d412506136d5b13fef4639618 --- /dev/null +++ b/data/developers/sequelbox.json @@ -0,0 +1,89 @@ +{ + "developer": "sequelbox", + "models": [ + { + "id": "sequelbox/Llama3.1-70B-PlumChat", + "name": "Llama3.1-70B-PlumChat", + "developer": "sequelbox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5616, + "hfopenllm_v2/BBH": 0.6753, + "hfopenllm_v2/MATH Level 5": 0.3029, + "hfopenllm_v2/GPQA": 0.3909, + "hfopenllm_v2/MUSR": 0.4774, + "hfopenllm_v2/MMLU-PRO": 0.5164 + } + }, + { + "id": "sequelbox/Llama3.1-8B-MOTH", + "name": "Llama3.1-8B-MOTH", + "developer": "sequelbox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5245, + "hfopenllm_v2/BBH": 0.4902, + "hfopenllm_v2/MATH Level 5": 0.1216, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3689, + "hfopenllm_v2/MMLU-PRO": 0.3339 + } + }, + { + "id": "sequelbox/Llama3.1-8B-PlumChat", + "name": "Llama3.1-8B-PlumChat", + "developer": "sequelbox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4243, + "hfopenllm_v2/BBH": 0.3873, + "hfopenllm_v2/MATH Level 5": 0.0363, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3755, + "hfopenllm_v2/MMLU-PRO": 0.2127 + } + }, + { + "id": "sequelbox/Llama3.1-8B-PlumCode", + "name": "Llama3.1-8B-PlumCode", + "developer": "sequelbox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2045, + "hfopenllm_v2/BBH": 0.3368, + "hfopenllm_v2/MATH Level 5": 0.0272, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3773, + "hfopenllm_v2/MMLU-PRO": 0.2335 + } + }, + { + "id": "sequelbox/Llama3.1-8B-PlumMath", + "name": "Llama3.1-8B-PlumMath", + "developer": "sequelbox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2242, + "hfopenllm_v2/BBH": 0.4032, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.318, + "hfopenllm_v2/MUSR": 0.3919, + "hfopenllm_v2/MMLU-PRO": 0.2975 + } + }, + { + "id": "sequelbox/gemma-2-9B-MOTH", + "name": "gemma-2-9B-MOTH", + "developer": "sequelbox", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2059, + "hfopenllm_v2/BBH": 0.308, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3409, + "hfopenllm_v2/MMLU-PRO": 0.114 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/sethuiyer.json b/data/developers/sethuiyer.json new file mode 100644 index 0000000000000000000000000000000000000000..5039098f4a6a9fd9b7ad65deaa9bcd839ac57a0c --- /dev/null +++ b/data/developers/sethuiyer.json @@ -0,0 +1,89 @@ +{ + "developer": "sethuiyer", + "models": [ + { + "id": "sethuiyer/Llama-3.1-8B-Experimental-1206-Instruct", + "name": "Llama-3.1-8B-Experimental-1206-Instruct", + "developer": "sethuiyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6967, + "hfopenllm_v2/BBH": 0.5104, + "hfopenllm_v2/MATH Level 5": 0.1118, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.3966, + "hfopenllm_v2/MMLU-PRO": 0.3529 + } + }, + { + "id": "sethuiyer/Llama-3.1-8B-Experimental-1208-Instruct", + "name": "Llama-3.1-8B-Experimental-1208-Instruct", + "developer": "sethuiyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.61, + "hfopenllm_v2/BBH": 0.4964, + "hfopenllm_v2/MATH Level 5": 0.0891, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.379, + "hfopenllm_v2/MMLU-PRO": 0.3511 + } + }, + { + "id": "sethuiyer/LlamaZero-3.1-8B-Experimental-1208", + "name": "LlamaZero-3.1-8B-Experimental-1208", + "developer": "sethuiyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6051, + "hfopenllm_v2/BBH": 0.4981, + "hfopenllm_v2/MATH Level 5": 0.108, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.382, + "hfopenllm_v2/MMLU-PRO": 0.3 + } + }, + { + "id": "sethuiyer/Llamaverse-3.1-8B-Instruct", + "name": "Llamaverse-3.1-8B-Instruct", + "developer": "sethuiyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6185, + "hfopenllm_v2/BBH": 0.5414, + "hfopenllm_v2/MATH Level 5": 0.1858, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.3762, + "hfopenllm_v2/MMLU-PRO": 0.3523 + } + }, + { + "id": "sethuiyer/Llamazing-3.1-8B-Instruct", + "name": "Llamazing-3.1-8B-Instruct", + "developer": "sethuiyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5711, + "hfopenllm_v2/BBH": 0.5291, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.3976, + "hfopenllm_v2/MMLU-PRO": 0.3606 + } + }, + { + "id": "sethuiyer/Qwen2.5-7B-Anvita", + "name": "Qwen2.5-7B-Anvita", + "developer": "sethuiyer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.648, + "hfopenllm_v2/BBH": 0.5466, + "hfopenllm_v2/MATH Level 5": 0.2017, + "hfopenllm_v2/GPQA": 0.3272, + "hfopenllm_v2/MUSR": 0.4337, + "hfopenllm_v2/MMLU-PRO": 0.4166 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/sfairXC.json b/data/developers/sfairXC.json new file mode 100644 index 0000000000000000000000000000000000000000..0f83a5aa819fcf69906563e63fd5c5a7cc1f8ce0 --- /dev/null +++ b/data/developers/sfairXC.json @@ -0,0 +1,24 @@ +{ + "developer": "sfairXC", + "models": [ + { + "id": "sfairXC/FsfairX-LLaMA3-RM-v0.1", + "name": "sfairXC/FsfairX-LLaMA3-RM-v0.1", + "developer": "sfairXC", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8338, + "reward-bench/Factuality": 0.5916, + "reward-bench/Precise IF": 0.4188, + "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.8676, + "reward-bench/Focus": 0.7051, + "reward-bench/Ties": 0.6647, + "reward-bench/Chat": 0.9944, + "reward-bench/Chat Hard": 0.6513, + "reward-bench/Reasoning": 0.8644, + "reward-bench/Prior Sets (0.5 weight)": 0.7492 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/shadowml.json b/data/developers/shadowml.json new file mode 100644 index 0000000000000000000000000000000000000000..1bc11cfd3b8cf26a1a26b53cc430206434b59b47 --- /dev/null +++ b/data/developers/shadowml.json @@ -0,0 +1,33 @@ +{ + "developer": "shadowml", + "models": [ + { + "id": "shadowml/BeagSake-7B", + "name": "BeagSake-7B", + "developer": "shadowml", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5216, + "hfopenllm_v2/BBH": 0.4711, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.4124, + "hfopenllm_v2/MMLU-PRO": 0.2585 + } + }, + { + "id": "shadowml/Mixolar-4x7b", + "name": "Mixolar-4x7b", + "developer": "shadowml", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3893, + "hfopenllm_v2/BBH": 0.5216, + "hfopenllm_v2/MATH Level 5": 0.0582, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4258, + "hfopenllm_v2/MMLU-PRO": 0.3305 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/shastraai.json b/data/developers/shastraai.json new file mode 100644 index 0000000000000000000000000000000000000000..62f077e19bd887791b211081c450b14e52aa6e09 --- /dev/null +++ b/data/developers/shastraai.json @@ -0,0 +1,19 @@ +{ + "developer": "shastraai", + "models": [ + { + "id": "shastraai/Shastra-LLAMA2-Math-Commonsense-SFT", + "name": "Shastra-LLAMA2-Math-Commonsense-SFT", + "developer": "shastraai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3042, + "hfopenllm_v2/BBH": 0.3843, + "hfopenllm_v2/MATH Level 5": 0.0174, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3604, + "hfopenllm_v2/MMLU-PRO": 0.1997 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/shivam9980.json b/data/developers/shivam9980.json new file mode 100644 index 0000000000000000000000000000000000000000..bdf9dcbd88c8b400b1ada1eeb240f667593b0ca6 --- /dev/null +++ b/data/developers/shivam9980.json @@ -0,0 +1,33 @@ +{ + "developer": "shivam9980", + "models": [ + { + "id": "shivam9980/NEPALI-LLM", + "name": "NEPALI-LLM", + "developer": "shivam9980", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0417, + "hfopenllm_v2/BBH": 0.3828, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.4122, + "hfopenllm_v2/MMLU-PRO": 0.2064 + } + }, + { + "id": "shivam9980/mistral-7b-news-cnn-merged", + "name": "mistral-7b-news-cnn-merged", + "developer": "shivam9980", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4634, + "hfopenllm_v2/BBH": 0.3635, + "hfopenllm_v2/MATH Level 5": 0.0189, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4523, + "hfopenllm_v2/MMLU-PRO": 0.2827 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/shivank21.json b/data/developers/shivank21.json new file mode 100644 index 0000000000000000000000000000000000000000..41171d70fe0b0a7b8593970b825e48eb5476a53e --- /dev/null +++ b/data/developers/shivank21.json @@ -0,0 +1,19 @@ +{ + "developer": "shivank21", + "models": [ + { + "id": "shivank21/mistral_dpo_self", + "name": "mistral_dpo_self", + "developer": "shivank21", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3403, + "hfopenllm_v2/BBH": 0.3216, + "hfopenllm_v2/MATH Level 5": 0.0219, + "hfopenllm_v2/GPQA": 0.2408, + "hfopenllm_v2/MUSR": 0.3247, + "hfopenllm_v2/MMLU-PRO": 0.2214 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/shuttleai.json b/data/developers/shuttleai.json new file mode 100644 index 0000000000000000000000000000000000000000..bf42fb42f571fa56667c3be3048193995b18d540 --- /dev/null +++ b/data/developers/shuttleai.json @@ -0,0 +1,19 @@ +{ + "developer": "shuttleai", + "models": [ + { + "id": "shuttleai/shuttle-3", + "name": "shuttle-3", + "developer": "shuttleai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8154, + "hfopenllm_v2/BBH": 0.742, + "hfopenllm_v2/MATH Level 5": 0.46, + "hfopenllm_v2/GPQA": 0.4119, + "hfopenllm_v2/MUSR": 0.4377, + "hfopenllm_v2/MMLU-PRO": 0.5716 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/shyamieee.json b/data/developers/shyamieee.json new file mode 100644 index 0000000000000000000000000000000000000000..0cdd6010e858d13672f44add131ae67503b8821d --- /dev/null +++ b/data/developers/shyamieee.json @@ -0,0 +1,19 @@ +{ + "developer": "shyamieee", + "models": [ + { + "id": "shyamieee/Padma-v7.0", + "name": "Padma-v7.0", + "developer": "shyamieee", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3841, + "hfopenllm_v2/BBH": 0.5119, + "hfopenllm_v2/MATH Level 5": 0.0702, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.4386, + "hfopenllm_v2/MMLU-PRO": 0.3029 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/silma-ai.json b/data/developers/silma-ai.json new file mode 100644 index 0000000000000000000000000000000000000000..c223d015a0a261f76d652d17c9574ec7fa11716a --- /dev/null +++ b/data/developers/silma-ai.json @@ -0,0 +1,33 @@ +{ + "developer": "silma-ai", + "models": [ + { + "id": "silma-ai/SILMA-9B-Instruct-v1.0", + "name": "SILMA-9B-Instruct-v1.0", + "developer": "silma-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5842, + "hfopenllm_v2/BBH": 0.5219, + "hfopenllm_v2/MATH Level 5": 0.1163, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.4637, + "hfopenllm_v2/MMLU-PRO": 0.392 + } + }, + { + "id": "silma-ai/SILMA-Kashif-2B-Instruct-v1.0", + "name": "SILMA-Kashif-2B-Instruct-v1.0", + "developer": "silma-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1181, + "hfopenllm_v2/BBH": 0.3793, + "hfopenllm_v2/MATH Level 5": 0.0113, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.4043, + "hfopenllm_v2/MMLU-PRO": 0.2258 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/siqi00.json b/data/developers/siqi00.json new file mode 100644 index 0000000000000000000000000000000000000000..6129b58a40776dc03eb6e76ca56d3ab5f56ae29d --- /dev/null +++ b/data/developers/siqi00.json @@ -0,0 +1,33 @@ +{ + "developer": "siqi00", + "models": [ + { + "id": "siqi00/Mistral-7B-DFT", + "name": "Mistral-7B-DFT", + "developer": "siqi00", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5569, + "hfopenllm_v2/BBH": 0.4665, + "hfopenllm_v2/MATH Level 5": 0.0378, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4191, + "hfopenllm_v2/MMLU-PRO": 0.2963 + } + }, + { + "id": "siqi00/Mistral-7B-DFT2", + "name": "Mistral-7B-DFT2", + "developer": "siqi00", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5804, + "hfopenllm_v2/BBH": 0.3968, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4401, + "hfopenllm_v2/MMLU-PRO": 0.2852 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/skumar9.json b/data/developers/skumar9.json new file mode 100644 index 0000000000000000000000000000000000000000..16cb68038507902f71851df1a6b09705dad757dd --- /dev/null +++ b/data/developers/skumar9.json @@ -0,0 +1,19 @@ +{ + "developer": "skumar9", + "models": [ + { + "id": "skumar9/Llama-medx_v2", + "name": "Llama-medx_v2", + "developer": "skumar9", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4462, + "hfopenllm_v2/BBH": 0.4909, + "hfopenllm_v2/MATH Level 5": 0.0914, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.3661, + "hfopenllm_v2/MMLU-PRO": 0.3463 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/skymizer.json b/data/developers/skymizer.json new file mode 100644 index 0000000000000000000000000000000000000000..2410cff2b8d9ce2e8486e2a7c862f0517c474dbf --- /dev/null +++ b/data/developers/skymizer.json @@ -0,0 +1,19 @@ +{ + "developer": "skymizer", + "models": [ + { + "id": "skymizer/Llama2-7b-sft-chat-custom-template-dpo", + "name": "Llama2-7b-sft-chat-custom-template-dpo", + "developer": "skymizer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2353, + "hfopenllm_v2/BBH": 0.3688, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2391, + "hfopenllm_v2/MUSR": 0.4429, + "hfopenllm_v2/MMLU-PRO": 0.1946 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/snowflake.json b/data/developers/snowflake.json new file mode 100644 index 0000000000000000000000000000000000000000..d36a86d899a25b9271cd05a105d08de1ef5c396f --- /dev/null +++ b/data/developers/snowflake.json @@ -0,0 +1,59 @@ +{ + "developer": "snowflake", + "models": [ + { + "id": "snowflake/snowflake-arctic-instruct", + "name": "Arctic Instruct", + "developer": "snowflake", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.338, + "helm_lite/NarrativeQA": 0.654, + "helm_lite/NaturalQuestions (closed-book)": 0.39, + "helm_lite/OpenbookQA": 0.828, + "helm_lite/MMLU": 0.575, + "helm_lite/MATH": 0.519, + "helm_lite/GSM8K": 0.768, + "helm_lite/LegalBench": 0.588, + "helm_lite/MedQA": 0.581, + "helm_lite/WMT 2014": 0.172, + "helm_mmlu/MMLU All Subjects": 0.677, + "helm_mmlu/Abstract Algebra": 0.35, + "helm_mmlu/Anatomy": 0.652, + "helm_mmlu/College Physics": 0.461, + "helm_mmlu/Computer Security": 0.84, + "helm_mmlu/Econometrics": 0.5, + "helm_mmlu/Global Facts": 0.39, + "helm_mmlu/Jurisprudence": 0.741, + "helm_mmlu/Philosophy": 0.752, + "helm_mmlu/Professional Psychology": 0.724, + "helm_mmlu/Us Foreign Policy": 0.88, + "helm_mmlu/Astronomy": 0.763, + "helm_mmlu/Business Ethics": 0.69, + "helm_mmlu/Clinical Knowledge": 0.781, + "helm_mmlu/Conceptual Physics": 0.634, + "helm_mmlu/Electrical Engineering": 0.662, + "helm_mmlu/Elementary Mathematics": 0.481, + "helm_mmlu/Formal Logic": 0.444, + "helm_mmlu/High School World History": 0.827, + "helm_mmlu/Human Sexuality": 0.847, + "helm_mmlu/International Law": 0.826, + "helm_mmlu/Logical Fallacies": 0.779, + "helm_mmlu/Machine Learning": 0.473, + "helm_mmlu/Management": 0.796, + "helm_mmlu/Marketing": 0.902, + "helm_mmlu/Medical Genetics": 0.76, + "helm_mmlu/Miscellaneous": 0.875, + "helm_mmlu/Moral Scenarios": 0.28, + "helm_mmlu/Nutrition": 0.725, + "helm_mmlu/Prehistory": 0.79, + "helm_mmlu/Public Relations": 0.664, + "helm_mmlu/Security Studies": 0.78, + "helm_mmlu/Sociology": 0.891, + "helm_mmlu/Virology": 0.536, + "helm_mmlu/World Religions": 0.854, + "helm_mmlu/Mean win rate": 0.565 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/someon98.json b/data/developers/someon98.json new file mode 100644 index 0000000000000000000000000000000000000000..ae9aa961e9a6fc0b2ae82aec18c44097dd97e780 --- /dev/null +++ b/data/developers/someon98.json @@ -0,0 +1,19 @@ +{ + "developer": "someon98", + "models": [ + { + "id": "someon98/qwen-CoMa-0.5b", + "name": "qwen-CoMa-0.5b", + "developer": "someon98", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2277, + "hfopenllm_v2/BBH": 0.2953, + "hfopenllm_v2/MATH Level 5": 0.0045, + "hfopenllm_v2/GPQA": 0.2399, + "hfopenllm_v2/MUSR": 0.4046, + "hfopenllm_v2/MMLU-PRO": 0.1099 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/sometimesanotion.json b/data/developers/sometimesanotion.json new file mode 100644 index 0000000000000000000000000000000000000000..2873e9464f191e6c97951bfbde0b20afc70e76a1 --- /dev/null +++ b/data/developers/sometimesanotion.json @@ -0,0 +1,817 @@ +{ + "developer": "sometimesanotion", + "models": [ + { + "id": "sometimesanotion/ChocoTrio-14B-v1", + "name": "ChocoTrio-14B-v1", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7089, + "hfopenllm_v2/BBH": 0.6506, + "hfopenllm_v2/MATH Level 5": 0.3973, + "hfopenllm_v2/GPQA": 0.3851, + "hfopenllm_v2/MUSR": 0.4821, + "hfopenllm_v2/MMLU-PRO": 0.537 + } + }, + { + "id": "sometimesanotion/IF-reasoning-experiment-40", + "name": "IF-reasoning-experiment-40", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.633, + "hfopenllm_v2/BBH": 0.6112, + "hfopenllm_v2/MATH Level 5": 0.3716, + "hfopenllm_v2/GPQA": 0.38, + "hfopenllm_v2/MUSR": 0.5194, + "hfopenllm_v2/MMLU-PRO": 0.5025 + } + }, + { + "id": "sometimesanotion/IF-reasoning-experiment-80", + "name": "IF-reasoning-experiment-80", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5463, + "hfopenllm_v2/BBH": 0.421, + "hfopenllm_v2/MATH Level 5": 0.0989, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.5025, + "hfopenllm_v2/MMLU-PRO": 0.3368 + } + }, + { + "id": "sometimesanotion/KytheraMix-7B-v0.2", + "name": "KytheraMix-7B-v0.2", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6129, + "hfopenllm_v2/BBH": 0.5635, + "hfopenllm_v2/MATH Level 5": 0.2923, + "hfopenllm_v2/GPQA": 0.3356, + "hfopenllm_v2/MUSR": 0.4594, + "hfopenllm_v2/MMLU-PRO": 0.4505 + } + }, + { + "id": "sometimesanotion/Lamarck-14B-v0.1-experimental", + "name": "Lamarck-14B-v0.1-experimental", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5354, + "hfopenllm_v2/BBH": 0.6583, + "hfopenllm_v2/MATH Level 5": 0.358, + "hfopenllm_v2/GPQA": 0.3817, + "hfopenllm_v2/MUSR": 0.4728, + "hfopenllm_v2/MMLU-PRO": 0.5408 + } + }, + { + "id": "sometimesanotion/Lamarck-14B-v0.3", + "name": "Lamarck-14B-v0.3", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5032, + "hfopenllm_v2/BBH": 0.6611, + "hfopenllm_v2/MATH Level 5": 0.3406, + "hfopenllm_v2/GPQA": 0.3884, + "hfopenllm_v2/MUSR": 0.4688, + "hfopenllm_v2/MMLU-PRO": 0.5411 + } + }, + { + "id": "sometimesanotion/Lamarck-14B-v0.4-Qwenvergence", + "name": "Lamarck-14B-v0.4-Qwenvergence", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4906, + "hfopenllm_v2/BBH": 0.6535, + "hfopenllm_v2/MATH Level 5": 0.3399, + "hfopenllm_v2/GPQA": 0.3784, + "hfopenllm_v2/MUSR": 0.4847, + "hfopenllm_v2/MMLU-PRO": 0.5406 + } + }, + { + "id": "sometimesanotion/Lamarck-14B-v0.6", + "name": "Lamarck-14B-v0.6", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6973, + "hfopenllm_v2/BBH": 0.646, + "hfopenllm_v2/MATH Level 5": 0.4041, + "hfopenllm_v2/GPQA": 0.3893, + "hfopenllm_v2/MUSR": 0.4847, + "hfopenllm_v2/MMLU-PRO": 0.54 + } + }, + { + "id": "sometimesanotion/Lamarck-14B-v0.6-002-model_stock", + "name": "Lamarck-14B-v0.6-002-model_stock", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6692, + "hfopenllm_v2/BBH": 0.6143, + "hfopenllm_v2/MATH Level 5": 0.3776, + "hfopenllm_v2/GPQA": 0.3742, + "hfopenllm_v2/MUSR": 0.518, + "hfopenllm_v2/MMLU-PRO": 0.5054 + } + }, + { + "id": "sometimesanotion/Lamarck-14B-v0.6-model_stock", + "name": "Lamarck-14B-v0.6-model_stock", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.679, + "hfopenllm_v2/BBH": 0.6269, + "hfopenllm_v2/MATH Level 5": 0.4245, + "hfopenllm_v2/GPQA": 0.3842, + "hfopenllm_v2/MUSR": 0.5007, + "hfopenllm_v2/MMLU-PRO": 0.5198 + } + }, + { + "id": "sometimesanotion/Lamarck-14B-v0.7-Fusion", + "name": "Lamarck-14B-v0.7-Fusion", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6821, + "hfopenllm_v2/BBH": 0.6544, + "hfopenllm_v2/MATH Level 5": 0.4041, + "hfopenllm_v2/GPQA": 0.401, + "hfopenllm_v2/MUSR": 0.4991, + "hfopenllm_v2/MMLU-PRO": 0.5391 + } + }, + { + "id": "sometimesanotion/Lamarck-14B-v0.7-rc1", + "name": "Lamarck-14B-v0.7-rc1", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7305, + "hfopenllm_v2/BBH": 0.6486, + "hfopenllm_v2/MATH Level 5": 0.3852, + "hfopenllm_v2/GPQA": 0.3893, + "hfopenllm_v2/MUSR": 0.4715, + "hfopenllm_v2/MMLU-PRO": 0.5416 + } + }, + { + "id": "sometimesanotion/Lamarck-14B-v0.7-rc4", + "name": "Lamarck-14B-v0.7-rc4", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7211, + "hfopenllm_v2/BBH": 0.651, + "hfopenllm_v2/MATH Level 5": 0.4026, + "hfopenllm_v2/GPQA": 0.3893, + "hfopenllm_v2/MUSR": 0.4912, + "hfopenllm_v2/MMLU-PRO": 0.54 + } + }, + { + "id": "sometimesanotion/LamarckInfusion-14B-v1", + "name": "LamarckInfusion-14B-v1", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7198, + "hfopenllm_v2/BBH": 0.6539, + "hfopenllm_v2/MATH Level 5": 0.4169, + "hfopenllm_v2/GPQA": 0.3909, + "hfopenllm_v2/MUSR": 0.4899, + "hfopenllm_v2/MMLU-PRO": 0.5376 + } + }, + { + "id": "sometimesanotion/LamarckInfusion-14B-v2", + "name": "LamarckInfusion-14B-v2", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6812, + "hfopenllm_v2/BBH": 0.6564, + "hfopenllm_v2/MATH Level 5": 0.4388, + "hfopenllm_v2/GPQA": 0.3876, + "hfopenllm_v2/MUSR": 0.4993, + "hfopenllm_v2/MMLU-PRO": 0.5416 + } + }, + { + "id": "sometimesanotion/LamarckInfusion-14B-v2-hi", + "name": "LamarckInfusion-14B-v2-hi", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6855, + "hfopenllm_v2/BBH": 0.6555, + "hfopenllm_v2/MATH Level 5": 0.423, + "hfopenllm_v2/GPQA": 0.3884, + "hfopenllm_v2/MUSR": 0.4847, + "hfopenllm_v2/MMLU-PRO": 0.5405 + } + }, + { + "id": "sometimesanotion/LamarckInfusion-14B-v2-lo", + "name": "LamarckInfusion-14B-v2-lo", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6788, + "hfopenllm_v2/BBH": 0.6528, + "hfopenllm_v2/MATH Level 5": 0.4237, + "hfopenllm_v2/GPQA": 0.3859, + "hfopenllm_v2/MUSR": 0.4991, + "hfopenllm_v2/MMLU-PRO": 0.5397 + } + }, + { + "id": "sometimesanotion/LamarckInfusion-14B-v3", + "name": "LamarckInfusion-14B-v3", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7131, + "hfopenllm_v2/BBH": 0.6518, + "hfopenllm_v2/MATH Level 5": 0.4124, + "hfopenllm_v2/GPQA": 0.3867, + "hfopenllm_v2/MUSR": 0.482, + "hfopenllm_v2/MMLU-PRO": 0.5407 + } + }, + { + "id": "sometimesanotion/Qwen-14B-ProseStock-v4", + "name": "Qwen-14B-ProseStock-v4", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4942, + "hfopenllm_v2/BBH": 0.6498, + "hfopenllm_v2/MATH Level 5": 0.364, + "hfopenllm_v2/GPQA": 0.3884, + "hfopenllm_v2/MUSR": 0.4938, + "hfopenllm_v2/MMLU-PRO": 0.5386 + } + }, + { + "id": "sometimesanotion/Qwen-2.5-14B-Virmarckeoso", + "name": "Qwen-2.5-14B-Virmarckeoso", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4813, + "hfopenllm_v2/BBH": 0.657, + "hfopenllm_v2/MATH Level 5": 0.3565, + "hfopenllm_v2/GPQA": 0.3792, + "hfopenllm_v2/MUSR": 0.4794, + "hfopenllm_v2/MMLU-PRO": 0.5377 + } + }, + { + "id": "sometimesanotion/Qwen2.5-14B-Vimarckoso", + "name": "Qwen2.5-14B-Vimarckoso", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4574, + "hfopenllm_v2/BBH": 0.6446, + "hfopenllm_v2/MATH Level 5": 0.3384, + "hfopenllm_v2/GPQA": 0.3926, + "hfopenllm_v2/MUSR": 0.4859, + "hfopenllm_v2/MMLU-PRO": 0.5329 + } + }, + { + "id": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v2", + "name": "Qwen2.5-14B-Vimarckoso-v2", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4505, + "hfopenllm_v2/BBH": 0.655, + "hfopenllm_v2/MATH Level 5": 0.358, + "hfopenllm_v2/GPQA": 0.3826, + "hfopenllm_v2/MUSR": 0.4819, + "hfopenllm_v2/MMLU-PRO": 0.538 + } + }, + { + "id": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v3", + "name": "Qwen2.5-14B-Vimarckoso-v3", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7257, + "hfopenllm_v2/BBH": 0.6415, + "hfopenllm_v2/MATH Level 5": 0.4003, + "hfopenllm_v2/GPQA": 0.38, + "hfopenllm_v2/MUSR": 0.4807, + "hfopenllm_v2/MMLU-PRO": 0.5343 + } + }, + { + "id": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-IF-Variant", + "name": "Qwen2.5-14B-Vimarckoso-v3-IF-Variant", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6413, + "hfopenllm_v2/BBH": 0.5521, + "hfopenllm_v2/MATH Level 5": 0.2545, + "hfopenllm_v2/GPQA": 0.3473, + "hfopenllm_v2/MUSR": 0.5319, + "hfopenllm_v2/MMLU-PRO": 0.4589 + } + }, + { + "id": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-Prose01", + "name": "Qwen2.5-14B-Vimarckoso-v3-Prose01", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6872, + "hfopenllm_v2/BBH": 0.6359, + "hfopenllm_v2/MATH Level 5": 0.3995, + "hfopenllm_v2/GPQA": 0.3867, + "hfopenllm_v2/MUSR": 0.4807, + "hfopenllm_v2/MMLU-PRO": 0.5275 + } + }, + { + "id": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-model_stock", + "name": "Qwen2.5-14B-Vimarckoso-v3-model_stock", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7162, + "hfopenllm_v2/BBH": 0.6421, + "hfopenllm_v2/MATH Level 5": 0.4245, + "hfopenllm_v2/GPQA": 0.38, + "hfopenllm_v2/MUSR": 0.4781, + "hfopenllm_v2/MMLU-PRO": 0.5316 + } + }, + { + "id": "sometimesanotion/Qwen2.5-7B-Gordion-v0.1", + "name": "Qwen2.5-7B-Gordion-v0.1", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7482, + "hfopenllm_v2/BBH": 0.5524, + "hfopenllm_v2/MATH Level 5": 0.2915, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4016, + "hfopenllm_v2/MMLU-PRO": 0.43 + } + }, + { + "id": "sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Prose", + "name": "Qwen2.5-7B-Gordion-v0.1-Prose", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5347, + "hfopenllm_v2/BBH": 0.5599, + "hfopenllm_v2/MATH Level 5": 0.2893, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.4502, + "hfopenllm_v2/MMLU-PRO": 0.4525 + } + }, + { + "id": "sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Reason", + "name": "Qwen2.5-7B-Gordion-v0.1-Reason", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4917, + "hfopenllm_v2/BBH": 0.5498, + "hfopenllm_v2/MATH Level 5": 0.2621, + "hfopenllm_v2/GPQA": 0.3406, + "hfopenllm_v2/MUSR": 0.4434, + "hfopenllm_v2/MMLU-PRO": 0.4307 + } + }, + { + "id": "sometimesanotion/Qwentessential-14B-v1", + "name": "Qwentessential-14B-v1", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6279, + "hfopenllm_v2/BBH": 0.6545, + "hfopenllm_v2/MATH Level 5": 0.4071, + "hfopenllm_v2/GPQA": 0.3876, + "hfopenllm_v2/MUSR": 0.4873, + "hfopenllm_v2/MMLU-PRO": 0.5381 + } + }, + { + "id": "sometimesanotion/Qwentinuum-14B-v013", + "name": "Qwentinuum-14B-v013", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6711, + "hfopenllm_v2/BBH": 0.6087, + "hfopenllm_v2/MATH Level 5": 0.3708, + "hfopenllm_v2/GPQA": 0.3574, + "hfopenllm_v2/MUSR": 0.5154, + "hfopenllm_v2/MMLU-PRO": 0.4991 + } + }, + { + "id": "sometimesanotion/Qwentinuum-14B-v1", + "name": "Qwentinuum-14B-v1", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5032, + "hfopenllm_v2/BBH": 0.6573, + "hfopenllm_v2/MATH Level 5": 0.3603, + "hfopenllm_v2/GPQA": 0.3826, + "hfopenllm_v2/MUSR": 0.4781, + "hfopenllm_v2/MMLU-PRO": 0.541 + } + }, + { + "id": "sometimesanotion/Qwentinuum-14B-v2", + "name": "Qwentinuum-14B-v2", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5378, + "hfopenllm_v2/BBH": 0.6555, + "hfopenllm_v2/MATH Level 5": 0.3754, + "hfopenllm_v2/GPQA": 0.3884, + "hfopenllm_v2/MUSR": 0.4714, + "hfopenllm_v2/MMLU-PRO": 0.5409 + } + }, + { + "id": "sometimesanotion/Qwentinuum-14B-v3", + "name": "Qwentinuum-14B-v3", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6158, + "hfopenllm_v2/BBH": 0.6539, + "hfopenllm_v2/MATH Level 5": 0.3535, + "hfopenllm_v2/GPQA": 0.3876, + "hfopenllm_v2/MUSR": 0.486, + "hfopenllm_v2/MMLU-PRO": 0.5413 + } + }, + { + "id": "sometimesanotion/Qwentinuum-14B-v5", + "name": "Qwentinuum-14B-v5", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6286, + "hfopenllm_v2/BBH": 0.655, + "hfopenllm_v2/MATH Level 5": 0.3444, + "hfopenllm_v2/GPQA": 0.3876, + "hfopenllm_v2/MUSR": 0.4874, + "hfopenllm_v2/MMLU-PRO": 0.5418 + } + }, + { + "id": "sometimesanotion/Qwentinuum-14B-v6", + "name": "Qwentinuum-14B-v6", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6304, + "hfopenllm_v2/BBH": 0.6545, + "hfopenllm_v2/MATH Level 5": 0.3603, + "hfopenllm_v2/GPQA": 0.3867, + "hfopenllm_v2/MUSR": 0.49, + "hfopenllm_v2/MMLU-PRO": 0.54 + } + }, + { + "id": "sometimesanotion/Qwentinuum-14B-v6-Prose", + "name": "Qwentinuum-14B-v6-Prose", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5643, + "hfopenllm_v2/BBH": 0.6545, + "hfopenllm_v2/MATH Level 5": 0.3701, + "hfopenllm_v2/GPQA": 0.3884, + "hfopenllm_v2/MUSR": 0.4913, + "hfopenllm_v2/MMLU-PRO": 0.5392 + } + }, + { + "id": "sometimesanotion/Qwentinuum-14B-v7", + "name": "Qwentinuum-14B-v7", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6109, + "hfopenllm_v2/BBH": 0.6551, + "hfopenllm_v2/MATH Level 5": 0.3573, + "hfopenllm_v2/GPQA": 0.3909, + "hfopenllm_v2/MUSR": 0.482, + "hfopenllm_v2/MMLU-PRO": 0.541 + } + }, + { + "id": "sometimesanotion/Qwentinuum-14B-v8", + "name": "Qwentinuum-14B-v8", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5412, + "hfopenllm_v2/BBH": 0.6534, + "hfopenllm_v2/MATH Level 5": 0.3912, + "hfopenllm_v2/GPQA": 0.3834, + "hfopenllm_v2/MUSR": 0.4873, + "hfopenllm_v2/MMLU-PRO": 0.5412 + } + }, + { + "id": "sometimesanotion/Qwentinuum-14B-v9", + "name": "Qwentinuum-14B-v9", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5107, + "hfopenllm_v2/BBH": 0.658, + "hfopenllm_v2/MATH Level 5": 0.3482, + "hfopenllm_v2/GPQA": 0.3859, + "hfopenllm_v2/MUSR": 0.4781, + "hfopenllm_v2/MMLU-PRO": 0.5421 + } + }, + { + "id": "sometimesanotion/Qwenvergence-14B-qv256", + "name": "Qwenvergence-14B-qv256", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7006, + "hfopenllm_v2/BBH": 0.6312, + "hfopenllm_v2/MATH Level 5": 0.3897, + "hfopenllm_v2/GPQA": 0.3784, + "hfopenllm_v2/MUSR": 0.4926, + "hfopenllm_v2/MMLU-PRO": 0.5178 + } + }, + { + "id": "sometimesanotion/Qwenvergence-14B-v0.6-004-model_stock", + "name": "Qwenvergence-14B-v0.6-004-model_stock", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.686, + "hfopenllm_v2/BBH": 0.6249, + "hfopenllm_v2/MATH Level 5": 0.4094, + "hfopenllm_v2/GPQA": 0.3834, + "hfopenllm_v2/MUSR": 0.5033, + "hfopenllm_v2/MMLU-PRO": 0.5193 + } + }, + { + "id": "sometimesanotion/Qwenvergence-14B-v10", + "name": "Qwenvergence-14B-v10", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6757, + "hfopenllm_v2/BBH": 0.6316, + "hfopenllm_v2/MATH Level 5": 0.4789, + "hfopenllm_v2/GPQA": 0.3792, + "hfopenllm_v2/MUSR": 0.4991, + "hfopenllm_v2/MMLU-PRO": 0.5239 + } + }, + { + "id": "sometimesanotion/Qwenvergence-14B-v11", + "name": "Qwenvergence-14B-v11", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7192, + "hfopenllm_v2/BBH": 0.6368, + "hfopenllm_v2/MATH Level 5": 0.4645, + "hfopenllm_v2/GPQA": 0.3725, + "hfopenllm_v2/MUSR": 0.4754, + "hfopenllm_v2/MMLU-PRO": 0.5327 + } + }, + { + "id": "sometimesanotion/Qwenvergence-14B-v12-Prose", + "name": "Qwenvergence-14B-v12-Prose", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5412, + "hfopenllm_v2/BBH": 0.6504, + "hfopenllm_v2/MATH Level 5": 0.3535, + "hfopenllm_v2/GPQA": 0.3867, + "hfopenllm_v2/MUSR": 0.4991, + "hfopenllm_v2/MMLU-PRO": 0.5381 + } + }, + { + "id": "sometimesanotion/Qwenvergence-14B-v12-Prose-DS", + "name": "Qwenvergence-14B-v12-Prose-DS", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6173, + "hfopenllm_v2/BBH": 0.6507, + "hfopenllm_v2/MATH Level 5": 0.4305, + "hfopenllm_v2/GPQA": 0.3943, + "hfopenllm_v2/MUSR": 0.5151, + "hfopenllm_v2/MMLU-PRO": 0.5369 + } + }, + { + "id": "sometimesanotion/Qwenvergence-14B-v13-Prose-DS", + "name": "Qwenvergence-14B-v13-Prose-DS", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7178, + "hfopenllm_v2/BBH": 0.6405, + "hfopenllm_v2/MATH Level 5": 0.386, + "hfopenllm_v2/GPQA": 0.3834, + "hfopenllm_v2/MUSR": 0.4927, + "hfopenllm_v2/MMLU-PRO": 0.5349 + } + }, + { + "id": "sometimesanotion/Qwenvergence-14B-v15-Prose-MS", + "name": "Qwenvergence-14B-v15-Prose-MS", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5032, + "hfopenllm_v2/BBH": 0.655, + "hfopenllm_v2/MATH Level 5": 0.3633, + "hfopenllm_v2/GPQA": 0.3951, + "hfopenllm_v2/MUSR": 0.4913, + "hfopenllm_v2/MMLU-PRO": 0.5393 + } + }, + { + "id": "sometimesanotion/Qwenvergence-14B-v2-Prose", + "name": "Qwenvergence-14B-v2-Prose", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4705, + "hfopenllm_v2/BBH": 0.6519, + "hfopenllm_v2/MATH Level 5": 0.3557, + "hfopenllm_v2/GPQA": 0.3935, + "hfopenllm_v2/MUSR": 0.4926, + "hfopenllm_v2/MMLU-PRO": 0.5372 + } + }, + { + "id": "sometimesanotion/Qwenvergence-14B-v3", + "name": "Qwenvergence-14B-v3", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5044, + "hfopenllm_v2/BBH": 0.6548, + "hfopenllm_v2/MATH Level 5": 0.3693, + "hfopenllm_v2/GPQA": 0.3842, + "hfopenllm_v2/MUSR": 0.4886, + "hfopenllm_v2/MMLU-PRO": 0.5386 + } + }, + { + "id": "sometimesanotion/Qwenvergence-14B-v3-Prose", + "name": "Qwenvergence-14B-v3-Prose", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4918, + "hfopenllm_v2/BBH": 0.6513, + "hfopenllm_v2/MATH Level 5": 0.3648, + "hfopenllm_v2/GPQA": 0.3951, + "hfopenllm_v2/MUSR": 0.4939, + "hfopenllm_v2/MMLU-PRO": 0.537 + } + }, + { + "id": "sometimesanotion/Qwenvergence-14B-v3-Reason", + "name": "Qwenvergence-14B-v3-Reason", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5278, + "hfopenllm_v2/BBH": 0.6557, + "hfopenllm_v2/MATH Level 5": 0.3119, + "hfopenllm_v2/GPQA": 0.3842, + "hfopenllm_v2/MUSR": 0.4754, + "hfopenllm_v2/MMLU-PRO": 0.5396 + } + }, + { + "id": "sometimesanotion/Qwenvergence-14B-v6-Prose", + "name": "Qwenvergence-14B-v6-Prose", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.599, + "hfopenllm_v2/BBH": 0.6544, + "hfopenllm_v2/MATH Level 5": 0.3565, + "hfopenllm_v2/GPQA": 0.3884, + "hfopenllm_v2/MUSR": 0.4887, + "hfopenllm_v2/MMLU-PRO": 0.5371 + } + }, + { + "id": "sometimesanotion/Qwenvergence-14B-v6-Prose-model_stock", + "name": "Qwenvergence-14B-v6-Prose-model_stock", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4811, + "hfopenllm_v2/BBH": 0.653, + "hfopenllm_v2/MATH Level 5": 0.3603, + "hfopenllm_v2/GPQA": 0.3935, + "hfopenllm_v2/MUSR": 0.4899, + "hfopenllm_v2/MMLU-PRO": 0.5387 + } + }, + { + "id": "sometimesanotion/Qwenvergence-14B-v8", + "name": "Qwenvergence-14B-v8", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5913, + "hfopenllm_v2/BBH": 0.6522, + "hfopenllm_v2/MATH Level 5": 0.4048, + "hfopenllm_v2/GPQA": 0.3809, + "hfopenllm_v2/MUSR": 0.4768, + "hfopenllm_v2/MMLU-PRO": 0.5435 + } + }, + { + "id": "sometimesanotion/Qwenvergence-14B-v9", + "name": "Qwenvergence-14B-v9", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6598, + "hfopenllm_v2/BBH": 0.6166, + "hfopenllm_v2/MATH Level 5": 0.4139, + "hfopenllm_v2/GPQA": 0.3683, + "hfopenllm_v2/MUSR": 0.5141, + "hfopenllm_v2/MMLU-PRO": 0.5111 + } + }, + { + "id": "sometimesanotion/lamarck-14b-prose-model_stock", + "name": "lamarck-14b-prose-model_stock", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4276, + "hfopenllm_v2/BBH": 0.6488, + "hfopenllm_v2/MATH Level 5": 0.3414, + "hfopenllm_v2/GPQA": 0.3935, + "hfopenllm_v2/MUSR": 0.4846, + "hfopenllm_v2/MMLU-PRO": 0.5354 + } + }, + { + "id": "sometimesanotion/lamarck-14b-reason-model_stock", + "name": "lamarck-14b-reason-model_stock", + "developer": "sometimesanotion", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4965, + "hfopenllm_v2/BBH": 0.6569, + "hfopenllm_v2/MATH Level 5": 0.358, + "hfopenllm_v2/GPQA": 0.3842, + "hfopenllm_v2/MUSR": 0.4741, + "hfopenllm_v2/MMLU-PRO": 0.5402 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/sonthenguyen.json b/data/developers/sonthenguyen.json new file mode 100644 index 0000000000000000000000000000000000000000..ea8ed7f630d2c7af0c6488b18d61f14b7cb8f4c2 --- /dev/null +++ b/data/developers/sonthenguyen.json @@ -0,0 +1,89 @@ +{ + "developer": "sonthenguyen", + "models": [ + { + "id": "sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415", + "name": "ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415", + "developer": "sonthenguyen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2893, + "hfopenllm_v2/BBH": 0.3804, + "hfopenllm_v2/MATH Level 5": 0.0113, + "hfopenllm_v2/GPQA": 0.2466, + "hfopenllm_v2/MUSR": 0.3861, + "hfopenllm_v2/MMLU-PRO": 0.1401 + } + }, + { + "id": "sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205", + "name": "ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205", + "developer": "sonthenguyen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3199, + "hfopenllm_v2/BBH": 0.3959, + "hfopenllm_v2/MATH Level 5": 0.0083, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.4272, + "hfopenllm_v2/MMLU-PRO": 0.2124 + } + }, + { + "id": "sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522", + "name": "ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522", + "developer": "sonthenguyen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3764, + "hfopenllm_v2/BBH": 0.3828, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.4404, + "hfopenllm_v2/MMLU-PRO": 0.2055 + } + }, + { + "id": "sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbc-213steps", + "name": "zephyr-sft-bnb-4bit-DPO-mtbc-213steps", + "developer": "sonthenguyen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4275, + "hfopenllm_v2/BBH": 0.4197, + "hfopenllm_v2/MATH Level 5": 0.0257, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.4086, + "hfopenllm_v2/MMLU-PRO": 0.2709 + } + }, + { + "id": "sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbo-180steps", + "name": "zephyr-sft-bnb-4bit-DPO-mtbo-180steps", + "developer": "sonthenguyen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4087, + "hfopenllm_v2/BBH": 0.4323, + "hfopenllm_v2/MATH Level 5": 0.0234, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3885, + "hfopenllm_v2/MMLU-PRO": 0.2748 + } + }, + { + "id": "sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbr-180steps", + "name": "zephyr-sft-bnb-4bit-DPO-mtbr-180steps", + "developer": "sonthenguyen", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4032, + "hfopenllm_v2/BBH": 0.4305, + "hfopenllm_v2/MATH Level 5": 0.0249, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.4258, + "hfopenllm_v2/MMLU-PRO": 0.2711 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/sophosympatheia.json b/data/developers/sophosympatheia.json new file mode 100644 index 0000000000000000000000000000000000000000..daf2aa2b7fe4ae3ff5f5e14351377c30775172a8 --- /dev/null +++ b/data/developers/sophosympatheia.json @@ -0,0 +1,19 @@ +{ + "developer": "sophosympatheia", + "models": [ + { + "id": "sophosympatheia/Midnight-Miqu-70B-v1.5", + "name": "Midnight-Miqu-70B-v1.5", + "developer": "sophosympatheia", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6118, + "hfopenllm_v2/BBH": 0.5606, + "hfopenllm_v2/MATH Level 5": 0.0702, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4244, + "hfopenllm_v2/MMLU-PRO": 0.3825 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/speakleash.json b/data/developers/speakleash.json new file mode 100644 index 0000000000000000000000000000000000000000..09c644aaa535e62ebd4dc0b391f0749b08b56ffe --- /dev/null +++ b/data/developers/speakleash.json @@ -0,0 +1,75 @@ +{ + "developer": "speakleash", + "models": [ + { + "id": "speakleash/Bielik-11B-v2", + "name": "Bielik-11B-v2", + "developer": "speakleash", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2381, + "hfopenllm_v2/BBH": 0.4931, + "hfopenllm_v2/MATH Level 5": 0.0785, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.3924, + "hfopenllm_v2/MMLU-PRO": 0.3137 + } + }, + { + "id": "speakleash/Bielik-11B-v2.0-Instruct", + "name": "Bielik-11B-v2.0-Instruct", + "developer": "speakleash", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5252, + "hfopenllm_v2/BBH": 0.5362, + "hfopenllm_v2/MATH Level 5": 0.1186, + "hfopenllm_v2/GPQA": 0.3171, + "hfopenllm_v2/MUSR": 0.4467, + "hfopenllm_v2/MMLU-PRO": 0.3351 + } + }, + { + "id": "speakleash/Bielik-11B-v2.1-Instruct", + "name": "Bielik-11B-v2.1-Instruct", + "developer": "speakleash", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.509, + "hfopenllm_v2/BBH": 0.553, + "hfopenllm_v2/MATH Level 5": 0.2666, + "hfopenllm_v2/GPQA": 0.3372, + "hfopenllm_v2/MUSR": 0.4185, + "hfopenllm_v2/MMLU-PRO": 0.3447 + } + }, + { + "id": "speakleash/Bielik-11B-v2.2-Instruct", + "name": "Bielik-11B-v2.2-Instruct", + "developer": "speakleash", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5552, + "hfopenllm_v2/BBH": 0.5597, + "hfopenllm_v2/MATH Level 5": 0.2681, + "hfopenllm_v2/GPQA": 0.3314, + "hfopenllm_v2/MUSR": 0.4171, + "hfopenllm_v2/MMLU-PRO": 0.3487 + } + }, + { + "id": "speakleash/Bielik-11B-v2.3-Instruct", + "name": "Bielik-11B-v2.3-Instruct", + "developer": "speakleash", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5583, + "hfopenllm_v2/BBH": 0.5663, + "hfopenllm_v2/MATH Level 5": 0.2085, + "hfopenllm_v2/GPQA": 0.3406, + "hfopenllm_v2/MUSR": 0.4518, + "hfopenllm_v2/MMLU-PRO": 0.3444 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/spmurrayzzz.json b/data/developers/spmurrayzzz.json new file mode 100644 index 0000000000000000000000000000000000000000..7697a5ccd81534909759899096ef6805baf01a65 --- /dev/null +++ b/data/developers/spmurrayzzz.json @@ -0,0 +1,19 @@ +{ + "developer": "spmurrayzzz", + "models": [ + { + "id": "spmurrayzzz/Mistral-Syndicate-7B", + "name": "Mistral-Syndicate-7B", + "developer": "spmurrayzzz", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2496, + "hfopenllm_v2/BBH": 0.4245, + "hfopenllm_v2/MATH Level 5": 0.034, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.4386, + "hfopenllm_v2/MMLU-PRO": 0.2631 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/spow12.json b/data/developers/spow12.json new file mode 100644 index 0000000000000000000000000000000000000000..39b5b7b70e78310868166d1a8d2f2ae80dc77b8f --- /dev/null +++ b/data/developers/spow12.json @@ -0,0 +1,61 @@ +{ + "developer": "spow12", + "models": [ + { + "id": "spow12/ChatWaifu_12B_v2.0", + "name": "ChatWaifu_12B_v2.0", + "developer": "spow12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4768, + "hfopenllm_v2/BBH": 0.5208, + "hfopenllm_v2/MATH Level 5": 0.071, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.4432, + "hfopenllm_v2/MMLU-PRO": 0.3388 + } + }, + { + "id": "spow12/ChatWaifu_22B_v2.0_preview", + "name": "ChatWaifu_22B_v2.0_preview", + "developer": "spow12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6745, + "hfopenllm_v2/BBH": 0.617, + "hfopenllm_v2/MATH Level 5": 0.1888, + "hfopenllm_v2/GPQA": 0.3154, + "hfopenllm_v2/MUSR": 0.3685, + "hfopenllm_v2/MMLU-PRO": 0.3988 + } + }, + { + "id": "spow12/ChatWaifu_v1.4", + "name": "ChatWaifu_v1.4", + "developer": "spow12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5691, + "hfopenllm_v2/BBH": 0.5176, + "hfopenllm_v2/MATH Level 5": 0.1057, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.4743, + "hfopenllm_v2/MMLU-PRO": 0.3475 + } + }, + { + "id": "spow12/ChatWaifu_v2.0_22B", + "name": "ChatWaifu_v2.0_22B", + "developer": "spow12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6511, + "hfopenllm_v2/BBH": 0.5926, + "hfopenllm_v2/MATH Level 5": 0.1858, + "hfopenllm_v2/GPQA": 0.3247, + "hfopenllm_v2/MUSR": 0.3842, + "hfopenllm_v2/MMLU-PRO": 0.3836 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ssmits.json b/data/developers/ssmits.json new file mode 100644 index 0000000000000000000000000000000000000000..b18b4bd4706de08526b971bc0323d4163a095a37 --- /dev/null +++ b/data/developers/ssmits.json @@ -0,0 +1,19 @@ +{ + "developer": "ssmits", + "models": [ + { + "id": "ssmits/Qwen2.5-95B-Instruct", + "name": "Qwen2.5-95B-Instruct", + "developer": "ssmits", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8431, + "hfopenllm_v2/BBH": 0.7038, + "hfopenllm_v2/MATH Level 5": 0.5302, + "hfopenllm_v2/GPQA": 0.3641, + "hfopenllm_v2/MUSR": 0.4284, + "hfopenllm_v2/MMLU-PRO": 0.5217 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/stabilityai.json b/data/developers/stabilityai.json new file mode 100644 index 0000000000000000000000000000000000000000..352eab5bb61d7f6570ba8d85f6599f07e37f2c5e --- /dev/null +++ b/data/developers/stabilityai.json @@ -0,0 +1,149 @@ +{ + "developer": "stabilityai", + "models": [ + { + "id": "stabilityai/StableBeluga2", + "name": "StableBeluga2", + "developer": "stabilityai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3787, + "hfopenllm_v2/BBH": 0.5824, + "hfopenllm_v2/MATH Level 5": 0.0438, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.473, + "hfopenllm_v2/MMLU-PRO": 0.3326 + } + }, + { + "id": "stabilityai/stable-code-instruct-3b", + "name": "stabilityai/stable-code-instruct-3b", + "developer": "stabilityai", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6216, + "reward-bench/Chat": 0.5782, + "reward-bench/Chat Hard": 0.5855, + "reward-bench/Safety": 0.6554, + "reward-bench/Reasoning": 0.7528, + "reward-bench/Prior Sets (0.5 weight)": 0.4506 + } + }, + { + "id": "stabilityai/stablelm-2-12b", + "name": "stablelm-2-12b", + "developer": "stabilityai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1569, + "hfopenllm_v2/BBH": 0.4509, + "hfopenllm_v2/MATH Level 5": 0.0431, + "hfopenllm_v2/GPQA": 0.2785, + "hfopenllm_v2/MUSR": 0.4479, + "hfopenllm_v2/MMLU-PRO": 0.3072 + } + }, + { + "id": "stabilityai/stablelm-2-12b-chat", + "name": "stablelm-2-12b-chat", + "developer": "stabilityai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4082, + "hfopenllm_v2/BBH": 0.4672, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3914, + "hfopenllm_v2/MMLU-PRO": 0.2734, + "reward-bench/Score": 0.7642, + "reward-bench/Chat": 0.9665, + "reward-bench/Chat Hard": 0.5548, + "reward-bench/Safety": 0.7811, + "reward-bench/Reasoning": 0.8945, + "reward-bench/Prior Sets (0.5 weight)": 0.4839 + } + }, + { + "id": "stabilityai/stablelm-2-1_6b", + "name": "stablelm-2-1_6b", + "developer": "stabilityai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1157, + "hfopenllm_v2/BBH": 0.3385, + "hfopenllm_v2/MATH Level 5": 0.0076, + "hfopenllm_v2/GPQA": 0.2483, + "hfopenllm_v2/MUSR": 0.3882, + "hfopenllm_v2/MMLU-PRO": 0.1464 + } + }, + { + "id": "stabilityai/stablelm-2-1_6b-chat", + "name": "stablelm-2-1_6b-chat", + "developer": "stabilityai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.306, + "hfopenllm_v2/BBH": 0.339, + "hfopenllm_v2/MATH Level 5": 0.0249, + "hfopenllm_v2/GPQA": 0.2475, + "hfopenllm_v2/MUSR": 0.358, + "hfopenllm_v2/MMLU-PRO": 0.1622 + } + }, + { + "id": "stabilityai/stablelm-2-zephyr-1_6b", + "name": "stablelm-2-zephyr-1_6b", + "developer": "stabilityai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3279, + "hfopenllm_v2/BBH": 0.3352, + "hfopenllm_v2/MATH Level 5": 0.0332, + "hfopenllm_v2/GPQA": 0.2433, + "hfopenllm_v2/MUSR": 0.3511, + "hfopenllm_v2/MMLU-PRO": 0.1714, + "reward-bench/Score": 0.6574, + "reward-bench/Chat": 0.9665, + "reward-bench/Chat Hard": 0.4671, + "reward-bench/Safety": 0.6027, + "reward-bench/Reasoning": 0.6784, + "reward-bench/Prior Sets (0.5 weight)": 0.4868 + } + }, + { + "id": "stabilityai/stablelm-3b-4e1t", + "name": "stablelm-3b-4e1t", + "developer": "stabilityai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2203, + "hfopenllm_v2/BBH": 0.3504, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2374, + "hfopenllm_v2/MUSR": 0.3778, + "hfopenllm_v2/MMLU-PRO": 0.1669 + } + }, + { + "id": "stabilityai/stablelm-zephyr-3b", + "name": "stablelm-zephyr-3b", + "developer": "stabilityai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3683, + "hfopenllm_v2/BBH": 0.3866, + "hfopenllm_v2/MATH Level 5": 0.0431, + "hfopenllm_v2/GPQA": 0.2391, + "hfopenllm_v2/MUSR": 0.4183, + "hfopenllm_v2/MMLU-PRO": 0.1768, + "reward-bench/Score": 0.7146, + "reward-bench/Chat": 0.8631, + "reward-bench/Chat Hard": 0.6009, + "reward-bench/Safety": 0.7405, + "reward-bench/Reasoning": 0.7573, + "reward-bench/Prior Sets (0.5 weight)": 0.5075 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/stanford.json b/data/developers/stanford.json new file mode 100644 index 0000000000000000000000000000000000000000..95c10daf7e4483a9621f542461b1d7e157b2d5e2 --- /dev/null +++ b/data/developers/stanford.json @@ -0,0 +1,28 @@ +{ + "developer": "stanford", + "models": [ + { + "id": "stanford/Alpaca-7B", + "name": "Alpaca 7B", + "developer": "stanford", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.381, + "helm_classic/MMLU": 0.385, + "helm_classic/BoolQ": 0.778, + "helm_classic/NarrativeQA": 0.396, + "helm_classic/NaturalQuestions (open-book)": 0.592, + "helm_classic/QuAC": 0.27, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.243, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.738, + "helm_classic/CivilComments": 0.566, + "helm_classic/RAFT": 0.486 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/stanfordnlp.json b/data/developers/stanfordnlp.json new file mode 100644 index 0000000000000000000000000000000000000000..2aaadfecfb9ec3215a55d094f249a23b6717aec2 --- /dev/null +++ b/data/developers/stanfordnlp.json @@ -0,0 +1,33 @@ +{ + "developer": "stanfordnlp", + "models": [ + { + "id": "stanfordnlp/SteamSHP-flan-t5-large", + "name": "stanfordnlp/SteamSHP-flan-t5-large", + "developer": "stanfordnlp", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.4962, + "reward-bench/Chat": 0.8575, + "reward-bench/Chat Hard": 0.3311, + "reward-bench/Safety": 0.3743, + "reward-bench/Reasoning": 0.3563, + "reward-bench/Prior Sets (0.5 weight)": 0.6273 + } + }, + { + "id": "stanfordnlp/SteamSHP-flan-t5-xl", + "name": "stanfordnlp/SteamSHP-flan-t5-xl", + "developer": "stanfordnlp", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5135, + "reward-bench/Chat": 0.8547, + "reward-bench/Chat Hard": 0.3684, + "reward-bench/Safety": 0.3784, + "reward-bench/Reasoning": 0.3841, + "reward-bench/Prior Sets (0.5 weight)": 0.6498 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/sthenno-com.json b/data/developers/sthenno-com.json new file mode 100644 index 0000000000000000000000000000000000000000..6079039a25dc05d46a87af22b79b137766bfb81a --- /dev/null +++ b/data/developers/sthenno-com.json @@ -0,0 +1,61 @@ +{ + "developer": "sthenno-com", + "models": [ + { + "id": "sthenno-com/miscii-14b-0130", + "name": "miscii-14b-0130", + "developer": "sthenno-com", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6647, + "hfopenllm_v2/BBH": 0.6505, + "hfopenllm_v2/MATH Level 5": 0.432, + "hfopenllm_v2/GPQA": 0.3817, + "hfopenllm_v2/MUSR": 0.4912, + "hfopenllm_v2/MMLU-PRO": 0.5363 + } + }, + { + "id": "sthenno-com/miscii-14b-0218", + "name": "miscii-14b-0218", + "developer": "sthenno-com", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7656, + "hfopenllm_v2/BBH": 0.6559, + "hfopenllm_v2/MATH Level 5": 0.5144, + "hfopenllm_v2/GPQA": 0.3834, + "hfopenllm_v2/MUSR": 0.4273, + "hfopenllm_v2/MMLU-PRO": 0.5298 + } + }, + { + "id": "sthenno-com/miscii-14b-1028", + "name": "miscii-14b-1028", + "developer": "sthenno-com", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8237, + "hfopenllm_v2/BBH": 0.6448, + "hfopenllm_v2/MATH Level 5": 0.503, + "hfopenllm_v2/GPQA": 0.3565, + "hfopenllm_v2/MUSR": 0.4182, + "hfopenllm_v2/MMLU-PRO": 0.5153 + } + }, + { + "id": "sthenno-com/miscii-14b-1225", + "name": "miscii-14b-1225", + "developer": "sthenno-com", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7878, + "hfopenllm_v2/BBH": 0.6572, + "hfopenllm_v2/MATH Level 5": 0.4517, + "hfopenllm_v2/GPQA": 0.3775, + "hfopenllm_v2/MUSR": 0.4366, + "hfopenllm_v2/MMLU-PRO": 0.5272 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/sthenno.json b/data/developers/sthenno.json new file mode 100644 index 0000000000000000000000000000000000000000..144f6005135a51b3707516c6521804423fd6f58e --- /dev/null +++ b/data/developers/sthenno.json @@ -0,0 +1,131 @@ +{ + "developer": "sthenno", + "models": [ + { + "id": "sthenno/tempesthenno-0120", + "name": "tempesthenno-0120", + "developer": "sthenno", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.539, + "hfopenllm_v2/BBH": 0.6373, + "hfopenllm_v2/MATH Level 5": 0.3353, + "hfopenllm_v2/GPQA": 0.3943, + "hfopenllm_v2/MUSR": 0.4633, + "hfopenllm_v2/MMLU-PRO": 0.529 + } + }, + { + "id": "sthenno/tempesthenno-fusion-0309", + "name": "tempesthenno-fusion-0309", + "developer": "sthenno", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7692, + "hfopenllm_v2/BBH": 0.6581, + "hfopenllm_v2/MATH Level 5": 0.4766, + "hfopenllm_v2/GPQA": 0.37, + "hfopenllm_v2/MUSR": 0.4325, + "hfopenllm_v2/MMLU-PRO": 0.5258 + } + }, + { + "id": "sthenno/tempesthenno-kto-0205-ckpt80", + "name": "tempesthenno-kto-0205-ckpt80", + "developer": "sthenno", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8054, + "hfopenllm_v2/BBH": 0.6543, + "hfopenllm_v2/MATH Level 5": 0.4592, + "hfopenllm_v2/GPQA": 0.3482, + "hfopenllm_v2/MUSR": 0.4248, + "hfopenllm_v2/MMLU-PRO": 0.5286 + } + }, + { + "id": "sthenno/tempesthenno-nuslerp-001", + "name": "tempesthenno-nuslerp-001", + "developer": "sthenno", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7926, + "hfopenllm_v2/BBH": 0.6578, + "hfopenllm_v2/MATH Level 5": 0.4758, + "hfopenllm_v2/GPQA": 0.3733, + "hfopenllm_v2/MUSR": 0.43, + "hfopenllm_v2/MMLU-PRO": 0.5257 + } + }, + { + "id": "sthenno/tempesthenno-nuslerp-0124", + "name": "tempesthenno-nuslerp-0124", + "developer": "sthenno", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7004, + "hfopenllm_v2/BBH": 0.6469, + "hfopenllm_v2/MATH Level 5": 0.4116, + "hfopenllm_v2/GPQA": 0.3901, + "hfopenllm_v2/MUSR": 0.4859, + "hfopenllm_v2/MMLU-PRO": 0.5352 + } + }, + { + "id": "sthenno/tempesthenno-ppo-ckpt40", + "name": "tempesthenno-ppo-ckpt40", + "developer": "sthenno", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7923, + "hfopenllm_v2/BBH": 0.655, + "hfopenllm_v2/MATH Level 5": 0.4736, + "hfopenllm_v2/GPQA": 0.3775, + "hfopenllm_v2/MUSR": 0.4352, + "hfopenllm_v2/MMLU-PRO": 0.5292 + } + }, + { + "id": "sthenno/tempesthenno-sft-0309-ckpt10", + "name": "tempesthenno-sft-0309-ckpt10", + "developer": "sthenno", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7744, + "hfopenllm_v2/BBH": 0.6552, + "hfopenllm_v2/MATH Level 5": 0.4721, + "hfopenllm_v2/GPQA": 0.3716, + "hfopenllm_v2/MUSR": 0.4364, + "hfopenllm_v2/MMLU-PRO": 0.5258 + } + }, + { + "id": "sthenno/tempesthenno-sft-0314-stage1-ckpt50", + "name": "tempesthenno-sft-0314-stage1-ckpt50", + "developer": "sthenno", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7394, + "hfopenllm_v2/BBH": 0.6601, + "hfopenllm_v2/MATH Level 5": 0.4683, + "hfopenllm_v2/GPQA": 0.3733, + "hfopenllm_v2/MUSR": 0.4429, + "hfopenllm_v2/MMLU-PRO": 0.5302 + } + }, + { + "id": "sthenno/tempestissimo-14b-0309", + "name": "tempestissimo-14b-0309", + "developer": "sthenno", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7549, + "hfopenllm_v2/BBH": 0.6587, + "hfopenllm_v2/MATH Level 5": 0.4796, + "hfopenllm_v2/GPQA": 0.3666, + "hfopenllm_v2/MUSR": 0.4312, + "hfopenllm_v2/MMLU-PRO": 0.5281 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/streamerbtw1002.json b/data/developers/streamerbtw1002.json new file mode 100644 index 0000000000000000000000000000000000000000..f847a3c8df86df18738c76fc4ac41247a4f99db8 --- /dev/null +++ b/data/developers/streamerbtw1002.json @@ -0,0 +1,19 @@ +{ + "developer": "streamerbtw1002", + "models": [ + { + "id": "streamerbtw1002/Nexuim-R1-7B-Instruct", + "name": "Nexuim-R1-7B-Instruct", + "developer": "streamerbtw1002", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6934, + "hfopenllm_v2/BBH": 0.5175, + "hfopenllm_v2/MATH Level 5": 0.4456, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3356, + "hfopenllm_v2/MMLU-PRO": 0.4138 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/stupidity-ai.json b/data/developers/stupidity-ai.json new file mode 100644 index 0000000000000000000000000000000000000000..841dccd4e39b0e7b87c676655545ed12f5073d77 --- /dev/null +++ b/data/developers/stupidity-ai.json @@ -0,0 +1,19 @@ +{ + "developer": "stupidity-ai", + "models": [ + { + "id": "stupidity-ai/Llama-3-8B-Instruct-MultiMoose", + "name": "Llama-3-8B-Instruct-MultiMoose", + "developer": "stupidity-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2318, + "hfopenllm_v2/BBH": 0.2823, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3485, + "hfopenllm_v2/MMLU-PRO": 0.1094 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/suayptalha.json b/data/developers/suayptalha.json new file mode 100644 index 0000000000000000000000000000000000000000..31fa829fc447e018d00a0eccdff5106bcd916280 --- /dev/null +++ b/data/developers/suayptalha.json @@ -0,0 +1,173 @@ +{ + "developer": "suayptalha", + "models": [ + { + "id": "suayptalha/Clarus-7B-v0.1", + "name": "Clarus-7B-v0.1", + "developer": "suayptalha", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7454, + "hfopenllm_v2/BBH": 0.5497, + "hfopenllm_v2/MATH Level 5": 0.4924, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.443, + "hfopenllm_v2/MMLU-PRO": 0.4387 + } + }, + { + "id": "suayptalha/Clarus-7B-v0.2", + "name": "Clarus-7B-v0.2", + "developer": "suayptalha", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7679, + "hfopenllm_v2/BBH": 0.549, + "hfopenllm_v2/MATH Level 5": 0.4856, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4417, + "hfopenllm_v2/MMLU-PRO": 0.44 + } + }, + { + "id": "suayptalha/Clarus-7B-v0.3", + "name": "Clarus-7B-v0.3", + "developer": "suayptalha", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7509, + "hfopenllm_v2/BBH": 0.5526, + "hfopenllm_v2/MATH Level 5": 0.4879, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4402, + "hfopenllm_v2/MMLU-PRO": 0.4385 + } + }, + { + "id": "suayptalha/DeepSeek-R1-Distill-Llama-3B", + "name": "DeepSeek-R1-Distill-Llama-3B", + "developer": "suayptalha", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7093, + "hfopenllm_v2/BBH": 0.4452, + "hfopenllm_v2/MATH Level 5": 0.2092, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3396, + "hfopenllm_v2/MMLU-PRO": 0.2978 + } + }, + { + "id": "suayptalha/Falcon3-Jessi-v0.4-7B-Slerp", + "name": "Falcon3-Jessi-v0.4-7B-Slerp", + "developer": "suayptalha", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7676, + "hfopenllm_v2/BBH": 0.5591, + "hfopenllm_v2/MATH Level 5": 0.3965, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4812, + "hfopenllm_v2/MMLU-PRO": 0.406 + } + }, + { + "id": "suayptalha/HomerCreativeAnvita-Mix-Qw7B", + "name": "HomerCreativeAnvita-Mix-Qw7B", + "developer": "suayptalha", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7808, + "hfopenllm_v2/BBH": 0.5565, + "hfopenllm_v2/MATH Level 5": 0.361, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.4416, + "hfopenllm_v2/MMLU-PRO": 0.4445 + } + }, + { + "id": "suayptalha/Komodo-Llama-3.2-3B-v2-fp16", + "name": "Komodo-Llama-3.2-3B-v2-fp16", + "developer": "suayptalha", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6341, + "hfopenllm_v2/BBH": 0.4355, + "hfopenllm_v2/MATH Level 5": 0.1065, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.3406, + "hfopenllm_v2/MMLU-PRO": 0.2852 + } + }, + { + "id": "suayptalha/Lamarckvergence-14B", + "name": "Lamarckvergence-14B", + "developer": "suayptalha", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7656, + "hfopenllm_v2/BBH": 0.6517, + "hfopenllm_v2/MATH Level 5": 0.54, + "hfopenllm_v2/GPQA": 0.3633, + "hfopenllm_v2/MUSR": 0.4422, + "hfopenllm_v2/MMLU-PRO": 0.5283 + } + }, + { + "id": "suayptalha/Lix-14B-v0.1", + "name": "Lix-14B-v0.1", + "developer": "suayptalha", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7813, + "hfopenllm_v2/BBH": 0.6608, + "hfopenllm_v2/MATH Level 5": 0.5295, + "hfopenllm_v2/GPQA": 0.37, + "hfopenllm_v2/MUSR": 0.4338, + "hfopenllm_v2/MMLU-PRO": 0.5314 + } + }, + { + "id": "suayptalha/Luminis-phi-4", + "name": "Luminis-phi-4", + "developer": "suayptalha", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.69, + "hfopenllm_v2/BBH": 0.692, + "hfopenllm_v2/MATH Level 5": 0.4637, + "hfopenllm_v2/GPQA": 0.3515, + "hfopenllm_v2/MUSR": 0.4572, + "hfopenllm_v2/MMLU-PRO": 0.5424 + } + }, + { + "id": "suayptalha/Maestro-10B", + "name": "Maestro-10B", + "developer": "suayptalha", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7768, + "hfopenllm_v2/BBH": 0.5746, + "hfopenllm_v2/MATH Level 5": 0.1911, + "hfopenllm_v2/GPQA": 0.3331, + "hfopenllm_v2/MUSR": 0.4397, + "hfopenllm_v2/MMLU-PRO": 0.4218 + } + }, + { + "id": "suayptalha/Rombos-2.5-T.E-8.1", + "name": "Rombos-2.5-T.E-8.1", + "developer": "suayptalha", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6925, + "hfopenllm_v2/BBH": 0.5515, + "hfopenllm_v2/MATH Level 5": 0.4924, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.4166, + "hfopenllm_v2/MMLU-PRO": 0.4446 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/sumink.json b/data/developers/sumink.json new file mode 100644 index 0000000000000000000000000000000000000000..e80bea14dcef5ea856b6e949097f077d82db6c3d --- /dev/null +++ b/data/developers/sumink.json @@ -0,0 +1,313 @@ +{ + "developer": "sumink", + "models": [ + { + "id": "sumink/Qmerft", + "name": "Qmerft", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1564, + "hfopenllm_v2/BBH": 0.2939, + "hfopenllm_v2/MATH Level 5": 0.0023, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.3688, + "hfopenllm_v2/MMLU-PRO": 0.1157 + } + }, + { + "id": "sumink/Qwenftmodel", + "name": "Qwenftmodel", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1729, + "hfopenllm_v2/BBH": 0.3823, + "hfopenllm_v2/MATH Level 5": 0.0891, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3617, + "hfopenllm_v2/MMLU-PRO": 0.2339 + } + }, + { + "id": "sumink/Qwenmplus", + "name": "Qwenmplus", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.204, + "hfopenllm_v2/BBH": 0.3676, + "hfopenllm_v2/MATH Level 5": 0.0249, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.3828, + "hfopenllm_v2/MMLU-PRO": 0.1992 + } + }, + { + "id": "sumink/Qwensci", + "name": "Qwensci", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.174, + "hfopenllm_v2/BBH": 0.3282, + "hfopenllm_v2/MATH Level 5": 0.0204, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3609, + "hfopenllm_v2/MMLU-PRO": 0.126 + } + }, + { + "id": "sumink/bbhqwen", + "name": "bbhqwen", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1809, + "hfopenllm_v2/BBH": 0.3388, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.4352, + "hfopenllm_v2/MMLU-PRO": 0.1617 + } + }, + { + "id": "sumink/bbhqwen2", + "name": "bbhqwen2", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1533, + "hfopenllm_v2/BBH": 0.3066, + "hfopenllm_v2/MATH Level 5": 0.006, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.4431, + "hfopenllm_v2/MMLU-PRO": 0.1149 + } + }, + { + "id": "sumink/bbhqwen3", + "name": "bbhqwen3", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1943, + "hfopenllm_v2/BBH": 0.2951, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3796, + "hfopenllm_v2/MMLU-PRO": 0.1166 + } + }, + { + "id": "sumink/bbhqwen4", + "name": "bbhqwen4", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1449, + "hfopenllm_v2/BBH": 0.3199, + "hfopenllm_v2/MATH Level 5": 0.006, + "hfopenllm_v2/GPQA": 0.2441, + "hfopenllm_v2/MUSR": 0.4029, + "hfopenllm_v2/MMLU-PRO": 0.1509 + } + }, + { + "id": "sumink/bbhqwen5", + "name": "bbhqwen5", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1522, + "hfopenllm_v2/BBH": 0.2913, + "hfopenllm_v2/MATH Level 5": 0.0023, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.4019, + "hfopenllm_v2/MMLU-PRO": 0.1131 + } + }, + { + "id": "sumink/bbhqwen6", + "name": "bbhqwen6", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1893, + "hfopenllm_v2/BBH": 0.2782, + "hfopenllm_v2/MATH Level 5": 0.0008, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.358, + "hfopenllm_v2/MMLU-PRO": 0.1153 + } + }, + { + "id": "sumink/flflmillama", + "name": "flflmillama", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1676, + "hfopenllm_v2/BBH": 0.3851, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.3591, + "hfopenllm_v2/MMLU-PRO": 0.2096 + } + }, + { + "id": "sumink/ftgpt", + "name": "ftgpt", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0787, + "hfopenllm_v2/BBH": 0.2919, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.4138, + "hfopenllm_v2/MMLU-PRO": 0.1172 + } + }, + { + "id": "sumink/llamaft", + "name": "llamaft", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1609, + "hfopenllm_v2/BBH": 0.3763, + "hfopenllm_v2/MATH Level 5": 0.0166, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3498, + "hfopenllm_v2/MMLU-PRO": 0.2114 + } + }, + { + "id": "sumink/llamamerge", + "name": "llamamerge", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2672, + "hfopenllm_v2/BBH": 0.4632, + "hfopenllm_v2/MATH Level 5": 0.0151, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.424, + "hfopenllm_v2/MMLU-PRO": 0.259 + } + }, + { + "id": "sumink/llftfl7", + "name": "llftfl7", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1714, + "hfopenllm_v2/BBH": 0.3786, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.3632, + "hfopenllm_v2/MMLU-PRO": 0.1743 + } + }, + { + "id": "sumink/llmer", + "name": "llmer", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3191, + "hfopenllm_v2/BBH": 0.4885, + "hfopenllm_v2/MATH Level 5": 0.065, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.4039, + "hfopenllm_v2/MMLU-PRO": 0.3529 + } + }, + { + "id": "sumink/qwft", + "name": "qwft", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1197, + "hfopenllm_v2/BBH": 0.3002, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.3581, + "hfopenllm_v2/MMLU-PRO": 0.1129 + } + }, + { + "id": "sumink/qwmer", + "name": "qwmer", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2212, + "hfopenllm_v2/BBH": 0.4299, + "hfopenllm_v2/MATH Level 5": 0.0008, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.4032, + "hfopenllm_v2/MMLU-PRO": 0.2215 + } + }, + { + "id": "sumink/solarmer3", + "name": "solarmer3", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3741, + "hfopenllm_v2/BBH": 0.5266, + "hfopenllm_v2/MATH Level 5": 0.0582, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.4401, + "hfopenllm_v2/MMLU-PRO": 0.3323 + } + }, + { + "id": "sumink/somer", + "name": "somer", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.299, + "hfopenllm_v2/BBH": 0.5194, + "hfopenllm_v2/MATH Level 5": 0.0415, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.465, + "hfopenllm_v2/MMLU-PRO": 0.3447 + } + }, + { + "id": "sumink/somer2", + "name": "somer2", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3132, + "hfopenllm_v2/BBH": 0.5167, + "hfopenllm_v2/MATH Level 5": 0.0468, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4663, + "hfopenllm_v2/MMLU-PRO": 0.3433 + } + }, + { + "id": "sumink/somerft", + "name": "somerft", + "developer": "sumink", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1431, + "hfopenllm_v2/BBH": 0.3093, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2483, + "hfopenllm_v2/MUSR": 0.4045, + "hfopenllm_v2/MMLU-PRO": 0.1117 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/sunbaby.json b/data/developers/sunbaby.json new file mode 100644 index 0000000000000000000000000000000000000000..281b12d8e58e49d898fe276b6cc13a9606253f6c --- /dev/null +++ b/data/developers/sunbaby.json @@ -0,0 +1,19 @@ +{ + "developer": "sunbaby", + "models": [ + { + "id": "sunbaby/BrainCog-8B-0.1-Instruct", + "name": "BrainCog-8B-0.1-Instruct", + "developer": "sunbaby", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4253, + "hfopenllm_v2/BBH": 0.4618, + "hfopenllm_v2/MATH Level 5": 0.0967, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.3656, + "hfopenllm_v2/MMLU-PRO": 0.2858 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/swap-uniba.json b/data/developers/swap-uniba.json new file mode 100644 index 0000000000000000000000000000000000000000..57d840de260b85652c7d17020ce275b4a2e9782a --- /dev/null +++ b/data/developers/swap-uniba.json @@ -0,0 +1,19 @@ +{ + "developer": "swap-uniba", + "models": [ + { + "id": "swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA", + "name": "LLaMAntino-3-ANITA-8B-Inst-DPO-ITA", + "developer": "swap-uniba", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4815, + "hfopenllm_v2/BBH": 0.4936, + "hfopenllm_v2/MATH Level 5": 0.0483, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.4387, + "hfopenllm_v2/MMLU-PRO": 0.3723 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/synergetic.json b/data/developers/synergetic.json new file mode 100644 index 0000000000000000000000000000000000000000..2b5cd94290293780c1c36a296640d863780a989a --- /dev/null +++ b/data/developers/synergetic.json @@ -0,0 +1,19 @@ +{ + "developer": "synergetic", + "models": [ + { + "id": "synergetic/FrankenQwen2.5-14B", + "name": "FrankenQwen2.5-14B", + "developer": "synergetic", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1869, + "hfopenllm_v2/BBH": 0.6048, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3843, + "hfopenllm_v2/MMLU-PRO": 0.4382 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/talha2001.json b/data/developers/talha2001.json new file mode 100644 index 0000000000000000000000000000000000000000..52c08146595f4c5ea023cd8df02edf3acd2ef3dd --- /dev/null +++ b/data/developers/talha2001.json @@ -0,0 +1,19 @@ +{ + "developer": "talha2001", + "models": [ + { + "id": "talha2001/Beast-Soul-new", + "name": "Beast-Soul-new", + "developer": "talha2001", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4854, + "hfopenllm_v2/BBH": 0.5227, + "hfopenllm_v2/MATH Level 5": 0.074, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.4459, + "hfopenllm_v2/MMLU-PRO": 0.3102 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/tangledgroup.json b/data/developers/tangledgroup.json new file mode 100644 index 0000000000000000000000000000000000000000..12057061e89bb04a722bd13b96087a62d695a722 --- /dev/null +++ b/data/developers/tangledgroup.json @@ -0,0 +1,33 @@ +{ + "developer": "tangledgroup", + "models": [ + { + "id": "tangledgroup/tangled-llama-pints-1.5b-v0.1-instruct", + "name": "tangled-llama-pints-1.5b-v0.1-instruct", + "developer": "tangledgroup", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1509, + "hfopenllm_v2/BBH": 0.3143, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2399, + "hfopenllm_v2/MUSR": 0.3761, + "hfopenllm_v2/MMLU-PRO": 0.1109 + } + }, + { + "id": "tangledgroup/tangled-llama-pints-1.5b-v0.2-instruct", + "name": "tangled-llama-pints-1.5b-v0.2-instruct", + "developer": "tangledgroup", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1724, + "hfopenllm_v2/BBH": 0.3158, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2416, + "hfopenllm_v2/MUSR": 0.3643, + "hfopenllm_v2/MMLU-PRO": 0.1117 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/tanliboy.json b/data/developers/tanliboy.json new file mode 100644 index 0000000000000000000000000000000000000000..daa17945601d8528cbfc7c883e4dc4315ea363fc --- /dev/null +++ b/data/developers/tanliboy.json @@ -0,0 +1,47 @@ +{ + "developer": "tanliboy", + "models": [ + { + "id": "tanliboy/lambda-gemma-2-9b-dpo", + "name": "lambda-gemma-2-9b-dpo", + "developer": "tanliboy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4501, + "hfopenllm_v2/BBH": 0.5472, + "hfopenllm_v2/MATH Level 5": 0.0944, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4017, + "hfopenllm_v2/MMLU-PRO": 0.3792 + } + }, + { + "id": "tanliboy/lambda-qwen2.5-14b-dpo-test", + "name": "lambda-qwen2.5-14b-dpo-test", + "developer": "tanliboy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8231, + "hfopenllm_v2/BBH": 0.6394, + "hfopenllm_v2/MATH Level 5": 0.5461, + "hfopenllm_v2/GPQA": 0.3624, + "hfopenllm_v2/MUSR": 0.426, + "hfopenllm_v2/MMLU-PRO": 0.4848 + } + }, + { + "id": "tanliboy/lambda-qwen2.5-32b-dpo-test", + "name": "lambda-qwen2.5-32b-dpo-test", + "developer": "tanliboy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8084, + "hfopenllm_v2/BBH": 0.6764, + "hfopenllm_v2/MATH Level 5": 0.6103, + "hfopenllm_v2/GPQA": 0.3565, + "hfopenllm_v2/MUSR": 0.4274, + "hfopenllm_v2/MMLU-PRO": 0.5657 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/tannedbum.json b/data/developers/tannedbum.json new file mode 100644 index 0000000000000000000000000000000000000000..19b7cd9a7d2aac309c9ef8c8091ea782d7354a70 --- /dev/null +++ b/data/developers/tannedbum.json @@ -0,0 +1,61 @@ +{ + "developer": "tannedbum", + "models": [ + { + "id": "tannedbum/Ellaria-9B", + "name": "Ellaria-9B", + "developer": "tannedbum", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7826, + "hfopenllm_v2/BBH": 0.5942, + "hfopenllm_v2/MATH Level 5": 0.2077, + "hfopenllm_v2/GPQA": 0.3331, + "hfopenllm_v2/MUSR": 0.4151, + "hfopenllm_v2/MMLU-PRO": 0.4205 + } + }, + { + "id": "tannedbum/L3-Nymeria-Maid-8B", + "name": "L3-Nymeria-Maid-8B", + "developer": "tannedbum", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.725, + "hfopenllm_v2/BBH": 0.5146, + "hfopenllm_v2/MATH Level 5": 0.0937, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.3751, + "hfopenllm_v2/MMLU-PRO": 0.3747 + } + }, + { + "id": "tannedbum/L3-Nymeria-v2-8B", + "name": "L3-Nymeria-v2-8B", + "developer": "tannedbum", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7168, + "hfopenllm_v2/BBH": 0.5224, + "hfopenllm_v2/MATH Level 5": 0.0921, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.3699, + "hfopenllm_v2/MMLU-PRO": 0.3753 + } + }, + { + "id": "tannedbum/L3-Rhaenys-8B", + "name": "L3-Rhaenys-8B", + "developer": "tannedbum", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7363, + "hfopenllm_v2/BBH": 0.5299, + "hfopenllm_v2/MATH Level 5": 0.0876, + "hfopenllm_v2/GPQA": 0.2978, + "hfopenllm_v2/MUSR": 0.3725, + "hfopenllm_v2/MMLU-PRO": 0.3799 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/teknium.json b/data/developers/teknium.json new file mode 100644 index 0000000000000000000000000000000000000000..e391d123290feb9376748c2a1f02aa900cfda727 --- /dev/null +++ b/data/developers/teknium.json @@ -0,0 +1,75 @@ +{ + "developer": "teknium", + "models": [ + { + "id": "teknium/CollectiveCognition-v1.1-Mistral-7B", + "name": "CollectiveCognition-v1.1-Mistral-7B", + "developer": "teknium", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.279, + "hfopenllm_v2/BBH": 0.4493, + "hfopenllm_v2/MATH Level 5": 0.031, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.3869, + "hfopenllm_v2/MMLU-PRO": 0.2837 + } + }, + { + "id": "teknium/OpenHermes-13B", + "name": "OpenHermes-13B", + "developer": "teknium", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2668, + "hfopenllm_v2/BBH": 0.4206, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.4043, + "hfopenllm_v2/MMLU-PRO": 0.2389 + } + }, + { + "id": "teknium/OpenHermes-2-Mistral-7B", + "name": "OpenHermes-2-Mistral-7B", + "developer": "teknium", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5286, + "hfopenllm_v2/BBH": 0.4948, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.452, + "hfopenllm_v2/MMLU-PRO": 0.2931 + } + }, + { + "id": "teknium/OpenHermes-2.5-Mistral-7B", + "name": "OpenHermes-2.5-Mistral-7B", + "developer": "teknium", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5571, + "hfopenllm_v2/BBH": 0.487, + "hfopenllm_v2/MATH Level 5": 0.0506, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.4242, + "hfopenllm_v2/MMLU-PRO": 0.3054 + } + }, + { + "id": "teknium/OpenHermes-7B", + "name": "OpenHermes-7B", + "developer": "teknium", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1813, + "hfopenllm_v2/BBH": 0.362, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.4324, + "hfopenllm_v2/MMLU-PRO": 0.1933 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/tensopolis.json b/data/developers/tensopolis.json new file mode 100644 index 0000000000000000000000000000000000000000..ef6a38c9fc2daee3b5bb5e68fa5563d4aee82191 --- /dev/null +++ b/data/developers/tensopolis.json @@ -0,0 +1,215 @@ +{ + "developer": "tensopolis", + "models": [ + { + "id": "tensopolis/falcon3-10b-tensopolis-v1", + "name": "falcon3-10b-tensopolis-v1", + "developer": "tensopolis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7817, + "hfopenllm_v2/BBH": 0.6182, + "hfopenllm_v2/MATH Level 5": 0.2749, + "hfopenllm_v2/GPQA": 0.3297, + "hfopenllm_v2/MUSR": 0.4375, + "hfopenllm_v2/MMLU-PRO": 0.442 + } + }, + { + "id": "tensopolis/falcon3-10b-tensopolis-v2", + "name": "falcon3-10b-tensopolis-v2", + "developer": "tensopolis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7792, + "hfopenllm_v2/BBH": 0.6182, + "hfopenllm_v2/MATH Level 5": 0.2666, + "hfopenllm_v2/GPQA": 0.3272, + "hfopenllm_v2/MUSR": 0.4297, + "hfopenllm_v2/MMLU-PRO": 0.4424 + } + }, + { + "id": "tensopolis/lamarckvergence-14b-tensopolis-v1", + "name": "lamarckvergence-14b-tensopolis-v1", + "developer": "tensopolis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7604, + "hfopenllm_v2/BBH": 0.6561, + "hfopenllm_v2/MATH Level 5": 0.5166, + "hfopenllm_v2/GPQA": 0.3607, + "hfopenllm_v2/MUSR": 0.4475, + "hfopenllm_v2/MMLU-PRO": 0.525 + } + }, + { + "id": "tensopolis/mistral-small-2501-tensopolis-v1", + "name": "mistral-small-2501-tensopolis-v1", + "developer": "tensopolis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7762, + "hfopenllm_v2/BBH": 0.6475, + "hfopenllm_v2/MATH Level 5": 0.4441, + "hfopenllm_v2/GPQA": 0.3574, + "hfopenllm_v2/MUSR": 0.428, + "hfopenllm_v2/MMLU-PRO": 0.4465 + } + }, + { + "id": "tensopolis/mistral-small-r1-tensopolis", + "name": "mistral-small-r1-tensopolis", + "developer": "tensopolis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4622, + "hfopenllm_v2/BBH": 0.5436, + "hfopenllm_v2/MATH Level 5": 0.2908, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.3738, + "hfopenllm_v2/MMLU-PRO": 0.4035 + } + }, + { + "id": "tensopolis/phi-4-tensopolis-v1", + "name": "phi-4-tensopolis-v1", + "developer": "tensopolis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6767, + "hfopenllm_v2/BBH": 0.6872, + "hfopenllm_v2/MATH Level 5": 0.494, + "hfopenllm_v2/GPQA": 0.3347, + "hfopenllm_v2/MUSR": 0.4141, + "hfopenllm_v2/MMLU-PRO": 0.5384 + } + }, + { + "id": "tensopolis/qwen2.5-14b-tensopolis-v1", + "name": "qwen2.5-14b-tensopolis-v1", + "developer": "tensopolis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.799, + "hfopenllm_v2/BBH": 0.6364, + "hfopenllm_v2/MATH Level 5": 0.5295, + "hfopenllm_v2/GPQA": 0.3347, + "hfopenllm_v2/MUSR": 0.4193, + "hfopenllm_v2/MMLU-PRO": 0.4911 + } + }, + { + "id": "tensopolis/qwen2.5-3b-or1-tensopolis", + "name": "qwen2.5-3b-or1-tensopolis", + "developer": "tensopolis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.354, + "hfopenllm_v2/BBH": 0.4421, + "hfopenllm_v2/MATH Level 5": 0.173, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.3749, + "hfopenllm_v2/MMLU-PRO": 0.3197 + } + }, + { + "id": "tensopolis/qwen2.5-7b-tensopolis-v1", + "name": "qwen2.5-7b-tensopolis-v1", + "developer": "tensopolis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7661, + "hfopenllm_v2/BBH": 0.5379, + "hfopenllm_v2/MATH Level 5": 0.4562, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4339, + "hfopenllm_v2/MMLU-PRO": 0.4269 + } + }, + { + "id": "tensopolis/qwen2.5-7b-tensopolis-v2", + "name": "qwen2.5-7b-tensopolis-v2", + "developer": "tensopolis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7521, + "hfopenllm_v2/BBH": 0.5415, + "hfopenllm_v2/MATH Level 5": 0.4819, + "hfopenllm_v2/GPQA": 0.2903, + "hfopenllm_v2/MUSR": 0.4246, + "hfopenllm_v2/MMLU-PRO": 0.4243 + } + }, + { + "id": "tensopolis/virtuoso-lite-tensopolis-v1", + "name": "virtuoso-lite-tensopolis-v1", + "developer": "tensopolis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8069, + "hfopenllm_v2/BBH": 0.6102, + "hfopenllm_v2/MATH Level 5": 0.2545, + "hfopenllm_v2/GPQA": 0.3448, + "hfopenllm_v2/MUSR": 0.4582, + "hfopenllm_v2/MMLU-PRO": 0.4435 + } + }, + { + "id": "tensopolis/virtuoso-lite-tensopolis-v2", + "name": "virtuoso-lite-tensopolis-v2", + "developer": "tensopolis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8029, + "hfopenllm_v2/BBH": 0.61, + "hfopenllm_v2/MATH Level 5": 0.25, + "hfopenllm_v2/GPQA": 0.3431, + "hfopenllm_v2/MUSR": 0.4595, + "hfopenllm_v2/MMLU-PRO": 0.444 + } + }, + { + "id": "tensopolis/virtuoso-small-tensopolis-v1", + "name": "virtuoso-small-tensopolis-v1", + "developer": "tensopolis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7856, + "hfopenllm_v2/BBH": 0.6415, + "hfopenllm_v2/MATH Level 5": 0.3527, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.4326, + "hfopenllm_v2/MMLU-PRO": 0.4968 + } + }, + { + "id": "tensopolis/virtuoso-small-tensopolis-v2", + "name": "virtuoso-small-tensopolis-v2", + "developer": "tensopolis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.802, + "hfopenllm_v2/BBH": 0.6516, + "hfopenllm_v2/MATH Level 5": 0.3875, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.4352, + "hfopenllm_v2/MMLU-PRO": 0.5154 + } + }, + { + "id": "tensopolis/virtuoso-small-v2-tensopolis-v1", + "name": "virtuoso-small-v2-tensopolis-v1", + "developer": "tensopolis", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8419, + "hfopenllm_v2/BBH": 0.6545, + "hfopenllm_v2/MATH Level 5": 0.4524, + "hfopenllm_v2/GPQA": 0.3465, + "hfopenllm_v2/MUSR": 0.4509, + "hfopenllm_v2/MMLU-PRO": 0.5175 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/tensoropera.json b/data/developers/tensoropera.json new file mode 100644 index 0000000000000000000000000000000000000000..714352a4146412b4ece92148d7c9d727448baf8b --- /dev/null +++ b/data/developers/tensoropera.json @@ -0,0 +1,19 @@ +{ + "developer": "tensoropera", + "models": [ + { + "id": "tensoropera/Fox-1-1.6B", + "name": "Fox-1-1.6B", + "developer": "tensoropera", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2766, + "hfopenllm_v2/BBH": 0.3307, + "hfopenllm_v2/MATH Level 5": 0.0174, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.355, + "hfopenllm_v2/MMLU-PRO": 0.1371 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/tenyx.json b/data/developers/tenyx.json new file mode 100644 index 0000000000000000000000000000000000000000..2710fe40e7067b955166223da475f149bf037f62 --- /dev/null +++ b/data/developers/tenyx.json @@ -0,0 +1,19 @@ +{ + "developer": "tenyx", + "models": [ + { + "id": "tenyx/Llama3-TenyxChat-70B", + "name": "Llama3-TenyxChat-70B", + "developer": "tenyx", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8087, + "hfopenllm_v2/BBH": 0.6511, + "hfopenllm_v2/MATH Level 5": 0.2356, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.426, + "hfopenllm_v2/MMLU-PRO": 0.521 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/theo77186.json b/data/developers/theo77186.json new file mode 100644 index 0000000000000000000000000000000000000000..1808ec42373f0178babb160f553f004c1f2357cd --- /dev/null +++ b/data/developers/theo77186.json @@ -0,0 +1,19 @@ +{ + "developer": "theo77186", + "models": [ + { + "id": "theo77186/Qwen2.5-Coder-7B-Instruct-20241106", + "name": "Qwen2.5-Coder-7B-Instruct-20241106", + "developer": "theo77186", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6101, + "hfopenllm_v2/BBH": 0.5008, + "hfopenllm_v2/MATH Level 5": 0.3882, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4073, + "hfopenllm_v2/MMLU-PRO": 0.3353 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/theprint.json b/data/developers/theprint.json new file mode 100644 index 0000000000000000000000000000000000000000..bc918827549a11b66c5052705449c2f653231dda --- /dev/null +++ b/data/developers/theprint.json @@ -0,0 +1,257 @@ +{ + "developer": "theprint", + "models": [ + { + "id": "theprint/Boptruth-Agatha-7B", + "name": "Boptruth-Agatha-7B", + "developer": "theprint", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3124, + "hfopenllm_v2/BBH": 0.4984, + "hfopenllm_v2/MATH Level 5": 0.0551, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4277, + "hfopenllm_v2/MMLU-PRO": 0.2861 + } + }, + { + "id": "theprint/CleverBoi-7B-v2", + "name": "CleverBoi-7B-v2", + "developer": "theprint", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.217, + "hfopenllm_v2/BBH": 0.4532, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.4695, + "hfopenllm_v2/MMLU-PRO": 0.2709 + } + }, + { + "id": "theprint/CleverBoi-7B-v3", + "name": "CleverBoi-7B-v3", + "developer": "theprint", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2382, + "hfopenllm_v2/BBH": 0.4414, + "hfopenllm_v2/MATH Level 5": 0.04, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.4072, + "hfopenllm_v2/MMLU-PRO": 0.2868 + } + }, + { + "id": "theprint/CleverBoi-Llama-3.1-8B-Instruct", + "name": "CleverBoi-Llama-3.1-8B-Instruct", + "developer": "theprint", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1682, + "hfopenllm_v2/BBH": 0.456, + "hfopenllm_v2/MATH Level 5": 0.0491, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4014, + "hfopenllm_v2/MMLU-PRO": 0.3075 + } + }, + { + "id": "theprint/CleverBoi-Llama-3.1-8B-v2", + "name": "CleverBoi-Llama-3.1-8B-v2", + "developer": "theprint", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1961, + "hfopenllm_v2/BBH": 0.4668, + "hfopenllm_v2/MATH Level 5": 0.0529, + "hfopenllm_v2/GPQA": 0.2861, + "hfopenllm_v2/MUSR": 0.3735, + "hfopenllm_v2/MMLU-PRO": 0.3188 + } + }, + { + "id": "theprint/CleverBoi-Nemo-12B-v2", + "name": "CleverBoi-Nemo-12B-v2", + "developer": "theprint", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2046, + "hfopenllm_v2/BBH": 0.5241, + "hfopenllm_v2/MATH Level 5": 0.1035, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4187, + "hfopenllm_v2/MMLU-PRO": 0.3228 + } + }, + { + "id": "theprint/Code-Llama-Bagel-8B", + "name": "Code-Llama-Bagel-8B", + "developer": "theprint", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.253, + "hfopenllm_v2/BBH": 0.4697, + "hfopenllm_v2/MATH Level 5": 0.0612, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.368, + "hfopenllm_v2/MMLU-PRO": 0.2822 + } + }, + { + "id": "theprint/Conversely-Mistral-7B", + "name": "Conversely-Mistral-7B", + "developer": "theprint", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2608, + "hfopenllm_v2/BBH": 0.4672, + "hfopenllm_v2/MATH Level 5": 0.0279, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.4189, + "hfopenllm_v2/MMLU-PRO": 0.2826 + } + }, + { + "id": "theprint/Llama-3.2-3B-VanRossum", + "name": "Llama-3.2-3B-VanRossum", + "developer": "theprint", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4783, + "hfopenllm_v2/BBH": 0.4279, + "hfopenllm_v2/MATH Level 5": 0.0974, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3442, + "hfopenllm_v2/MMLU-PRO": 0.277 + } + }, + { + "id": "theprint/ReWiz-7B", + "name": "ReWiz-7B", + "developer": "theprint", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4048, + "hfopenllm_v2/BBH": 0.4564, + "hfopenllm_v2/MATH Level 5": 0.0408, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.4612, + "hfopenllm_v2/MMLU-PRO": 0.267 + } + }, + { + "id": "theprint/ReWiz-Llama-3.1-8B-v2", + "name": "ReWiz-Llama-3.1-8B-v2", + "developer": "theprint", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2379, + "hfopenllm_v2/BBH": 0.4632, + "hfopenllm_v2/MATH Level 5": 0.0574, + "hfopenllm_v2/GPQA": 0.3029, + "hfopenllm_v2/MUSR": 0.3814, + "hfopenllm_v2/MMLU-PRO": 0.331 + } + }, + { + "id": "theprint/ReWiz-Llama-3.2-3B", + "name": "ReWiz-Llama-3.2-3B", + "developer": "theprint", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4649, + "hfopenllm_v2/BBH": 0.4343, + "hfopenllm_v2/MATH Level 5": 0.1095, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3614, + "hfopenllm_v2/MMLU-PRO": 0.2887 + } + }, + { + "id": "theprint/ReWiz-Nemo-12B-Instruct", + "name": "ReWiz-Nemo-12B-Instruct", + "developer": "theprint", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1062, + "hfopenllm_v2/BBH": 0.5092, + "hfopenllm_v2/MATH Level 5": 0.1042, + "hfopenllm_v2/GPQA": 0.3238, + "hfopenllm_v2/MUSR": 0.4096, + "hfopenllm_v2/MMLU-PRO": 0.3339 + } + }, + { + "id": "theprint/ReWiz-Qwen-2.5-14B", + "name": "ReWiz-Qwen-2.5-14B", + "developer": "theprint", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2785, + "hfopenllm_v2/BBH": 0.6179, + "hfopenllm_v2/MATH Level 5": 0.2923, + "hfopenllm_v2/GPQA": 0.38, + "hfopenllm_v2/MUSR": 0.4539, + "hfopenllm_v2/MMLU-PRO": 0.5092 + } + }, + { + "id": "theprint/ReWiz-Worldbuilder-7B", + "name": "ReWiz-Worldbuilder-7B", + "developer": "theprint", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.251, + "hfopenllm_v2/BBH": 0.4636, + "hfopenllm_v2/MATH Level 5": 0.037, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.4572, + "hfopenllm_v2/MMLU-PRO": 0.2971 + } + }, + { + "id": "theprint/RuDolph-Hermes-7B", + "name": "RuDolph-Hermes-7B", + "developer": "theprint", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3604, + "hfopenllm_v2/BBH": 0.5053, + "hfopenllm_v2/MATH Level 5": 0.0514, + "hfopenllm_v2/GPQA": 0.3121, + "hfopenllm_v2/MUSR": 0.4226, + "hfopenllm_v2/MMLU-PRO": 0.3073 + } + }, + { + "id": "theprint/WorldBuilder-12B", + "name": "WorldBuilder-12B", + "developer": "theprint", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1374, + "hfopenllm_v2/BBH": 0.501, + "hfopenllm_v2/MATH Level 5": 0.0446, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4066, + "hfopenllm_v2/MMLU-PRO": 0.3192 + } + }, + { + "id": "theprint/phi-3-mini-4k-python", + "name": "phi-3-mini-4k-python", + "developer": "theprint", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2409, + "hfopenllm_v2/BBH": 0.4938, + "hfopenllm_v2/MATH Level 5": 0.105, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.3922, + "hfopenllm_v2/MMLU-PRO": 0.3577 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/thinkcoder.json b/data/developers/thinkcoder.json new file mode 100644 index 0000000000000000000000000000000000000000..758defeed9e7e5eef830591f8aa3bbc16e3d953e --- /dev/null +++ b/data/developers/thinkcoder.json @@ -0,0 +1,19 @@ +{ + "developer": "thinkcoder", + "models": [ + { + "id": "thinkcoder/llama3-8b-instruct-lora-8-sft", + "name": "llama3-8b-instruct-lora-8-sft", + "developer": "thinkcoder", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.648, + "hfopenllm_v2/BBH": 0.4865, + "hfopenllm_v2/MATH Level 5": 0.102, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3235, + "hfopenllm_v2/MMLU-PRO": 0.3476 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/thirdeyeai.json b/data/developers/thirdeyeai.json new file mode 100644 index 0000000000000000000000000000000000000000..eadc873895e11ba0dccbb2fae8600f3324eb346e --- /dev/null +++ b/data/developers/thirdeyeai.json @@ -0,0 +1,19 @@ +{ + "developer": "thirdeyeai", + "models": [ + { + "id": "thirdeyeai/elevate360m", + "name": "elevate360m", + "developer": "thirdeyeai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0445, + "hfopenllm_v2/BBH": 0.2963, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.2408, + "hfopenllm_v2/MUSR": 0.3462, + "hfopenllm_v2/MMLU-PRO": 0.1077 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/thomas-yanxin.json b/data/developers/thomas-yanxin.json new file mode 100644 index 0000000000000000000000000000000000000000..73a7fd9f779612c0dc692aaccb0edd0e53f86ba9 --- /dev/null +++ b/data/developers/thomas-yanxin.json @@ -0,0 +1,61 @@ +{ + "developer": "thomas-yanxin", + "models": [ + { + "id": "thomas-yanxin/XinYuan-Qwen2-1_5B", + "name": "XinYuan-Qwen2-1_5B", + "developer": "thomas-yanxin", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2986, + "hfopenllm_v2/BBH": 0.3635, + "hfopenllm_v2/MATH Level 5": 0.0672, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3634, + "hfopenllm_v2/MMLU-PRO": 0.2357 + } + }, + { + "id": "thomas-yanxin/XinYuan-Qwen2-7B", + "name": "XinYuan-Qwen2-7B", + "developer": "thomas-yanxin", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4438, + "hfopenllm_v2/BBH": 0.4937, + "hfopenllm_v2/MATH Level 5": 0.1458, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.4058, + "hfopenllm_v2/MMLU-PRO": 0.3925 + } + }, + { + "id": "thomas-yanxin/XinYuan-Qwen2-7B-0917", + "name": "XinYuan-Qwen2-7B-0917", + "developer": "thomas-yanxin", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3719, + "hfopenllm_v2/BBH": 0.5169, + "hfopenllm_v2/MATH Level 5": 0.1979, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4401, + "hfopenllm_v2/MMLU-PRO": 0.4245 + } + }, + { + "id": "thomas-yanxin/XinYuan-Qwen2.5-7B-0917", + "name": "XinYuan-Qwen2.5-7B-0917", + "developer": "thomas-yanxin", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3577, + "hfopenllm_v2/BBH": 0.5184, + "hfopenllm_v2/MATH Level 5": 0.1934, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.3676, + "hfopenllm_v2/MMLU-PRO": 0.3882 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/tianyil1.json b/data/developers/tianyil1.json new file mode 100644 index 0000000000000000000000000000000000000000..e4d404fedb4aebf09d86018218a9fc7fb5280009 --- /dev/null +++ b/data/developers/tianyil1.json @@ -0,0 +1,19 @@ +{ + "developer": "tianyil1", + "models": [ + { + "id": "tianyil1/MistralForCausalLM_Cal_DPO", + "name": "MistralForCausalLM_Cal_DPO", + "developer": "tianyil1", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5328, + "hfopenllm_v2/BBH": 0.4381, + "hfopenllm_v2/MATH Level 5": 0.0287, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3977, + "hfopenllm_v2/MMLU-PRO": 0.2763 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/tiiuae.json b/data/developers/tiiuae.json new file mode 100644 index 0000000000000000000000000000000000000000..d7aacc648812adc90e58e9065a4ac9822073e8d5 --- /dev/null +++ b/data/developers/tiiuae.json @@ -0,0 +1,341 @@ +{ + "developer": "tiiuae", + "models": [ + { + "id": "tiiuae/Falcon-40B", + "name": "Falcon 40B", + "developer": "tiiuae", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.729, + "helm_classic/MMLU": 0.509, + "helm_classic/BoolQ": 0.819, + "helm_classic/NarrativeQA": 0.673, + "helm_classic/NaturalQuestions (open-book)": 0.675, + "helm_classic/QuAC": 0.307, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.353, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.959, + "helm_classic/CivilComments": 0.552, + "helm_classic/RAFT": 0.661 + } + }, + { + "id": "tiiuae/Falcon-7B", + "name": "Falcon 7B", + "developer": "tiiuae", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.378, + "helm_classic/MMLU": 0.286, + "helm_classic/BoolQ": 0.753, + "helm_classic/NarrativeQA": 0.621, + "helm_classic/NaturalQuestions (open-book)": 0.579, + "helm_classic/QuAC": 0.332, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.234, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.836, + "helm_classic/CivilComments": 0.514, + "helm_classic/RAFT": 0.602 + } + }, + { + "id": "tiiuae/Falcon-Instruct-40B", + "name": "Falcon-Instruct 40B", + "developer": "tiiuae", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.727, + "helm_classic/MMLU": 0.497, + "helm_classic/BoolQ": 0.829, + "helm_classic/NarrativeQA": 0.625, + "helm_classic/NaturalQuestions (open-book)": 0.666, + "helm_classic/QuAC": 0.371, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.384, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.959, + "helm_classic/CivilComments": 0.603, + "helm_classic/RAFT": 0.586 + } + }, + { + "id": "tiiuae/Falcon-Instruct-7B", + "name": "Falcon-Instruct 7B", + "developer": "tiiuae", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.244, + "helm_classic/MMLU": 0.275, + "helm_classic/BoolQ": 0.72, + "helm_classic/NarrativeQA": 0.476, + "helm_classic/NaturalQuestions (open-book)": 0.449, + "helm_classic/QuAC": 0.311, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.213, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.852, + "helm_classic/CivilComments": 0.511, + "helm_classic/RAFT": 0.523 + } + }, + { + "id": "tiiuae/Falcon3-10B-Base", + "name": "Falcon3-10B-Base", + "developer": "tiiuae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3648, + "hfopenllm_v2/BBH": 0.595, + "hfopenllm_v2/MATH Level 5": 0.2492, + "hfopenllm_v2/GPQA": 0.3456, + "hfopenllm_v2/MUSR": 0.4398, + "hfopenllm_v2/MMLU-PRO": 0.424 + } + }, + { + "id": "tiiuae/Falcon3-10B-Instruct", + "name": "Falcon3-10B-Instruct", + "developer": "tiiuae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7817, + "hfopenllm_v2/BBH": 0.617, + "hfopenllm_v2/MATH Level 5": 0.2764, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.4323, + "hfopenllm_v2/MMLU-PRO": 0.4429 + } + }, + { + "id": "tiiuae/Falcon3-1B-Base", + "name": "Falcon3-1B-Base", + "developer": "tiiuae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2428, + "hfopenllm_v2/BBH": 0.3571, + "hfopenllm_v2/MATH Level 5": 0.0332, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.4147, + "hfopenllm_v2/MMLU-PRO": 0.1608 + } + }, + { + "id": "tiiuae/Falcon3-1B-Instruct", + "name": "Falcon3-1B-Instruct", + "developer": "tiiuae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5557, + "hfopenllm_v2/BBH": 0.3745, + "hfopenllm_v2/MATH Level 5": 0.0634, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.4189, + "hfopenllm_v2/MMLU-PRO": 0.1838 + } + }, + { + "id": "tiiuae/Falcon3-3B-Base", + "name": "Falcon3-3B-Base", + "developer": "tiiuae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2765, + "hfopenllm_v2/BBH": 0.4421, + "hfopenllm_v2/MATH Level 5": 0.1178, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.375, + "hfopenllm_v2/MMLU-PRO": 0.2879 + } + }, + { + "id": "tiiuae/Falcon3-3B-Instruct", + "name": "Falcon3-3B-Instruct", + "developer": "tiiuae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6977, + "hfopenllm_v2/BBH": 0.4754, + "hfopenllm_v2/MATH Level 5": 0.25, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.4136, + "hfopenllm_v2/MMLU-PRO": 0.3005 + } + }, + { + "id": "tiiuae/Falcon3-7B-Base", + "name": "Falcon3-7B-Base", + "developer": "tiiuae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3416, + "hfopenllm_v2/BBH": 0.5099, + "hfopenllm_v2/MATH Level 5": 0.1941, + "hfopenllm_v2/GPQA": 0.3465, + "hfopenllm_v2/MUSR": 0.4702, + "hfopenllm_v2/MMLU-PRO": 0.391 + } + }, + { + "id": "tiiuae/Falcon3-7B-Instruct", + "name": "Falcon3-7B-Instruct", + "developer": "tiiuae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7612, + "hfopenllm_v2/BBH": 0.5632, + "hfopenllm_v2/MATH Level 5": 0.4086, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.4827, + "hfopenllm_v2/MMLU-PRO": 0.4087 + } + }, + { + "id": "tiiuae/Falcon3-Mamba-7B-Base", + "name": "Falcon3-Mamba-7B-Base", + "developer": "tiiuae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2891, + "hfopenllm_v2/BBH": 0.4699, + "hfopenllm_v2/MATH Level 5": 0.1941, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.3431, + "hfopenllm_v2/MMLU-PRO": 0.3038 + } + }, + { + "id": "tiiuae/Falcon3-Mamba-7B-Instruct", + "name": "Falcon3-Mamba-7B-Instruct", + "developer": "tiiuae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7165, + "hfopenllm_v2/BBH": 0.4679, + "hfopenllm_v2/MATH Level 5": 0.3006, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.3869, + "hfopenllm_v2/MMLU-PRO": 0.3369 + } + }, + { + "id": "tiiuae/falcon-11B", + "name": "falcon-11B", + "developer": "tiiuae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3261, + "hfopenllm_v2/BBH": 0.4392, + "hfopenllm_v2/MATH Level 5": 0.0279, + "hfopenllm_v2/GPQA": 0.271, + "hfopenllm_v2/MUSR": 0.3986, + "hfopenllm_v2/MMLU-PRO": 0.2389 + } + }, + { + "id": "tiiuae/falcon-40b", + "name": "Falcon 40B", + "developer": "tiiuae", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.217, + "helm_lite/NarrativeQA": 0.671, + "helm_lite/NaturalQuestions (closed-book)": 0.392, + "helm_lite/OpenbookQA": 0.662, + "helm_lite/MMLU": 0.507, + "helm_lite/MATH": 0.128, + "helm_lite/GSM8K": 0.267, + "helm_lite/LegalBench": 0.442, + "helm_lite/MedQA": 0.419, + "helm_lite/WMT 2014": 0.162, + "hfopenllm_v2/IFEval": 0.2496, + "hfopenllm_v2/BBH": 0.4019, + "hfopenllm_v2/MATH Level 5": 0.0181, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3631, + "hfopenllm_v2/MMLU-PRO": 0.2505 + } + }, + { + "id": "tiiuae/falcon-40b-instruct", + "name": "falcon-40b-instruct", + "developer": "tiiuae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2454, + "hfopenllm_v2/BBH": 0.4054, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3762, + "hfopenllm_v2/MMLU-PRO": 0.2261 + } + }, + { + "id": "tiiuae/falcon-7b", + "name": "Falcon 7B", + "developer": "tiiuae", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.064, + "helm_lite/NarrativeQA": 0.621, + "helm_lite/NaturalQuestions (closed-book)": 0.285, + "helm_lite/OpenbookQA": 0.26, + "helm_lite/MMLU": 0.288, + "helm_lite/MATH": 0.044, + "helm_lite/GSM8K": 0.055, + "helm_lite/LegalBench": 0.346, + "helm_lite/MedQA": 0.254, + "helm_lite/WMT 2014": 0.094, + "hfopenllm_v2/IFEval": 0.1821, + "hfopenllm_v2/BBH": 0.3285, + "hfopenllm_v2/MATH Level 5": 0.0098, + "hfopenllm_v2/GPQA": 0.245, + "hfopenllm_v2/MUSR": 0.3778, + "hfopenllm_v2/MMLU-PRO": 0.1125 + } + }, + { + "id": "tiiuae/falcon-7b-instruct", + "name": "falcon-7b-instruct", + "developer": "tiiuae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1969, + "hfopenllm_v2/BBH": 0.3203, + "hfopenllm_v2/MATH Level 5": 0.0121, + "hfopenllm_v2/GPQA": 0.2475, + "hfopenllm_v2/MUSR": 0.3634, + "hfopenllm_v2/MMLU-PRO": 0.1155 + } + }, + { + "id": "tiiuae/falcon-mamba-7b", + "name": "falcon-mamba-7b", + "developer": "tiiuae", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3336, + "hfopenllm_v2/BBH": 0.4285, + "hfopenllm_v2/MATH Level 5": 0.0446, + "hfopenllm_v2/GPQA": 0.3104, + "hfopenllm_v2/MUSR": 0.421, + "hfopenllm_v2/MMLU-PRO": 0.2302 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/tinycompany.json b/data/developers/tinycompany.json new file mode 100644 index 0000000000000000000000000000000000000000..aefb31ec443bdaa534331eee717201b46eaa5bf5 --- /dev/null +++ b/data/developers/tinycompany.json @@ -0,0 +1,215 @@ +{ + "developer": "tinycompany", + "models": [ + { + "id": "tinycompany/BiBo-v0.3", + "name": "BiBo-v0.3", + "developer": "tinycompany", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5184, + "hfopenllm_v2/BBH": 0.4642, + "hfopenllm_v2/MATH Level 5": 0.0876, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.395, + "hfopenllm_v2/MMLU-PRO": 0.2995 + } + }, + { + "id": "tinycompany/BiBo-v0.7", + "name": "BiBo-v0.7", + "developer": "tinycompany", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3738, + "hfopenllm_v2/BBH": 0.4311, + "hfopenllm_v2/MATH Level 5": 0.0823, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.4044, + "hfopenllm_v2/MMLU-PRO": 0.265 + } + }, + { + "id": "tinycompany/ShawtyIsBad-bgem3", + "name": "ShawtyIsBad-bgem3", + "developer": "tinycompany", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2608, + "hfopenllm_v2/BBH": 0.3853, + "hfopenllm_v2/MATH Level 5": 0.0483, + "hfopenllm_v2/GPQA": 0.3054, + "hfopenllm_v2/MUSR": 0.3695, + "hfopenllm_v2/MMLU-PRO": 0.2583 + } + }, + { + "id": "tinycompany/ShawtyIsBad-e5-large", + "name": "ShawtyIsBad-e5-large", + "developer": "tinycompany", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2468, + "hfopenllm_v2/BBH": 0.3873, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.372, + "hfopenllm_v2/MMLU-PRO": 0.2569 + } + }, + { + "id": "tinycompany/ShawtyIsBad-ib", + "name": "ShawtyIsBad-ib", + "developer": "tinycompany", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2565, + "hfopenllm_v2/BBH": 0.388, + "hfopenllm_v2/MATH Level 5": 0.0491, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.3641, + "hfopenllm_v2/MMLU-PRO": 0.2581 + } + }, + { + "id": "tinycompany/ShawtyIsBad-nomic-moe", + "name": "ShawtyIsBad-nomic-moe", + "developer": "tinycompany", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2608, + "hfopenllm_v2/BBH": 0.3878, + "hfopenllm_v2/MATH Level 5": 0.0431, + "hfopenllm_v2/GPQA": 0.307, + "hfopenllm_v2/MUSR": 0.3747, + "hfopenllm_v2/MMLU-PRO": 0.2572 + } + }, + { + "id": "tinycompany/ShawtyIsBad-nomic1.5", + "name": "ShawtyIsBad-nomic1.5", + "developer": "tinycompany", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2544, + "hfopenllm_v2/BBH": 0.3874, + "hfopenllm_v2/MATH Level 5": 0.0431, + "hfopenllm_v2/GPQA": 0.3112, + "hfopenllm_v2/MUSR": 0.3628, + "hfopenllm_v2/MMLU-PRO": 0.2567 + } + }, + { + "id": "tinycompany/SigmaBoi-base", + "name": "SigmaBoi-base", + "developer": "tinycompany", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2447, + "hfopenllm_v2/BBH": 0.4314, + "hfopenllm_v2/MATH Level 5": 0.0778, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.4343, + "hfopenllm_v2/MMLU-PRO": 0.2817 + } + }, + { + "id": "tinycompany/SigmaBoi-bge-m3", + "name": "SigmaBoi-bge-m3", + "developer": "tinycompany", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.245, + "hfopenllm_v2/BBH": 0.4351, + "hfopenllm_v2/MATH Level 5": 0.0763, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4383, + "hfopenllm_v2/MMLU-PRO": 0.2819 + } + }, + { + "id": "tinycompany/SigmaBoi-bgem3", + "name": "SigmaBoi-bgem3", + "developer": "tinycompany", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.245, + "hfopenllm_v2/BBH": 0.4351, + "hfopenllm_v2/MATH Level 5": 0.0763, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4383, + "hfopenllm_v2/MMLU-PRO": 0.2819 + } + }, + { + "id": "tinycompany/SigmaBoi-ib", + "name": "SigmaBoi-ib", + "developer": "tinycompany", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2477, + "hfopenllm_v2/BBH": 0.4344, + "hfopenllm_v2/MATH Level 5": 0.074, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.429, + "hfopenllm_v2/MMLU-PRO": 0.2824 + } + }, + { + "id": "tinycompany/SigmaBoi-nomic-moe", + "name": "SigmaBoi-nomic-moe", + "developer": "tinycompany", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2474, + "hfopenllm_v2/BBH": 0.4334, + "hfopenllm_v2/MATH Level 5": 0.0718, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.4316, + "hfopenllm_v2/MMLU-PRO": 0.2837 + } + }, + { + "id": "tinycompany/SigmaBoi-nomic1.5", + "name": "SigmaBoi-nomic1.5", + "developer": "tinycompany", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2447, + "hfopenllm_v2/BBH": 0.4371, + "hfopenllm_v2/MATH Level 5": 0.0831, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4316, + "hfopenllm_v2/MMLU-PRO": 0.2841 + } + }, + { + "id": "tinycompany/SigmaBoi-nomic1.5-fp32", + "name": "SigmaBoi-nomic1.5-fp32", + "developer": "tinycompany", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2462, + "hfopenllm_v2/BBH": 0.4371, + "hfopenllm_v2/MATH Level 5": 0.0831, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4316, + "hfopenllm_v2/MMLU-PRO": 0.2841 + } + }, + { + "id": "tinycompany/Tamed-Shawty", + "name": "Tamed-Shawty", + "developer": "tinycompany", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3831, + "hfopenllm_v2/BBH": 0.3837, + "hfopenllm_v2/MATH Level 5": 0.0718, + "hfopenllm_v2/GPQA": 0.2626, + "hfopenllm_v2/MUSR": 0.3501, + "hfopenllm_v2/MMLU-PRO": 0.2601 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/tklohj.json b/data/developers/tklohj.json new file mode 100644 index 0000000000000000000000000000000000000000..b7fe7986665fbf31cb59d414701c967231e18665 --- /dev/null +++ b/data/developers/tklohj.json @@ -0,0 +1,19 @@ +{ + "developer": "tklohj", + "models": [ + { + "id": "tklohj/WindyFloLLM", + "name": "WindyFloLLM", + "developer": "tklohj", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2669, + "hfopenllm_v2/BBH": 0.4637, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.4253, + "hfopenllm_v2/MMLU-PRO": 0.2581 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/together.json b/data/developers/together.json new file mode 100644 index 0000000000000000000000000000000000000000..406f7bcf87e4781c2c0c59b6c4757d81c413c501 --- /dev/null +++ b/data/developers/together.json @@ -0,0 +1,97 @@ +{ + "developer": "together", + "models": [ + { + "id": "together/RedPajama-INCITE-Base-7B", + "name": "RedPajama-INCITE-Base 7B", + "developer": "together", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.378, + "helm_classic/MMLU": 0.302, + "helm_classic/BoolQ": 0.713, + "helm_classic/NarrativeQA": 0.617, + "helm_classic/NaturalQuestions (open-book)": 0.586, + "helm_classic/QuAC": 0.336, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.205, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.752, + "helm_classic/CivilComments": 0.547, + "helm_classic/RAFT": 0.648 + } + }, + { + "id": "together/RedPajama-INCITE-Base-v1-3B", + "name": "RedPajama-INCITE-Base-v1 3B", + "developer": "together", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.311, + "helm_classic/MMLU": 0.263, + "helm_classic/BoolQ": 0.685, + "helm_classic/NarrativeQA": 0.555, + "helm_classic/NaturalQuestions (open-book)": 0.52, + "helm_classic/QuAC": 0.309, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.277, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.907, + "helm_classic/CivilComments": 0.549, + "helm_classic/RAFT": 0.502 + } + }, + { + "id": "together/RedPajama-INCITE-Instruct-7B", + "name": "RedPajama-INCITE-Instruct 7B", + "developer": "together", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.524, + "helm_classic/MMLU": 0.363, + "helm_classic/BoolQ": 0.705, + "helm_classic/NarrativeQA": 0.638, + "helm_classic/NaturalQuestions (open-book)": 0.659, + "helm_classic/QuAC": 0.26, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.243, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.927, + "helm_classic/CivilComments": 0.664, + "helm_classic/RAFT": 0.695 + } + }, + { + "id": "together/RedPajama-INCITE-Instruct-v1-3B", + "name": "RedPajama-INCITE-Instruct-v1 3B", + "developer": "together", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.366, + "helm_classic/MMLU": 0.257, + "helm_classic/BoolQ": 0.677, + "helm_classic/NarrativeQA": 0.638, + "helm_classic/NaturalQuestions (open-book)": 0.637, + "helm_classic/QuAC": 0.259, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.208, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": -1.0, + "helm_classic/XSUM": -1.0, + "helm_classic/IMDB": 0.894, + "helm_classic/CivilComments": 0.549, + "helm_classic/RAFT": 0.661 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/togethercomputer.json b/data/developers/togethercomputer.json new file mode 100644 index 0000000000000000000000000000000000000000..152a501a0475b14c1abac56c7280e5ff47fb39ab --- /dev/null +++ b/data/developers/togethercomputer.json @@ -0,0 +1,145 @@ +{ + "developer": "togethercomputer", + "models": [ + { + "id": "togethercomputer/GPT-JT-6B-v1", + "name": "GPT-JT-6B-v1", + "developer": "togethercomputer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2061, + "hfopenllm_v2/BBH": 0.3303, + "hfopenllm_v2/MATH Level 5": 0.0106, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3737, + "hfopenllm_v2/MMLU-PRO": 0.1626 + } + }, + { + "id": "togethercomputer/GPT-NeoXT-Chat-Base-20B", + "name": "GPT-NeoXT-Chat-Base-20B", + "developer": "togethercomputer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.183, + "hfopenllm_v2/BBH": 0.3321, + "hfopenllm_v2/MATH Level 5": 0.0234, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3461, + "hfopenllm_v2/MMLU-PRO": 0.1145 + } + }, + { + "id": "togethercomputer/LLaMA-2-7B-32K", + "name": "LLaMA-2-7B-32K", + "developer": "togethercomputer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1865, + "hfopenllm_v2/BBH": 0.34, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3754, + "hfopenllm_v2/MMLU-PRO": 0.1768 + } + }, + { + "id": "togethercomputer/Llama-2-7B-32K-Instruct", + "name": "Llama-2-7B-32K-Instruct", + "developer": "togethercomputer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.213, + "hfopenllm_v2/BBH": 0.3443, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.2517, + "hfopenllm_v2/MUSR": 0.4056, + "hfopenllm_v2/MMLU-PRO": 0.1781 + } + }, + { + "id": "togethercomputer/RedPajama-INCITE-7B-Base", + "name": "RedPajama-INCITE-7B-Base", + "developer": "togethercomputer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2082, + "hfopenllm_v2/BBH": 0.3195, + "hfopenllm_v2/MATH Level 5": 0.0159, + "hfopenllm_v2/GPQA": 0.255, + "hfopenllm_v2/MUSR": 0.362, + "hfopenllm_v2/MMLU-PRO": 0.1197 + } + }, + { + "id": "togethercomputer/RedPajama-INCITE-7B-Chat", + "name": "RedPajama-INCITE-7B-Chat", + "developer": "togethercomputer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1558, + "hfopenllm_v2/BBH": 0.3175, + "hfopenllm_v2/MATH Level 5": 0.0068, + "hfopenllm_v2/GPQA": 0.2525, + "hfopenllm_v2/MUSR": 0.3448, + "hfopenllm_v2/MMLU-PRO": 0.1121 + } + }, + { + "id": "togethercomputer/RedPajama-INCITE-7B-Instruct", + "name": "RedPajama-INCITE-7B-Instruct", + "developer": "togethercomputer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2055, + "hfopenllm_v2/BBH": 0.3377, + "hfopenllm_v2/MATH Level 5": 0.0211, + "hfopenllm_v2/GPQA": 0.2508, + "hfopenllm_v2/MUSR": 0.3685, + "hfopenllm_v2/MMLU-PRO": 0.1272 + } + }, + { + "id": "togethercomputer/RedPajama-INCITE-Base-3B-v1", + "name": "RedPajama-INCITE-Base-3B-v1", + "developer": "togethercomputer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2294, + "hfopenllm_v2/BBH": 0.306, + "hfopenllm_v2/MATH Level 5": 0.0144, + "hfopenllm_v2/GPQA": 0.2433, + "hfopenllm_v2/MUSR": 0.3739, + "hfopenllm_v2/MMLU-PRO": 0.1111 + } + }, + { + "id": "togethercomputer/RedPajama-INCITE-Chat-3B-v1", + "name": "RedPajama-INCITE-Chat-3B-v1", + "developer": "togethercomputer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1652, + "hfopenllm_v2/BBH": 0.3217, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.2441, + "hfopenllm_v2/MUSR": 0.3684, + "hfopenllm_v2/MMLU-PRO": 0.1127 + } + }, + { + "id": "togethercomputer/RedPajama-INCITE-Instruct-3B-v1", + "name": "RedPajama-INCITE-Instruct-3B-v1", + "developer": "togethercomputer", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2124, + "hfopenllm_v2/BBH": 0.3146, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2475, + "hfopenllm_v2/MUSR": 0.3886, + "hfopenllm_v2/MMLU-PRO": 0.111 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/tokyotech-llm.json b/data/developers/tokyotech-llm.json new file mode 100644 index 0000000000000000000000000000000000000000..d89a0c9d89d224fd8856c2b701dbf3579e2488c2 --- /dev/null +++ b/data/developers/tokyotech-llm.json @@ -0,0 +1,19 @@ +{ + "developer": "tokyotech-llm", + "models": [ + { + "id": "tokyotech-llm/Llama-3-Swallow-8B-Instruct-v0.1", + "name": "Llama-3-Swallow-8B-Instruct-v0.1", + "developer": "tokyotech-llm", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5508, + "hfopenllm_v2/BBH": 0.5009, + "hfopenllm_v2/MATH Level 5": 0.0748, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.4357, + "hfopenllm_v2/MMLU-PRO": 0.3088 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/tomasmcm.json b/data/developers/tomasmcm.json new file mode 100644 index 0000000000000000000000000000000000000000..a65cb2a8174542e5cfaecbe8a958bccab476acc2 --- /dev/null +++ b/data/developers/tomasmcm.json @@ -0,0 +1,19 @@ +{ + "developer": "tomasmcm", + "models": [ + { + "id": "tomasmcm/sky-t1-coder-32b-flash", + "name": "sky-t1-coder-32b-flash", + "developer": "tomasmcm", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.778, + "hfopenllm_v2/BBH": 0.6822, + "hfopenllm_v2/MATH Level 5": 0.5423, + "hfopenllm_v2/GPQA": 0.3683, + "hfopenllm_v2/MUSR": 0.4233, + "hfopenllm_v2/MMLU-PRO": 0.5782 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/trthminh1112.json b/data/developers/trthminh1112.json new file mode 100644 index 0000000000000000000000000000000000000000..8775717a53c5fdbd88894639f6139f7ec78bafc4 --- /dev/null +++ b/data/developers/trthminh1112.json @@ -0,0 +1,19 @@ +{ + "developer": "trthminh1112", + "models": [ + { + "id": "trthminh1112/autotrain-llama32-1b-finetune", + "name": "autotrain-llama32-1b-finetune", + "developer": "trthminh1112", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1769, + "hfopenllm_v2/BBH": 0.2996, + "hfopenllm_v2/MATH Level 5": 0.0151, + "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/MUSR": 0.3513, + "hfopenllm_v2/MMLU-PRO": 0.1099 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/tugstugi.json b/data/developers/tugstugi.json new file mode 100644 index 0000000000000000000000000000000000000000..7d7f6df5a055f9cd8d1ca8265a8df27a0c2b2174 --- /dev/null +++ b/data/developers/tugstugi.json @@ -0,0 +1,19 @@ +{ + "developer": "tugstugi", + "models": [ + { + "id": "tugstugi/Qwen2.5-7B-Instruct-QwQ-v0.1", + "name": "Qwen2.5-7B-Instruct-QwQ-v0.1", + "developer": "tugstugi", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6017, + "hfopenllm_v2/BBH": 0.5101, + "hfopenllm_v2/MATH Level 5": 0.3814, + "hfopenllm_v2/GPQA": 0.2685, + "hfopenllm_v2/MUSR": 0.3794, + "hfopenllm_v2/MMLU-PRO": 0.4081 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/universalml.json b/data/developers/universalml.json new file mode 100644 index 0000000000000000000000000000000000000000..6a875805611a502f473751f7dcb735d0ed736e8b --- /dev/null +++ b/data/developers/universalml.json @@ -0,0 +1,19 @@ +{ + "developer": "universalml", + "models": [ + { + "id": "universalml/NepaliGPT-2.0", + "name": "NepaliGPT-2.0", + "developer": "universalml", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.0365, + "hfopenllm_v2/BBH": 0.466, + "hfopenllm_v2/MATH Level 5": 0.0045, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.4657, + "hfopenllm_v2/MMLU-PRO": 0.33 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/unknown.json b/data/developers/unknown.json new file mode 100644 index 0000000000000000000000000000000000000000..3c55382e79aaa4313dd242546596ab28b509e7ca --- /dev/null +++ b/data/developers/unknown.json @@ -0,0 +1,150 @@ +{ + "developer": "unknown", + "models": [ + { + "id": "Anthropic-LM-v4-s3-52B", + "name": "Anthropic-LM v4-s3 52B", + "developer": "unknown", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.78, + "helm_classic/MMLU": 0.481, + "helm_classic/BoolQ": 0.815, + "helm_classic/NarrativeQA": 0.728, + "helm_classic/NaturalQuestions (open-book)": 0.686, + "helm_classic/QuAC": 0.431, + "helm_classic/HellaSwag": 0.807, + "helm_classic/OpenbookQA": 0.558, + "helm_classic/TruthfulQA": 0.368, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": 0.154, + "helm_classic/XSUM": 0.134, + "helm_classic/IMDB": 0.934, + "helm_classic/CivilComments": 0.61, + "helm_classic/RAFT": 0.699 + } + }, + { + "id": "Cohere March 2024", + "name": "Cohere March 2024", + "developer": "unknown", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8511, + "reward-bench/Chat": 0.9469, + "reward-bench/Chat Hard": 0.6513, + "reward-bench/Safety": 0.877, + "reward-bench/Reasoning": 0.9817, + "reward-bench/Prior Sets (0.5 weight)": 0.7458 + } + }, + { + "id": "Cohere May 2024", + "name": "Cohere May 2024", + "developer": "unknown", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.8816, + "reward-bench/Chat": 0.9637, + "reward-bench/Chat Hard": 0.7127, + "reward-bench/Safety": 0.923, + "reward-bench/Reasoning": 0.9768, + "reward-bench/Prior Sets (0.5 weight)": 0.782 + } + }, + { + "id": "gemini-1.5-flash-8b", + "name": "gemini-1.5-flash-8b", + "developer": "unknown", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7601, + "reward-bench/Chat": 0.9441, + "reward-bench/Chat Hard": 0.5987, + "reward-bench/Safety": 0.7399, + "reward-bench/Reasoning": 0.7575 + } + }, + { + "id": "unknown/aya-expanse-32b", + "name": "aya-expanse-32b", + "developer": "unknown", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.7353, + "global-mmlu-lite/Culturally Sensitive": 0.6891, + "global-mmlu-lite/Culturally Agnostic": 0.7815, + "global-mmlu-lite/Arabic": 0.7425, + "global-mmlu-lite/English": 0.7544, + "global-mmlu-lite/Bengali": 0.7343, + "global-mmlu-lite/German": 0.7425, + "global-mmlu-lite/French": 0.7325, + "global-mmlu-lite/Hindi": 0.7375, + "global-mmlu-lite/Indonesian": 0.7594, + "global-mmlu-lite/Italian": 0.7305, + "global-mmlu-lite/Japanese": 0.7419, + "global-mmlu-lite/Korean": 0.7525, + "global-mmlu-lite/Portuguese": 0.7544, + "global-mmlu-lite/Spanish": 0.7362, + "global-mmlu-lite/Swahili": 0.7071, + "global-mmlu-lite/Yoruba": 0.6942, + "global-mmlu-lite/Chinese": 0.743, + "global-mmlu-lite/Burmese": 0.7025 + } + }, + { + "id": "unknown/granite-4.0-h-small", + "name": "granite-4.0-h-small", + "developer": "unknown", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.7503, + "global-mmlu-lite/Culturally Sensitive": 0.7182, + "global-mmlu-lite/Culturally Agnostic": 0.7826, + "global-mmlu-lite/Arabic": 0.7613, + "global-mmlu-lite/English": 0.77, + "global-mmlu-lite/Bengali": 0.7613, + "global-mmlu-lite/German": 0.755, + "global-mmlu-lite/French": 0.7594, + "global-mmlu-lite/Hindi": 0.7575, + "global-mmlu-lite/Indonesian": 0.7614, + "global-mmlu-lite/Italian": 0.7525, + "global-mmlu-lite/Japanese": 0.7406, + "global-mmlu-lite/Korean": 0.7525, + "global-mmlu-lite/Portuguese": 0.757, + "global-mmlu-lite/Spanish": 0.7638, + "global-mmlu-lite/Swahili": 0.7318, + "global-mmlu-lite/Yoruba": 0.6921, + "global-mmlu-lite/Chinese": 0.7475, + "global-mmlu-lite/Burmese": 0.7419 + } + }, + { + "id": "unknown/o4-mini-2025-04-16", + "name": "o4-mini-2025-04-16", + "developer": "unknown", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.8705, + "global-mmlu-lite/Culturally Sensitive": 0.8503, + "global-mmlu-lite/Culturally Agnostic": 0.8906, + "global-mmlu-lite/Arabic": 0.865, + "global-mmlu-lite/English": 0.8675, + "global-mmlu-lite/Bengali": 0.8875, + "global-mmlu-lite/German": 0.8775, + "global-mmlu-lite/French": 0.87, + "global-mmlu-lite/Hindi": 0.87, + "global-mmlu-lite/Indonesian": 0.8675, + "global-mmlu-lite/Italian": 0.855, + "global-mmlu-lite/Japanese": 0.885, + "global-mmlu-lite/Korean": 0.88, + "global-mmlu-lite/Portuguese": 0.88, + "global-mmlu-lite/Spanish": 0.855, + "global-mmlu-lite/Swahili": 0.8525, + "global-mmlu-lite/Yoruba": 0.8525, + "global-mmlu-lite/Chinese": 0.89, + "global-mmlu-lite/Burmese": 0.8725 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/unsloth.json b/data/developers/unsloth.json new file mode 100644 index 0000000000000000000000000000000000000000..9fce6b255e1d7c1ace5fec08071a66eeeff9c3b7 --- /dev/null +++ b/data/developers/unsloth.json @@ -0,0 +1,89 @@ +{ + "developer": "unsloth", + "models": [ + { + "id": "unsloth/Llama-3.2-1B-Instruct", + "name": "Llama-3.2-1B-Instruct", + "developer": "unsloth", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.581, + "hfopenllm_v2/BBH": 0.3485, + "hfopenllm_v2/MATH Level 5": 0.0823, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.3196, + "hfopenllm_v2/MMLU-PRO": 0.1742 + } + }, + { + "id": "unsloth/Llama-3.2-1B-Instruct-no-system-message", + "name": "Llama-3.2-1B-Instruct-no-system-message", + "developer": "unsloth", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.565, + "hfopenllm_v2/BBH": 0.3544, + "hfopenllm_v2/MATH Level 5": 0.0755, + "hfopenllm_v2/GPQA": 0.2727, + "hfopenllm_v2/MUSR": 0.3341, + "hfopenllm_v2/MMLU-PRO": 0.1669 + } + }, + { + "id": "unsloth/Phi-3-mini-4k-instruct", + "name": "Phi-3-mini-4k-instruct", + "developer": "unsloth", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.544, + "hfopenllm_v2/BBH": 0.55, + "hfopenllm_v2/MATH Level 5": 0.1639, + "hfopenllm_v2/GPQA": 0.323, + "hfopenllm_v2/MUSR": 0.4284, + "hfopenllm_v2/MMLU-PRO": 0.4031 + } + }, + { + "id": "unsloth/phi-4", + "name": "phi-4", + "developer": "unsloth", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6882, + "hfopenllm_v2/BBH": 0.6886, + "hfopenllm_v2/MATH Level 5": 0.5, + "hfopenllm_v2/GPQA": 0.3364, + "hfopenllm_v2/MUSR": 0.4114, + "hfopenllm_v2/MMLU-PRO": 0.5378 + } + }, + { + "id": "unsloth/phi-4-bnb-4bit", + "name": "phi-4-bnb-4bit", + "developer": "unsloth", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.673, + "hfopenllm_v2/BBH": 0.677, + "hfopenllm_v2/MATH Level 5": 0.4607, + "hfopenllm_v2/GPQA": 0.3381, + "hfopenllm_v2/MUSR": 0.4007, + "hfopenllm_v2/MMLU-PRO": 0.5256 + } + }, + { + "id": "unsloth/phi-4-unsloth-bnb-4bit", + "name": "phi-4-unsloth-bnb-4bit", + "developer": "unsloth", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6794, + "hfopenllm_v2/BBH": 0.6791, + "hfopenllm_v2/MATH Level 5": 0.4562, + "hfopenllm_v2/GPQA": 0.3364, + "hfopenllm_v2/MUSR": 0.4034, + "hfopenllm_v2/MMLU-PRO": 0.5286 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/upstage.json b/data/developers/upstage.json new file mode 100644 index 0000000000000000000000000000000000000000..ca8a8f601c091f416d00421e8a147f7e016ea392 --- /dev/null +++ b/data/developers/upstage.json @@ -0,0 +1,107 @@ +{ + "developer": "upstage", + "models": [ + { + "id": "upstage/SOLAR-10.7B-Instruct-v1.0", + "name": "SOLAR-10.7B-Instruct-v1.0", + "developer": "upstage", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4737, + "hfopenllm_v2/BBH": 0.5162, + "hfopenllm_v2/MATH Level 5": 0.0566, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.3899, + "hfopenllm_v2/MMLU-PRO": 0.3138, + "reward-bench/Score": 0.7391, + "reward-bench/Chat": 0.8156, + "reward-bench/Chat Hard": 0.6864, + "reward-bench/Safety": 0.8514, + "reward-bench/Reasoning": 0.7252, + "reward-bench/Prior Sets (0.5 weight)": 0.4949 + } + }, + { + "id": "upstage/SOLAR-10.7B-v1.0", + "name": "SOLAR-10.7B-v1.0", + "developer": "upstage", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2421, + "hfopenllm_v2/BBH": 0.5094, + "hfopenllm_v2/MATH Level 5": 0.0264, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.4372, + "hfopenllm_v2/MMLU-PRO": 0.34 + } + }, + { + "id": "upstage/solar-pro-241126", + "name": "Solar Pro", + "developer": "upstage", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.602, + "helm_lite/NarrativeQA": 0.753, + "helm_lite/NaturalQuestions (closed-book)": 0.297, + "helm_lite/OpenbookQA": 0.922, + "helm_lite/MMLU": 0.679, + "helm_lite/MATH": 0.567, + "helm_lite/GSM8K": 0.871, + "helm_lite/LegalBench": 0.67, + "helm_lite/MedQA": 0.698, + "helm_lite/WMT 2014": 0.169, + "helm_mmlu/MMLU All Subjects": 0.776, + "helm_mmlu/Abstract Algebra": 0.46, + "helm_mmlu/Anatomy": 0.719, + "helm_mmlu/College Physics": 0.559, + "helm_mmlu/Computer Security": 0.82, + "helm_mmlu/Econometrics": 0.605, + "helm_mmlu/Global Facts": 0.5, + "helm_mmlu/Jurisprudence": 0.898, + "helm_mmlu/Philosophy": 0.817, + "helm_mmlu/Professional Psychology": 0.85, + "helm_mmlu/Us Foreign Policy": 0.97, + "helm_mmlu/Astronomy": 0.868, + "helm_mmlu/Business Ethics": 0.8, + "helm_mmlu/Clinical Knowledge": 0.808, + "helm_mmlu/Conceptual Physics": 0.826, + "helm_mmlu/Electrical Engineering": 0.697, + "helm_mmlu/Elementary Mathematics": 0.611, + "helm_mmlu/Formal Logic": 0.579, + "helm_mmlu/High School World History": 0.907, + "helm_mmlu/Human Sexuality": 0.847, + "helm_mmlu/International Law": 0.901, + "helm_mmlu/Logical Fallacies": 0.865, + "helm_mmlu/Machine Learning": 0.616, + "helm_mmlu/Management": 0.864, + "helm_mmlu/Marketing": 0.953, + "helm_mmlu/Medical Genetics": 0.91, + "helm_mmlu/Miscellaneous": 0.888, + "helm_mmlu/Moral Scenarios": 0.811, + "helm_mmlu/Nutrition": 0.859, + "helm_mmlu/Prehistory": 0.867, + "helm_mmlu/Public Relations": 0.764, + "helm_mmlu/Security Studies": 0.82, + "helm_mmlu/Sociology": 0.886, + "helm_mmlu/Virology": 0.572, + "helm_mmlu/World Religions": 0.883, + "helm_mmlu/Mean win rate": 0.462 + } + }, + { + "id": "upstage/solar-pro-preview-instruct", + "name": "solar-pro-preview-instruct", + "developer": "upstage", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8416, + "hfopenllm_v2/BBH": 0.6817, + "hfopenllm_v2/MATH Level 5": 0.2205, + "hfopenllm_v2/GPQA": 0.3708, + "hfopenllm_v2/MUSR": 0.4417, + "hfopenllm_v2/MMLU-PRO": 0.5273 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/utkmst.json b/data/developers/utkmst.json new file mode 100644 index 0000000000000000000000000000000000000000..40be9162c0df392098568cdd7f1a63f35dd4f11b --- /dev/null +++ b/data/developers/utkmst.json @@ -0,0 +1,19 @@ +{ + "developer": "utkmst", + "models": [ + { + "id": "utkmst/chimera-beta-test2-lora-merged", + "name": "chimera-beta-test2-lora-merged", + "developer": "utkmst", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6054, + "hfopenllm_v2/BBH": 0.4796, + "hfopenllm_v2/MATH Level 5": 0.0952, + "hfopenllm_v2/GPQA": 0.3037, + "hfopenllm_v2/MUSR": 0.4118, + "hfopenllm_v2/MMLU-PRO": 0.2992 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/uukuguy.json b/data/developers/uukuguy.json new file mode 100644 index 0000000000000000000000000000000000000000..7d022d1529e04b5afb867ef1b09439a985a98928 --- /dev/null +++ b/data/developers/uukuguy.json @@ -0,0 +1,103 @@ +{ + "developer": "uukuguy", + "models": [ + { + "id": "uukuguy/speechless-code-mistral-7b-v1.0", + "name": "speechless-code-mistral-7b-v1.0", + "developer": "uukuguy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3665, + "hfopenllm_v2/BBH": 0.4572, + "hfopenllm_v2/MATH Level 5": 0.0521, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4502, + "hfopenllm_v2/MMLU-PRO": 0.3146 + } + }, + { + "id": "uukuguy/speechless-codellama-34b-v2.0", + "name": "speechless-codellama-34b-v2.0", + "developer": "uukuguy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4604, + "hfopenllm_v2/BBH": 0.4813, + "hfopenllm_v2/MATH Level 5": 0.0431, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3787, + "hfopenllm_v2/MMLU-PRO": 0.2542 + } + }, + { + "id": "uukuguy/speechless-coder-ds-6.7b", + "name": "speechless-coder-ds-6.7b", + "developer": "uukuguy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2505, + "hfopenllm_v2/BBH": 0.4036, + "hfopenllm_v2/MATH Level 5": 0.0211, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.3819, + "hfopenllm_v2/MMLU-PRO": 0.1719 + } + }, + { + "id": "uukuguy/speechless-instruct-mistral-7b-v0.2", + "name": "speechless-instruct-mistral-7b-v0.2", + "developer": "uukuguy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3261, + "hfopenllm_v2/BBH": 0.4607, + "hfopenllm_v2/MATH Level 5": 0.0491, + "hfopenllm_v2/GPQA": 0.2819, + "hfopenllm_v2/MUSR": 0.4902, + "hfopenllm_v2/MMLU-PRO": 0.2902 + } + }, + { + "id": "uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b", + "name": "speechless-llama2-hermes-orca-platypus-wizardlm-13b", + "developer": "uukuguy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4562, + "hfopenllm_v2/BBH": 0.4846, + "hfopenllm_v2/MATH Level 5": 0.0204, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.4655, + "hfopenllm_v2/MMLU-PRO": 0.2559 + } + }, + { + "id": "uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b", + "name": "speechless-mistral-dolphin-orca-platypus-samantha-7b", + "developer": "uukuguy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.37, + "hfopenllm_v2/BBH": 0.4983, + "hfopenllm_v2/MATH Level 5": 0.0295, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.4361, + "hfopenllm_v2/MMLU-PRO": 0.299 + } + }, + { + "id": "uukuguy/speechless-zephyr-code-functionary-7b", + "name": "speechless-zephyr-code-functionary-7b", + "developer": "uukuguy", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2696, + "hfopenllm_v2/BBH": 0.4664, + "hfopenllm_v2/MATH Level 5": 0.0423, + "hfopenllm_v2/GPQA": 0.3003, + "hfopenllm_v2/MUSR": 0.4268, + "hfopenllm_v2/MMLU-PRO": 0.3094 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/v000000.json b/data/developers/v000000.json new file mode 100644 index 0000000000000000000000000000000000000000..5e3125570bb166596fe3c396ac640a981eb6699e --- /dev/null +++ b/data/developers/v000000.json @@ -0,0 +1,89 @@ +{ + "developer": "v000000", + "models": [ + { + "id": "v000000/L3-8B-Stheno-v3.2-abliterated", + "name": "L3-8B-Stheno-v3.2-abliterated", + "developer": "v000000", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6718, + "hfopenllm_v2/BBH": 0.5141, + "hfopenllm_v2/MATH Level 5": 0.0695, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.362, + "hfopenllm_v2/MMLU-PRO": 0.3604 + } + }, + { + "id": "v000000/L3.1-Niitorm-8B-DPO-t0.0001", + "name": "L3.1-Niitorm-8B-DPO-t0.0001", + "developer": "v000000", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7689, + "hfopenllm_v2/BBH": 0.5134, + "hfopenllm_v2/MATH Level 5": 0.1624, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.388, + "hfopenllm_v2/MMLU-PRO": 0.3866 + } + }, + { + "id": "v000000/L3.1-Storniitova-8B", + "name": "L3.1-Storniitova-8B", + "developer": "v000000", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7817, + "hfopenllm_v2/BBH": 0.5151, + "hfopenllm_v2/MATH Level 5": 0.1465, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.4029, + "hfopenllm_v2/MMLU-PRO": 0.3776 + } + }, + { + "id": "v000000/Qwen2.5-14B-Gutenberg-1e-Delta", + "name": "Qwen2.5-14B-Gutenberg-1e-Delta", + "developer": "v000000", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8045, + "hfopenllm_v2/BBH": 0.6398, + "hfopenllm_v2/MATH Level 5": 0.5264, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.4073, + "hfopenllm_v2/MMLU-PRO": 0.493 + } + }, + { + "id": "v000000/Qwen2.5-14B-Gutenberg-Instruct-Slerpeno", + "name": "Qwen2.5-14B-Gutenberg-Instruct-Slerpeno", + "developer": "v000000", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8197, + "hfopenllm_v2/BBH": 0.639, + "hfopenllm_v2/MATH Level 5": 0.5325, + "hfopenllm_v2/GPQA": 0.3314, + "hfopenllm_v2/MUSR": 0.4114, + "hfopenllm_v2/MMLU-PRO": 0.4924 + } + }, + { + "id": "v000000/Qwen2.5-Lumen-14B", + "name": "Qwen2.5-Lumen-14B", + "developer": "v000000", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8064, + "hfopenllm_v2/BBH": 0.6391, + "hfopenllm_v2/MATH Level 5": 0.5363, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.4114, + "hfopenllm_v2/MMLU-PRO": 0.4903 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/vhab10.json b/data/developers/vhab10.json new file mode 100644 index 0000000000000000000000000000000000000000..f095bada4c5a5b84e0d492b92e33bdd7a68c123f --- /dev/null +++ b/data/developers/vhab10.json @@ -0,0 +1,47 @@ +{ + "developer": "vhab10", + "models": [ + { + "id": "vhab10/Llama-3.1-8B-Base-Instruct-SLERP", + "name": "Llama-3.1-8B-Base-Instruct-SLERP", + "developer": "vhab10", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2907, + "hfopenllm_v2/BBH": 0.5057, + "hfopenllm_v2/MATH Level 5": 0.1201, + "hfopenllm_v2/GPQA": 0.2961, + "hfopenllm_v2/MUSR": 0.4011, + "hfopenllm_v2/MMLU-PRO": 0.3621 + } + }, + { + "id": "vhab10/Llama-3.2-Instruct-3B-TIES", + "name": "Llama-3.2-Instruct-3B-TIES", + "developer": "vhab10", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4727, + "hfopenllm_v2/BBH": 0.4332, + "hfopenllm_v2/MATH Level 5": 0.0982, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3497, + "hfopenllm_v2/MMLU-PRO": 0.2916 + } + }, + { + "id": "vhab10/llama-3-8b-merged-linear", + "name": "llama-3-8b-merged-linear", + "developer": "vhab10", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5917, + "hfopenllm_v2/BBH": 0.4937, + "hfopenllm_v2/MATH Level 5": 0.0816, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4191, + "hfopenllm_v2/MMLU-PRO": 0.3704 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/vicgalle.json b/data/developers/vicgalle.json new file mode 100644 index 0000000000000000000000000000000000000000..0c6f2105e3f597f030b6d21192c9b14c4770a980 --- /dev/null +++ b/data/developers/vicgalle.json @@ -0,0 +1,173 @@ +{ + "developer": "vicgalle", + "models": [ + { + "id": "vicgalle/CarbonBeagle-11B", + "name": "CarbonBeagle-11B", + "developer": "vicgalle", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5415, + "hfopenllm_v2/BBH": 0.5294, + "hfopenllm_v2/MATH Level 5": 0.0619, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.402, + "hfopenllm_v2/MMLU-PRO": 0.3276 + } + }, + { + "id": "vicgalle/CarbonBeagle-11B-truthy", + "name": "CarbonBeagle-11B-truthy", + "developer": "vicgalle", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5212, + "hfopenllm_v2/BBH": 0.5348, + "hfopenllm_v2/MATH Level 5": 0.0491, + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.374, + "hfopenllm_v2/MMLU-PRO": 0.3357 + } + }, + { + "id": "vicgalle/Configurable-Hermes-2-Pro-Llama-3-8B", + "name": "Configurable-Hermes-2-Pro-Llama-3-8B", + "developer": "vicgalle", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5763, + "hfopenllm_v2/BBH": 0.5055, + "hfopenllm_v2/MATH Level 5": 0.0763, + "hfopenllm_v2/GPQA": 0.297, + "hfopenllm_v2/MUSR": 0.4184, + "hfopenllm_v2/MMLU-PRO": 0.3098 + } + }, + { + "id": "vicgalle/Configurable-Llama-3.1-8B-Instruct", + "name": "Configurable-Llama-3.1-8B-Instruct", + "developer": "vicgalle", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8312, + "hfopenllm_v2/BBH": 0.5045, + "hfopenllm_v2/MATH Level 5": 0.173, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3845, + "hfopenllm_v2/MMLU-PRO": 0.3592 + } + }, + { + "id": "vicgalle/Configurable-Yi-1.5-9B-Chat", + "name": "Configurable-Yi-1.5-9B-Chat", + "developer": "vicgalle", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4323, + "hfopenllm_v2/BBH": 0.5452, + "hfopenllm_v2/MATH Level 5": 0.2047, + "hfopenllm_v2/GPQA": 0.3431, + "hfopenllm_v2/MUSR": 0.4271, + "hfopenllm_v2/MMLU-PRO": 0.4015 + } + }, + { + "id": "vicgalle/ConfigurableBeagle-11B", + "name": "ConfigurableBeagle-11B", + "developer": "vicgalle", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5834, + "hfopenllm_v2/BBH": 0.5287, + "hfopenllm_v2/MATH Level 5": 0.0431, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.3953, + "hfopenllm_v2/MMLU-PRO": 0.3374 + } + }, + { + "id": "vicgalle/ConfigurableHermes-7B", + "name": "ConfigurableHermes-7B", + "developer": "vicgalle", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5411, + "hfopenllm_v2/BBH": 0.4573, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.2768, + "hfopenllm_v2/MUSR": 0.4057, + "hfopenllm_v2/MMLU-PRO": 0.3025 + } + }, + { + "id": "vicgalle/ConfigurableSOLAR-10.7B", + "name": "ConfigurableSOLAR-10.7B", + "developer": "vicgalle", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.51, + "hfopenllm_v2/BBH": 0.4867, + "hfopenllm_v2/MATH Level 5": 0.0665, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.3805, + "hfopenllm_v2/MMLU-PRO": 0.3173 + } + }, + { + "id": "vicgalle/Humanish-RP-Llama-3.1-8B", + "name": "Humanish-RP-Llama-3.1-8B", + "developer": "vicgalle", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6669, + "hfopenllm_v2/BBH": 0.51, + "hfopenllm_v2/MATH Level 5": 0.1518, + "hfopenllm_v2/GPQA": 0.2869, + "hfopenllm_v2/MUSR": 0.3952, + "hfopenllm_v2/MMLU-PRO": 0.3477 + } + }, + { + "id": "vicgalle/Merge-Mistral-Prometheus-7B", + "name": "Merge-Mistral-Prometheus-7B", + "developer": "vicgalle", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4848, + "hfopenllm_v2/BBH": 0.4201, + "hfopenllm_v2/MATH Level 5": 0.0181, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.41, + "hfopenllm_v2/MMLU-PRO": 0.2717 + } + }, + { + "id": "vicgalle/Merge-Mixtral-Prometheus-8x7B", + "name": "Merge-Mixtral-Prometheus-8x7B", + "developer": "vicgalle", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5744, + "hfopenllm_v2/BBH": 0.5351, + "hfopenllm_v2/MATH Level 5": 0.0929, + "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/MUSR": 0.4098, + "hfopenllm_v2/MMLU-PRO": 0.3684 + } + }, + { + "id": "vicgalle/Roleplay-Llama-3-8B", + "name": "Roleplay-Llama-3-8B", + "developer": "vicgalle", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.732, + "hfopenllm_v2/BBH": 0.5012, + "hfopenllm_v2/MATH Level 5": 0.0914, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3529, + "hfopenllm_v2/MMLU-PRO": 0.3708 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/viettelsecurity-ai.json b/data/developers/viettelsecurity-ai.json new file mode 100644 index 0000000000000000000000000000000000000000..c998c562c115539194ae54bb33f2621840449fa1 --- /dev/null +++ b/data/developers/viettelsecurity-ai.json @@ -0,0 +1,19 @@ +{ + "developer": "viettelsecurity-ai", + "models": [ + { + "id": "viettelsecurity-ai/security-llama3.2-3b", + "name": "security-llama3.2-3b", + "developer": "viettelsecurity-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5909, + "hfopenllm_v2/BBH": 0.4401, + "hfopenllm_v2/MATH Level 5": 0.1261, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3379, + "hfopenllm_v2/MMLU-PRO": 0.2837 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/vihangd.json b/data/developers/vihangd.json new file mode 100644 index 0000000000000000000000000000000000000000..436df44fa5738837b462253bc3008a4ec60e02bb --- /dev/null +++ b/data/developers/vihangd.json @@ -0,0 +1,19 @@ +{ + "developer": "vihangd", + "models": [ + { + "id": "vihangd/smart-dan-sft-v0.1", + "name": "smart-dan-sft-v0.1", + "developer": "vihangd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1576, + "hfopenllm_v2/BBH": 0.3062, + "hfopenllm_v2/MATH Level 5": 0.0098, + "hfopenllm_v2/GPQA": 0.255, + "hfopenllm_v2/MUSR": 0.3502, + "hfopenllm_v2/MMLU-PRO": 0.1142 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/voidful.json b/data/developers/voidful.json new file mode 100644 index 0000000000000000000000000000000000000000..a6de5e1eae715c6d4a98bf6e858459c94a0e37a1 --- /dev/null +++ b/data/developers/voidful.json @@ -0,0 +1,19 @@ +{ + "developer": "voidful", + "models": [ + { + "id": "voidful/smol-360m-ft", + "name": "smol-360m-ft", + "developer": "voidful", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2013, + "hfopenllm_v2/BBH": 0.3012, + "hfopenllm_v2/MATH Level 5": 0.0083, + "hfopenllm_v2/GPQA": 0.2458, + "hfopenllm_v2/MUSR": 0.3714, + "hfopenllm_v2/MMLU-PRO": 0.1087 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/vonjack.json b/data/developers/vonjack.json new file mode 100644 index 0000000000000000000000000000000000000000..7ae9750c00c58a489b48463780a5edf88ec14ed0 --- /dev/null +++ b/data/developers/vonjack.json @@ -0,0 +1,103 @@ +{ + "developer": "vonjack", + "models": [ + { + "id": "vonjack/MobileLLM-125M-HF", + "name": "MobileLLM-125M-HF", + "developer": "vonjack", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2107, + "hfopenllm_v2/BBH": 0.3027, + "hfopenllm_v2/MATH Level 5": 0.0091, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3782, + "hfopenllm_v2/MMLU-PRO": 0.1164 + } + }, + { + "id": "vonjack/Phi-3-mini-4k-instruct-LLaMAfied", + "name": "Phi-3-mini-4k-instruct-LLaMAfied", + "developer": "vonjack", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5787, + "hfopenllm_v2/BBH": 0.5741, + "hfopenllm_v2/MATH Level 5": 0.1382, + "hfopenllm_v2/GPQA": 0.3305, + "hfopenllm_v2/MUSR": 0.3924, + "hfopenllm_v2/MMLU-PRO": 0.3885 + } + }, + { + "id": "vonjack/Phi-3.5-mini-instruct-hermes-fc-json", + "name": "Phi-3.5-mini-instruct-hermes-fc-json", + "developer": "vonjack", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1416, + "hfopenllm_v2/BBH": 0.2975, + "hfopenllm_v2/MATH Level 5": 0.0076, + "hfopenllm_v2/GPQA": 0.2542, + "hfopenllm_v2/MUSR": 0.4041, + "hfopenllm_v2/MMLU-PRO": 0.1139 + } + }, + { + "id": "vonjack/Qwen2.5-Coder-0.5B-Merged", + "name": "Qwen2.5-Coder-0.5B-Merged", + "developer": "vonjack", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.31, + "hfopenllm_v2/BBH": 0.3076, + "hfopenllm_v2/MATH Level 5": 0.0378, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3303, + "hfopenllm_v2/MMLU-PRO": 0.1202 + } + }, + { + "id": "vonjack/SmolLM2-1.7B-Merged", + "name": "SmolLM2-1.7B-Merged", + "developer": "vonjack", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3698, + "hfopenllm_v2/BBH": 0.3587, + "hfopenllm_v2/MATH Level 5": 0.0627, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3408, + "hfopenllm_v2/MMLU-PRO": 0.2048 + } + }, + { + "id": "vonjack/SmolLM2-135M-Merged", + "name": "SmolLM2-135M-Merged", + "developer": "vonjack", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2483, + "hfopenllm_v2/BBH": 0.31, + "hfopenllm_v2/MATH Level 5": 0.0113, + "hfopenllm_v2/GPQA": 0.2383, + "hfopenllm_v2/MUSR": 0.3662, + "hfopenllm_v2/MMLU-PRO": 0.1112 + } + }, + { + "id": "vonjack/SmolLM2-360M-Merged", + "name": "SmolLM2-360M-Merged", + "developer": "vonjack", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3206, + "hfopenllm_v2/BBH": 0.3155, + "hfopenllm_v2/MATH Level 5": 0.0174, + "hfopenllm_v2/GPQA": 0.2559, + "hfopenllm_v2/MUSR": 0.3527, + "hfopenllm_v2/MMLU-PRO": 0.1098 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/w4r10ck.json b/data/developers/w4r10ck.json new file mode 100644 index 0000000000000000000000000000000000000000..af6efbb76c0ab7ca146aee9c95291c7f48412e41 --- /dev/null +++ b/data/developers/w4r10ck.json @@ -0,0 +1,19 @@ +{ + "developer": "w4r10ck", + "models": [ + { + "id": "w4r10ck/SOLAR-10.7B-Instruct-v1.0-uncensored", + "name": "SOLAR-10.7B-Instruct-v1.0-uncensored", + "developer": "w4r10ck", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3884, + "hfopenllm_v2/BBH": 0.5302, + "hfopenllm_v2/MATH Level 5": 0.0657, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.4639, + "hfopenllm_v2/MMLU-PRO": 0.3344 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/wanlige.json b/data/developers/wanlige.json new file mode 100644 index 0000000000000000000000000000000000000000..463f87ec60baeec83beadbb3c8f277c9fcc94551 --- /dev/null +++ b/data/developers/wanlige.json @@ -0,0 +1,47 @@ +{ + "developer": "wanlige", + "models": [ + { + "id": "wanlige/li-14b-v0.4", + "name": "li-14b-v0.4", + "developer": "wanlige", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8133, + "hfopenllm_v2/BBH": 0.6544, + "hfopenllm_v2/MATH Level 5": 0.5574, + "hfopenllm_v2/GPQA": 0.3389, + "hfopenllm_v2/MUSR": 0.446, + "hfopenllm_v2/MMLU-PRO": 0.5167 + } + }, + { + "id": "wanlige/li-14b-v0.4-slerp", + "name": "li-14b-v0.4-slerp", + "developer": "wanlige", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4606, + "hfopenllm_v2/BBH": 0.6587, + "hfopenllm_v2/MATH Level 5": 0.4192, + "hfopenllm_v2/GPQA": 0.4002, + "hfopenllm_v2/MUSR": 0.4768, + "hfopenllm_v2/MMLU-PRO": 0.5372 + } + }, + { + "id": "wanlige/li-14b-v0.4-slerp0.1", + "name": "li-14b-v0.4-slerp0.1", + "developer": "wanlige", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7923, + "hfopenllm_v2/BBH": 0.6572, + "hfopenllm_v2/MATH Level 5": 0.5332, + "hfopenllm_v2/GPQA": 0.3591, + "hfopenllm_v2/MUSR": 0.4207, + "hfopenllm_v2/MMLU-PRO": 0.5294 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/wannaphong.json b/data/developers/wannaphong.json new file mode 100644 index 0000000000000000000000000000000000000000..50f8660f474e3b402487a715f4b5797e1c58accf --- /dev/null +++ b/data/developers/wannaphong.json @@ -0,0 +1,19 @@ +{ + "developer": "wannaphong", + "models": [ + { + "id": "wannaphong/KhanomTanLLM-Instruct", + "name": "KhanomTanLLM-Instruct", + "developer": "wannaphong", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1621, + "hfopenllm_v2/BBH": 0.3093, + "hfopenllm_v2/MATH Level 5": 0.0136, + "hfopenllm_v2/GPQA": 0.2634, + "hfopenllm_v2/MUSR": 0.3701, + "hfopenllm_v2/MMLU-PRO": 0.1119 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/waqasali1707.json b/data/developers/waqasali1707.json new file mode 100644 index 0000000000000000000000000000000000000000..57f8257bc9490ec8162ed63344b5bf450368a234 --- /dev/null +++ b/data/developers/waqasali1707.json @@ -0,0 +1,19 @@ +{ + "developer": "waqasali1707", + "models": [ + { + "id": "waqasali1707/Beast-Soul-new", + "name": "Beast-Soul-new", + "developer": "waqasali1707", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.503, + "hfopenllm_v2/BBH": 0.5225, + "hfopenllm_v2/MATH Level 5": 0.0702, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4486, + "hfopenllm_v2/MMLU-PRO": 0.3108 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/wave-on-discord.json b/data/developers/wave-on-discord.json new file mode 100644 index 0000000000000000000000000000000000000000..40e374e65eb42588d697b5d8299aa8e4008bd0f0 --- /dev/null +++ b/data/developers/wave-on-discord.json @@ -0,0 +1,19 @@ +{ + "developer": "wave-on-discord", + "models": [ + { + "id": "wave-on-discord/qwent-7b", + "name": "qwent-7b", + "developer": "wave-on-discord", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2015, + "hfopenllm_v2/BBH": 0.4228, + "hfopenllm_v2/MATH Level 5": 0.0038, + "hfopenllm_v2/GPQA": 0.2651, + "hfopenllm_v2/MUSR": 0.3817, + "hfopenllm_v2/MMLU-PRO": 0.1603 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/weathermanj.json b/data/developers/weathermanj.json new file mode 100644 index 0000000000000000000000000000000000000000..f7cec2ba4e2118205feba545342ba05a9e285a1b --- /dev/null +++ b/data/developers/weathermanj.json @@ -0,0 +1,61 @@ +{ + "developer": "weathermanj", + "models": [ + { + "id": "weathermanj/Menda-3B-500", + "name": "Menda-3B-500", + "developer": "weathermanj", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6353, + "hfopenllm_v2/BBH": 0.4766, + "hfopenllm_v2/MATH Level 5": 0.3724, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.3968, + "hfopenllm_v2/MMLU-PRO": 0.3475 + } + }, + { + "id": "weathermanj/Menda-3b-750", + "name": "Menda-3b-750", + "developer": "weathermanj", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6335, + "hfopenllm_v2/BBH": 0.4737, + "hfopenllm_v2/MATH Level 5": 0.3716, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.3942, + "hfopenllm_v2/MMLU-PRO": 0.3506 + } + }, + { + "id": "weathermanj/Menda-3b-Optim-100", + "name": "Menda-3b-Optim-100", + "developer": "weathermanj", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6398, + "hfopenllm_v2/BBH": 0.4735, + "hfopenllm_v2/MATH Level 5": 0.3716, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.3993, + "hfopenllm_v2/MMLU-PRO": 0.3461 + } + }, + { + "id": "weathermanj/Menda-3b-Optim-200", + "name": "Menda-3b-Optim-200", + "developer": "weathermanj", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6375, + "hfopenllm_v2/BBH": 0.4746, + "hfopenllm_v2/MATH Level 5": 0.3731, + "hfopenllm_v2/GPQA": 0.2827, + "hfopenllm_v2/MUSR": 0.4033, + "hfopenllm_v2/MMLU-PRO": 0.3484 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/wenbopan.json b/data/developers/wenbopan.json new file mode 100644 index 0000000000000000000000000000000000000000..baa05d5ec0959b065992ea52981ce1b0efee773b --- /dev/null +++ b/data/developers/wenbopan.json @@ -0,0 +1,19 @@ +{ + "developer": "wenbopan", + "models": [ + { + "id": "wenbopan/Faro-Yi-9B-DPO", + "name": "wenbopan/Faro-Yi-9B-DPO", + "developer": "wenbopan", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6461, + "reward-bench/Chat": 0.9218, + "reward-bench/Chat Hard": 0.5307, + "reward-bench/Safety": 0.5514, + "reward-bench/Reasoning": 0.5839, + "reward-bench/Prior Sets (0.5 weight)": 0.6395 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/weqweasdas.json b/data/developers/weqweasdas.json new file mode 100644 index 0000000000000000000000000000000000000000..93825681001579e2a4406f81036de228a63594a2 --- /dev/null +++ b/data/developers/weqweasdas.json @@ -0,0 +1,95 @@ +{ + "developer": "weqweasdas", + "models": [ + { + "id": "weqweasdas/RM-Gemma-2B", + "name": "weqweasdas/RM-Gemma-2B", + "developer": "weqweasdas", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.3057, + "reward-bench/Chat": 0.9441, + "reward-bench/Chat Hard": 0.4079, + "reward-bench/Safety": 0.3311, + "reward-bench/Reasoning": 0.7637, + "reward-bench/Prior Sets (0.5 weight)": 0.6652, + "reward-bench/Factuality": 0.3705, + "reward-bench/Precise IF": 0.2812, + "reward-bench/Math": 0.4317, + "reward-bench/Focus": 0.2343, + "reward-bench/Ties": 0.1851 + } + }, + { + "id": "weqweasdas/RM-Gemma-7B", + "name": "weqweasdas/RM-Gemma-7B", + "developer": "weqweasdas", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6967, + "reward-bench/Factuality": 0.4926, + "reward-bench/Precise IF": 0.3937, + "reward-bench/Math": 0.6066, + "reward-bench/Safety": 0.5784, + "reward-bench/Focus": 0.497, + "reward-bench/Ties": 0.4232, + "reward-bench/Chat": 0.9693, + "reward-bench/Chat Hard": 0.4978, + "reward-bench/Reasoning": 0.7362, + "reward-bench/Prior Sets (0.5 weight)": 0.7069 + } + }, + { + "id": "weqweasdas/RM-Gemma-7B-4096", + "name": "weqweasdas/RM-Gemma-7B-4096", + "developer": "weqweasdas", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.6922, + "reward-bench/Chat": 0.9497, + "reward-bench/Chat Hard": 0.5022, + "reward-bench/Safety": 0.5608, + "reward-bench/Reasoning": 0.7511, + "reward-bench/Prior Sets (0.5 weight)": 0.7024 + } + }, + { + "id": "weqweasdas/RM-Mistral-7B", + "name": "weqweasdas/RM-Mistral-7B", + "developer": "weqweasdas", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.7982, + "reward-bench/Factuality": 0.5937, + "reward-bench/Precise IF": 0.3438, + "reward-bench/Math": 0.5956, + "reward-bench/Safety": 0.8703, + "reward-bench/Focus": 0.7293, + "reward-bench/Ties": 0.6226, + "reward-bench/Chat": 0.9665, + "reward-bench/Chat Hard": 0.6053, + "reward-bench/Reasoning": 0.7736, + "reward-bench/Prior Sets (0.5 weight)": 0.753 + } + }, + { + "id": "weqweasdas/hh_rlhf_rm_open_llama_3b", + "name": "weqweasdas/hh_rlhf_rm_open_llama_3b", + "developer": "weqweasdas", + "evaluator_relationship": null, + "benchmark_scores": { + "reward-bench/Score": 0.5027, + "reward-bench/Factuality": 0.3642, + "reward-bench/Precise IF": 0.275, + "reward-bench/Math": 0.3497, + "reward-bench/Safety": 0.4149, + "reward-bench/Focus": 0.2384, + "reward-bench/Ties": 0.0315, + "reward-bench/Chat": 0.8184, + "reward-bench/Chat Hard": 0.3728, + "reward-bench/Reasoning": 0.3281, + "reward-bench/Prior Sets (0.5 weight)": 0.6564 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/win10.json b/data/developers/win10.json new file mode 100644 index 0000000000000000000000000000000000000000..335cf7902906d8cc35d7707b4280f0550729bd2b --- /dev/null +++ b/data/developers/win10.json @@ -0,0 +1,131 @@ +{ + "developer": "win10", + "models": [ + { + "id": "win10/ArliAI-RPMax-v1.3-merge-13.3B", + "name": "ArliAI-RPMax-v1.3-merge-13.3B", + "developer": "win10", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3038, + "hfopenllm_v2/BBH": 0.4581, + "hfopenllm_v2/MATH Level 5": 0.0393, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.4325, + "hfopenllm_v2/MMLU-PRO": 0.32 + } + }, + { + "id": "win10/Breeze-13B-32k-Instruct-v1_0", + "name": "Breeze-13B-32k-Instruct-v1_0", + "developer": "win10", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3584, + "hfopenllm_v2/BBH": 0.4611, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2643, + "hfopenllm_v2/MUSR": 0.4202, + "hfopenllm_v2/MMLU-PRO": 0.2568 + } + }, + { + "id": "win10/EVA-Norns-Qwen2.5-v0.1", + "name": "EVA-Norns-Qwen2.5-v0.1", + "developer": "win10", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.622, + "hfopenllm_v2/BBH": 0.5072, + "hfopenllm_v2/MATH Level 5": 0.2613, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.4045, + "hfopenllm_v2/MMLU-PRO": 0.3425 + } + }, + { + "id": "win10/Llama-3.2-3B-Instruct-24-9-29", + "name": "Llama-3.2-3B-Instruct-24-9-29", + "developer": "win10", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7332, + "hfopenllm_v2/BBH": 0.4614, + "hfopenllm_v2/MATH Level 5": 0.1707, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3555, + "hfopenllm_v2/MMLU-PRO": 0.3228 + } + }, + { + "id": "win10/Norns-Qwen2.5-12B", + "name": "Norns-Qwen2.5-12B", + "developer": "win10", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4897, + "hfopenllm_v2/BBH": 0.4619, + "hfopenllm_v2/MATH Level 5": 0.0838, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3555, + "hfopenllm_v2/MMLU-PRO": 0.266 + } + }, + { + "id": "win10/Norns-Qwen2.5-7B", + "name": "Norns-Qwen2.5-7B", + "developer": "win10", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6122, + "hfopenllm_v2/BBH": 0.5073, + "hfopenllm_v2/MATH Level 5": 0.2628, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4085, + "hfopenllm_v2/MMLU-PRO": 0.3413 + } + }, + { + "id": "win10/Qwen2.5-2B-Instruct", + "name": "Qwen2.5-2B-Instruct", + "developer": "win10", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2273, + "hfopenllm_v2/BBH": 0.3706, + "hfopenllm_v2/MATH Level 5": 0.0227, + "hfopenllm_v2/GPQA": 0.2676, + "hfopenllm_v2/MUSR": 0.4378, + "hfopenllm_v2/MMLU-PRO": 0.1934 + } + }, + { + "id": "win10/llama3-13.45b-Instruct", + "name": "llama3-13.45b-Instruct", + "developer": "win10", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4144, + "hfopenllm_v2/BBH": 0.4865, + "hfopenllm_v2/MATH Level 5": 0.0242, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3848, + "hfopenllm_v2/MMLU-PRO": 0.3345 + } + }, + { + "id": "win10/miscii-14b-1M-0128", + "name": "miscii-14b-1M-0128", + "developer": "win10", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4181, + "hfopenllm_v2/BBH": 0.5742, + "hfopenllm_v2/MATH Level 5": 0.4773, + "hfopenllm_v2/GPQA": 0.3826, + "hfopenllm_v2/MUSR": 0.5431, + "hfopenllm_v2/MMLU-PRO": 0.4491 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/winglian.json b/data/developers/winglian.json new file mode 100644 index 0000000000000000000000000000000000000000..db16fdc9fecf90f748ada6e8e9fc7986ca2b894e --- /dev/null +++ b/data/developers/winglian.json @@ -0,0 +1,33 @@ +{ + "developer": "winglian", + "models": [ + { + "id": "winglian/Llama-3-8b-64k-PoSE", + "name": "Llama-3-8b-64k-PoSE", + "developer": "winglian", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2857, + "hfopenllm_v2/BBH": 0.3702, + "hfopenllm_v2/MATH Level 5": 0.0415, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3396, + "hfopenllm_v2/MMLU-PRO": 0.2467 + } + }, + { + "id": "winglian/llama-3-8b-256k-PoSE", + "name": "llama-3-8b-256k-PoSE", + "developer": "winglian", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2909, + "hfopenllm_v2/BBH": 0.3157, + "hfopenllm_v2/MATH Level 5": 0.0196, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3316, + "hfopenllm_v2/MMLU-PRO": 0.1116 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/writer.json b/data/developers/writer.json new file mode 100644 index 0000000000000000000000000000000000000000..dc48bcf4079929a631381e5f48ab6834a52fe15a --- /dev/null +++ b/data/developers/writer.json @@ -0,0 +1,202 @@ +{ + "developer": "writer", + "models": [ + { + "id": "writer/InstructPalmyra-30B", + "name": "InstructPalmyra 30B", + "developer": "writer", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.568, + "helm_classic/MMLU": 0.403, + "helm_classic/BoolQ": 0.751, + "helm_classic/NarrativeQA": 0.496, + "helm_classic/NaturalQuestions (open-book)": 0.682, + "helm_classic/QuAC": 0.433, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.185, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": 0.152, + "helm_classic/XSUM": 0.104, + "helm_classic/IMDB": 0.94, + "helm_classic/CivilComments": 0.555, + "helm_classic/RAFT": 0.652 + } + }, + { + "id": "writer/palmyra-fin", + "name": "Palmyra Fin", + "developer": "writer", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.577, + "helm_capabilities/MMLU-Pro": 0.591, + "helm_capabilities/GPQA": 0.422, + "helm_capabilities/IFEval": 0.793, + "helm_capabilities/WildBench": 0.783, + "helm_capabilities/Omni-MATH": 0.295 + } + }, + { + "id": "writer/palmyra-med", + "name": "Palmyra Med", + "developer": "writer", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.476, + "helm_capabilities/MMLU-Pro": 0.411, + "helm_capabilities/GPQA": 0.368, + "helm_capabilities/IFEval": 0.767, + "helm_capabilities/WildBench": 0.676, + "helm_capabilities/Omni-MATH": 0.156 + } + }, + { + "id": "writer/palmyra-x-004", + "name": "Palmyra-X-004", + "developer": "writer", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.609, + "helm_capabilities/MMLU-Pro": 0.657, + "helm_capabilities/GPQA": 0.395, + "helm_capabilities/IFEval": 0.872, + "helm_capabilities/WildBench": 0.802, + "helm_capabilities/Omni-MATH": 0.32, + "helm_lite/Mean win rate": 0.808, + "helm_lite/NarrativeQA": 0.773, + "helm_lite/NaturalQuestions (closed-book)": 0.457, + "helm_lite/OpenbookQA": 0.926, + "helm_lite/MMLU": 0.739, + "helm_lite/MATH": 0.767, + "helm_lite/GSM8K": 0.905, + "helm_lite/LegalBench": 0.73, + "helm_lite/MedQA": 0.775, + "helm_lite/WMT 2014": 0.203, + "helm_mmlu/MMLU All Subjects": 0.813, + "helm_mmlu/Abstract Algebra": 0.75, + "helm_mmlu/Anatomy": 0.822, + "helm_mmlu/College Physics": 0.647, + "helm_mmlu/Computer Security": 0.82, + "helm_mmlu/Econometrics": 0.684, + "helm_mmlu/Global Facts": 0.62, + "helm_mmlu/Jurisprudence": 0.843, + "helm_mmlu/Philosophy": 0.83, + "helm_mmlu/Professional Psychology": 0.845, + "helm_mmlu/Us Foreign Policy": 0.92, + "helm_mmlu/Astronomy": 0.928, + "helm_mmlu/Business Ethics": 0.76, + "helm_mmlu/Clinical Knowledge": 0.879, + "helm_mmlu/Conceptual Physics": 0.885, + "helm_mmlu/Electrical Engineering": 0.793, + "helm_mmlu/Elementary Mathematics": 0.841, + "helm_mmlu/Formal Logic": 0.579, + "helm_mmlu/High School World History": 0.911, + "helm_mmlu/Human Sexuality": 0.924, + "helm_mmlu/International Law": 0.901, + "helm_mmlu/Logical Fallacies": 0.877, + "helm_mmlu/Machine Learning": 0.679, + "helm_mmlu/Management": 0.903, + "helm_mmlu/Marketing": 0.932, + "helm_mmlu/Medical Genetics": 0.87, + "helm_mmlu/Miscellaneous": 0.934, + "helm_mmlu/Moral Scenarios": 0.825, + "helm_mmlu/Nutrition": 0.869, + "helm_mmlu/Prehistory": 0.917, + "helm_mmlu/Public Relations": 0.791, + "helm_mmlu/Security Studies": 0.849, + "helm_mmlu/Sociology": 0.915, + "helm_mmlu/Virology": 0.584, + "helm_mmlu/World Religions": 0.842, + "helm_mmlu/Mean win rate": 0.629 + } + }, + { + "id": "writer/palmyra-x-v2", + "name": "Palmyra X V2 33B", + "developer": "writer", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.589, + "helm_lite/NarrativeQA": 0.752, + "helm_lite/NaturalQuestions (closed-book)": 0.428, + "helm_lite/OpenbookQA": 0.878, + "helm_lite/MMLU": 0.621, + "helm_lite/MATH": 0.58, + "helm_lite/GSM8K": 0.735, + "helm_lite/LegalBench": 0.644, + "helm_lite/MedQA": 0.598, + "helm_lite/WMT 2014": 0.239 + } + }, + { + "id": "writer/palmyra-x-v3", + "name": "Palmyra X V3 72B", + "developer": "writer", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_lite/Mean win rate": 0.679, + "helm_lite/NarrativeQA": 0.706, + "helm_lite/NaturalQuestions (closed-book)": 0.407, + "helm_lite/OpenbookQA": 0.938, + "helm_lite/MMLU": 0.702, + "helm_lite/MATH": 0.723, + "helm_lite/GSM8K": 0.831, + "helm_lite/LegalBench": 0.709, + "helm_lite/MedQA": 0.684, + "helm_lite/WMT 2014": 0.262, + "helm_mmlu/MMLU All Subjects": 0.786, + "helm_mmlu/Abstract Algebra": 0.53, + "helm_mmlu/Anatomy": 0.733, + "helm_mmlu/College Physics": 0.549, + "helm_mmlu/Computer Security": 0.78, + "helm_mmlu/Econometrics": 0.649, + "helm_mmlu/Global Facts": 0.53, + "helm_mmlu/Jurisprudence": 0.88, + "helm_mmlu/Philosophy": 0.836, + "helm_mmlu/Professional Psychology": 0.858, + "helm_mmlu/Us Foreign Policy": 0.96, + "helm_mmlu/Astronomy": 0.862, + "helm_mmlu/Business Ethics": 0.83, + "helm_mmlu/Clinical Knowledge": 0.804, + "helm_mmlu/Conceptual Physics": 0.809, + "helm_mmlu/Electrical Engineering": 0.772, + "helm_mmlu/Elementary Mathematics": 0.661, + "helm_mmlu/Formal Logic": 0.659, + "helm_mmlu/High School World History": 0.911, + "helm_mmlu/Human Sexuality": 0.924, + "helm_mmlu/International Law": 0.909, + "helm_mmlu/Logical Fallacies": 0.877, + "helm_mmlu/Machine Learning": 0.625, + "helm_mmlu/Management": 0.903, + "helm_mmlu/Marketing": 0.94, + "helm_mmlu/Medical Genetics": 0.83, + "helm_mmlu/Miscellaneous": 0.894, + "helm_mmlu/Moral Scenarios": 0.562, + "helm_mmlu/Nutrition": 0.856, + "helm_mmlu/Prehistory": 0.87, + "helm_mmlu/Public Relations": 0.773, + "helm_mmlu/Security Studies": 0.833, + "helm_mmlu/Sociology": 0.91, + "helm_mmlu/Virology": 0.572, + "helm_mmlu/World Religions": 0.877, + "helm_mmlu/Mean win rate": 0.325 + } + }, + { + "id": "writer/palmyra-x5", + "name": "Palmyra X5", + "developer": "writer", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.696, + "helm_capabilities/MMLU-Pro": 0.804, + "helm_capabilities/GPQA": 0.661, + "helm_capabilities/IFEval": 0.823, + "helm_capabilities/WildBench": 0.78, + "helm_capabilities/Omni-MATH": 0.414 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/wzhouad.json b/data/developers/wzhouad.json new file mode 100644 index 0000000000000000000000000000000000000000..453797409e63aafdb8b07eeb1ddaefaa100cd036 --- /dev/null +++ b/data/developers/wzhouad.json @@ -0,0 +1,19 @@ +{ + "developer": "wzhouad", + "models": [ + { + "id": "wzhouad/gemma-2-9b-it-WPO-HB", + "name": "gemma-2-9b-it-WPO-HB", + "developer": "wzhouad", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5437, + "hfopenllm_v2/BBH": 0.5629, + "hfopenllm_v2/MATH Level 5": 0.1533, + "hfopenllm_v2/GPQA": 0.3498, + "hfopenllm_v2/MUSR": 0.3675, + "hfopenllm_v2/MMLU-PRO": 0.336 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/x0000001.json b/data/developers/x0000001.json new file mode 100644 index 0000000000000000000000000000000000000000..042a41cd2a9dd6984ca2dddab28f1b0c182af361 --- /dev/null +++ b/data/developers/x0000001.json @@ -0,0 +1,19 @@ +{ + "developer": "x0000001", + "models": [ + { + "id": "x0000001/Deepseek-Lumen-R1-Qwen2.5-14B", + "name": "Deepseek-Lumen-R1-Qwen2.5-14B", + "developer": "x0000001", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4436, + "hfopenllm_v2/BBH": 0.4569, + "hfopenllm_v2/MATH Level 5": 0.2779, + "hfopenllm_v2/GPQA": 0.2852, + "hfopenllm_v2/MUSR": 0.474, + "hfopenllm_v2/MMLU-PRO": 0.4379 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/xAI.json b/data/developers/xAI.json new file mode 100644 index 0000000000000000000000000000000000000000..a178d74d4ce2ff741ee7cfef87dacd26423b9076 --- /dev/null +++ b/data/developers/xAI.json @@ -0,0 +1,23 @@ +{ + "developer": "xAI", + "models": [ + { + "id": "xai/grok-4", + "name": "Grok 4", + "developer": "xAI", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 23.1 + } + }, + { + "id": "xai/grok-code-fast-1", + "name": "Grok Code Fast 1", + "developer": "xAI", + "evaluator_relationship": null, + "benchmark_scores": { + "terminal-bench-2.0/terminal-bench-2.0": 14.2 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/xMaulana.json b/data/developers/xMaulana.json new file mode 100644 index 0000000000000000000000000000000000000000..b4d6b577aa49240ab6bcf77c3897b8d702f000e1 --- /dev/null +++ b/data/developers/xMaulana.json @@ -0,0 +1,19 @@ +{ + "developer": "xMaulana", + "models": [ + { + "id": "xMaulana/FinMatcha-3B-Instruct", + "name": "FinMatcha-3B-Instruct", + "developer": "xMaulana", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7548, + "hfopenllm_v2/BBH": 0.4536, + "hfopenllm_v2/MATH Level 5": 0.1435, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3633, + "hfopenllm_v2/MMLU-PRO": 0.3182 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/xai.json b/data/developers/xai.json new file mode 100644 index 0000000000000000000000000000000000000000..9373b5609f40cbf6169d56916e046772b21e5127 --- /dev/null +++ b/data/developers/xai.json @@ -0,0 +1,109 @@ +{ + "developer": "xai", + "models": [ + { + "id": "xai/Grok 4", + "name": "Grok 4", + "developer": "xai", + "evaluator_relationship": null, + "benchmark_scores": { + "apex-agents/Overall Pass@1": 0.152, + "apex-agents/Overall Pass@8": 0.329, + "apex-agents/Overall Mean Score": 0.303, + "apex-agents/Investment Banking Pass@1": 0.17, + "apex-agents/Management Consulting Pass@1": 0.12, + "apex-agents/Corporate Law Pass@1": 0.165, + "apex-agents/Corporate Lawyer Mean Score": 0.41, + "apex-v1/Overall Score": 0.635 + } + }, + { + "id": "xai/grok-3-beta", + "name": "Grok 3 Beta", + "developer": "xai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.727, + "helm_capabilities/MMLU-Pro": 0.788, + "helm_capabilities/GPQA": 0.65, + "helm_capabilities/IFEval": 0.884, + "helm_capabilities/WildBench": 0.849, + "helm_capabilities/Omni-MATH": 0.464 + } + }, + { + "id": "xai/grok-3-mini", + "name": "grok-3-mini", + "developer": "xai", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.673, + "global-mmlu-lite/Culturally Sensitive": 0.6717, + "global-mmlu-lite/Culturally Agnostic": 0.6743, + "global-mmlu-lite/Arabic": 0.755, + "global-mmlu-lite/English": 0.5075, + "global-mmlu-lite/Bengali": 0.7355, + "global-mmlu-lite/German": 0.6591, + "global-mmlu-lite/French": 0.485, + "global-mmlu-lite/Hindi": 0.56, + "global-mmlu-lite/Indonesian": 0.725, + "global-mmlu-lite/Italian": 0.696, + "global-mmlu-lite/Japanese": 0.6575, + "global-mmlu-lite/Korean": 0.7325, + "global-mmlu-lite/Portuguese": 0.6275, + "global-mmlu-lite/Spanish": 0.61, + "global-mmlu-lite/Swahili": 0.7625, + "global-mmlu-lite/Yoruba": 0.8296, + "global-mmlu-lite/Chinese": 0.5564, + "global-mmlu-lite/Burmese": 0.8693 + } + }, + { + "id": "xai/grok-3-mini-beta", + "name": "Grok 3 mini Beta", + "developer": "xai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.679, + "helm_capabilities/MMLU-Pro": 0.799, + "helm_capabilities/GPQA": 0.675, + "helm_capabilities/IFEval": 0.951, + "helm_capabilities/WildBench": 0.651, + "helm_capabilities/Omni-MATH": 0.318 + } + }, + { + "id": "xai/grok-4-0709", + "name": "grok-4-0709", + "developer": "xai", + "evaluator_relationship": null, + "benchmark_scores": { + "global-mmlu-lite/Global MMLU Lite": 0.8881, + "global-mmlu-lite/Culturally Sensitive": 0.8862, + "global-mmlu-lite/Culturally Agnostic": 0.89, + "global-mmlu-lite/Arabic": 0.885, + "global-mmlu-lite/English": 0.905, + "global-mmlu-lite/Bengali": 0.8925, + "global-mmlu-lite/German": 0.8725, + "global-mmlu-lite/French": 0.875, + "global-mmlu-lite/Hindi": 0.8675, + "global-mmlu-lite/Indonesian": 0.89, + "global-mmlu-lite/Italian": 0.9025, + "global-mmlu-lite/Japanese": 0.87, + "global-mmlu-lite/Korean": 0.895, + "global-mmlu-lite/Portuguese": 0.8725, + "global-mmlu-lite/Spanish": 0.9075, + "global-mmlu-lite/Swahili": 0.91, + "global-mmlu-lite/Yoruba": 0.905, + "global-mmlu-lite/Chinese": 0.8525, + "global-mmlu-lite/Burmese": 0.9075, + "helm_capabilities/Mean score": 0.785, + "helm_capabilities/MMLU-Pro": 0.851, + "helm_capabilities/GPQA": 0.726, + "helm_capabilities/IFEval": 0.949, + "helm_capabilities/WildBench": 0.797, + "helm_capabilities/Omni-MATH": 0.603 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/xinchen9.json b/data/developers/xinchen9.json new file mode 100644 index 0000000000000000000000000000000000000000..7c861896befb6eb332fa8f8e14a51c229d4b6cb0 --- /dev/null +++ b/data/developers/xinchen9.json @@ -0,0 +1,75 @@ +{ + "developer": "xinchen9", + "models": [ + { + "id": "xinchen9/Llama3.1_8B_Instruct_CoT", + "name": "Llama3.1_8B_Instruct_CoT", + "developer": "xinchen9", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2974, + "hfopenllm_v2/BBH": 0.4398, + "hfopenllm_v2/MATH Level 5": 0.0604, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4371, + "hfopenllm_v2/MMLU-PRO": 0.2879 + } + }, + { + "id": "xinchen9/Llama3.1_CoT", + "name": "Llama3.1_CoT", + "developer": "xinchen9", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2246, + "hfopenllm_v2/BBH": 0.4341, + "hfopenllm_v2/MATH Level 5": 0.0385, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.4305, + "hfopenllm_v2/MMLU-PRO": 0.2739 + } + }, + { + "id": "xinchen9/Llama3.1_CoT_V1", + "name": "Llama3.1_CoT_V1", + "developer": "xinchen9", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2453, + "hfopenllm_v2/BBH": 0.4376, + "hfopenllm_v2/MATH Level 5": 0.0332, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.4572, + "hfopenllm_v2/MMLU-PRO": 0.2805 + } + }, + { + "id": "xinchen9/Mistral-7B-CoT", + "name": "Mistral-7B-CoT", + "developer": "xinchen9", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2783, + "hfopenllm_v2/BBH": 0.3873, + "hfopenllm_v2/MATH Level 5": 0.0249, + "hfopenllm_v2/GPQA": 0.2492, + "hfopenllm_v2/MUSR": 0.3994, + "hfopenllm_v2/MMLU-PRO": 0.2284 + } + }, + { + "id": "xinchen9/llama3-b8-ft-dis", + "name": "llama3-b8-ft-dis", + "developer": "xinchen9", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1546, + "hfopenllm_v2/BBH": 0.4626, + "hfopenllm_v2/MATH Level 5": 0.0393, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.3654, + "hfopenllm_v2/MMLU-PRO": 0.3244 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/xkp24.json b/data/developers/xkp24.json new file mode 100644 index 0000000000000000000000000000000000000000..4083497ab0a287c873007f5bd866a54a7499662d --- /dev/null +++ b/data/developers/xkp24.json @@ -0,0 +1,117 @@ +{ + "developer": "xkp24", + "models": [ + { + "id": "xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table", + "developer": "xkp24", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6375, + "hfopenllm_v2/BBH": 0.4912, + "hfopenllm_v2/MATH Level 5": 0.0921, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.382, + "hfopenllm_v2/MMLU-PRO": 0.3686 + } + }, + { + "id": "xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table", + "developer": "xkp24", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7275, + "hfopenllm_v2/BBH": 0.5057, + "hfopenllm_v2/MATH Level 5": 0.0846, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3819, + "hfopenllm_v2/MMLU-PRO": 0.3697 + } + }, + { + "id": "xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table", + "developer": "xkp24", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6569, + "hfopenllm_v2/BBH": 0.4952, + "hfopenllm_v2/MATH Level 5": 0.0891, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3594, + "hfopenllm_v2/MMLU-PRO": 0.3702 + } + }, + { + "id": "xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table", + "developer": "xkp24", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6621, + "hfopenllm_v2/BBH": 0.5004, + "hfopenllm_v2/MATH Level 5": 0.0861, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3805, + "hfopenllm_v2/MMLU-PRO": 0.36 + } + }, + { + "id": "xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001", + "developer": "xkp24", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6042, + "hfopenllm_v2/BBH": 0.4936, + "hfopenllm_v2/MATH Level 5": 0.0997, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3793, + "hfopenllm_v2/MMLU-PRO": 0.3708 + } + }, + { + "id": "xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002", + "developer": "xkp24", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7132, + "hfopenllm_v2/BBH": 0.4996, + "hfopenllm_v2/MATH Level 5": 0.0853, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3872, + "hfopenllm_v2/MMLU-PRO": 0.3664 + } + }, + { + "id": "xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001", + "developer": "xkp24", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5947, + "hfopenllm_v2/BBH": 0.4899, + "hfopenllm_v2/MATH Level 5": 0.1073, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3581, + "hfopenllm_v2/MMLU-PRO": 0.3704 + } + }, + { + "id": "xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002", + "developer": "xkp24", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6453, + "hfopenllm_v2/BBH": 0.4951, + "hfopenllm_v2/MATH Level 5": 0.0937, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3939, + "hfopenllm_v2/MMLU-PRO": 0.353 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/xukp20.json b/data/developers/xukp20.json new file mode 100644 index 0000000000000000000000000000000000000000..ec4d4e13a4615567769db5124d18519260083f4a --- /dev/null +++ b/data/developers/xukp20.json @@ -0,0 +1,117 @@ +{ + "developer": "xukp20", + "models": [ + { + "id": "xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table", + "developer": "xukp20", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5756, + "hfopenllm_v2/BBH": 0.4901, + "hfopenllm_v2/MATH Level 5": 0.0997, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.366, + "hfopenllm_v2/MMLU-PRO": 0.3659 + } + }, + { + "id": "xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table", + "developer": "xukp20", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7034, + "hfopenllm_v2/BBH": 0.5092, + "hfopenllm_v2/MATH Level 5": 0.0967, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3739, + "hfopenllm_v2/MMLU-PRO": 0.3693 + } + }, + { + "id": "xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table", + "developer": "xukp20", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6024, + "hfopenllm_v2/BBH": 0.497, + "hfopenllm_v2/MATH Level 5": 0.1042, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3674, + "hfopenllm_v2/MMLU-PRO": 0.3658 + } + }, + { + "id": "xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table", + "developer": "xukp20", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.662, + "hfopenllm_v2/BBH": 0.5, + "hfopenllm_v2/MATH Level 5": 0.0937, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3818, + "hfopenllm_v2/MMLU-PRO": 0.3615 + } + }, + { + "id": "xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001", + "developer": "xukp20", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5336, + "hfopenllm_v2/BBH": 0.4915, + "hfopenllm_v2/MATH Level 5": 0.0982, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.378, + "hfopenllm_v2/MMLU-PRO": 0.3625 + } + }, + { + "id": "xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002", + "developer": "xukp20", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6852, + "hfopenllm_v2/BBH": 0.5075, + "hfopenllm_v2/MATH Level 5": 0.0718, + "hfopenllm_v2/GPQA": 0.2584, + "hfopenllm_v2/MUSR": 0.3832, + "hfopenllm_v2/MMLU-PRO": 0.3621 + } + }, + { + "id": "xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001", + "developer": "xukp20", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5482, + "hfopenllm_v2/BBH": 0.4887, + "hfopenllm_v2/MATH Level 5": 0.0891, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3633, + "hfopenllm_v2/MMLU-PRO": 0.3671 + } + }, + { + "id": "xukp20/llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table", + "name": "llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table", + "developer": "xukp20", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.69, + "hfopenllm_v2/BBH": 0.4978, + "hfopenllm_v2/MATH Level 5": 0.105, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3673, + "hfopenllm_v2/MMLU-PRO": 0.3716 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/xwen-team.json b/data/developers/xwen-team.json new file mode 100644 index 0000000000000000000000000000000000000000..60d7a6f2f80b797def4b2a5bf64155dcbc5adbff --- /dev/null +++ b/data/developers/xwen-team.json @@ -0,0 +1,19 @@ +{ + "developer": "xwen-team", + "models": [ + { + "id": "xwen-team/Xwen-7B-Chat", + "name": "Xwen-7B-Chat", + "developer": "xwen-team", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6864, + "hfopenllm_v2/BBH": 0.5068, + "hfopenllm_v2/MATH Level 5": 0.4509, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3914, + "hfopenllm_v2/MMLU-PRO": 0.429 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/xxx777xxxASD.json b/data/developers/xxx777xxxASD.json new file mode 100644 index 0000000000000000000000000000000000000000..9624ed023877d4f46e564f2b1046890ec687436e --- /dev/null +++ b/data/developers/xxx777xxxASD.json @@ -0,0 +1,19 @@ +{ + "developer": "xxx777xxxASD", + "models": [ + { + "id": "xxx777xxxASD/L3.1-ClaudeMaid-4x8B", + "name": "L3.1-ClaudeMaid-4x8B", + "developer": "xxx777xxxASD", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6696, + "hfopenllm_v2/BBH": 0.5071, + "hfopenllm_v2/MATH Level 5": 0.1412, + "hfopenllm_v2/GPQA": 0.2911, + "hfopenllm_v2/MUSR": 0.4289, + "hfopenllm_v2/MMLU-PRO": 0.358 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/yam-peleg.json b/data/developers/yam-peleg.json new file mode 100644 index 0000000000000000000000000000000000000000..3161f95274f1998cddb90c53a59dce4097dbe0fd --- /dev/null +++ b/data/developers/yam-peleg.json @@ -0,0 +1,47 @@ +{ + "developer": "yam-peleg", + "models": [ + { + "id": "yam-peleg/Hebrew-Gemma-11B-Instruct", + "name": "Hebrew-Gemma-11B-Instruct", + "developer": "yam-peleg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3021, + "hfopenllm_v2/BBH": 0.4036, + "hfopenllm_v2/MATH Level 5": 0.0657, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.4089, + "hfopenllm_v2/MMLU-PRO": 0.2554 + } + }, + { + "id": "yam-peleg/Hebrew-Mistral-7B", + "name": "Hebrew-Mistral-7B", + "developer": "yam-peleg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2328, + "hfopenllm_v2/BBH": 0.4334, + "hfopenllm_v2/MATH Level 5": 0.0498, + "hfopenllm_v2/GPQA": 0.2794, + "hfopenllm_v2/MUSR": 0.3977, + "hfopenllm_v2/MMLU-PRO": 0.278 + } + }, + { + "id": "yam-peleg/Hebrew-Mistral-7B-200K", + "name": "Hebrew-Mistral-7B-200K", + "developer": "yam-peleg", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1856, + "hfopenllm_v2/BBH": 0.4149, + "hfopenllm_v2/MATH Level 5": 0.0234, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3765, + "hfopenllm_v2/MMLU-PRO": 0.2573 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/yandex.json b/data/developers/yandex.json new file mode 100644 index 0000000000000000000000000000000000000000..b112206ad32f8f95bdaef8cfd7791c6d8a424936 --- /dev/null +++ b/data/developers/yandex.json @@ -0,0 +1,28 @@ +{ + "developer": "yandex", + "models": [ + { + "id": "yandex/YaLM-100B", + "name": "YaLM 100B", + "developer": "yandex", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.075, + "helm_classic/MMLU": 0.243, + "helm_classic/BoolQ": 0.634, + "helm_classic/NarrativeQA": 0.252, + "helm_classic/NaturalQuestions (open-book)": 0.227, + "helm_classic/QuAC": 0.162, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.202, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": 0.017, + "helm_classic/XSUM": 0.021, + "helm_classic/IMDB": 0.836, + "helm_classic/CivilComments": 0.49, + "helm_classic/RAFT": 0.395 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/yanng1242.json b/data/developers/yanng1242.json new file mode 100644 index 0000000000000000000000000000000000000000..ef7b82c81b1cc22d6beb15bce60394b80153904c --- /dev/null +++ b/data/developers/yanng1242.json @@ -0,0 +1,19 @@ +{ + "developer": "yanng1242", + "models": [ + { + "id": "yanng1242/Marcoro14-7B-slerp", + "name": "Marcoro14-7B-slerp", + "developer": "yanng1242", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.406, + "hfopenllm_v2/BBH": 0.5252, + "hfopenllm_v2/MATH Level 5": 0.0748, + "hfopenllm_v2/GPQA": 0.3146, + "hfopenllm_v2/MUSR": 0.4686, + "hfopenllm_v2/MMLU-PRO": 0.3168 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/yasserrmd.json b/data/developers/yasserrmd.json new file mode 100644 index 0000000000000000000000000000000000000000..5564a88140dae2972387cd1d19a8316870614c39 --- /dev/null +++ b/data/developers/yasserrmd.json @@ -0,0 +1,33 @@ +{ + "developer": "yasserrmd", + "models": [ + { + "id": "yasserrmd/Coder-GRPO-3B", + "name": "Coder-GRPO-3B", + "developer": "yasserrmd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6208, + "hfopenllm_v2/BBH": 0.4469, + "hfopenllm_v2/MATH Level 5": 0.3202, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.4115, + "hfopenllm_v2/MMLU-PRO": 0.3197 + } + }, + { + "id": "yasserrmd/Text2SQL-1.5B", + "name": "Text2SQL-1.5B", + "developer": "yasserrmd", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2857, + "hfopenllm_v2/BBH": 0.3858, + "hfopenllm_v2/MATH Level 5": 0.068, + "hfopenllm_v2/GPQA": 0.2878, + "hfopenllm_v2/MUSR": 0.3942, + "hfopenllm_v2/MMLU-PRO": 0.2363 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ycros.json b/data/developers/ycros.json new file mode 100644 index 0000000000000000000000000000000000000000..9b83f7f872f197f98fd71538cda202af618f5659 --- /dev/null +++ b/data/developers/ycros.json @@ -0,0 +1,19 @@ +{ + "developer": "ycros", + "models": [ + { + "id": "ycros/BagelMIsteryTour-v2-8x7B", + "name": "BagelMIsteryTour-v2-8x7B", + "developer": "ycros", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5994, + "hfopenllm_v2/BBH": 0.5159, + "hfopenllm_v2/MATH Level 5": 0.0785, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4203, + "hfopenllm_v2/MMLU-PRO": 0.3473 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/yfzp.json b/data/developers/yfzp.json new file mode 100644 index 0000000000000000000000000000000000000000..af69616605e9d8864220b6bd0050288576f02596 --- /dev/null +++ b/data/developers/yfzp.json @@ -0,0 +1,117 @@ +{ + "developer": "yfzp", + "models": [ + { + "id": "yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table", + "developer": "yfzp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6709, + "hfopenllm_v2/BBH": 0.4987, + "hfopenllm_v2/MATH Level 5": 0.1118, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3727, + "hfopenllm_v2/MMLU-PRO": 0.3716 + } + }, + { + "id": "yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table", + "developer": "yfzp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7333, + "hfopenllm_v2/BBH": 0.508, + "hfopenllm_v2/MATH Level 5": 0.1035, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3806, + "hfopenllm_v2/MMLU-PRO": 0.3748 + } + }, + { + "id": "yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table", + "developer": "yfzp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6785, + "hfopenllm_v2/BBH": 0.4941, + "hfopenllm_v2/MATH Level 5": 0.1125, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3647, + "hfopenllm_v2/MMLU-PRO": 0.3718 + } + }, + { + "id": "yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table", + "name": "Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table", + "developer": "yfzp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7132, + "hfopenllm_v2/BBH": 0.5025, + "hfopenllm_v2/MATH Level 5": 0.0989, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3713, + "hfopenllm_v2/MMLU-PRO": 0.3683 + } + }, + { + "id": "yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001", + "developer": "yfzp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6496, + "hfopenllm_v2/BBH": 0.4979, + "hfopenllm_v2/MATH Level 5": 0.1012, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.378, + "hfopenllm_v2/MMLU-PRO": 0.372 + } + }, + { + "id": "yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002", + "developer": "yfzp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7196, + "hfopenllm_v2/BBH": 0.5045, + "hfopenllm_v2/MATH Level 5": 0.0876, + "hfopenllm_v2/GPQA": 0.2601, + "hfopenllm_v2/MUSR": 0.3831, + "hfopenllm_v2/MMLU-PRO": 0.3734 + } + }, + { + "id": "yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001", + "developer": "yfzp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6504, + "hfopenllm_v2/BBH": 0.4958, + "hfopenllm_v2/MATH Level 5": 0.0937, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.366, + "hfopenllm_v2/MMLU-PRO": 0.3703 + } + }, + { + "id": "yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002", + "developer": "yfzp", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7016, + "hfopenllm_v2/BBH": 0.4992, + "hfopenllm_v2/MATH Level 5": 0.0869, + "hfopenllm_v2/GPQA": 0.2592, + "hfopenllm_v2/MUSR": 0.3779, + "hfopenllm_v2/MMLU-PRO": 0.3669 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/yifAI.json b/data/developers/yifAI.json new file mode 100644 index 0000000000000000000000000000000000000000..0ffcb54953079f72e995c1ad81eb2c4aeb958202 --- /dev/null +++ b/data/developers/yifAI.json @@ -0,0 +1,19 @@ +{ + "developer": "yifAI", + "models": [ + { + "id": "yifAI/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002", + "name": "Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002", + "developer": "yifAI", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.649, + "hfopenllm_v2/BBH": 0.4915, + "hfopenllm_v2/MATH Level 5": 0.0755, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3899, + "hfopenllm_v2/MMLU-PRO": 0.352 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ylalain.json b/data/developers/ylalain.json new file mode 100644 index 0000000000000000000000000000000000000000..4f5ac89537e2a3fe1a315987b6c78585ba8a18c6 --- /dev/null +++ b/data/developers/ylalain.json @@ -0,0 +1,19 @@ +{ + "developer": "ylalain", + "models": [ + { + "id": "ylalain/ECE-PRYMMAL-YL-1B-SLERP-V8", + "name": "ECE-PRYMMAL-YL-1B-SLERP-V8", + "developer": "ylalain", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1505, + "hfopenllm_v2/BBH": 0.3976, + "hfopenllm_v2/MATH Level 5": 0.0045, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.3875, + "hfopenllm_v2/MMLU-PRO": 0.2384 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/ymcki.json b/data/developers/ymcki.json new file mode 100644 index 0000000000000000000000000000000000000000..b70afb964f4764611c0eaf7ed8372d4914cef9c9 --- /dev/null +++ b/data/developers/ymcki.json @@ -0,0 +1,159 @@ +{ + "developer": "ymcki", + "models": [ + { + "id": "ymcki/Llama-3.1-8B-GRPO-Instruct", + "name": "Llama-3.1-8B-GRPO-Instruct", + "developer": "ymcki", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7445, + "hfopenllm_v2/BBH": 0.5132, + "hfopenllm_v2/MATH Level 5": 0.2024, + "hfopenllm_v2/GPQA": 0.2945, + "hfopenllm_v2/MUSR": 0.3817, + "hfopenllm_v2/MMLU-PRO": 0.3738 + } + }, + { + "id": "ymcki/Llama-3.1-8B-SFT-GRPO-Instruct", + "name": "Llama-3.1-8B-SFT-GRPO-Instruct", + "developer": "ymcki", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3354, + "hfopenllm_v2/BBH": 0.3126, + "hfopenllm_v2/MATH Level 5": 0.04, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3526, + "hfopenllm_v2/MMLU-PRO": 0.1098 + } + }, + { + "id": "ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18", + "name": "gemma-2-2b-ORPO-jpn-it-abliterated-18", + "developer": "ymcki", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4631, + "hfopenllm_v2/BBH": 0.4053, + "hfopenllm_v2/MATH Level 5": 0.0431, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.3754, + "hfopenllm_v2/MMLU-PRO": 0.2345 + } + }, + { + "id": "ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18-merge", + "name": "gemma-2-2b-ORPO-jpn-it-abliterated-18-merge", + "developer": "ymcki", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5218, + "hfopenllm_v2/BBH": 0.4147, + "hfopenllm_v2/MATH Level 5": 0.0544, + "hfopenllm_v2/GPQA": 0.2836, + "hfopenllm_v2/MUSR": 0.3514, + "hfopenllm_v2/MMLU-PRO": 0.2461 + } + }, + { + "id": "ymcki/gemma-2-2b-jpn-it-abliterated-17", + "name": "gemma-2-2b-jpn-it-abliterated-17", + "developer": "ymcki", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5082, + "hfopenllm_v2/BBH": 0.4076, + "hfopenllm_v2/MATH Level 5": 0.0385, + "hfopenllm_v2/GPQA": 0.2718, + "hfopenllm_v2/MUSR": 0.3701, + "hfopenllm_v2/MMLU-PRO": 0.2455 + } + }, + { + "id": "ymcki/gemma-2-2b-jpn-it-abliterated-17-18-24", + "name": "gemma-2-2b-jpn-it-abliterated-17-18-24", + "developer": "ymcki", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5055, + "hfopenllm_v2/BBH": 0.3812, + "hfopenllm_v2/MATH Level 5": 0.0257, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.3502, + "hfopenllm_v2/MMLU-PRO": 0.2282 + } + }, + { + "id": "ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO", + "name": "gemma-2-2b-jpn-it-abliterated-17-ORPO", + "developer": "ymcki", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4748, + "hfopenllm_v2/BBH": 0.3898, + "hfopenllm_v2/MATH Level 5": 0.0619, + "hfopenllm_v2/GPQA": 0.2743, + "hfopenllm_v2/MUSR": 0.3768, + "hfopenllm_v2/MMLU-PRO": 0.2191 + } + }, + { + "id": "ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca", + "name": "gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca", + "developer": "ymcki", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.3065, + "hfopenllm_v2/BBH": 0.4072, + "hfopenllm_v2/MATH Level 5": 0.0325, + "hfopenllm_v2/GPQA": 0.2693, + "hfopenllm_v2/MUSR": 0.3969, + "hfopenllm_v2/MMLU-PRO": 0.2249 + } + }, + { + "id": "ymcki/gemma-2-2b-jpn-it-abliterated-18", + "name": "gemma-2-2b-jpn-it-abliterated-18", + "developer": "ymcki", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5175, + "hfopenllm_v2/BBH": 0.4132, + "hfopenllm_v2/MATH Level 5": 0.0446, + "hfopenllm_v2/GPQA": 0.2735, + "hfopenllm_v2/MUSR": 0.3742, + "hfopenllm_v2/MMLU-PRO": 0.2505 + } + }, + { + "id": "ymcki/gemma-2-2b-jpn-it-abliterated-18-ORPO", + "name": "gemma-2-2b-jpn-it-abliterated-18-ORPO", + "developer": "ymcki", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4742, + "hfopenllm_v2/BBH": 0.4039, + "hfopenllm_v2/MATH Level 5": 0.0468, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3953, + "hfopenllm_v2/MMLU-PRO": 0.2185 + } + }, + { + "id": "ymcki/gemma-2-2b-jpn-it-abliterated-24", + "name": "gemma-2-2b-jpn-it-abliterated-24", + "developer": "ymcki", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4979, + "hfopenllm_v2/BBH": 0.411, + "hfopenllm_v2/MATH Level 5": 0.0438, + "hfopenllm_v2/GPQA": 0.2777, + "hfopenllm_v2/MUSR": 0.3915, + "hfopenllm_v2/MMLU-PRO": 0.2473 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/yuchenxie.json b/data/developers/yuchenxie.json new file mode 100644 index 0000000000000000000000000000000000000000..34dd07aac11cacaf468d8ba95f3cd57442859f6c --- /dev/null +++ b/data/developers/yuchenxie.json @@ -0,0 +1,33 @@ +{ + "developer": "yuchenxie", + "models": [ + { + "id": "yuchenxie/ArlowGPT-3B-Multilingual", + "name": "ArlowGPT-3B-Multilingual", + "developer": "yuchenxie", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.6395, + "hfopenllm_v2/BBH": 0.4301, + "hfopenllm_v2/MATH Level 5": 0.1125, + "hfopenllm_v2/GPQA": 0.2802, + "hfopenllm_v2/MUSR": 0.3727, + "hfopenllm_v2/MMLU-PRO": 0.2817 + } + }, + { + "id": "yuchenxie/ArlowGPT-8B", + "name": "ArlowGPT-8B", + "developer": "yuchenxie", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7847, + "hfopenllm_v2/BBH": 0.508, + "hfopenllm_v2/MATH Level 5": 0.2039, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.3882, + "hfopenllm_v2/MMLU-PRO": 0.3787 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/yuvraj17.json b/data/developers/yuvraj17.json new file mode 100644 index 0000000000000000000000000000000000000000..6cff91e0bff775c6a11b30e9637ddcb1cc121d7d --- /dev/null +++ b/data/developers/yuvraj17.json @@ -0,0 +1,47 @@ +{ + "developer": "yuvraj17", + "models": [ + { + "id": "yuvraj17/Llama3-8B-SuperNova-Spectrum-Hermes-DPO", + "name": "Llama3-8B-SuperNova-Spectrum-Hermes-DPO", + "developer": "yuvraj17", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4691, + "hfopenllm_v2/BBH": 0.44, + "hfopenllm_v2/MATH Level 5": 0.0566, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.4012, + "hfopenllm_v2/MMLU-PRO": 0.2635 + } + }, + { + "id": "yuvraj17/Llama3-8B-SuperNova-Spectrum-dare_ties", + "name": "Llama3-8B-SuperNova-Spectrum-dare_ties", + "developer": "yuvraj17", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4013, + "hfopenllm_v2/BBH": 0.4616, + "hfopenllm_v2/MATH Level 5": 0.0846, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.4211, + "hfopenllm_v2/MMLU-PRO": 0.3574 + } + }, + { + "id": "yuvraj17/Llama3-8B-abliterated-Spectrum-slerp", + "name": "Llama3-8B-abliterated-Spectrum-slerp", + "developer": "yuvraj17", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.2885, + "hfopenllm_v2/BBH": 0.4978, + "hfopenllm_v2/MATH Level 5": 0.0604, + "hfopenllm_v2/GPQA": 0.3012, + "hfopenllm_v2/MUSR": 0.3998, + "hfopenllm_v2/MMLU-PRO": 0.3257 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/zai-org.json b/data/developers/zai-org.json new file mode 100644 index 0000000000000000000000000000000000000000..97d318c43138cf1428c836472cd08b522e456bd3 --- /dev/null +++ b/data/developers/zai-org.json @@ -0,0 +1,19 @@ +{ + "developer": "zai-org", + "models": [ + { + "id": "zai-org/glm-4.5-air-fp8", + "name": "GLM-4.5-Air-FP8", + "developer": "zai-org", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_capabilities/Mean score": 0.67, + "helm_capabilities/MMLU-Pro": 0.762, + "helm_capabilities/GPQA": 0.594, + "helm_capabilities/IFEval": 0.812, + "helm_capabilities/WildBench": 0.789, + "helm_capabilities/Omni-MATH": 0.391 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/zake7749.json b/data/developers/zake7749.json new file mode 100644 index 0000000000000000000000000000000000000000..a63543b72cbd283b991e4cdc1c436c2e985c9c93 --- /dev/null +++ b/data/developers/zake7749.json @@ -0,0 +1,33 @@ +{ + "developer": "zake7749", + "models": [ + { + "id": "zake7749/gemma-2-2b-it-chinese-kyara-dpo", + "name": "gemma-2-2b-it-chinese-kyara-dpo", + "developer": "zake7749", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.5382, + "hfopenllm_v2/BBH": 0.4257, + "hfopenllm_v2/MATH Level 5": 0.0838, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.4576, + "hfopenllm_v2/MMLU-PRO": 0.2573 + } + }, + { + "id": "zake7749/gemma-2-9b-it-chinese-kyara", + "name": "gemma-2-9b-it-chinese-kyara", + "developer": "zake7749", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1764, + "hfopenllm_v2/BBH": 0.5954, + "hfopenllm_v2/MATH Level 5": 0.105, + "hfopenllm_v2/GPQA": 0.3381, + "hfopenllm_v2/MUSR": 0.4242, + "hfopenllm_v2/MMLU-PRO": 0.4179 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/zelk12.json b/data/developers/zelk12.json new file mode 100644 index 0000000000000000000000000000000000000000..4037af7a97ada5692b99451903fcb046865830b1 --- /dev/null +++ b/data/developers/zelk12.json @@ -0,0 +1,1097 @@ +{ + "developer": "zelk12", + "models": [ + { + "id": "zelk12/Gemma-2-TM-9B", + "name": "Gemma-2-TM-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8045, + "hfopenllm_v2/BBH": 0.5987, + "hfopenllm_v2/MATH Level 5": 0.2024, + "hfopenllm_v2/GPQA": 0.3465, + "hfopenllm_v2/MUSR": 0.4152, + "hfopenllm_v2/MMLU-PRO": 0.4088 + } + }, + { + "id": "zelk12/MT-Gen1-gemma-2-9B", + "name": "MT-Gen1-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7886, + "hfopenllm_v2/BBH": 0.61, + "hfopenllm_v2/MATH Level 5": 0.2221, + "hfopenllm_v2/GPQA": 0.3465, + "hfopenllm_v2/MUSR": 0.4217, + "hfopenllm_v2/MMLU-PRO": 0.4381 + } + }, + { + "id": "zelk12/MT-Gen2-GI-gemma-2-9B", + "name": "MT-Gen2-GI-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7914, + "hfopenllm_v2/BBH": 0.6096, + "hfopenllm_v2/MATH Level 5": 0.2205, + "hfopenllm_v2/GPQA": 0.3507, + "hfopenllm_v2/MUSR": 0.4283, + "hfopenllm_v2/MMLU-PRO": 0.4356 + } + }, + { + "id": "zelk12/MT-Gen2-gemma-2-9B", + "name": "MT-Gen2-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7907, + "hfopenllm_v2/BBH": 0.61, + "hfopenllm_v2/MATH Level 5": 0.219, + "hfopenllm_v2/GPQA": 0.3465, + "hfopenllm_v2/MUSR": 0.4323, + "hfopenllm_v2/MMLU-PRO": 0.4387 + } + }, + { + "id": "zelk12/MT-Gen3-gemma-2-9B", + "name": "MT-Gen3-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.802, + "hfopenllm_v2/BBH": 0.6097, + "hfopenllm_v2/MATH Level 5": 0.2296, + "hfopenllm_v2/GPQA": 0.349, + "hfopenllm_v2/MUSR": 0.4217, + "hfopenllm_v2/MMLU-PRO": 0.4356 + } + }, + { + "id": "zelk12/MT-Gen4-gemma-2-9B", + "name": "MT-Gen4-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7883, + "hfopenllm_v2/BBH": 0.611, + "hfopenllm_v2/MATH Level 5": 0.2236, + "hfopenllm_v2/GPQA": 0.3549, + "hfopenllm_v2/MUSR": 0.4228, + "hfopenllm_v2/MMLU-PRO": 0.4387 + } + }, + { + "id": "zelk12/MT-Gen5-gemma-2-9B", + "name": "MT-Gen5-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7923, + "hfopenllm_v2/BBH": 0.6133, + "hfopenllm_v2/MATH Level 5": 0.2153, + "hfopenllm_v2/GPQA": 0.3515, + "hfopenllm_v2/MUSR": 0.4202, + "hfopenllm_v2/MMLU-PRO": 0.4402 + } + }, + { + "id": "zelk12/MT-Gen6-gemma-2-9B", + "name": "MT-Gen6-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1616, + "hfopenllm_v2/BBH": 0.5845, + "hfopenllm_v2/MATH Level 5": 0.0823, + "hfopenllm_v2/GPQA": 0.3331, + "hfopenllm_v2/MUSR": 0.4069, + "hfopenllm_v2/MMLU-PRO": 0.4166 + } + }, + { + "id": "zelk12/MT-Gen6fix-gemma-2-9B", + "name": "MT-Gen6fix-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1576, + "hfopenllm_v2/BBH": 0.5917, + "hfopenllm_v2/MATH Level 5": 0.0816, + "hfopenllm_v2/GPQA": 0.3372, + "hfopenllm_v2/MUSR": 0.4084, + "hfopenllm_v2/MMLU-PRO": 0.412 + } + }, + { + "id": "zelk12/MT-Gen7-gemma-2-9B", + "name": "MT-Gen7-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1664, + "hfopenllm_v2/BBH": 0.5935, + "hfopenllm_v2/MATH Level 5": 0.0891, + "hfopenllm_v2/GPQA": 0.3356, + "hfopenllm_v2/MUSR": 0.4098, + "hfopenllm_v2/MMLU-PRO": 0.4122 + } + }, + { + "id": "zelk12/MT-Max-Merge_02012025163610-gemma-2-9B", + "name": "MT-Max-Merge_02012025163610-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7907, + "hfopenllm_v2/BBH": 0.6142, + "hfopenllm_v2/MATH Level 5": 0.2213, + "hfopenllm_v2/GPQA": 0.3515, + "hfopenllm_v2/MUSR": 0.4228, + "hfopenllm_v2/MMLU-PRO": 0.4396 + } + }, + { + "id": "zelk12/MT-Merge-gemma-2-9B", + "name": "MT-Merge-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8035, + "hfopenllm_v2/BBH": 0.6118, + "hfopenllm_v2/MATH Level 5": 0.2205, + "hfopenllm_v2/GPQA": 0.3482, + "hfopenllm_v2/MUSR": 0.4256, + "hfopenllm_v2/MMLU-PRO": 0.4362 + } + }, + { + "id": "zelk12/MT-Merge1-gemma-2-9B", + "name": "MT-Merge1-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7901, + "hfopenllm_v2/BBH": 0.61, + "hfopenllm_v2/MATH Level 5": 0.2289, + "hfopenllm_v2/GPQA": 0.3515, + "hfopenllm_v2/MUSR": 0.4244, + "hfopenllm_v2/MMLU-PRO": 0.4374 + } + }, + { + "id": "zelk12/MT-Merge2-MU-gemma-2-MTg2MT1g2-9B", + "name": "MT-Merge2-MU-gemma-2-MTg2MT1g2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7956, + "hfopenllm_v2/BBH": 0.6084, + "hfopenllm_v2/MATH Level 5": 0.2183, + "hfopenllm_v2/GPQA": 0.3507, + "hfopenllm_v2/MUSR": 0.4322, + "hfopenllm_v2/MMLU-PRO": 0.4373 + } + }, + { + "id": "zelk12/MT-Merge2-gemma-2-9B", + "name": "MT-Merge2-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7877, + "hfopenllm_v2/BBH": 0.6107, + "hfopenllm_v2/MATH Level 5": 0.2349, + "hfopenllm_v2/GPQA": 0.3507, + "hfopenllm_v2/MUSR": 0.4217, + "hfopenllm_v2/MMLU-PRO": 0.4382 + } + }, + { + "id": "zelk12/MT-Merge3-gemma-2-9B", + "name": "MT-Merge3-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7859, + "hfopenllm_v2/BBH": 0.6102, + "hfopenllm_v2/MATH Level 5": 0.2205, + "hfopenllm_v2/GPQA": 0.349, + "hfopenllm_v2/MUSR": 0.4258, + "hfopenllm_v2/MMLU-PRO": 0.4373 + } + }, + { + "id": "zelk12/MT-Merge4-gemma-2-9B", + "name": "MT-Merge4-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7807, + "hfopenllm_v2/BBH": 0.6118, + "hfopenllm_v2/MATH Level 5": 0.2168, + "hfopenllm_v2/GPQA": 0.3523, + "hfopenllm_v2/MUSR": 0.4294, + "hfopenllm_v2/MMLU-PRO": 0.439 + } + }, + { + "id": "zelk12/MT-Merge5-gemma-2-9B", + "name": "MT-Merge5-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7844, + "hfopenllm_v2/BBH": 0.6123, + "hfopenllm_v2/MATH Level 5": 0.2183, + "hfopenllm_v2/GPQA": 0.3532, + "hfopenllm_v2/MUSR": 0.4281, + "hfopenllm_v2/MMLU-PRO": 0.4387 + } + }, + { + "id": "zelk12/MT-Merge6-gemma-2-9B", + "name": "MT-Merge6-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1695, + "hfopenllm_v2/BBH": 0.5949, + "hfopenllm_v2/MATH Level 5": 0.0801, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.4098, + "hfopenllm_v2/MMLU-PRO": 0.4115 + } + }, + { + "id": "zelk12/MT-gemma-2-9B", + "name": "MT-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7968, + "hfopenllm_v2/BBH": 0.6064, + "hfopenllm_v2/MATH Level 5": 0.2054, + "hfopenllm_v2/GPQA": 0.3456, + "hfopenllm_v2/MUSR": 0.4071, + "hfopenllm_v2/MMLU-PRO": 0.4224 + } + }, + { + "id": "zelk12/MT1-Gen1-gemma-2-9B", + "name": "MT1-Gen1-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7974, + "hfopenllm_v2/BBH": 0.6118, + "hfopenllm_v2/MATH Level 5": 0.2243, + "hfopenllm_v2/GPQA": 0.344, + "hfopenllm_v2/MUSR": 0.431, + "hfopenllm_v2/MMLU-PRO": 0.4376 + } + }, + { + "id": "zelk12/MT1-Gen2-gemma-2-9B", + "name": "MT1-Gen2-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7984, + "hfopenllm_v2/BBH": 0.6096, + "hfopenllm_v2/MATH Level 5": 0.2251, + "hfopenllm_v2/GPQA": 0.3523, + "hfopenllm_v2/MUSR": 0.4284, + "hfopenllm_v2/MMLU-PRO": 0.4355 + } + }, + { + "id": "zelk12/MT1-Gen3-gemma-2-9B", + "name": "MT1-Gen3-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.796, + "hfopenllm_v2/BBH": 0.6102, + "hfopenllm_v2/MATH Level 5": 0.2243, + "hfopenllm_v2/GPQA": 0.349, + "hfopenllm_v2/MUSR": 0.4243, + "hfopenllm_v2/MMLU-PRO": 0.4349 + } + }, + { + "id": "zelk12/MT1-Gen4-gemma-2-9B", + "name": "MT1-Gen4-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7941, + "hfopenllm_v2/BBH": 0.6058, + "hfopenllm_v2/MATH Level 5": 0.216, + "hfopenllm_v2/GPQA": 0.3473, + "hfopenllm_v2/MUSR": 0.4231, + "hfopenllm_v2/MMLU-PRO": 0.4286 + } + }, + { + "id": "zelk12/MT1-Gen5-IF-gemma-2-S2DMv1-9B", + "name": "MT1-Gen5-IF-gemma-2-S2DMv1-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7929, + "hfopenllm_v2/BBH": 0.6, + "hfopenllm_v2/MATH Level 5": 0.2032, + "hfopenllm_v2/GPQA": 0.344, + "hfopenllm_v2/MUSR": 0.4245, + "hfopenllm_v2/MMLU-PRO": 0.4218 + } + }, + { + "id": "zelk12/MT1-Gen5-gemma-2-9B", + "name": "MT1-Gen5-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7795, + "hfopenllm_v2/BBH": 0.6017, + "hfopenllm_v2/MATH Level 5": 0.2077, + "hfopenllm_v2/GPQA": 0.3465, + "hfopenllm_v2/MUSR": 0.4191, + "hfopenllm_v2/MMLU-PRO": 0.4222 + } + }, + { + "id": "zelk12/MT1-Gen6-gemma-2-9B", + "name": "MT1-Gen6-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1634, + "hfopenllm_v2/BBH": 0.5944, + "hfopenllm_v2/MATH Level 5": 0.0808, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.4044, + "hfopenllm_v2/MMLU-PRO": 0.4133 + } + }, + { + "id": "zelk12/MT1-Gen7-gemma-2-9B", + "name": "MT1-Gen7-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1634, + "hfopenllm_v2/BBH": 0.5938, + "hfopenllm_v2/MATH Level 5": 0.0831, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.4111, + "hfopenllm_v2/MMLU-PRO": 0.4145 + } + }, + { + "id": "zelk12/MT1-Max-Merge_02012025163610-gemma-2-9B", + "name": "MT1-Max-Merge_02012025163610-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7929, + "hfopenllm_v2/BBH": 0.6123, + "hfopenllm_v2/MATH Level 5": 0.2228, + "hfopenllm_v2/GPQA": 0.3549, + "hfopenllm_v2/MUSR": 0.4255, + "hfopenllm_v2/MMLU-PRO": 0.4382 + } + }, + { + "id": "zelk12/MT1-gemma-2-9B", + "name": "MT1-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7947, + "hfopenllm_v2/BBH": 0.6109, + "hfopenllm_v2/MATH Level 5": 0.2236, + "hfopenllm_v2/GPQA": 0.3456, + "hfopenllm_v2/MUSR": 0.4322, + "hfopenllm_v2/MMLU-PRO": 0.4358 + } + }, + { + "id": "zelk12/MT2-Gen1-gemma-2-9B", + "name": "MT2-Gen1-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7856, + "hfopenllm_v2/BBH": 0.6101, + "hfopenllm_v2/MATH Level 5": 0.2213, + "hfopenllm_v2/GPQA": 0.3431, + "hfopenllm_v2/MUSR": 0.4243, + "hfopenllm_v2/MMLU-PRO": 0.4377 + } + }, + { + "id": "zelk12/MT2-Gen2-gemma-2-9B", + "name": "MT2-Gen2-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7889, + "hfopenllm_v2/BBH": 0.6093, + "hfopenllm_v2/MATH Level 5": 0.2183, + "hfopenllm_v2/GPQA": 0.3465, + "hfopenllm_v2/MUSR": 0.427, + "hfopenllm_v2/MMLU-PRO": 0.4388 + } + }, + { + "id": "zelk12/MT2-Gen3-gemma-2-9B", + "name": "MT2-Gen3-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.781, + "hfopenllm_v2/BBH": 0.6105, + "hfopenllm_v2/MATH Level 5": 0.2107, + "hfopenllm_v2/GPQA": 0.3465, + "hfopenllm_v2/MUSR": 0.4231, + "hfopenllm_v2/MMLU-PRO": 0.4374 + } + }, + { + "id": "zelk12/MT2-Gen4-gemma-2-9B", + "name": "MT2-Gen4-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7896, + "hfopenllm_v2/BBH": 0.6097, + "hfopenllm_v2/MATH Level 5": 0.2236, + "hfopenllm_v2/GPQA": 0.3456, + "hfopenllm_v2/MUSR": 0.4125, + "hfopenllm_v2/MMLU-PRO": 0.4321 + } + }, + { + "id": "zelk12/MT2-Gen5-gemma-2-9B", + "name": "MT2-Gen5-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7749, + "hfopenllm_v2/BBH": 0.6064, + "hfopenllm_v2/MATH Level 5": 0.2107, + "hfopenllm_v2/GPQA": 0.3515, + "hfopenllm_v2/MUSR": 0.4244, + "hfopenllm_v2/MMLU-PRO": 0.4302 + } + }, + { + "id": "zelk12/MT2-Gen6-gemma-2-9B", + "name": "MT2-Gen6-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1664, + "hfopenllm_v2/BBH": 0.596, + "hfopenllm_v2/MATH Level 5": 0.0846, + "hfopenllm_v2/GPQA": 0.3381, + "hfopenllm_v2/MUSR": 0.4137, + "hfopenllm_v2/MMLU-PRO": 0.421 + } + }, + { + "id": "zelk12/MT2-Gen7-gemma-2-9B", + "name": "MT2-Gen7-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1762, + "hfopenllm_v2/BBH": 0.6079, + "hfopenllm_v2/MATH Level 5": 0.102, + "hfopenllm_v2/GPQA": 0.3549, + "hfopenllm_v2/MUSR": 0.4203, + "hfopenllm_v2/MMLU-PRO": 0.4311 + } + }, + { + "id": "zelk12/MT2-Max-Merge_02012025163610-gemma-2-9B", + "name": "MT2-Max-Merge_02012025163610-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7901, + "hfopenllm_v2/BBH": 0.6108, + "hfopenllm_v2/MATH Level 5": 0.2243, + "hfopenllm_v2/GPQA": 0.3515, + "hfopenllm_v2/MUSR": 0.4228, + "hfopenllm_v2/MMLU-PRO": 0.4391 + } + }, + { + "id": "zelk12/MT2-gemma-2-9B", + "name": "MT2-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7886, + "hfopenllm_v2/BBH": 0.6115, + "hfopenllm_v2/MATH Level 5": 0.2213, + "hfopenllm_v2/GPQA": 0.3473, + "hfopenllm_v2/MUSR": 0.4217, + "hfopenllm_v2/MMLU-PRO": 0.4368 + } + }, + { + "id": "zelk12/MT3-Gen1-gemma-2-9B", + "name": "MT3-Gen1-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7838, + "hfopenllm_v2/BBH": 0.6107, + "hfopenllm_v2/MATH Level 5": 0.2145, + "hfopenllm_v2/GPQA": 0.3465, + "hfopenllm_v2/MUSR": 0.4151, + "hfopenllm_v2/MMLU-PRO": 0.4327 + } + }, + { + "id": "zelk12/MT3-Gen2-gemma-2-9B", + "name": "MT3-Gen2-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7843, + "hfopenllm_v2/BBH": 0.6091, + "hfopenllm_v2/MATH Level 5": 0.2236, + "hfopenllm_v2/GPQA": 0.3574, + "hfopenllm_v2/MUSR": 0.4111, + "hfopenllm_v2/MMLU-PRO": 0.4333 + } + }, + { + "id": "zelk12/MT3-Gen3-gemma-2-9B", + "name": "MT3-Gen3-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7856, + "hfopenllm_v2/BBH": 0.6089, + "hfopenllm_v2/MATH Level 5": 0.2153, + "hfopenllm_v2/GPQA": 0.3515, + "hfopenllm_v2/MUSR": 0.4258, + "hfopenllm_v2/MMLU-PRO": 0.4303 + } + }, + { + "id": "zelk12/MT3-Gen4-gemma-2-9B", + "name": "MT3-Gen4-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7737, + "hfopenllm_v2/BBH": 0.6101, + "hfopenllm_v2/MATH Level 5": 0.2062, + "hfopenllm_v2/GPQA": 0.3473, + "hfopenllm_v2/MUSR": 0.4476, + "hfopenllm_v2/MMLU-PRO": 0.4387 + } + }, + { + "id": "zelk12/MT3-Gen5-gemma-2-9B", + "name": "MT3-Gen5-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.799, + "hfopenllm_v2/BBH": 0.6099, + "hfopenllm_v2/MATH Level 5": 0.2266, + "hfopenllm_v2/GPQA": 0.3532, + "hfopenllm_v2/MUSR": 0.4191, + "hfopenllm_v2/MMLU-PRO": 0.4317 + } + }, + { + "id": "zelk12/MT3-Gen5-gemma-2-9B_v1", + "name": "MT3-Gen5-gemma-2-9B_v1", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7996, + "hfopenllm_v2/BBH": 0.6113, + "hfopenllm_v2/MATH Level 5": 0.2228, + "hfopenllm_v2/GPQA": 0.349, + "hfopenllm_v2/MUSR": 0.4204, + "hfopenllm_v2/MMLU-PRO": 0.4359 + } + }, + { + "id": "zelk12/MT3-Gen6-gemma-2-9B", + "name": "MT3-Gen6-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1762, + "hfopenllm_v2/BBH": 0.602, + "hfopenllm_v2/MATH Level 5": 0.0884, + "hfopenllm_v2/GPQA": 0.3431, + "hfopenllm_v2/MUSR": 0.4126, + "hfopenllm_v2/MMLU-PRO": 0.4102 + } + }, + { + "id": "zelk12/MT3-Max-Merge_02012025163610-gemma-2-9B", + "name": "MT3-Max-Merge_02012025163610-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1762, + "hfopenllm_v2/BBH": 0.6123, + "hfopenllm_v2/MATH Level 5": 0.1012, + "hfopenllm_v2/GPQA": 0.3507, + "hfopenllm_v2/MUSR": 0.4255, + "hfopenllm_v2/MMLU-PRO": 0.4389 + } + }, + { + "id": "zelk12/MT3-gemma-2-9B", + "name": "MT3-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7786, + "hfopenllm_v2/BBH": 0.6131, + "hfopenllm_v2/MATH Level 5": 0.2168, + "hfopenllm_v2/GPQA": 0.3448, + "hfopenllm_v2/MUSR": 0.4243, + "hfopenllm_v2/MMLU-PRO": 0.4327 + } + }, + { + "id": "zelk12/MT4-Gen1-gemma-2-9B", + "name": "MT4-Gen1-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7895, + "hfopenllm_v2/BBH": 0.6094, + "hfopenllm_v2/MATH Level 5": 0.2198, + "hfopenllm_v2/GPQA": 0.344, + "hfopenllm_v2/MUSR": 0.4322, + "hfopenllm_v2/MMLU-PRO": 0.4389 + } + }, + { + "id": "zelk12/MT4-Gen2-gemma-2-9B", + "name": "MT4-Gen2-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8051, + "hfopenllm_v2/BBH": 0.6108, + "hfopenllm_v2/MATH Level 5": 0.2326, + "hfopenllm_v2/GPQA": 0.3456, + "hfopenllm_v2/MUSR": 0.4257, + "hfopenllm_v2/MMLU-PRO": 0.4368 + } + }, + { + "id": "zelk12/MT4-Gen3-gemma-2-9B", + "name": "MT4-Gen3-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7841, + "hfopenllm_v2/BBH": 0.6087, + "hfopenllm_v2/MATH Level 5": 0.219, + "hfopenllm_v2/GPQA": 0.344, + "hfopenllm_v2/MUSR": 0.4243, + "hfopenllm_v2/MMLU-PRO": 0.4381 + } + }, + { + "id": "zelk12/MT4-Gen4-gemma-2-9B", + "name": "MT4-Gen4-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7874, + "hfopenllm_v2/BBH": 0.6076, + "hfopenllm_v2/MATH Level 5": 0.2145, + "hfopenllm_v2/GPQA": 0.3523, + "hfopenllm_v2/MUSR": 0.4244, + "hfopenllm_v2/MMLU-PRO": 0.4323 + } + }, + { + "id": "zelk12/MT4-Gen5-gemma-2-9B", + "name": "MT4-Gen5-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7789, + "hfopenllm_v2/BBH": 0.6107, + "hfopenllm_v2/MATH Level 5": 0.2266, + "hfopenllm_v2/GPQA": 0.3565, + "hfopenllm_v2/MUSR": 0.4268, + "hfopenllm_v2/MMLU-PRO": 0.4384 + } + }, + { + "id": "zelk12/MT4-Max-Merge_02012025163610-gemma-2-9B", + "name": "MT4-Max-Merge_02012025163610-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1771, + "hfopenllm_v2/BBH": 0.612, + "hfopenllm_v2/MATH Level 5": 0.0952, + "hfopenllm_v2/GPQA": 0.3515, + "hfopenllm_v2/MUSR": 0.4228, + "hfopenllm_v2/MMLU-PRO": 0.4391 + } + }, + { + "id": "zelk12/MT4-gemma-2-9B", + "name": "MT4-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7762, + "hfopenllm_v2/BBH": 0.6073, + "hfopenllm_v2/MATH Level 5": 0.2085, + "hfopenllm_v2/GPQA": 0.3381, + "hfopenllm_v2/MUSR": 0.4309, + "hfopenllm_v2/MMLU-PRO": 0.4366 + } + }, + { + "id": "zelk12/MT5-Gen1-gemma-2-9B", + "name": "MT5-Gen1-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7831, + "hfopenllm_v2/BBH": 0.611, + "hfopenllm_v2/MATH Level 5": 0.2213, + "hfopenllm_v2/GPQA": 0.3473, + "hfopenllm_v2/MUSR": 0.4204, + "hfopenllm_v2/MMLU-PRO": 0.4368 + } + }, + { + "id": "zelk12/MT5-Gen2-gemma-2-9B", + "name": "MT5-Gen2-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7962, + "hfopenllm_v2/BBH": 0.6105, + "hfopenllm_v2/MATH Level 5": 0.2205, + "hfopenllm_v2/GPQA": 0.3515, + "hfopenllm_v2/MUSR": 0.4163, + "hfopenllm_v2/MMLU-PRO": 0.4379 + } + }, + { + "id": "zelk12/MT5-Gen3-gemma-2-9B", + "name": "MT5-Gen3-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7825, + "hfopenllm_v2/BBH": 0.609, + "hfopenllm_v2/MATH Level 5": 0.2168, + "hfopenllm_v2/GPQA": 0.3515, + "hfopenllm_v2/MUSR": 0.4231, + "hfopenllm_v2/MMLU-PRO": 0.4375 + } + }, + { + "id": "zelk12/MT5-Gen4-gemma-2-9B", + "name": "MT5-Gen4-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7835, + "hfopenllm_v2/BBH": 0.6131, + "hfopenllm_v2/MATH Level 5": 0.2243, + "hfopenllm_v2/GPQA": 0.3532, + "hfopenllm_v2/MUSR": 0.4228, + "hfopenllm_v2/MMLU-PRO": 0.4397 + } + }, + { + "id": "zelk12/MT5-Gen5-gemma-2-9B", + "name": "MT5-Gen5-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7947, + "hfopenllm_v2/BBH": 0.6112, + "hfopenllm_v2/MATH Level 5": 0.2258, + "hfopenllm_v2/GPQA": 0.3482, + "hfopenllm_v2/MUSR": 0.4191, + "hfopenllm_v2/MMLU-PRO": 0.4329 + } + }, + { + "id": "zelk12/MT5-Max-Merge_02012025163610-gemma-2-9B", + "name": "MT5-Max-Merge_02012025163610-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1762, + "hfopenllm_v2/BBH": 0.6127, + "hfopenllm_v2/MATH Level 5": 0.0982, + "hfopenllm_v2/GPQA": 0.3515, + "hfopenllm_v2/MUSR": 0.4228, + "hfopenllm_v2/MMLU-PRO": 0.439 + } + }, + { + "id": "zelk12/MT5-gemma-2-9B", + "name": "MT5-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8048, + "hfopenllm_v2/BBH": 0.6112, + "hfopenllm_v2/MATH Level 5": 0.2258, + "hfopenllm_v2/GPQA": 0.3431, + "hfopenllm_v2/MUSR": 0.4204, + "hfopenllm_v2/MMLU-PRO": 0.4367 + } + }, + { + "id": "zelk12/MTM-Merge-gemma-2-9B", + "name": "MTM-Merge-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7798, + "hfopenllm_v2/BBH": 0.6133, + "hfopenllm_v2/MATH Level 5": 0.2175, + "hfopenllm_v2/GPQA": 0.3549, + "hfopenllm_v2/MUSR": 0.4268, + "hfopenllm_v2/MMLU-PRO": 0.4388 + } + }, + { + "id": "zelk12/MTMaMe-Merge_02012025163610-gemma-2-9B", + "name": "MTMaMe-Merge_02012025163610-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1786, + "hfopenllm_v2/BBH": 0.6117, + "hfopenllm_v2/MATH Level 5": 0.0959, + "hfopenllm_v2/GPQA": 0.3523, + "hfopenllm_v2/MUSR": 0.4241, + "hfopenllm_v2/MMLU-PRO": 0.4382 + } + }, + { + "id": "zelk12/Rv0.4DMv1t0.25-gemma-2-9B", + "name": "Rv0.4DMv1t0.25-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7497, + "hfopenllm_v2/BBH": 0.607, + "hfopenllm_v2/MATH Level 5": 0.2258, + "hfopenllm_v2/GPQA": 0.3456, + "hfopenllm_v2/MUSR": 0.4309, + "hfopenllm_v2/MMLU-PRO": 0.4401 + } + }, + { + "id": "zelk12/Rv0.4DMv1t0.25Tt0.25-gemma-2-9B", + "name": "Rv0.4DMv1t0.25Tt0.25-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7646, + "hfopenllm_v2/BBH": 0.6098, + "hfopenllm_v2/MATH Level 5": 0.2069, + "hfopenllm_v2/GPQA": 0.3423, + "hfopenllm_v2/MUSR": 0.4283, + "hfopenllm_v2/MMLU-PRO": 0.4347 + } + }, + { + "id": "zelk12/Rv0.4MT4g2-gemma-2-9B", + "name": "Rv0.4MT4g2-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.732, + "hfopenllm_v2/BBH": 0.6041, + "hfopenllm_v2/MATH Level 5": 0.1949, + "hfopenllm_v2/GPQA": 0.3532, + "hfopenllm_v2/MUSR": 0.4231, + "hfopenllm_v2/MMLU-PRO": 0.4417 + } + }, + { + "id": "zelk12/T31122024203920-gemma-2-9B", + "name": "T31122024203920-gemma-2-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7676, + "hfopenllm_v2/BBH": 0.6096, + "hfopenllm_v2/MATH Level 5": 0.2054, + "hfopenllm_v2/GPQA": 0.3507, + "hfopenllm_v2/MUSR": 0.4322, + "hfopenllm_v2/MMLU-PRO": 0.4373 + } + }, + { + "id": "zelk12/Test01012025155054", + "name": "Test01012025155054", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1555, + "hfopenllm_v2/BBH": 0.283, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2416, + "hfopenllm_v2/MUSR": 0.367, + "hfopenllm_v2/MMLU-PRO": 0.109 + } + }, + { + "id": "zelk12/Test01012025155054t0.5_gemma-2", + "name": "Test01012025155054t0.5_gemma-2", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.1555, + "hfopenllm_v2/BBH": 0.283, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2416, + "hfopenllm_v2/MUSR": 0.367, + "hfopenllm_v2/MMLU-PRO": 0.109 + } + }, + { + "id": "zelk12/gemma-2-S2MTM-9B", + "name": "gemma-2-S2MTM-9B", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7823, + "hfopenllm_v2/BBH": 0.6061, + "hfopenllm_v2/MATH Level 5": 0.2047, + "hfopenllm_v2/GPQA": 0.3456, + "hfopenllm_v2/MUSR": 0.4218, + "hfopenllm_v2/MMLU-PRO": 0.4297 + } + }, + { + "id": "zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1", + "name": "recoilme-gemma-2-Ataraxy-9B-v0.1", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7649, + "hfopenllm_v2/BBH": 0.6075, + "hfopenllm_v2/MATH Level 5": 0.2281, + "hfopenllm_v2/GPQA": 0.3498, + "hfopenllm_v2/MUSR": 0.4136, + "hfopenllm_v2/MMLU-PRO": 0.4321 + } + }, + { + "id": "zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25", + "name": "recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7707, + "hfopenllm_v2/BBH": 0.6075, + "hfopenllm_v2/MATH Level 5": 0.2145, + "hfopenllm_v2/GPQA": 0.3431, + "hfopenllm_v2/MUSR": 0.4323, + "hfopenllm_v2/MMLU-PRO": 0.44 + } + }, + { + "id": "zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75", + "name": "recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7208, + "hfopenllm_v2/BBH": 0.5995, + "hfopenllm_v2/MATH Level 5": 0.2017, + "hfopenllm_v2/GPQA": 0.3498, + "hfopenllm_v2/MUSR": 0.3951, + "hfopenllm_v2/MMLU-PRO": 0.4141 + } + }, + { + "id": "zelk12/recoilme-gemma-2-Ataraxy-9B-v0.2", + "name": "recoilme-gemma-2-Ataraxy-9B-v0.2", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.76, + "hfopenllm_v2/BBH": 0.6066, + "hfopenllm_v2/MATH Level 5": 0.2228, + "hfopenllm_v2/GPQA": 0.3482, + "hfopenllm_v2/MUSR": 0.411, + "hfopenllm_v2/MMLU-PRO": 0.4323 + } + }, + { + "id": "zelk12/recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1", + "name": "recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7615, + "hfopenllm_v2/BBH": 0.6099, + "hfopenllm_v2/MATH Level 5": 0.21, + "hfopenllm_v2/GPQA": 0.3414, + "hfopenllm_v2/MUSR": 0.431, + "hfopenllm_v2/MMLU-PRO": 0.4315 + } + }, + { + "id": "zelk12/recoilme-gemma-2-Ifable-9B-v0.1", + "name": "recoilme-gemma-2-Ifable-9B-v0.1", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7944, + "hfopenllm_v2/BBH": 0.6064, + "hfopenllm_v2/MATH Level 5": 0.2205, + "hfopenllm_v2/GPQA": 0.3515, + "hfopenllm_v2/MUSR": 0.4202, + "hfopenllm_v2/MMLU-PRO": 0.4323 + } + }, + { + "id": "zelk12/recoilme-gemma-2-psy10k-mental_healt-9B-v0.1", + "name": "recoilme-gemma-2-psy10k-mental_healt-9B-v0.1", + "developer": "zelk12", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7445, + "hfopenllm_v2/BBH": 0.5978, + "hfopenllm_v2/MATH Level 5": 0.1888, + "hfopenllm_v2/GPQA": 0.344, + "hfopenllm_v2/MUSR": 0.4295, + "hfopenllm_v2/MMLU-PRO": 0.4181 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/zetasepic.json b/data/developers/zetasepic.json new file mode 100644 index 0000000000000000000000000000000000000000..1086f4e5f9c03e4c2dcb6387043dd4fb20abddf5 --- /dev/null +++ b/data/developers/zetasepic.json @@ -0,0 +1,33 @@ +{ + "developer": "zetasepic", + "models": [ + { + "id": "zetasepic/Qwen2.5-32B-Instruct-abliterated-v2", + "name": "Qwen2.5-32B-Instruct-abliterated-v2", + "developer": "zetasepic", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.8334, + "hfopenllm_v2/BBH": 0.6934, + "hfopenllm_v2/MATH Level 5": 0.5952, + "hfopenllm_v2/GPQA": 0.3674, + "hfopenllm_v2/MUSR": 0.4354, + "hfopenllm_v2/MMLU-PRO": 0.5622 + } + }, + { + "id": "zetasepic/Qwen2.5-72B-Instruct-abliterated", + "name": "Qwen2.5-72B-Instruct-abliterated", + "developer": "zetasepic", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.7153, + "hfopenllm_v2/BBH": 0.7152, + "hfopenllm_v2/MATH Level 5": 0.5242, + "hfopenllm_v2/GPQA": 0.4069, + "hfopenllm_v2/MUSR": 0.4719, + "hfopenllm_v2/MMLU-PRO": 0.5872 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/zhengr.json b/data/developers/zhengr.json new file mode 100644 index 0000000000000000000000000000000000000000..fd148552e2b6e6a9168c94acdb94db08b9ff9f46 --- /dev/null +++ b/data/developers/zhengr.json @@ -0,0 +1,19 @@ +{ + "developer": "zhengr", + "models": [ + { + "id": "zhengr/MixTAO-7Bx2-MoE-v8.1", + "name": "MixTAO-7Bx2-MoE-v8.1", + "developer": "zhengr", + "evaluator_relationship": null, + "benchmark_scores": { + "hfopenllm_v2/IFEval": 0.4188, + "hfopenllm_v2/BBH": 0.4202, + "hfopenllm_v2/MATH Level 5": 0.0604, + "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/MUSR": 0.3976, + "hfopenllm_v2/MMLU-PRO": 0.2847 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/zhipu-ai.json b/data/developers/zhipu-ai.json new file mode 100644 index 0000000000000000000000000000000000000000..b914350b9f9666b3ff0559ce5693c595b07fee2f --- /dev/null +++ b/data/developers/zhipu-ai.json @@ -0,0 +1,28 @@ +{ + "developer": "zhipu-ai", + "models": [ + { + "id": "zhipu-ai/GLM-130B", + "name": "GLM 130B", + "developer": "zhipu-ai", + "evaluator_relationship": null, + "benchmark_scores": { + "helm_classic/Mean win rate": 0.512, + "helm_classic/MMLU": 0.344, + "helm_classic/BoolQ": 0.784, + "helm_classic/NarrativeQA": 0.706, + "helm_classic/NaturalQuestions (open-book)": 0.642, + "helm_classic/QuAC": 0.272, + "helm_classic/HellaSwag": -1.0, + "helm_classic/OpenbookQA": -1.0, + "helm_classic/TruthfulQA": 0.218, + "helm_classic/MS MARCO (TREC)": -1.0, + "helm_classic/CNN/DailyMail": 0.154, + "helm_classic/XSUM": 0.132, + "helm_classic/IMDB": 0.955, + "helm_classic/CivilComments": 0.5, + "helm_classic/RAFT": 0.598 + } + } + ] +} \ No newline at end of file diff --git a/data/developers/zhipu.json b/data/developers/zhipu.json new file mode 100644 index 0000000000000000000000000000000000000000..8c28c39163cadb7b5a5538e3d6a4caf806e7ec63 --- /dev/null +++ b/data/developers/zhipu.json @@ -0,0 +1,23 @@ +{ + "developer": "zhipu", + "models": [ + { + "id": "zhipu/GLM 4.6", + "name": "GLM 4.6", + "developer": "zhipu", + "evaluator_relationship": null, + "benchmark_scores": { + "apex-agents/Corporate Lawyer Mean Score": 0.196 + } + }, + { + "id": "zhipu/GLM 4.7", + "name": "GLM 4.7", + "developer": "zhipu", + "evaluator_relationship": null, + "benchmark_scores": { + "apex-agents/Corporate Lawyer Mean Score": 0.147 + } + } + ] +} \ No newline at end of file diff --git a/data/models.json b/data/models.json index 60625ce86164f61dd5c29b6669ddc3da26c7d5b5..8c41767f8aba7274867583a42ae301bdc9ee2ff6 100644 --- a/data/models.json +++ b/data/models.json @@ -1446,12 +1446,12 @@ "developer": "AtAndDev", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.4605, - "hfopenllm_v2/BBH": 0.4258, - "hfopenllm_v2/MATH Level 5": 0.0748, - "hfopenllm_v2/GPQA": 0.2659, - "hfopenllm_v2/MUSR": 0.3636, - "hfopenllm_v2/MMLU-PRO": 0.2812 + "hfopenllm_v2/IFEval": 0.4511, + "hfopenllm_v2/BBH": 0.4275, + "hfopenllm_v2/MATH Level 5": 0.1473, + "hfopenllm_v2/GPQA": 0.2701, + "hfopenllm_v2/MUSR": 0.3623, + "hfopenllm_v2/MMLU-PRO": 0.2806 } }, { @@ -2256,12 +2256,12 @@ "developer": "BoltMonkey", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.459, - "hfopenllm_v2/BBH": 0.5185, - "hfopenllm_v2/MATH Level 5": 0.0937, - "hfopenllm_v2/GPQA": 0.2743, - "hfopenllm_v2/MUSR": 0.4083, - "hfopenllm_v2/MMLU-PRO": 0.3631 + "hfopenllm_v2/IFEval": 0.7999, + "hfopenllm_v2/BBH": 0.5152, + "hfopenllm_v2/MATH Level 5": 0.1193, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.4019, + "hfopenllm_v2/MMLU-PRO": 0.3733 } }, { @@ -2354,17 +2354,17 @@ "developer": "CIR-AMS", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.5736, - "reward-bench/Chat": 0.9749, - "reward-bench/Chat Hard": 0.5724, - "reward-bench/Safety": 0.7178, - "reward-bench/Reasoning": 0.8775, - "reward-bench/Prior Sets (0.5 weight)": 0.7029, + "reward-bench/Score": 0.8172, "reward-bench/Factuality": 0.5347, "reward-bench/Precise IF": 0.3563, "reward-bench/Math": 0.6066, + "reward-bench/Safety": 0.9014, "reward-bench/Focus": 0.5737, - "reward-bench/Ties": 0.6527 + "reward-bench/Ties": 0.6527, + "reward-bench/Chat": 0.9749, + "reward-bench/Chat Hard": 0.5724, + "reward-bench/Reasoning": 0.8775, + "reward-bench/Prior Sets (0.5 weight)": 0.7029 } }, { @@ -2771,12 +2771,12 @@ "developer": "Columbia-NLP", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.3278, - "hfopenllm_v2/BBH": 0.392, - "hfopenllm_v2/MATH Level 5": 0.0431, - "hfopenllm_v2/GPQA": 0.2492, - "hfopenllm_v2/MUSR": 0.412, - "hfopenllm_v2/MMLU-PRO": 0.1666 + "hfopenllm_v2/IFEval": 0.3102, + "hfopenllm_v2/BBH": 0.3881, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.4081, + "hfopenllm_v2/MMLU-PRO": 0.1665 } }, { @@ -3935,12 +3935,12 @@ "developer": "Daemontatox", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.4398, - "hfopenllm_v2/BBH": 0.5066, - "hfopenllm_v2/MATH Level 5": 0.1488, + "hfopenllm_v2/IFEval": 0.4383, + "hfopenllm_v2/BBH": 0.5034, + "hfopenllm_v2/MATH Level 5": 0.1443, "hfopenllm_v2/GPQA": 0.3238, - "hfopenllm_v2/MUSR": 0.4079, - "hfopenllm_v2/MMLU-PRO": 0.3804 + "hfopenllm_v2/MUSR": 0.4052, + "hfopenllm_v2/MMLU-PRO": 0.3778 } }, { @@ -4019,12 +4019,12 @@ "developer": "Daemontatox", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.5064, - "hfopenllm_v2/BBH": 0.5112, - "hfopenllm_v2/MATH Level 5": 0.1631, - "hfopenllm_v2/GPQA": 0.3163, - "hfopenllm_v2/MUSR": 0.3973, - "hfopenllm_v2/MMLU-PRO": 0.3802 + "hfopenllm_v2/IFEval": 0.777, + "hfopenllm_v2/BBH": 0.5187, + "hfopenllm_v2/MATH Level 5": 0.2198, + "hfopenllm_v2/GPQA": 0.2936, + "hfopenllm_v2/MUSR": 0.3911, + "hfopenllm_v2/MMLU-PRO": 0.3738 } }, { @@ -4131,12 +4131,12 @@ "developer": "Daemontatox", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.4855, - "hfopenllm_v2/BBH": 0.6627, - "hfopenllm_v2/MATH Level 5": 0.4841, - "hfopenllm_v2/GPQA": 0.3096, - "hfopenllm_v2/MUSR": 0.4256, - "hfopenllm_v2/MMLU-PRO": 0.5542 + "hfopenllm_v2/IFEval": 0.3745, + "hfopenllm_v2/BBH": 0.6668, + "hfopenllm_v2/MATH Level 5": 0.4758, + "hfopenllm_v2/GPQA": 0.3943, + "hfopenllm_v2/MUSR": 0.4858, + "hfopenllm_v2/MMLU-PRO": 0.5593 } }, { @@ -4986,12 +4986,12 @@ "developer": "DavieLion", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.1549, - "hfopenllm_v2/BBH": 0.2937, - "hfopenllm_v2/MATH Level 5": 0.006, - "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/IFEval": 0.1507, + "hfopenllm_v2/BBH": 0.293, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2534, "hfopenllm_v2/MUSR": 0.3565, - "hfopenllm_v2/MMLU-PRO": 0.1128 + "hfopenllm_v2/MMLU-PRO": 0.1125 } }, { @@ -5028,12 +5028,12 @@ "developer": "DavieLion", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.1324, - "hfopenllm_v2/BBH": 0.2972, - "hfopenllm_v2/MATH Level 5": 0.0, - "hfopenllm_v2/GPQA": 0.2643, - "hfopenllm_v2/MUSR": 0.3527, - "hfopenllm_v2/MMLU-PRO": 0.1129 + "hfopenllm_v2/IFEval": 0.1336, + "hfopenllm_v2/BBH": 0.2975, + "hfopenllm_v2/MATH Level 5": 0.0068, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.35, + "hfopenllm_v2/MMLU-PRO": 0.1128 } }, { @@ -5686,12 +5686,12 @@ "developer": "DoppelReflEx", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.436, - "hfopenllm_v2/BBH": 0.4956, - "hfopenllm_v2/MATH Level 5": 0.0589, - "hfopenllm_v2/GPQA": 0.3205, - "hfopenllm_v2/MUSR": 0.3843, - "hfopenllm_v2/MMLU-PRO": 0.3237 + "hfopenllm_v2/IFEval": 0.451, + "hfopenllm_v2/BBH": 0.4944, + "hfopenllm_v2/MATH Level 5": 0.1156, + "hfopenllm_v2/GPQA": 0.3196, + "hfopenllm_v2/MUSR": 0.3896, + "hfopenllm_v2/MMLU-PRO": 0.3256 } }, { @@ -14777,12 +14777,12 @@ "developer": "LeroyDyer", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.3036, - "hfopenllm_v2/BBH": 0.4575, + "hfopenllm_v2/IFEval": 0.3066, + "hfopenllm_v2/BBH": 0.4577, "hfopenllm_v2/MATH Level 5": 0.0446, - "hfopenllm_v2/GPQA": 0.3012, - "hfopenllm_v2/MUSR": 0.4253, - "hfopenllm_v2/MMLU-PRO": 0.2329 + "hfopenllm_v2/GPQA": 0.2995, + "hfopenllm_v2/MUSR": 0.4254, + "hfopenllm_v2/MMLU-PRO": 0.2318 } }, { @@ -16022,16 +16022,16 @@ "developer": "LxzGordon", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.7394, - "reward-bench/Chat": 0.9553, - "reward-bench/Chat Hard": 0.8816, - "reward-bench/Safety": 0.9178, - "reward-bench/Reasoning": 0.9698, + "reward-bench/Score": 0.9294, "reward-bench/Factuality": 0.6884, "reward-bench/Precise IF": 0.45, "reward-bench/Math": 0.6393, + "reward-bench/Safety": 0.9108, "reward-bench/Focus": 0.9758, - "reward-bench/Ties": 0.7653 + "reward-bench/Ties": 0.7653, + "reward-bench/Chat": 0.9553, + "reward-bench/Chat Hard": 0.8816, + "reward-bench/Reasoning": 0.9698 } }, { @@ -18395,17 +18395,17 @@ "developer": "Nexusflow", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.4553, - "reward-bench/Chat": 0.9693, - "reward-bench/Chat Hard": 0.5724, - "reward-bench/Safety": 0.7556, - "reward-bench/Reasoning": 0.8845, - "reward-bench/Prior Sets (0.5 weight)": 0.7137, + "reward-bench/Score": 0.8133, "reward-bench/Factuality": 0.4589, "reward-bench/Precise IF": 0.3187, "reward-bench/Math": 0.6175, + "reward-bench/Safety": 0.877, "reward-bench/Focus": 0.4808, - "reward-bench/Ties": 0.1004 + "reward-bench/Ties": 0.1004, + "reward-bench/Chat": 0.9693, + "reward-bench/Chat Hard": 0.5724, + "reward-bench/Reasoning": 0.8845, + "reward-bench/Prior Sets (0.5 weight)": 0.7137 } }, { @@ -19477,17 +19477,17 @@ "developer": "OpenAssistant", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.6126, + "reward-bench/Score": 0.32, + "reward-bench/Chat": 0.8939, + "reward-bench/Chat Hard": 0.4518, + "reward-bench/Safety": 0.3667, + "reward-bench/Reasoning": 0.3855, + "reward-bench/Prior Sets (0.5 weight)": 0.5836, "reward-bench/Factuality": 0.3853, "reward-bench/Precise IF": 0.2687, "reward-bench/Math": 0.5027, - "reward-bench/Safety": 0.7338, "reward-bench/Focus": 0.2768, - "reward-bench/Ties": 0.12, - "reward-bench/Chat": 0.8939, - "reward-bench/Chat Hard": 0.4518, - "reward-bench/Reasoning": 0.3855, - "reward-bench/Prior Sets (0.5 weight)": 0.5836 + "reward-bench/Ties": 0.12 } }, { @@ -20164,17 +20164,17 @@ "developer": "PKU-Alignment", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.5957, + "reward-bench/Score": 0.3326, + "reward-bench/Chat": 0.5726, + "reward-bench/Chat Hard": 0.4561, + "reward-bench/Safety": 0.7356, + "reward-bench/Reasoning": 0.6211, + "reward-bench/Prior Sets (0.5 weight)": 0.5397, "reward-bench/Factuality": 0.3789, "reward-bench/Precise IF": 0.275, "reward-bench/Math": 0.3333, - "reward-bench/Safety": 0.7608, "reward-bench/Focus": 0.2828, - "reward-bench/Ties": -0.01, - "reward-bench/Chat": 0.5726, - "reward-bench/Chat Hard": 0.4561, - "reward-bench/Reasoning": 0.6211, - "reward-bench/Prior Sets (0.5 weight)": 0.5397 + "reward-bench/Ties": -0.01 } }, { @@ -20183,17 +20183,17 @@ "developer": "PKU-Alignment", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.6366, + "reward-bench/Score": 0.2544, + "reward-bench/Chat": 0.8994, + "reward-bench/Chat Hard": 0.364, + "reward-bench/Safety": 0.3156, + "reward-bench/Reasoning": 0.6887, + "reward-bench/Prior Sets (0.5 weight)": 0.6171, "reward-bench/Factuality": 0.2168, "reward-bench/Precise IF": 0.2562, "reward-bench/Math": 0.3825, - "reward-bench/Safety": 0.6041, "reward-bench/Focus": 0.2606, - "reward-bench/Ties": 0.0944, - "reward-bench/Chat": 0.8994, - "reward-bench/Chat Hard": 0.364, - "reward-bench/Reasoning": 0.6887, - "reward-bench/Prior Sets (0.5 weight)": 0.6171 + "reward-bench/Ties": 0.0944 } }, { @@ -22085,12 +22085,12 @@ "developer": "Qwen", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.3153, - "hfopenllm_v2/BBH": 0.3322, - "hfopenllm_v2/MATH Level 5": 0.1035, - "hfopenllm_v2/GPQA": 0.2592, - "hfopenllm_v2/MUSR": 0.3342, - "hfopenllm_v2/MMLU-PRO": 0.172 + "hfopenllm_v2/IFEval": 0.3071, + "hfopenllm_v2/BBH": 0.3341, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3329, + "hfopenllm_v2/MMLU-PRO": 0.1697 } }, { @@ -22692,16 +22692,16 @@ "developer": "Ray2333", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.5966, - "reward-bench/Chat": 0.9302, - "reward-bench/Chat Hard": 0.7719, - "reward-bench/Safety": 0.9222, - "reward-bench/Reasoning": 0.912, + "reward-bench/Score": 0.8839, "reward-bench/Factuality": 0.5305, "reward-bench/Precise IF": 0.3125, "reward-bench/Math": 0.5902, + "reward-bench/Safety": 0.9216, "reward-bench/Focus": 0.7455, - "reward-bench/Ties": 0.4788 + "reward-bench/Ties": 0.4788, + "reward-bench/Chat": 0.9302, + "reward-bench/Chat Hard": 0.7719, + "reward-bench/Reasoning": 0.912 } }, { @@ -22710,17 +22710,17 @@ "developer": "Ray2333", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.8464, + "reward-bench/Score": 0.589, + "reward-bench/Chat": 0.9832, + "reward-bench/Chat Hard": 0.6842, + "reward-bench/Safety": 0.7222, + "reward-bench/Reasoning": 0.9133, + "reward-bench/Prior Sets (0.5 weight)": 0.7209, "reward-bench/Factuality": 0.5874, "reward-bench/Precise IF": 0.3875, "reward-bench/Math": 0.5902, - "reward-bench/Safety": 0.8676, "reward-bench/Focus": 0.6727, - "reward-bench/Ties": 0.5743, - "reward-bench/Chat": 0.9832, - "reward-bench/Chat Hard": 0.6842, - "reward-bench/Reasoning": 0.9133, - "reward-bench/Prior Sets (0.5 weight)": 0.7209 + "reward-bench/Ties": 0.5743 } }, { @@ -22729,17 +22729,17 @@ "developer": "Ray2333", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.8542, + "reward-bench/Score": 0.6089, + "reward-bench/Chat": 0.986, + "reward-bench/Chat Hard": 0.6776, + "reward-bench/Safety": 0.7867, + "reward-bench/Reasoning": 0.9229, + "reward-bench/Prior Sets (0.5 weight)": 0.7309, "reward-bench/Factuality": 0.6189, "reward-bench/Precise IF": 0.3875, "reward-bench/Math": 0.5792, - "reward-bench/Safety": 0.8919, "reward-bench/Focus": 0.6828, - "reward-bench/Ties": 0.5981, - "reward-bench/Chat": 0.986, - "reward-bench/Chat Hard": 0.6776, - "reward-bench/Reasoning": 0.9229, - "reward-bench/Prior Sets (0.5 weight)": 0.7309 + "reward-bench/Ties": 0.5981 } }, { @@ -24183,12 +24183,12 @@ "developer": "Sao10K", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.7384, - "hfopenllm_v2/BBH": 0.6471, - "hfopenllm_v2/MATH Level 5": 0.2137, + "hfopenllm_v2/IFEval": 0.7281, + "hfopenllm_v2/BBH": 0.6503, + "hfopenllm_v2/MATH Level 5": 0.2243, "hfopenllm_v2/GPQA": 0.3314, - "hfopenllm_v2/MUSR": 0.4209, - "hfopenllm_v2/MMLU-PRO": 0.5104 + "hfopenllm_v2/MUSR": 0.4196, + "hfopenllm_v2/MMLU-PRO": 0.5096 } }, { @@ -25134,16 +25134,16 @@ "hfopenllm_v2/GPQA": 0.344, "hfopenllm_v2/MUSR": 0.4231, "hfopenllm_v2/MMLU-PRO": 0.4103, - "reward-bench/Score": 0.7531, - "reward-bench/Chat": 0.9609, - "reward-bench/Chat Hard": 0.8991, - "reward-bench/Safety": 0.9689, - "reward-bench/Reasoning": 0.9807, + "reward-bench/Score": 0.9426, "reward-bench/Factuality": 0.7674, "reward-bench/Precise IF": 0.375, "reward-bench/Math": 0.6721, + "reward-bench/Safety": 0.9297, "reward-bench/Focus": 0.9172, - "reward-bench/Ties": 0.8182 + "reward-bench/Ties": 0.8182, + "reward-bench/Chat": 0.9609, + "reward-bench/Chat Hard": 0.8991, + "reward-bench/Reasoning": 0.9807 } }, { @@ -25170,16 +25170,16 @@ "developer": "Skywork", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.7175, - "reward-bench/Chat": 0.9469, - "reward-bench/Chat Hard": 0.8838, - "reward-bench/Safety": 0.9422, - "reward-bench/Reasoning": 0.9675, + "reward-bench/Score": 0.9313, "reward-bench/Factuality": 0.6968, "reward-bench/Precise IF": 0.4062, "reward-bench/Math": 0.6011, + "reward-bench/Safety": 0.927, "reward-bench/Focus": 0.9414, - "reward-bench/Ties": 0.7169 + "reward-bench/Ties": 0.7169, + "reward-bench/Chat": 0.9469, + "reward-bench/Chat Hard": 0.8838, + "reward-bench/Reasoning": 0.9675 } }, { @@ -25293,16 +25293,16 @@ "developer": "Skywork", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.6885, - "reward-bench/Chat": 0.8994, - "reward-bench/Chat Hard": 0.875, - "reward-bench/Safety": 0.8911, - "reward-bench/Reasoning": 0.9176, + "reward-bench/Score": 0.9007, "reward-bench/Factuality": 0.6063, "reward-bench/Precise IF": 0.35, "reward-bench/Math": 0.6339, + "reward-bench/Safety": 0.9108, "reward-bench/Focus": 0.8909, - "reward-bench/Ties": 0.7586 + "reward-bench/Ties": 0.7586, + "reward-bench/Chat": 0.8994, + "reward-bench/Chat Hard": 0.875, + "reward-bench/Reasoning": 0.9176 } }, { @@ -28614,12 +28614,12 @@ "developer": "VIRNECT", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.5021, - "hfopenllm_v2/BBH": 0.4918, - "hfopenllm_v2/MATH Level 5": 0.108, + "hfopenllm_v2/IFEval": 0.5058, + "hfopenllm_v2/BBH": 0.4908, + "hfopenllm_v2/MATH Level 5": 0.0929, "hfopenllm_v2/GPQA": 0.271, - "hfopenllm_v2/MUSR": 0.3648, - "hfopenllm_v2/MMLU-PRO": 0.3536 + "hfopenllm_v2/MUSR": 0.3662, + "hfopenllm_v2/MMLU-PRO": 0.3539 } }, { @@ -28726,12 +28726,12 @@ "developer": "ValiantLabs", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.5483, - "hfopenllm_v2/BBH": 0.461, - "hfopenllm_v2/MATH Level 5": 0.0582, - "hfopenllm_v2/GPQA": 0.2886, - "hfopenllm_v2/MUSR": 0.3433, - "hfopenllm_v2/MMLU-PRO": 0.2407 + "hfopenllm_v2/IFEval": 0.5328, + "hfopenllm_v2/BBH": 0.4613, + "hfopenllm_v2/MATH Level 5": 0.0876, + "hfopenllm_v2/GPQA": 0.2894, + "hfopenllm_v2/MUSR": 0.3367, + "hfopenllm_v2/MMLU-PRO": 0.2424 } }, { @@ -30349,12 +30349,12 @@ "developer": "adriszmar", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.1685, - "hfopenllm_v2/BBH": 0.3124, - "hfopenllm_v2/MATH Level 5": 0.0015, - "hfopenllm_v2/GPQA": 0.2492, - "hfopenllm_v2/MUSR": 0.3963, - "hfopenllm_v2/MMLU-PRO": 0.1066 + "hfopenllm_v2/IFEval": 0.1746, + "hfopenllm_v2/BBH": 0.3126, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.245, + "hfopenllm_v2/MUSR": 0.4096, + "hfopenllm_v2/MMLU-PRO": 0.1087 } }, { @@ -30553,10 +30553,10 @@ "developer": "ai2", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.7004, - "reward-bench/Chat": 0.9413, - "reward-bench/Chat Hard": 0.3882, - "reward-bench/Safety": 0.7716 + "reward-bench/Score": 0.6895, + "reward-bench/Chat": 0.9385, + "reward-bench/Chat Hard": 0.3706, + "reward-bench/Safety": 0.7595 } }, { @@ -31100,12 +31100,12 @@ "developer": "akjindal53244", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.8033, - "hfopenllm_v2/BBH": 0.5196, - "hfopenllm_v2/MATH Level 5": 0.1624, - "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/IFEval": 0.8051, + "hfopenllm_v2/BBH": 0.5189, + "hfopenllm_v2/MATH Level 5": 0.1722, + "hfopenllm_v2/GPQA": 0.3263, "hfopenllm_v2/MUSR": 0.4028, - "hfopenllm_v2/MMLU-PRO": 0.3812 + "hfopenllm_v2/MMLU-PRO": 0.3803 } }, { @@ -31197,7 +31197,7 @@ "developer": "Alibaba", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 25.4 + "terminal-bench-2.0/terminal-bench-2.0": 23.9 } }, { @@ -31288,17 +31288,17 @@ "developer": "allenai", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.7606, - "reward-bench/Chat": 0.9665, - "reward-bench/Chat Hard": 0.8355, - "reward-bench/Safety": 0.8844, - "reward-bench/Reasoning": 0.8969, - "reward-bench/Prior Sets (0.5 weight)": 0.0, + "reward-bench/Score": 0.9021, "reward-bench/Factuality": 0.8126, "reward-bench/Precise IF": 0.4188, "reward-bench/Math": 0.6995, + "reward-bench/Safety": 0.9095, "reward-bench/Focus": 0.8646, - "reward-bench/Ties": 0.8835 + "reward-bench/Ties": 0.8835, + "reward-bench/Chat": 0.9665, + "reward-bench/Chat Hard": 0.8355, + "reward-bench/Reasoning": 0.8969, + "reward-bench/Prior Sets (0.5 weight)": 0.0 } }, { @@ -31307,17 +31307,17 @@ "developer": "allenai", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.8463, + "reward-bench/Score": 0.649, + "reward-bench/Chat": 0.933, + "reward-bench/Chat Hard": 0.7785, + "reward-bench/Safety": 0.8267, + "reward-bench/Reasoning": 0.7886, + "reward-bench/Prior Sets (0.5 weight)": 0.0, "reward-bench/Factuality": 0.72, "reward-bench/Precise IF": 0.3625, "reward-bench/Math": 0.612, - "reward-bench/Safety": 0.8851, "reward-bench/Focus": 0.8323, - "reward-bench/Ties": 0.5406, - "reward-bench/Chat": 0.933, - "reward-bench/Chat Hard": 0.7785, - "reward-bench/Reasoning": 0.7886, - "reward-bench/Prior Sets (0.5 weight)": 0.0 + "reward-bench/Ties": 0.5406 } }, { @@ -31406,12 +31406,12 @@ "developer": "allenai", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.8267, - "hfopenllm_v2/BBH": 0.405, - "hfopenllm_v2/MATH Level 5": 0.1964, - "hfopenllm_v2/GPQA": 0.2987, + "hfopenllm_v2/IFEval": 0.8255, + "hfopenllm_v2/BBH": 0.4061, + "hfopenllm_v2/MATH Level 5": 0.2115, + "hfopenllm_v2/GPQA": 0.297, "hfopenllm_v2/MUSR": 0.4175, - "hfopenllm_v2/MMLU-PRO": 0.2827 + "hfopenllm_v2/MMLU-PRO": 0.2821 } }, { @@ -31434,17 +31434,17 @@ "developer": "allenai", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.8431, + "reward-bench/Score": 0.687, + "reward-bench/Chat": 0.9553, + "reward-bench/Chat Hard": 0.761, + "reward-bench/Safety": 0.86, + "reward-bench/Reasoning": 0.7898, + "reward-bench/Prior Sets (0.5 weight)": 0.0, "reward-bench/Factuality": 0.7516, "reward-bench/Precise IF": 0.3875, "reward-bench/Math": 0.6284, - "reward-bench/Safety": 0.8662, "reward-bench/Focus": 0.8545, - "reward-bench/Ties": 0.6397, - "reward-bench/Chat": 0.9553, - "reward-bench/Chat Hard": 0.761, - "reward-bench/Reasoning": 0.7898, - "reward-bench/Prior Sets (0.5 weight)": 0.0 + "reward-bench/Ties": 0.6397 } }, { @@ -31507,17 +31507,17 @@ "developer": "allenai", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.6821, - "reward-bench/Chat": 0.9497, - "reward-bench/Chat Hard": 0.7917, - "reward-bench/Safety": 0.8978, - "reward-bench/Reasoning": 0.8005, - "reward-bench/Prior Sets (0.5 weight)": 0.0, + "reward-bench/Score": 0.8551, "reward-bench/Factuality": 0.7326, "reward-bench/Precise IF": 0.3875, "reward-bench/Math": 0.5792, + "reward-bench/Safety": 0.8784, "reward-bench/Focus": 0.8889, - "reward-bench/Ties": 0.6063 + "reward-bench/Ties": 0.6063, + "reward-bench/Chat": 0.9497, + "reward-bench/Chat Hard": 0.7917, + "reward-bench/Reasoning": 0.8005, + "reward-bench/Prior Sets (0.5 weight)": 0.0 } }, { @@ -36202,7 +36202,7 @@ "developer": "Anthropic", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 27.5 + "terminal-bench-2.0/terminal-bench-2.0": 35.5 } }, { @@ -36328,11 +36328,11 @@ "evaluator_relationship": null, "benchmark_scores": { "appworld_test_normal/appworld/test_normal": 0.66, - "browsecompplus/browsecompplus": 0.61, - "swe-bench/swe-bench": 0.7423, + "browsecompplus/browsecompplus": 0.49, + "swe-bench/swe-bench": 0.65, "tau-bench-2_airline/tau-bench-2/airline": 0.66, - "tau-bench-2_retail/tau-bench-2/retail": 0.78, - "tau-bench-2_telecom/tau-bench-2/telecom": 0.76 + "tau-bench-2_retail/tau-bench-2/retail": 0.85, + "tau-bench-2_telecom/tau-bench-2/telecom": 0.58 } }, { @@ -36341,7 +36341,7 @@ "developer": "Anthropic", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 35.1 + "terminal-bench-2.0/terminal-bench-2.0": 38.0 } }, { @@ -36350,7 +36350,7 @@ "developer": "Anthropic", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 58.4 + "terminal-bench-2.0/terminal-bench-2.0": 54.3 } }, { @@ -36359,7 +36359,7 @@ "developer": "Anthropic", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 71.9 + "terminal-bench-2.0/terminal-bench-2.0": 69.9 } }, { @@ -39809,12 +39809,12 @@ "developer": "cognitivecomputations", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.3613, - "hfopenllm_v2/BBH": 0.6123, - "hfopenllm_v2/MATH Level 5": 0.1239, - "hfopenllm_v2/GPQA": 0.328, - "hfopenllm_v2/MUSR": 0.4112, - "hfopenllm_v2/MMLU-PRO": 0.4494 + "hfopenllm_v2/IFEval": 0.4124, + "hfopenllm_v2/BBH": 0.6383, + "hfopenllm_v2/MATH Level 5": 0.182, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.4349, + "hfopenllm_v2/MMLU-PRO": 0.4525 } }, { @@ -41039,12 +41039,12 @@ "developer": "dfurman", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.3, - "hfopenllm_v2/BBH": 0.3853, - "hfopenllm_v2/MATH Level 5": 0.0415, - "hfopenllm_v2/GPQA": 0.2617, - "hfopenllm_v2/MUSR": 0.3579, - "hfopenllm_v2/MMLU-PRO": 0.2281 + "hfopenllm_v2/IFEval": 0.2835, + "hfopenllm_v2/BBH": 0.3842, + "hfopenllm_v2/MATH Level 5": 0.0521, + "hfopenllm_v2/GPQA": 0.2609, + "hfopenllm_v2/MUSR": 0.3566, + "hfopenllm_v2/MMLU-PRO": 0.2298 } }, { @@ -42553,12 +42553,12 @@ "developer": "fblgit", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.5181, - "hfopenllm_v2/BBH": 0.7033, - "hfopenllm_v2/MATH Level 5": 0.4947, - "hfopenllm_v2/GPQA": 0.3826, - "hfopenllm_v2/MUSR": 0.5008, - "hfopenllm_v2/MMLU-PRO": 0.5915 + "hfopenllm_v2/IFEval": 0.4503, + "hfopenllm_v2/BBH": 0.7035, + "hfopenllm_v2/MATH Level 5": 0.3943, + "hfopenllm_v2/GPQA": 0.401, + "hfopenllm_v2/MUSR": 0.5021, + "hfopenllm_v2/MMLU-PRO": 0.5911 } }, { @@ -43705,7 +43705,6 @@ "developer": "google", "evaluator_relationship": null, "benchmark_scores": { - "ace/Gaming Score": 0.415, "apex-agents/Overall Pass@1": 0.24, "apex-agents/Overall Pass@8": 0.367, "apex-agents/Overall Mean Score": 0.395, @@ -43713,6 +43712,7 @@ "apex-agents/Management Consulting Pass@1": 0.193, "apex-agents/Corporate Law Pass@1": 0.259, "apex-agents/Corporate Lawyer Mean Score": 0.524, + "ace/Gaming Score": 0.415, "apex-v1/Overall Score": 0.64, "apex-v1/Consulting Score": 0.64 } @@ -44469,7 +44469,7 @@ "reward-bench/Safety": 0.909, "reward-bench/Focus": 0.841, "reward-bench/Ties": 0.809, - "terminal-bench-2.0/terminal-bench-2.0": 17.1 + "terminal-bench-2.0/terminal-bench-2.0": 15.4 } }, { @@ -44607,7 +44607,7 @@ "developer": "Google", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 64.3 + "terminal-bench-2.0/terminal-bench-2.0": 47.4 } }, { @@ -44616,7 +44616,7 @@ "developer": "Google", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 56.9 + "terminal-bench-2.0/terminal-bench-2.0": 62.2 } }, { @@ -44625,7 +44625,7 @@ "developer": "Google", "evaluator_relationship": null, "benchmark_scores": { - "appworld_test_normal/appworld/test_normal": 0.582, + "appworld_test_normal/appworld/test_normal": 0.505, "browsecompplus/browsecompplus": 0.48, "global-mmlu-lite/Global MMLU Lite": 0.9453, "global-mmlu-lite/Culturally Sensitive": 0.9397, @@ -44646,10 +44646,10 @@ "global-mmlu-lite/Yoruba": 0.9425, "global-mmlu-lite/Chinese": 0.9475, "global-mmlu-lite/Burmese": 0.9425, - "swe-bench/swe-bench": 0.67, + "swe-bench/swe-bench": 0.7234, "tau-bench-2_airline/tau-bench-2/airline": 0.68, - "tau-bench-2_retail/tau-bench-2/retail": 0.82, - "tau-bench-2_telecom/tau-bench-2/telecom": 0.8876 + "tau-bench-2_retail/tau-bench-2/retail": 0.7805, + "tau-bench-2_telecom/tau-bench-2/telecom": 0.73 } }, { @@ -44658,7 +44658,7 @@ "developer": "Google", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 74.8 + "terminal-bench-2.0/terminal-bench-2.0": 78.4 } }, { @@ -44774,12 +44774,12 @@ "developer": "google", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.2018, - "hfopenllm_v2/BBH": 0.3709, - "hfopenllm_v2/MATH Level 5": 0.0302, + "hfopenllm_v2/IFEval": 0.1993, + "hfopenllm_v2/BBH": 0.3656, + "hfopenllm_v2/MATH Level 5": 0.0287, "hfopenllm_v2/GPQA": 0.2626, - "hfopenllm_v2/MUSR": 0.4219, - "hfopenllm_v2/MMLU-PRO": 0.2217 + "hfopenllm_v2/MUSR": 0.4232, + "hfopenllm_v2/MMLU-PRO": 0.218 } }, { @@ -44802,12 +44802,12 @@ "developer": "google", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.5078, - "hfopenllm_v2/BBH": 0.4226, - "hfopenllm_v2/MATH Level 5": 0.0347, - "hfopenllm_v2/GPQA": 0.2852, - "hfopenllm_v2/MUSR": 0.3964, - "hfopenllm_v2/MMLU-PRO": 0.2578 + "hfopenllm_v2/IFEval": 0.5288, + "hfopenllm_v2/BBH": 0.4178, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3728, + "hfopenllm_v2/MMLU-PRO": 0.2467 } }, { @@ -45868,17 +45868,17 @@ "developer": "hendrydong", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.5851, - "reward-bench/Chat": 0.9832, - "reward-bench/Chat Hard": 0.5789, - "reward-bench/Safety": 0.6956, - "reward-bench/Reasoning": 0.7434, - "reward-bench/Prior Sets (0.5 weight)": 0.7508, + "reward-bench/Score": 0.7847, "reward-bench/Factuality": 0.5779, "reward-bench/Precise IF": 0.3625, "reward-bench/Math": 0.6011, + "reward-bench/Safety": 0.85, "reward-bench/Focus": 0.6747, - "reward-bench/Ties": 0.5988 + "reward-bench/Ties": 0.5988, + "reward-bench/Chat": 0.9832, + "reward-bench/Chat Hard": 0.5789, + "reward-bench/Reasoning": 0.7434, + "reward-bench/Prior Sets (0.5 weight)": 0.7508 } }, { @@ -48000,16 +48000,16 @@ "developer": "infly", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.7648, - "reward-bench/Chat": 0.9665, - "reward-bench/Chat Hard": 0.9101, - "reward-bench/Safety": 0.9644, - "reward-bench/Reasoning": 0.9912, + "reward-bench/Score": 0.9511, "reward-bench/Factuality": 0.7411, "reward-bench/Precise IF": 0.4188, "reward-bench/Math": 0.6995, + "reward-bench/Safety": 0.9365, "reward-bench/Focus": 0.903, - "reward-bench/Ties": 0.8622 + "reward-bench/Ties": 0.8622, + "reward-bench/Chat": 0.9665, + "reward-bench/Chat Hard": 0.9101, + "reward-bench/Reasoning": 0.9912 } }, { @@ -55218,12 +55218,12 @@ "developer": "meta-llama", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.7408, - "hfopenllm_v2/BBH": 0.4989, - "hfopenllm_v2/MATH Level 5": 0.0869, - "hfopenllm_v2/GPQA": 0.2592, - "hfopenllm_v2/MUSR": 0.3568, - "hfopenllm_v2/MMLU-PRO": 0.3664, + "hfopenllm_v2/IFEval": 0.4782, + "hfopenllm_v2/BBH": 0.491, + "hfopenllm_v2/MATH Level 5": 0.0914, + "hfopenllm_v2/GPQA": 0.2928, + "hfopenllm_v2/MUSR": 0.3805, + "hfopenllm_v2/MMLU-PRO": 0.3591, "reward-bench/Score": 0.645, "reward-bench/Chat": 0.8547, "reward-bench/Chat Hard": 0.4156, @@ -56603,12 +56603,12 @@ "developer": "microsoft", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.0488, - "hfopenllm_v2/BBH": 0.6703, - "hfopenllm_v2/MATH Level 5": 0.2787, - "hfopenllm_v2/GPQA": 0.401, + "hfopenllm_v2/IFEval": 0.0585, + "hfopenllm_v2/BBH": 0.6691, + "hfopenllm_v2/MATH Level 5": 0.3165, + "hfopenllm_v2/GPQA": 0.406, "hfopenllm_v2/MUSR": 0.5034, - "hfopenllm_v2/MMLU-PRO": 0.5295 + "hfopenllm_v2/MMLU-PRO": 0.5287 } }, { @@ -56729,12 +56729,12 @@ "developer": "migtissera", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.4345, - "hfopenllm_v2/BBH": 0.5686, - "hfopenllm_v2/MATH Level 5": 0.0838, - "hfopenllm_v2/GPQA": 0.3003, - "hfopenllm_v2/MUSR": 0.4045, - "hfopenllm_v2/MMLU-PRO": 0.334 + "hfopenllm_v2/IFEval": 0.443, + "hfopenllm_v2/BBH": 0.5706, + "hfopenllm_v2/MATH Level 5": 0.0869, + "hfopenllm_v2/GPQA": 0.3079, + "hfopenllm_v2/MUSR": 0.4031, + "hfopenllm_v2/MMLU-PRO": 0.3354 } }, { @@ -57017,12 +57017,12 @@ "developer": "mistralai", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.667, - "hfopenllm_v2/BBH": 0.5213, - "hfopenllm_v2/MATH Level 5": 0.1435, - "hfopenllm_v2/GPQA": 0.3238, - "hfopenllm_v2/MUSR": 0.3632, - "hfopenllm_v2/MMLU-PRO": 0.396 + "hfopenllm_v2/IFEval": 0.6283, + "hfopenllm_v2/BBH": 0.583, + "hfopenllm_v2/MATH Level 5": 0.2039, + "hfopenllm_v2/GPQA": 0.3331, + "hfopenllm_v2/MUSR": 0.4063, + "hfopenllm_v2/MMLU-PRO": 0.4099 } }, { @@ -58094,7 +58094,7 @@ "developer": "Moonshot AI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 27.8 + "terminal-bench-2.0/terminal-bench-2.0": 26.7 } }, { @@ -58345,7 +58345,7 @@ "developer": "Multiple", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 61.2 + "terminal-bench-2.0/terminal-bench-2.0": 50.1 } }, { @@ -60229,16 +60229,16 @@ "developer": "nicolinho", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.7667, - "reward-bench/Chat": 0.9665, - "reward-bench/Chat Hard": 0.9013, - "reward-bench/Safety": 0.9578, - "reward-bench/Reasoning": 0.9826, + "reward-bench/Score": 0.9444, "reward-bench/Factuality": 0.7853, "reward-bench/Precise IF": 0.3719, "reward-bench/Math": 0.6995, + "reward-bench/Safety": 0.927, "reward-bench/Focus": 0.9535, - "reward-bench/Ties": 0.8321 + "reward-bench/Ties": 0.8321, + "reward-bench/Chat": 0.9665, + "reward-bench/Chat Hard": 0.9013, + "reward-bench/Reasoning": 0.9826 } }, { @@ -61436,12 +61436,12 @@ "developer": "oopere", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.2119, - "hfopenllm_v2/BBH": 0.3156, - "hfopenllm_v2/MATH Level 5": 0.0181, - "hfopenllm_v2/GPQA": 0.2567, + "hfopenllm_v2/IFEval": 0.2164, + "hfopenllm_v2/BBH": 0.3169, + "hfopenllm_v2/MATH Level 5": 0.0128, + "hfopenllm_v2/GPQA": 0.2584, "hfopenllm_v2/MUSR": 0.3832, - "hfopenllm_v2/MMLU-PRO": 0.113 + "hfopenllm_v2/MMLU-PRO": 0.1134 } }, { @@ -61752,16 +61752,16 @@ "developer": "openai", "evaluator_relationship": null, "benchmark_scores": { + "ace/Overall Score": 0.515, + "ace/Food Score": 0.65, + "ace/Gaming Score": 0.578, "apex-agents/Overall Pass@1": 0.23, "apex-agents/Overall Pass@8": 0.4, "apex-agents/Overall Mean Score": 0.387, "apex-agents/Investment Banking Pass@1": 0.273, "apex-agents/Management Consulting Pass@1": 0.227, "apex-agents/Corporate Law Pass@1": 0.189, - "apex-agents/Corporate Lawyer Mean Score": 0.443, - "ace/Overall Score": 0.515, - "ace/Food Score": 0.65, - "ace/Gaming Score": 0.578 + "apex-agents/Corporate Lawyer Mean Score": 0.443 } }, { @@ -62499,16 +62499,16 @@ "helm_mmlu/Virology": 0.578, "helm_mmlu/World Religions": 0.883, "helm_mmlu/Mean win rate": 0.52, - "reward-bench/Score": 0.6493, - "reward-bench/Chat": 0.9609, - "reward-bench/Chat Hard": 0.761, - "reward-bench/Safety": 0.8619, - "reward-bench/Reasoning": 0.8661, + "reward-bench/Score": 0.8673, "reward-bench/Factuality": 0.5684, "reward-bench/Precise IF": 0.3312, "reward-bench/Math": 0.623, + "reward-bench/Safety": 0.8811, "reward-bench/Focus": 0.7293, - "reward-bench/Ties": 0.7819 + "reward-bench/Ties": 0.7819, + "reward-bench/Chat": 0.9609, + "reward-bench/Chat Hard": 0.761, + "reward-bench/Reasoning": 0.8661 } }, { @@ -62649,7 +62649,7 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 44.3 + "terminal-bench-2.0/terminal-bench-2.0": 41.3 } }, { @@ -62658,7 +62658,7 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 24.0 + "terminal-bench-2.0/terminal-bench-2.0": 31.9 } }, { @@ -62681,7 +62681,7 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 11.5 + "terminal-bench-2.0/terminal-bench-2.0": 7.0 } }, { @@ -62749,15 +62749,15 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { - "appworld_test_normal/appworld/test_normal": 0.0, - "browsecompplus/browsecompplus": 0.48, + "appworld_test_normal/appworld/test_normal": 0.071, + "browsecompplus/browsecompplus": 0.46, "livecodebenchpro/Hard Problems": 0.1594, "livecodebenchpro/Medium Problems": 0.5211, "livecodebenchpro/Easy Problems": 0.9014, - "swe-bench/swe-bench": 0.57, - "tau-bench-2_airline/tau-bench-2/airline": 0.48, - "tau-bench-2_retail/tau-bench-2/retail": 0.51, - "tau-bench-2_telecom/tau-bench-2/telecom": 0.55 + "swe-bench/swe-bench": 0.5253, + "tau-bench-2_airline/tau-bench-2/airline": 0.54, + "tau-bench-2_retail/tau-bench-2/retail": 0.73, + "tau-bench-2_telecom/tau-bench-2/telecom": 0.5354 } }, { @@ -62914,9 +62914,9 @@ "helm_capabilities/IFEval": 0.929, "helm_capabilities/WildBench": 0.854, "helm_capabilities/Omni-MATH": 0.72, - "livecodebenchpro/Hard Problems": 0.014084507042253521, - "livecodebenchpro/Medium Problems": 0.30985915492957744, - "livecodebenchpro/Easy Problems": 0.8873239436619719 + "livecodebenchpro/Hard Problems": 0.0143, + "livecodebenchpro/Medium Problems": 0.2923, + "livecodebenchpro/Easy Problems": 0.8571 } }, { @@ -63121,17 +63121,17 @@ "developer": "openbmb", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.4683, - "reward-bench/Chat": 0.9637, - "reward-bench/Chat Hard": 0.5548, - "reward-bench/Safety": 0.5089, - "reward-bench/Reasoning": 0.6244, - "reward-bench/Prior Sets (0.5 weight)": 0.7294, + "reward-bench/Score": 0.6903, "reward-bench/Factuality": 0.5063, "reward-bench/Precise IF": 0.3312, "reward-bench/Math": 0.5519, + "reward-bench/Safety": 0.5986, "reward-bench/Focus": 0.6081, - "reward-bench/Ties": 0.3036 + "reward-bench/Ties": 0.3036, + "reward-bench/Chat": 0.9637, + "reward-bench/Chat Hard": 0.5548, + "reward-bench/Reasoning": 0.6244, + "reward-bench/Prior Sets (0.5 weight)": 0.7294 } }, { @@ -64704,12 +64704,12 @@ "developer": "prithivMLmods", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.6052, - "hfopenllm_v2/BBH": 0.6317, - "hfopenllm_v2/MATH Level 5": 0.4789, - "hfopenllm_v2/GPQA": 0.3742, - "hfopenllm_v2/MUSR": 0.486, - "hfopenllm_v2/MMLU-PRO": 0.5302 + "hfopenllm_v2/IFEval": 0.6064, + "hfopenllm_v2/BBH": 0.6296, + "hfopenllm_v2/MATH Level 5": 0.3708, + "hfopenllm_v2/GPQA": 0.3733, + "hfopenllm_v2/MUSR": 0.4873, + "hfopenllm_v2/MMLU-PRO": 0.5307 } }, { @@ -67302,12 +67302,12 @@ "developer": "riaz", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.4373, - "hfopenllm_v2/BBH": 0.4586, - "hfopenllm_v2/MATH Level 5": 0.0514, - "hfopenllm_v2/GPQA": 0.2752, - "hfopenllm_v2/MUSR": 0.3763, - "hfopenllm_v2/MMLU-PRO": 0.2964 + "hfopenllm_v2/IFEval": 0.4137, + "hfopenllm_v2/BBH": 0.4565, + "hfopenllm_v2/MATH Level 5": 0.0453, + "hfopenllm_v2/GPQA": 0.276, + "hfopenllm_v2/MUSR": 0.3776, + "hfopenllm_v2/MMLU-PRO": 0.2978 } }, { @@ -68254,17 +68254,17 @@ "developer": "sfairXC", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.6292, - "reward-bench/Chat": 0.9944, - "reward-bench/Chat Hard": 0.6513, - "reward-bench/Safety": 0.7667, - "reward-bench/Reasoning": 0.8644, - "reward-bench/Prior Sets (0.5 weight)": 0.7492, + "reward-bench/Score": 0.8338, "reward-bench/Factuality": 0.5916, "reward-bench/Precise IF": 0.4188, "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.8676, "reward-bench/Focus": 0.7051, - "reward-bench/Ties": 0.6647 + "reward-bench/Ties": 0.6647, + "reward-bench/Chat": 0.9944, + "reward-bench/Chat Hard": 0.6513, + "reward-bench/Reasoning": 0.8644, + "reward-bench/Prior Sets (0.5 weight)": 0.7492 } }, { @@ -70566,12 +70566,12 @@ "developer": "tanliboy", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.1829, - "hfopenllm_v2/BBH": 0.5488, - "hfopenllm_v2/MATH Level 5": 0.0, - "hfopenllm_v2/GPQA": 0.3104, - "hfopenllm_v2/MUSR": 0.4056, - "hfopenllm_v2/MMLU-PRO": 0.3805 + "hfopenllm_v2/IFEval": 0.4501, + "hfopenllm_v2/BBH": 0.5472, + "hfopenllm_v2/MATH Level 5": 0.0944, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4017, + "hfopenllm_v2/MMLU-PRO": 0.3792 } }, { @@ -73231,17 +73231,17 @@ "developer": "weqweasdas", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.596, - "reward-bench/Chat": 0.9665, - "reward-bench/Chat Hard": 0.6053, - "reward-bench/Safety": 0.6911, - "reward-bench/Reasoning": 0.7736, - "reward-bench/Prior Sets (0.5 weight)": 0.753, + "reward-bench/Score": 0.7982, "reward-bench/Factuality": 0.5937, "reward-bench/Precise IF": 0.3438, "reward-bench/Math": 0.5956, + "reward-bench/Safety": 0.8703, "reward-bench/Focus": 0.7293, - "reward-bench/Ties": 0.6226 + "reward-bench/Ties": 0.6226, + "reward-bench/Chat": 0.9665, + "reward-bench/Chat Hard": 0.6053, + "reward-bench/Reasoning": 0.7736, + "reward-bench/Prior Sets (0.5 weight)": 0.753 } }, { @@ -73250,17 +73250,17 @@ "developer": "weqweasdas", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.2498, - "reward-bench/Chat": 0.8184, - "reward-bench/Chat Hard": 0.3728, - "reward-bench/Safety": 0.24, - "reward-bench/Reasoning": 0.3281, - "reward-bench/Prior Sets (0.5 weight)": 0.6564, + "reward-bench/Score": 0.5027, "reward-bench/Factuality": 0.3642, "reward-bench/Precise IF": 0.275, "reward-bench/Math": 0.3497, + "reward-bench/Safety": 0.4149, "reward-bench/Focus": 0.2384, - "reward-bench/Ties": 0.0315 + "reward-bench/Ties": 0.0315, + "reward-bench/Chat": 0.8184, + "reward-bench/Chat Hard": 0.3728, + "reward-bench/Reasoning": 0.3281, + "reward-bench/Prior Sets (0.5 weight)": 0.6564 } }, { @@ -73733,7 +73733,7 @@ "developer": "xAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 25.4 + "terminal-bench-2.0/terminal-bench-2.0": 23.1 } }, { @@ -74213,12 +74213,12 @@ "developer": "ycros", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.6262, - "hfopenllm_v2/BBH": 0.5142, - "hfopenllm_v2/MATH Level 5": 0.0937, - "hfopenllm_v2/GPQA": 0.3079, - "hfopenllm_v2/MUSR": 0.4138, - "hfopenllm_v2/MMLU-PRO": 0.3481 + "hfopenllm_v2/IFEval": 0.5994, + "hfopenllm_v2/BBH": 0.5159, + "hfopenllm_v2/MATH Level 5": 0.0785, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4203, + "hfopenllm_v2/MMLU-PRO": 0.3473 } }, { diff --git a/data/0-hero_Matter-0.1-7B-DPO-preview.json b/data/models/0-hero_Matter-0.1-7B-DPO-preview.json similarity index 100% rename from data/0-hero_Matter-0.1-7B-DPO-preview.json rename to data/models/0-hero_Matter-0.1-7B-DPO-preview.json diff --git a/data/0-hero_Matter-0.1-7B-boost-DPO-preview.json b/data/models/0-hero_Matter-0.1-7B-boost-DPO-preview.json similarity index 100% rename from data/0-hero_Matter-0.1-7B-boost-DPO-preview.json rename to data/models/0-hero_Matter-0.1-7B-boost-DPO-preview.json diff --git a/data/0-hero_Matter-0.2-7B-DPO.json b/data/models/0-hero_Matter-0.2-7B-DPO.json similarity index 100% rename from data/0-hero_Matter-0.2-7B-DPO.json rename to data/models/0-hero_Matter-0.2-7B-DPO.json diff --git a/data/01-ai_Yi-1.5-34B-32K.json b/data/models/01-ai_Yi-1.5-34B-32K.json similarity index 100% rename from data/01-ai_Yi-1.5-34B-32K.json rename to data/models/01-ai_Yi-1.5-34B-32K.json diff --git a/data/01-ai_Yi-1.5-34B-Chat-16K.json b/data/models/01-ai_Yi-1.5-34B-Chat-16K.json similarity index 100% rename from data/01-ai_Yi-1.5-34B-Chat-16K.json rename to data/models/01-ai_Yi-1.5-34B-Chat-16K.json diff --git a/data/01-ai_Yi-1.5-34B-Chat.json b/data/models/01-ai_Yi-1.5-34B-Chat.json similarity index 100% rename from data/01-ai_Yi-1.5-34B-Chat.json rename to data/models/01-ai_Yi-1.5-34B-Chat.json diff --git a/data/01-ai_Yi-1.5-34B.json b/data/models/01-ai_Yi-1.5-34B.json similarity index 100% rename from data/01-ai_Yi-1.5-34B.json rename to data/models/01-ai_Yi-1.5-34B.json diff --git a/data/01-ai_Yi-1.5-6B-Chat.json b/data/models/01-ai_Yi-1.5-6B-Chat.json similarity index 100% rename from data/01-ai_Yi-1.5-6B-Chat.json rename to data/models/01-ai_Yi-1.5-6B-Chat.json diff --git a/data/01-ai_Yi-1.5-6B.json b/data/models/01-ai_Yi-1.5-6B.json similarity index 100% rename from data/01-ai_Yi-1.5-6B.json rename to data/models/01-ai_Yi-1.5-6B.json diff --git a/data/01-ai_Yi-1.5-9B-32K.json b/data/models/01-ai_Yi-1.5-9B-32K.json similarity index 100% rename from data/01-ai_Yi-1.5-9B-32K.json rename to data/models/01-ai_Yi-1.5-9B-32K.json diff --git a/data/01-ai_Yi-1.5-9B-Chat-16K.json b/data/models/01-ai_Yi-1.5-9B-Chat-16K.json similarity index 100% rename from data/01-ai_Yi-1.5-9B-Chat-16K.json rename to data/models/01-ai_Yi-1.5-9B-Chat-16K.json diff --git a/data/01-ai_Yi-1.5-9B-Chat.json b/data/models/01-ai_Yi-1.5-9B-Chat.json similarity index 100% rename from data/01-ai_Yi-1.5-9B-Chat.json rename to data/models/01-ai_Yi-1.5-9B-Chat.json diff --git a/data/01-ai_Yi-1.5-9B.json b/data/models/01-ai_Yi-1.5-9B.json similarity index 100% rename from data/01-ai_Yi-1.5-9B.json rename to data/models/01-ai_Yi-1.5-9B.json diff --git a/data/01-ai_Yi-34B-200K.json b/data/models/01-ai_Yi-34B-200K.json similarity index 100% rename from data/01-ai_Yi-34B-200K.json rename to data/models/01-ai_Yi-34B-200K.json diff --git a/data/01-ai_Yi-34B-Chat.json b/data/models/01-ai_Yi-34B-Chat.json similarity index 100% rename from data/01-ai_Yi-34B-Chat.json rename to data/models/01-ai_Yi-34B-Chat.json diff --git a/data/models/01-ai_Yi-34B.json b/data/models/01-ai_Yi-34B.json new file mode 100644 index 0000000000000000000000000000000000000000..4969884c4f48354c5b3cbbe63551163a45a0bdd9 --- /dev/null +++ b/data/models/01-ai_Yi-34B.json @@ -0,0 +1,145 @@ +{ + "model_info": { + "name": "Yi-34B", + "id": "01-ai/Yi-34B", + "developer": "01-ai", + "inference_platform": "unknown", + "additional_details": { + "precision": "bfloat16", + "architecture": "LlamaForCausalLM", + "params_billions": "34.389" + } + }, + "evaluations": [ + { + "evaluation_id": "hfopenllm_v2/01-ai_Yi-34B/1773936498.240187", + "retrieved_timestamp": "1773936498.240187", + "source_metadata": { + "source_name": "HF Open LLM v2", + "source_type": "documentation", + "source_organization_name": "Hugging Face", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "lm-evaluation-harness", + "version": "0.4.0", + "additional_details": { + "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess" + } + }, + "benchmark": "hfopenllm_v2", + "evaluation_results": [ + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "hf_dataset", + "hf_repo": "google/IFEval" + }, + "metric_config": { + "evaluation_description": "Accuracy on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3046 + } + }, + { + "evaluation_name": "BBH", + "source_data": { + "dataset_name": "BBH", + "source_type": "hf_dataset", + "hf_repo": "SaylorTwift/bbh" + }, + "metric_config": { + "evaluation_description": "Accuracy on BBH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.5457 + } + }, + { + "evaluation_name": "MATH Level 5", + "source_data": { + "dataset_name": "MATH Level 5", + "source_type": "hf_dataset", + "hf_repo": "DigitalLearningGmbH/MATH-lighteval" + }, + "metric_config": { + "evaluation_description": "Exact Match on MATH Level 5", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.0514 + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "hf_dataset", + "hf_repo": "Idavidrein/gpqa" + }, + "metric_config": { + "evaluation_description": "Accuracy on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3666 + } + }, + { + "evaluation_name": "MUSR", + "source_data": { + "dataset_name": "MUSR", + "source_type": "hf_dataset", + "hf_repo": "TAUR-Lab/MuSR" + }, + "metric_config": { + "evaluation_description": "Accuracy on MUSR", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4119 + } + }, + { + "evaluation_name": "MMLU-PRO", + "source_data": { + "dataset_name": "MMLU-PRO", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/MMLU-Pro" + }, + "metric_config": { + "evaluation_description": "Accuracy on MMLU-PRO", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4412 + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/01-ai_Yi-6B-200K.json b/data/models/01-ai_Yi-6B-200K.json similarity index 100% rename from data/01-ai_Yi-6B-200K.json rename to data/models/01-ai_Yi-6B-200K.json diff --git a/data/01-ai_Yi-6B-Chat.json b/data/models/01-ai_Yi-6B-Chat.json similarity index 100% rename from data/01-ai_Yi-6B-Chat.json rename to data/models/01-ai_Yi-6B-Chat.json diff --git a/data/models/01-ai_Yi-6B.json b/data/models/01-ai_Yi-6B.json new file mode 100644 index 0000000000000000000000000000000000000000..3d1bcb59d7cc0c8bff3cd79db62fc3b1dbc05ff8 --- /dev/null +++ b/data/models/01-ai_Yi-6B.json @@ -0,0 +1,145 @@ +{ + "model_info": { + "name": "Yi-6B", + "id": "01-ai/Yi-6B", + "developer": "01-ai", + "inference_platform": "unknown", + "additional_details": { + "precision": "bfloat16", + "architecture": "LlamaForCausalLM", + "params_billions": "6.061" + } + }, + "evaluations": [ + { + "evaluation_id": "hfopenllm_v2/01-ai_Yi-6B/1773936498.240187", + "retrieved_timestamp": "1773936498.240187", + "source_metadata": { + "source_name": "HF Open LLM v2", + "source_type": "documentation", + "source_organization_name": "Hugging Face", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "lm-evaluation-harness", + "version": "0.4.0", + "additional_details": { + "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess" + } + }, + "benchmark": "hfopenllm_v2", + "evaluation_results": [ + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "hf_dataset", + "hf_repo": "google/IFEval" + }, + "metric_config": { + "evaluation_description": "Accuracy on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2893 + } + }, + { + "evaluation_name": "BBH", + "source_data": { + "dataset_name": "BBH", + "source_type": "hf_dataset", + "hf_repo": "SaylorTwift/bbh" + }, + "metric_config": { + "evaluation_description": "Accuracy on BBH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4309 + } + }, + { + "evaluation_name": "MATH Level 5", + "source_data": { + "dataset_name": "MATH Level 5", + "source_type": "hf_dataset", + "hf_repo": "DigitalLearningGmbH/MATH-lighteval" + }, + "metric_config": { + "evaluation_description": "Exact Match on MATH Level 5", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.0159 + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "hf_dataset", + "hf_repo": "Idavidrein/gpqa" + }, + "metric_config": { + "evaluation_description": "Accuracy on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2693 + } + }, + { + "evaluation_name": "MUSR", + "source_data": { + "dataset_name": "MUSR", + "source_type": "hf_dataset", + "hf_repo": "TAUR-Lab/MuSR" + }, + "metric_config": { + "evaluation_description": "Accuracy on MUSR", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3937 + } + }, + { + "evaluation_name": "MMLU-PRO", + "source_data": { + "dataset_name": "MMLU-PRO", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/MMLU-Pro" + }, + "metric_config": { + "evaluation_description": "Accuracy on MMLU-PRO", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2991 + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/01-ai_Yi-9B-200K.json b/data/models/01-ai_Yi-9B-200K.json similarity index 100% rename from data/01-ai_Yi-9B-200K.json rename to data/models/01-ai_Yi-9B-200K.json diff --git a/data/01-ai_Yi-9B.json b/data/models/01-ai_Yi-9B.json similarity index 100% rename from data/01-ai_Yi-9B.json rename to data/models/01-ai_Yi-9B.json diff --git a/data/01-ai_Yi-Coder-9B-Chat.json b/data/models/01-ai_Yi-Coder-9B-Chat.json similarity index 100% rename from data/01-ai_Yi-Coder-9B-Chat.json rename to data/models/01-ai_Yi-Coder-9B-Chat.json diff --git a/data/01-ai_yi-34b.json b/data/models/01-ai_yi-34b.json similarity index 100% rename from data/01-ai_yi-34b.json rename to data/models/01-ai_yi-34b.json diff --git a/data/01-ai_yi-6b.json b/data/models/01-ai_yi-6b.json similarity index 100% rename from data/01-ai_yi-6b.json rename to data/models/01-ai_yi-6b.json diff --git a/data/01-ai_yi-large-preview.json b/data/models/01-ai_yi-large-preview.json similarity index 100% rename from data/01-ai_yi-large-preview.json rename to data/models/01-ai_yi-large-preview.json diff --git a/data/1-800-LLMs_Qwen-2.5-14B-Hindi-Custom-Instruct.json b/data/models/1-800-LLMs_Qwen-2.5-14B-Hindi-Custom-Instruct.json similarity index 100% rename from data/1-800-LLMs_Qwen-2.5-14B-Hindi-Custom-Instruct.json rename to data/models/1-800-LLMs_Qwen-2.5-14B-Hindi-Custom-Instruct.json diff --git a/data/1-800-LLMs_Qwen-2.5-14B-Hindi.json b/data/models/1-800-LLMs_Qwen-2.5-14B-Hindi.json similarity index 100% rename from data/1-800-LLMs_Qwen-2.5-14B-Hindi.json rename to data/models/1-800-LLMs_Qwen-2.5-14B-Hindi.json diff --git a/data/1024m_PHI-4-Hindi.json b/data/models/1024m_PHI-4-Hindi.json similarity index 100% rename from data/1024m_PHI-4-Hindi.json rename to data/models/1024m_PHI-4-Hindi.json diff --git a/data/1024m_QWEN-14B-B100.json b/data/models/1024m_QWEN-14B-B100.json similarity index 100% rename from data/1024m_QWEN-14B-B100.json rename to data/models/1024m_QWEN-14B-B100.json diff --git a/data/152334H_miqu-1-70b-sf.json b/data/models/152334H_miqu-1-70b-sf.json similarity index 100% rename from data/152334H_miqu-1-70b-sf.json rename to data/models/152334H_miqu-1-70b-sf.json diff --git a/data/1TuanPham_T-VisStar-7B-v0.1.json b/data/models/1TuanPham_T-VisStar-7B-v0.1.json similarity index 100% rename from data/1TuanPham_T-VisStar-7B-v0.1.json rename to data/models/1TuanPham_T-VisStar-7B-v0.1.json diff --git a/data/1TuanPham_T-VisStar-v0.1.json b/data/models/1TuanPham_T-VisStar-v0.1.json similarity index 100% rename from data/1TuanPham_T-VisStar-v0.1.json rename to data/models/1TuanPham_T-VisStar-v0.1.json diff --git a/data/3rd-Degree-Burn_L-3.1-Science-Writer-8B.json b/data/models/3rd-Degree-Burn_L-3.1-Science-Writer-8B.json similarity index 100% rename from data/3rd-Degree-Burn_L-3.1-Science-Writer-8B.json rename to data/models/3rd-Degree-Burn_L-3.1-Science-Writer-8B.json diff --git a/data/3rd-Degree-Burn_Llama-3.1-8B-Squareroot-v1.json b/data/models/3rd-Degree-Burn_Llama-3.1-8B-Squareroot-v1.json similarity index 100% rename from data/3rd-Degree-Burn_Llama-3.1-8B-Squareroot-v1.json rename to data/models/3rd-Degree-Burn_Llama-3.1-8B-Squareroot-v1.json diff --git a/data/3rd-Degree-Burn_Llama-3.1-8B-Squareroot.json b/data/models/3rd-Degree-Burn_Llama-3.1-8B-Squareroot.json similarity index 100% rename from data/3rd-Degree-Burn_Llama-3.1-8B-Squareroot.json rename to data/models/3rd-Degree-Burn_Llama-3.1-8B-Squareroot.json diff --git a/data/3rd-Degree-Burn_Llama-Squared-8B.json b/data/models/3rd-Degree-Burn_Llama-Squared-8B.json similarity index 100% rename from data/3rd-Degree-Burn_Llama-Squared-8B.json rename to data/models/3rd-Degree-Burn_Llama-Squared-8B.json diff --git a/data/4season_final_model_test_v2.json b/data/models/4season_final_model_test_v2.json similarity index 100% rename from data/4season_final_model_test_v2.json rename to data/models/4season_final_model_test_v2.json diff --git a/data/AALF_FuseChat-Llama-3.1-8B-Instruct-preview.json b/data/models/AALF_FuseChat-Llama-3.1-8B-Instruct-preview.json similarity index 100% rename from data/AALF_FuseChat-Llama-3.1-8B-Instruct-preview.json rename to data/models/AALF_FuseChat-Llama-3.1-8B-Instruct-preview.json diff --git a/data/AALF_FuseChat-Llama-3.1-8B-SFT-preview.json b/data/models/AALF_FuseChat-Llama-3.1-8B-SFT-preview.json similarity index 100% rename from data/AALF_FuseChat-Llama-3.1-8B-SFT-preview.json rename to data/models/AALF_FuseChat-Llama-3.1-8B-SFT-preview.json diff --git a/data/AALF_gemma-2-27b-it-SimPO-37K-100steps.json b/data/models/AALF_gemma-2-27b-it-SimPO-37K-100steps.json similarity index 100% rename from data/AALF_gemma-2-27b-it-SimPO-37K-100steps.json rename to data/models/AALF_gemma-2-27b-it-SimPO-37K-100steps.json diff --git a/data/AALF_gemma-2-27b-it-SimPO-37K.json b/data/models/AALF_gemma-2-27b-it-SimPO-37K.json similarity index 100% rename from data/AALF_gemma-2-27b-it-SimPO-37K.json rename to data/models/AALF_gemma-2-27b-it-SimPO-37K.json diff --git a/data/AELLM_gemma-2-aeria-infinity-9b.json b/data/models/AELLM_gemma-2-aeria-infinity-9b.json similarity index 100% rename from data/AELLM_gemma-2-aeria-infinity-9b.json rename to data/models/AELLM_gemma-2-aeria-infinity-9b.json diff --git a/data/AELLM_gemma-2-lyco-infinity-9b.json b/data/models/AELLM_gemma-2-lyco-infinity-9b.json similarity index 100% rename from data/AELLM_gemma-2-lyco-infinity-9b.json rename to data/models/AELLM_gemma-2-lyco-infinity-9b.json diff --git a/data/AGI-0_Art-v0-3B.json b/data/models/AGI-0_Art-v0-3B.json similarity index 100% rename from data/AGI-0_Art-v0-3B.json rename to data/models/AGI-0_Art-v0-3B.json diff --git a/data/AGI-0_Artificium-llama3.1-8B-001.json b/data/models/AGI-0_Artificium-llama3.1-8B-001.json similarity index 100% rename from data/AGI-0_Artificium-llama3.1-8B-001.json rename to data/models/AGI-0_Artificium-llama3.1-8B-001.json diff --git a/data/AGI-0_smartllama3.1-8B-001.json b/data/models/AGI-0_smartllama3.1-8B-001.json similarity index 100% rename from data/AGI-0_smartllama3.1-8B-001.json rename to data/models/AGI-0_smartllama3.1-8B-001.json diff --git a/data/AI-MO_NuminaMath-7B-CoT.json b/data/models/AI-MO_NuminaMath-7B-CoT.json similarity index 100% rename from data/AI-MO_NuminaMath-7B-CoT.json rename to data/models/AI-MO_NuminaMath-7B-CoT.json diff --git a/data/AI-MO_NuminaMath-7B-TIR.json b/data/models/AI-MO_NuminaMath-7B-TIR.json similarity index 100% rename from data/AI-MO_NuminaMath-7B-TIR.json rename to data/models/AI-MO_NuminaMath-7B-TIR.json diff --git a/data/AI-Sweden-Models_Llama-3-8B-instruct.json b/data/models/AI-Sweden-Models_Llama-3-8B-instruct.json similarity index 100% rename from data/AI-Sweden-Models_Llama-3-8B-instruct.json rename to data/models/AI-Sweden-Models_Llama-3-8B-instruct.json diff --git a/data/AI-Sweden-Models_gpt-sw3-40b.json b/data/models/AI-Sweden-Models_gpt-sw3-40b.json similarity index 100% rename from data/AI-Sweden-Models_gpt-sw3-40b.json rename to data/models/AI-Sweden-Models_gpt-sw3-40b.json diff --git a/data/AI4free_Dhanishtha.json b/data/models/AI4free_Dhanishtha.json similarity index 100% rename from data/AI4free_Dhanishtha.json rename to data/models/AI4free_Dhanishtha.json diff --git a/data/AI4free_t2.json b/data/models/AI4free_t2.json similarity index 100% rename from data/AI4free_t2.json rename to data/models/AI4free_t2.json diff --git a/data/AIDC-AI_Marco-o1.json b/data/models/AIDC-AI_Marco-o1.json similarity index 100% rename from data/AIDC-AI_Marco-o1.json rename to data/models/AIDC-AI_Marco-o1.json diff --git a/data/Aashraf995_Creative-7B-nerd.json b/data/models/Aashraf995_Creative-7B-nerd.json similarity index 100% rename from data/Aashraf995_Creative-7B-nerd.json rename to data/models/Aashraf995_Creative-7B-nerd.json diff --git a/data/Aashraf995_Gemma-Evo-10B.json b/data/models/Aashraf995_Gemma-Evo-10B.json similarity index 100% rename from data/Aashraf995_Gemma-Evo-10B.json rename to data/models/Aashraf995_Gemma-Evo-10B.json diff --git a/data/Aashraf995_Qwen-Evo-7B.json b/data/models/Aashraf995_Qwen-Evo-7B.json similarity index 100% rename from data/Aashraf995_Qwen-Evo-7B.json rename to data/models/Aashraf995_Qwen-Evo-7B.json diff --git a/data/Aashraf995_QwenStock-14B.json b/data/models/Aashraf995_QwenStock-14B.json similarity index 100% rename from data/Aashraf995_QwenStock-14B.json rename to data/models/Aashraf995_QwenStock-14B.json diff --git a/data/AbacusResearch_Jallabi-34B.json b/data/models/AbacusResearch_Jallabi-34B.json similarity index 100% rename from data/AbacusResearch_Jallabi-34B.json rename to data/models/AbacusResearch_Jallabi-34B.json diff --git a/data/Ahdoot_StructuredThinker-v0.3-MoreStructure.json b/data/models/Ahdoot_StructuredThinker-v0.3-MoreStructure.json similarity index 100% rename from data/Ahdoot_StructuredThinker-v0.3-MoreStructure.json rename to data/models/Ahdoot_StructuredThinker-v0.3-MoreStructure.json diff --git a/data/Ahdoot_Test_StealthThinker.json b/data/models/Ahdoot_Test_StealthThinker.json similarity index 100% rename from data/Ahdoot_Test_StealthThinker.json rename to data/models/Ahdoot_Test_StealthThinker.json diff --git a/data/Ahjeong_MMPO_Gemma_7b.json b/data/models/Ahjeong_MMPO_Gemma_7b.json similarity index 100% rename from data/Ahjeong_MMPO_Gemma_7b.json rename to data/models/Ahjeong_MMPO_Gemma_7b.json diff --git a/data/Ahjeong_MMPO_Gemma_7b_gamma1.1_epoch3.json b/data/models/Ahjeong_MMPO_Gemma_7b_gamma1.1_epoch3.json similarity index 100% rename from data/Ahjeong_MMPO_Gemma_7b_gamma1.1_epoch3.json rename to data/models/Ahjeong_MMPO_Gemma_7b_gamma1.1_epoch3.json diff --git a/data/AicoresSecurity_Cybernet-Sec-3B-R1-V0-Coder.json b/data/models/AicoresSecurity_Cybernet-Sec-3B-R1-V0-Coder.json similarity index 100% rename from data/AicoresSecurity_Cybernet-Sec-3B-R1-V0-Coder.json rename to data/models/AicoresSecurity_Cybernet-Sec-3B-R1-V0-Coder.json diff --git a/data/AicoresSecurity_Cybernet-Sec-3B-R1-V0.json b/data/models/AicoresSecurity_Cybernet-Sec-3B-R1-V0.json similarity index 100% rename from data/AicoresSecurity_Cybernet-Sec-3B-R1-V0.json rename to data/models/AicoresSecurity_Cybernet-Sec-3B-R1-V0.json diff --git a/data/AicoresSecurity_Cybernet-Sec-3B-R1-V1.1.json b/data/models/AicoresSecurity_Cybernet-Sec-3B-R1-V1.1.json similarity index 100% rename from data/AicoresSecurity_Cybernet-Sec-3B-R1-V1.1.json rename to data/models/AicoresSecurity_Cybernet-Sec-3B-R1-V1.1.json diff --git a/data/AicoresSecurity_Cybernet-Sec-3B-R1-V1.json b/data/models/AicoresSecurity_Cybernet-Sec-3B-R1-V1.json similarity index 100% rename from data/AicoresSecurity_Cybernet-Sec-3B-R1-V1.json rename to data/models/AicoresSecurity_Cybernet-Sec-3B-R1-V1.json diff --git a/data/Alepach_notHumpback-M0.json b/data/models/Alepach_notHumpback-M0.json similarity index 100% rename from data/Alepach_notHumpback-M0.json rename to data/models/Alepach_notHumpback-M0.json diff --git a/data/Alepach_notHumpback-M1-v2.json b/data/models/Alepach_notHumpback-M1-v2.json similarity index 100% rename from data/Alepach_notHumpback-M1-v2.json rename to data/models/Alepach_notHumpback-M1-v2.json diff --git a/data/Alepach_notHumpback-M1.json b/data/models/Alepach_notHumpback-M1.json similarity index 100% rename from data/Alepach_notHumpback-M1.json rename to data/models/Alepach_notHumpback-M1.json diff --git a/data/AlephAlpha_luminous-base.json b/data/models/AlephAlpha_luminous-base.json similarity index 100% rename from data/AlephAlpha_luminous-base.json rename to data/models/AlephAlpha_luminous-base.json diff --git a/data/AlephAlpha_luminous-extended.json b/data/models/AlephAlpha_luminous-extended.json similarity index 100% rename from data/AlephAlpha_luminous-extended.json rename to data/models/AlephAlpha_luminous-extended.json diff --git a/data/AlephAlpha_luminous-supreme.json b/data/models/AlephAlpha_luminous-supreme.json similarity index 100% rename from data/AlephAlpha_luminous-supreme.json rename to data/models/AlephAlpha_luminous-supreme.json diff --git a/data/Alibaba-NLP_gte-Qwen2-7B-instruct.json b/data/models/Alibaba-NLP_gte-Qwen2-7B-instruct.json similarity index 100% rename from data/Alibaba-NLP_gte-Qwen2-7B-instruct.json rename to data/models/Alibaba-NLP_gte-Qwen2-7B-instruct.json diff --git a/data/Alsebay_Qwen2.5-7B-test-novelist.json b/data/models/Alsebay_Qwen2.5-7B-test-novelist.json similarity index 100% rename from data/Alsebay_Qwen2.5-7B-test-novelist.json rename to data/models/Alsebay_Qwen2.5-7B-test-novelist.json diff --git a/data/Amaorynho_BBAI2006.json b/data/models/Amaorynho_BBAI2006.json similarity index 100% rename from data/Amaorynho_BBAI2006.json rename to data/models/Amaorynho_BBAI2006.json diff --git a/data/Amaorynho_BBAI270V4.json b/data/models/Amaorynho_BBAI270V4.json similarity index 100% rename from data/Amaorynho_BBAI270V4.json rename to data/models/Amaorynho_BBAI270V4.json diff --git a/data/Amaorynho_BBAIIFEV1.json b/data/models/Amaorynho_BBAIIFEV1.json similarity index 100% rename from data/Amaorynho_BBAIIFEV1.json rename to data/models/Amaorynho_BBAIIFEV1.json diff --git a/data/Amaorynho_BBAI_375.json b/data/models/Amaorynho_BBAI_375.json similarity index 100% rename from data/Amaorynho_BBAI_375.json rename to data/models/Amaorynho_BBAI_375.json diff --git a/data/Amu_t1-1.5B.json b/data/models/Amu_t1-1.5B.json similarity index 100% rename from data/Amu_t1-1.5B.json rename to data/models/Amu_t1-1.5B.json diff --git a/data/Amu_t1-3B.json b/data/models/Amu_t1-3B.json similarity index 100% rename from data/Amu_t1-3B.json rename to data/models/Amu_t1-3B.json diff --git a/data/Anthropic-LM-v4-s3-52B.json b/data/models/Anthropic-LM-v4-s3-52B.json similarity index 100% rename from data/Anthropic-LM-v4-s3-52B.json rename to data/models/Anthropic-LM-v4-s3-52B.json diff --git a/data/models/Anthropic_claude-3-5-sonnet-20240620.json b/data/models/Anthropic_claude-3-5-sonnet-20240620.json new file mode 100644 index 0000000000000000000000000000000000000000..5fc714c7df534a43aff4096fb51608e0bedafaaf --- /dev/null +++ b/data/models/Anthropic_claude-3-5-sonnet-20240620.json @@ -0,0 +1,126 @@ +{ + "model_info": { + "name": "Anthropic/claude-3-5-sonnet-20240620", + "id": "Anthropic/claude-3-5-sonnet-20240620", + "developer": "Anthropic", + "additional_details": { + "model_type": "Generative" + } + }, + "evaluations": [ + { + "evaluation_id": "reward-bench/Anthropic_claude-3-5-sonnet-20240620/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ + { + "evaluation_name": "Score", + "metric_config": { + "evaluation_description": "Overall RewardBench Score", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.8417 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Chat", + "metric_config": { + "evaluation_description": "Chat accuracy - includes easy chat subsets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.9637 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Chat Hard", + "metric_config": { + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.7401 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Safety", + "metric_config": { + "evaluation_description": "Safety accuracy - includes safety subsets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.8162 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Reasoning", + "metric_config": { + "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.8469 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/models/Anthropic_claude-3-haiku-20240307.json b/data/models/Anthropic_claude-3-haiku-20240307.json new file mode 100644 index 0000000000000000000000000000000000000000..440cba1da4f8485837ea98ed20f0e481e3e0f958 --- /dev/null +++ b/data/models/Anthropic_claude-3-haiku-20240307.json @@ -0,0 +1,144 @@ +{ + "model_info": { + "name": "Anthropic/claude-3-haiku-20240307", + "id": "Anthropic/claude-3-haiku-20240307", + "developer": "Anthropic", + "additional_details": { + "model_type": "Generative" + } + }, + "evaluations": [ + { + "evaluation_id": "reward-bench/Anthropic_claude-3-haiku-20240307/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ + { + "evaluation_name": "Score", + "metric_config": { + "evaluation_description": "Overall RewardBench Score", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.7289 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Chat", + "metric_config": { + "evaluation_description": "Chat accuracy - includes easy chat subsets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.9274 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Chat Hard", + "metric_config": { + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.5197 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Safety", + "metric_config": { + "evaluation_description": "Safety accuracy - includes safety subsets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.7953 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Reasoning", + "metric_config": { + "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.706 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Prior Sets (0.5 weight)", + "metric_config": { + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6635 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/models/Anthropic_claude-3-opus-20240229.json b/data/models/Anthropic_claude-3-opus-20240229.json new file mode 100644 index 0000000000000000000000000000000000000000..1c3a1dfa90f3a7ac5e7f38ee8817b8a9f68a1e94 --- /dev/null +++ b/data/models/Anthropic_claude-3-opus-20240229.json @@ -0,0 +1,126 @@ +{ + "model_info": { + "name": "Anthropic/claude-3-opus-20240229", + "id": "Anthropic/claude-3-opus-20240229", + "developer": "Anthropic", + "additional_details": { + "model_type": "Generative" + } + }, + "evaluations": [ + { + "evaluation_id": "reward-bench/Anthropic_claude-3-opus-20240229/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ + { + "evaluation_name": "Score", + "metric_config": { + "evaluation_description": "Overall RewardBench Score", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.8008 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Chat", + "metric_config": { + "evaluation_description": "Chat accuracy - includes easy chat subsets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.9469 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Chat Hard", + "metric_config": { + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6031 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Safety", + "metric_config": { + "evaluation_description": "Safety accuracy - includes safety subsets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.8662 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Reasoning", + "metric_config": { + "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.7868 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/models/Anthropic_claude-3-sonnet-20240229.json b/data/models/Anthropic_claude-3-sonnet-20240229.json new file mode 100644 index 0000000000000000000000000000000000000000..18f9e366842fd4c36dcc6b5d81a5768547d68a68 --- /dev/null +++ b/data/models/Anthropic_claude-3-sonnet-20240229.json @@ -0,0 +1,144 @@ +{ + "model_info": { + "name": "Anthropic/claude-3-sonnet-20240229", + "id": "Anthropic/claude-3-sonnet-20240229", + "developer": "Anthropic", + "additional_details": { + "model_type": "Generative" + } + }, + "evaluations": [ + { + "evaluation_id": "reward-bench/Anthropic_claude-3-sonnet-20240229/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ + { + "evaluation_name": "Score", + "metric_config": { + "evaluation_description": "Overall RewardBench Score", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.7458 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Chat", + "metric_config": { + "evaluation_description": "Chat accuracy - includes easy chat subsets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.9344 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Chat Hard", + "metric_config": { + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.5658 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Safety", + "metric_config": { + "evaluation_description": "Safety accuracy - includes safety subsets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.8169 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Reasoning", + "metric_config": { + "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6907 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Prior Sets (0.5 weight)", + "metric_config": { + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6963 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/ArliAI_ArliAI-RPMax-12B-v1.1.json b/data/models/ArliAI_ArliAI-RPMax-12B-v1.1.json similarity index 100% rename from data/ArliAI_ArliAI-RPMax-12B-v1.1.json rename to data/models/ArliAI_ArliAI-RPMax-12B-v1.1.json diff --git a/data/ArliAI_Llama-3.1-8B-ArliAI-RPMax-v1.1.json b/data/models/ArliAI_Llama-3.1-8B-ArliAI-RPMax-v1.1.json similarity index 100% rename from data/ArliAI_Llama-3.1-8B-ArliAI-RPMax-v1.1.json rename to data/models/ArliAI_Llama-3.1-8B-ArliAI-RPMax-v1.1.json diff --git a/data/Arthur-LAGACHERIE_Precis-1B-Instruct.json b/data/models/Arthur-LAGACHERIE_Precis-1B-Instruct.json similarity index 100% rename from data/Arthur-LAGACHERIE_Precis-1B-Instruct.json rename to data/models/Arthur-LAGACHERIE_Precis-1B-Instruct.json diff --git a/data/Artples_L-MChat-7b.json b/data/models/Artples_L-MChat-7b.json similarity index 100% rename from data/Artples_L-MChat-7b.json rename to data/models/Artples_L-MChat-7b.json diff --git a/data/Artples_L-MChat-Small.json b/data/models/Artples_L-MChat-Small.json similarity index 100% rename from data/Artples_L-MChat-Small.json rename to data/models/Artples_L-MChat-Small.json diff --git a/data/Aryanne_QwentileSwap.json b/data/models/Aryanne_QwentileSwap.json similarity index 100% rename from data/Aryanne_QwentileSwap.json rename to data/models/Aryanne_QwentileSwap.json diff --git a/data/Aryanne_SHBA.json b/data/models/Aryanne_SHBA.json similarity index 100% rename from data/Aryanne_SHBA.json rename to data/models/Aryanne_SHBA.json diff --git a/data/Aryanne_SuperHeart.json b/data/models/Aryanne_SuperHeart.json similarity index 100% rename from data/Aryanne_SuperHeart.json rename to data/models/Aryanne_SuperHeart.json diff --git a/data/AtAndDev_Qwen2.5-1.5B-continuous-learnt.json b/data/models/AtAndDev_Qwen2.5-1.5B-continuous-learnt.json similarity index 99% rename from data/AtAndDev_Qwen2.5-1.5B-continuous-learnt.json rename to data/models/AtAndDev_Qwen2.5-1.5B-continuous-learnt.json index 8aa5dd3c1d9edc9b6735b92c14cc680d8f5aedfa..dba26133fd6dfa178c728b102613f0a5db478958 100644 --- a/data/AtAndDev_Qwen2.5-1.5B-continuous-learnt.json +++ b/data/models/AtAndDev_Qwen2.5-1.5B-continuous-learnt.json @@ -5,7 +5,7 @@ "developer": "AtAndDev", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "Qwen2ForCausalLM", "params_billions": "1.544" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4511 + "score": 0.4605 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4275 + "score": 0.4258 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1473 + "score": 0.0748 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2701 + "score": 0.2659 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3623 + "score": 0.3636 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2806 + "score": 0.2812 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4605 + "score": 0.4511 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4258 + "score": 0.4275 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0748 + "score": 0.1473 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2659 + "score": 0.2701 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3636 + "score": 0.3623 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2812 + "score": 0.2806 } } ], diff --git a/data/Ateron_Glowing-Forest-12B.json b/data/models/Ateron_Glowing-Forest-12B.json similarity index 100% rename from data/Ateron_Glowing-Forest-12B.json rename to data/models/Ateron_Glowing-Forest-12B.json diff --git a/data/Ateron_Lotus-Magpic.json b/data/models/Ateron_Lotus-Magpic.json similarity index 100% rename from data/Ateron_Lotus-Magpic.json rename to data/models/Ateron_Lotus-Magpic.json diff --git a/data/Ateron_Way_of_MagPicaro.json b/data/models/Ateron_Way_of_MagPicaro.json similarity index 100% rename from data/Ateron_Way_of_MagPicaro.json rename to data/models/Ateron_Way_of_MagPicaro.json diff --git a/data/AtlaAI_Selene-1-Mini-Llama-3.1-8B.json b/data/models/AtlaAI_Selene-1-Mini-Llama-3.1-8B.json similarity index 100% rename from data/AtlaAI_Selene-1-Mini-Llama-3.1-8B.json rename to data/models/AtlaAI_Selene-1-Mini-Llama-3.1-8B.json diff --git a/data/AtlaAI_Selene-1.json b/data/models/AtlaAI_Selene-1.json similarity index 100% rename from data/AtlaAI_Selene-1.json rename to data/models/AtlaAI_Selene-1.json diff --git a/data/AuraIndustries_Aura-4B.json b/data/models/AuraIndustries_Aura-4B.json similarity index 100% rename from data/AuraIndustries_Aura-4B.json rename to data/models/AuraIndustries_Aura-4B.json diff --git a/data/AuraIndustries_Aura-8B.json b/data/models/AuraIndustries_Aura-8B.json similarity index 100% rename from data/AuraIndustries_Aura-8B.json rename to data/models/AuraIndustries_Aura-8B.json diff --git a/data/AuraIndustries_Aura-MoE-2x4B-v2.json b/data/models/AuraIndustries_Aura-MoE-2x4B-v2.json similarity index 100% rename from data/AuraIndustries_Aura-MoE-2x4B-v2.json rename to data/models/AuraIndustries_Aura-MoE-2x4B-v2.json diff --git a/data/AuraIndustries_Aura-MoE-2x4B.json b/data/models/AuraIndustries_Aura-MoE-2x4B.json similarity index 100% rename from data/AuraIndustries_Aura-MoE-2x4B.json rename to data/models/AuraIndustries_Aura-MoE-2x4B.json diff --git a/data/Aurel9_testmerge-7b.json b/data/models/Aurel9_testmerge-7b.json similarity index 100% rename from data/Aurel9_testmerge-7b.json rename to data/models/Aurel9_testmerge-7b.json diff --git a/data/Ayush-Singh_Llama1B-sft-2.json b/data/models/Ayush-Singh_Llama1B-sft-2.json similarity index 100% rename from data/Ayush-Singh_Llama1B-sft-2.json rename to data/models/Ayush-Singh_Llama1B-sft-2.json diff --git a/data/Azure99_Blossom-V6-14B.json b/data/models/Azure99_Blossom-V6-14B.json similarity index 100% rename from data/Azure99_Blossom-V6-14B.json rename to data/models/Azure99_Blossom-V6-14B.json diff --git a/data/Azure99_Blossom-V6-7B.json b/data/models/Azure99_Blossom-V6-7B.json similarity index 100% rename from data/Azure99_Blossom-V6-7B.json rename to data/models/Azure99_Blossom-V6-7B.json diff --git a/data/Azure99_blossom-v5-32b.json b/data/models/Azure99_blossom-v5-32b.json similarity index 100% rename from data/Azure99_blossom-v5-32b.json rename to data/models/Azure99_blossom-v5-32b.json diff --git a/data/Azure99_blossom-v5-llama3-8b.json b/data/models/Azure99_blossom-v5-llama3-8b.json similarity index 100% rename from data/Azure99_blossom-v5-llama3-8b.json rename to data/models/Azure99_blossom-v5-llama3-8b.json diff --git a/data/Azure99_blossom-v5.1-34b.json b/data/models/Azure99_blossom-v5.1-34b.json similarity index 100% rename from data/Azure99_blossom-v5.1-34b.json rename to data/models/Azure99_blossom-v5.1-34b.json diff --git a/data/Azure99_blossom-v5.1-9b.json b/data/models/Azure99_blossom-v5.1-9b.json similarity index 100% rename from data/Azure99_blossom-v5.1-9b.json rename to data/models/Azure99_blossom-v5.1-9b.json diff --git a/data/BAAI_Gemma2-9B-IT-Simpo-Infinity-Preference.json b/data/models/BAAI_Gemma2-9B-IT-Simpo-Infinity-Preference.json similarity index 100% rename from data/BAAI_Gemma2-9B-IT-Simpo-Infinity-Preference.json rename to data/models/BAAI_Gemma2-9B-IT-Simpo-Infinity-Preference.json diff --git a/data/BAAI_Infinity-Instruct-3M-0613-Llama3-70B.json b/data/models/BAAI_Infinity-Instruct-3M-0613-Llama3-70B.json similarity index 100% rename from data/BAAI_Infinity-Instruct-3M-0613-Llama3-70B.json rename to data/models/BAAI_Infinity-Instruct-3M-0613-Llama3-70B.json diff --git a/data/BAAI_Infinity-Instruct-3M-0613-Mistral-7B.json b/data/models/BAAI_Infinity-Instruct-3M-0613-Mistral-7B.json similarity index 100% rename from data/BAAI_Infinity-Instruct-3M-0613-Mistral-7B.json rename to data/models/BAAI_Infinity-Instruct-3M-0613-Mistral-7B.json diff --git a/data/BAAI_Infinity-Instruct-3M-0625-Llama3-70B.json b/data/models/BAAI_Infinity-Instruct-3M-0625-Llama3-70B.json similarity index 100% rename from data/BAAI_Infinity-Instruct-3M-0625-Llama3-70B.json rename to data/models/BAAI_Infinity-Instruct-3M-0625-Llama3-70B.json diff --git a/data/BAAI_Infinity-Instruct-3M-0625-Llama3-8B.json b/data/models/BAAI_Infinity-Instruct-3M-0625-Llama3-8B.json similarity index 100% rename from data/BAAI_Infinity-Instruct-3M-0625-Llama3-8B.json rename to data/models/BAAI_Infinity-Instruct-3M-0625-Llama3-8B.json diff --git a/data/BAAI_Infinity-Instruct-3M-0625-Mistral-7B.json b/data/models/BAAI_Infinity-Instruct-3M-0625-Mistral-7B.json similarity index 100% rename from data/BAAI_Infinity-Instruct-3M-0625-Mistral-7B.json rename to data/models/BAAI_Infinity-Instruct-3M-0625-Mistral-7B.json diff --git a/data/BAAI_Infinity-Instruct-3M-0625-Qwen2-7B.json b/data/models/BAAI_Infinity-Instruct-3M-0625-Qwen2-7B.json similarity index 100% rename from data/BAAI_Infinity-Instruct-3M-0625-Qwen2-7B.json rename to data/models/BAAI_Infinity-Instruct-3M-0625-Qwen2-7B.json diff --git a/data/BAAI_Infinity-Instruct-3M-0625-Yi-1.5-9B.json b/data/models/BAAI_Infinity-Instruct-3M-0625-Yi-1.5-9B.json similarity index 100% rename from data/BAAI_Infinity-Instruct-3M-0625-Yi-1.5-9B.json rename to data/models/BAAI_Infinity-Instruct-3M-0625-Yi-1.5-9B.json diff --git a/data/BAAI_Infinity-Instruct-7M-0729-Llama3_1-8B.json b/data/models/BAAI_Infinity-Instruct-7M-0729-Llama3_1-8B.json similarity index 100% rename from data/BAAI_Infinity-Instruct-7M-0729-Llama3_1-8B.json rename to data/models/BAAI_Infinity-Instruct-7M-0729-Llama3_1-8B.json diff --git a/data/BAAI_Infinity-Instruct-7M-0729-mistral-7B.json b/data/models/BAAI_Infinity-Instruct-7M-0729-mistral-7B.json similarity index 100% rename from data/BAAI_Infinity-Instruct-7M-0729-mistral-7B.json rename to data/models/BAAI_Infinity-Instruct-7M-0729-mistral-7B.json diff --git a/data/BAAI_Infinity-Instruct-7M-Gen-Llama3_1-70B.json b/data/models/BAAI_Infinity-Instruct-7M-Gen-Llama3_1-70B.json similarity index 100% rename from data/BAAI_Infinity-Instruct-7M-Gen-Llama3_1-70B.json rename to data/models/BAAI_Infinity-Instruct-7M-Gen-Llama3_1-70B.json diff --git a/data/BAAI_Infinity-Instruct-7M-Gen-Llama3_1-8B.json b/data/models/BAAI_Infinity-Instruct-7M-Gen-Llama3_1-8B.json similarity index 100% rename from data/BAAI_Infinity-Instruct-7M-Gen-Llama3_1-8B.json rename to data/models/BAAI_Infinity-Instruct-7M-Gen-Llama3_1-8B.json diff --git a/data/BAAI_Infinity-Instruct-7M-Gen-mistral-7B.json b/data/models/BAAI_Infinity-Instruct-7M-Gen-mistral-7B.json similarity index 100% rename from data/BAAI_Infinity-Instruct-7M-Gen-mistral-7B.json rename to data/models/BAAI_Infinity-Instruct-7M-Gen-mistral-7B.json diff --git a/data/BAAI_OPI-Llama-3.1-8B-Instruct.json b/data/models/BAAI_OPI-Llama-3.1-8B-Instruct.json similarity index 100% rename from data/BAAI_OPI-Llama-3.1-8B-Instruct.json rename to data/models/BAAI_OPI-Llama-3.1-8B-Instruct.json diff --git a/data/BEE-spoke-data_Meta-Llama-3-8Bee.json b/data/models/BEE-spoke-data_Meta-Llama-3-8Bee.json similarity index 100% rename from data/BEE-spoke-data_Meta-Llama-3-8Bee.json rename to data/models/BEE-spoke-data_Meta-Llama-3-8Bee.json diff --git a/data/BEE-spoke-data_smol_llama-101M-GQA.json b/data/models/BEE-spoke-data_smol_llama-101M-GQA.json similarity index 100% rename from data/BEE-spoke-data_smol_llama-101M-GQA.json rename to data/models/BEE-spoke-data_smol_llama-101M-GQA.json diff --git a/data/BEE-spoke-data_smol_llama-220M-GQA-fineweb_edu.json b/data/models/BEE-spoke-data_smol_llama-220M-GQA-fineweb_edu.json similarity index 100% rename from data/BEE-spoke-data_smol_llama-220M-GQA-fineweb_edu.json rename to data/models/BEE-spoke-data_smol_llama-220M-GQA-fineweb_edu.json diff --git a/data/BEE-spoke-data_smol_llama-220M-GQA.json b/data/models/BEE-spoke-data_smol_llama-220M-GQA.json similarity index 100% rename from data/BEE-spoke-data_smol_llama-220M-GQA.json rename to data/models/BEE-spoke-data_smol_llama-220M-GQA.json diff --git a/data/BEE-spoke-data_smol_llama-220M-openhermes.json b/data/models/BEE-spoke-data_smol_llama-220M-openhermes.json similarity index 100% rename from data/BEE-spoke-data_smol_llama-220M-openhermes.json rename to data/models/BEE-spoke-data_smol_llama-220M-openhermes.json diff --git a/data/BEE-spoke-data_tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024.json b/data/models/BEE-spoke-data_tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024.json similarity index 100% rename from data/BEE-spoke-data_tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024.json rename to data/models/BEE-spoke-data_tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024.json diff --git a/data/BEE-spoke-data_tFINE-900m-e16-d32-flan.json b/data/models/BEE-spoke-data_tFINE-900m-e16-d32-flan.json similarity index 100% rename from data/BEE-spoke-data_tFINE-900m-e16-d32-flan.json rename to data/models/BEE-spoke-data_tFINE-900m-e16-d32-flan.json diff --git a/data/BEE-spoke-data_tFINE-900m-e16-d32-instruct_2e.json b/data/models/BEE-spoke-data_tFINE-900m-e16-d32-instruct_2e.json similarity index 100% rename from data/BEE-spoke-data_tFINE-900m-e16-d32-instruct_2e.json rename to data/models/BEE-spoke-data_tFINE-900m-e16-d32-instruct_2e.json diff --git a/data/BEE-spoke-data_tFINE-900m-instruct-orpo.json b/data/models/BEE-spoke-data_tFINE-900m-instruct-orpo.json similarity index 100% rename from data/BEE-spoke-data_tFINE-900m-instruct-orpo.json rename to data/models/BEE-spoke-data_tFINE-900m-instruct-orpo.json diff --git a/data/BSC-LT_salamandra-7b-instruct.json b/data/models/BSC-LT_salamandra-7b-instruct.json similarity index 100% rename from data/BSC-LT_salamandra-7b-instruct.json rename to data/models/BSC-LT_salamandra-7b-instruct.json diff --git a/data/BSC-LT_salamandra-7b.json b/data/models/BSC-LT_salamandra-7b.json similarity index 100% rename from data/BSC-LT_salamandra-7b.json rename to data/models/BSC-LT_salamandra-7b.json diff --git a/data/Ba2han_Llama-Phi-3_DoRA.json b/data/models/Ba2han_Llama-Phi-3_DoRA.json similarity index 100% rename from data/Ba2han_Llama-Phi-3_DoRA.json rename to data/models/Ba2han_Llama-Phi-3_DoRA.json diff --git a/data/Baptiste-HUVELLE-10_LeTriomphant2.2_ECE_iLAB.json b/data/models/Baptiste-HUVELLE-10_LeTriomphant2.2_ECE_iLAB.json similarity index 100% rename from data/Baptiste-HUVELLE-10_LeTriomphant2.2_ECE_iLAB.json rename to data/models/Baptiste-HUVELLE-10_LeTriomphant2.2_ECE_iLAB.json diff --git a/data/BenevolenceMessiah_Qwen2.5-72B-2x-Instruct-TIES-v1.0.json b/data/models/BenevolenceMessiah_Qwen2.5-72B-2x-Instruct-TIES-v1.0.json similarity index 100% rename from data/BenevolenceMessiah_Qwen2.5-72B-2x-Instruct-TIES-v1.0.json rename to data/models/BenevolenceMessiah_Qwen2.5-72B-2x-Instruct-TIES-v1.0.json diff --git a/data/BenevolenceMessiah_Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0.json b/data/models/BenevolenceMessiah_Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0.json similarity index 100% rename from data/BenevolenceMessiah_Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0.json rename to data/models/BenevolenceMessiah_Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0.json diff --git a/data/BlackBeenie_Bloslain-8B-v0.2.json b/data/models/BlackBeenie_Bloslain-8B-v0.2.json similarity index 100% rename from data/BlackBeenie_Bloslain-8B-v0.2.json rename to data/models/BlackBeenie_Bloslain-8B-v0.2.json diff --git a/data/BlackBeenie_Llama-3.1-8B-OpenO1-SFT-v0.1.json b/data/models/BlackBeenie_Llama-3.1-8B-OpenO1-SFT-v0.1.json similarity index 100% rename from data/BlackBeenie_Llama-3.1-8B-OpenO1-SFT-v0.1.json rename to data/models/BlackBeenie_Llama-3.1-8B-OpenO1-SFT-v0.1.json diff --git a/data/BlackBeenie_Llama-3.1-8B-pythonic-passthrough-merge.json b/data/models/BlackBeenie_Llama-3.1-8B-pythonic-passthrough-merge.json similarity index 100% rename from data/BlackBeenie_Llama-3.1-8B-pythonic-passthrough-merge.json rename to data/models/BlackBeenie_Llama-3.1-8B-pythonic-passthrough-merge.json diff --git a/data/BlackBeenie_Neos-Gemma-2-9b.json b/data/models/BlackBeenie_Neos-Gemma-2-9b.json similarity index 100% rename from data/BlackBeenie_Neos-Gemma-2-9b.json rename to data/models/BlackBeenie_Neos-Gemma-2-9b.json diff --git a/data/BlackBeenie_Neos-Llama-3.1-8B.json b/data/models/BlackBeenie_Neos-Llama-3.1-8B.json similarity index 100% rename from data/BlackBeenie_Neos-Llama-3.1-8B.json rename to data/models/BlackBeenie_Neos-Llama-3.1-8B.json diff --git a/data/BlackBeenie_Neos-Llama-3.1-base.json b/data/models/BlackBeenie_Neos-Llama-3.1-base.json similarity index 100% rename from data/BlackBeenie_Neos-Llama-3.1-base.json rename to data/models/BlackBeenie_Neos-Llama-3.1-base.json diff --git a/data/BlackBeenie_Neos-Phi-3-14B-v0.1.json b/data/models/BlackBeenie_Neos-Phi-3-14B-v0.1.json similarity index 100% rename from data/BlackBeenie_Neos-Phi-3-14B-v0.1.json rename to data/models/BlackBeenie_Neos-Phi-3-14B-v0.1.json diff --git a/data/BlackBeenie_llama-3-luminous-merged.json b/data/models/BlackBeenie_llama-3-luminous-merged.json similarity index 100% rename from data/BlackBeenie_llama-3-luminous-merged.json rename to data/models/BlackBeenie_llama-3-luminous-merged.json diff --git a/data/BlackBeenie_llama-3.1-8B-Galore-openassistant-guanaco.json b/data/models/BlackBeenie_llama-3.1-8B-Galore-openassistant-guanaco.json similarity index 100% rename from data/BlackBeenie_llama-3.1-8B-Galore-openassistant-guanaco.json rename to data/models/BlackBeenie_llama-3.1-8B-Galore-openassistant-guanaco.json diff --git a/data/Bllossom_llama-3.2-Korean-Bllossom-AICA-5B.json b/data/models/Bllossom_llama-3.2-Korean-Bllossom-AICA-5B.json similarity index 100% rename from data/Bllossom_llama-3.2-Korean-Bllossom-AICA-5B.json rename to data/models/Bllossom_llama-3.2-Korean-Bllossom-AICA-5B.json diff --git a/data/BoltMonkey_DreadMix.json b/data/models/BoltMonkey_DreadMix.json similarity index 100% rename from data/BoltMonkey_DreadMix.json rename to data/models/BoltMonkey_DreadMix.json diff --git a/data/BoltMonkey_NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated.json b/data/models/BoltMonkey_NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated.json similarity index 99% rename from data/BoltMonkey_NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated.json rename to data/models/BoltMonkey_NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated.json index 7e063d230a7a83c0f882526c40d3a35850b37135..87c7d85f598dc091e43d3a783948555286f12f59 100644 --- a/data/BoltMonkey_NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated.json +++ b/data/models/BoltMonkey_NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated.json @@ -5,7 +5,7 @@ "developer": "BoltMonkey", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "LlamaForCausalLM", "params_billions": "8.03" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7999 + "score": 0.459 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5152 + "score": 0.5185 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1193 + "score": 0.0937 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.281 + "score": 0.2743 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4019 + "score": 0.4083 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3733 + "score": 0.3631 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.459 + "score": 0.7999 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5185 + "score": 0.5152 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0937 + "score": 0.1193 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2743 + "score": 0.281 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4083 + "score": 0.4019 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3631 + "score": 0.3733 } } ], diff --git a/data/BoltMonkey_SuperNeuralDreadDevil-8b.json b/data/models/BoltMonkey_SuperNeuralDreadDevil-8b.json similarity index 100% rename from data/BoltMonkey_SuperNeuralDreadDevil-8b.json rename to data/models/BoltMonkey_SuperNeuralDreadDevil-8b.json diff --git a/data/BrainWave-ML_llama3.2-3B-maths-orpo.json b/data/models/BrainWave-ML_llama3.2-3B-maths-orpo.json similarity index 100% rename from data/BrainWave-ML_llama3.2-3B-maths-orpo.json rename to data/models/BrainWave-ML_llama3.2-3B-maths-orpo.json diff --git a/data/BramVanroy_GEITje-7B-ultra.json b/data/models/BramVanroy_GEITje-7B-ultra.json similarity index 100% rename from data/BramVanroy_GEITje-7B-ultra.json rename to data/models/BramVanroy_GEITje-7B-ultra.json diff --git a/data/BramVanroy_fietje-2-chat.json b/data/models/BramVanroy_fietje-2-chat.json similarity index 100% rename from data/BramVanroy_fietje-2-chat.json rename to data/models/BramVanroy_fietje-2-chat.json diff --git a/data/BramVanroy_fietje-2-instruct.json b/data/models/BramVanroy_fietje-2-instruct.json similarity index 100% rename from data/BramVanroy_fietje-2-instruct.json rename to data/models/BramVanroy_fietje-2-instruct.json diff --git a/data/BramVanroy_fietje-2.json b/data/models/BramVanroy_fietje-2.json similarity index 100% rename from data/BramVanroy_fietje-2.json rename to data/models/BramVanroy_fietje-2.json diff --git a/data/CIR-AMS_BTRM_Qwen2_7b_0613.json b/data/models/CIR-AMS_BTRM_Qwen2_7b_0613.json similarity index 100% rename from data/CIR-AMS_BTRM_Qwen2_7b_0613.json rename to data/models/CIR-AMS_BTRM_Qwen2_7b_0613.json index 84a36ab2262aa5d4869e5d141d383b85699971f1..9fb827ce32fb32044e2247d7f86c70d1bc13d414 100644 --- a/data/CIR-AMS_BTRM_Qwen2_7b_0613.json +++ b/data/models/CIR-AMS_BTRM_Qwen2_7b_0613.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/CIR-AMS_BTRM_Qwen2_7b_0613/1766412838.146816", + "evaluation_id": "reward-bench-2/CIR-AMS_BTRM_Qwen2_7b_0613/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,109 +31,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8172 + "score": 0.5736 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9749 + "score": 0.5347 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5724 + "score": 0.3563 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6066 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9014 + "score": 0.7178 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8775 + "score": 0.5737 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7029 + "score": 0.6527 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], @@ -141,10 +159,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench-2/CIR-AMS_BTRM_Qwen2_7b_0613/1766412838.146816", + "evaluation_id": "reward-bench/CIR-AMS_BTRM_Qwen2_7b_0613/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -163,127 +181,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5736 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5347 + "score": 0.8172 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3563 + "score": 0.9749 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6066 + "score": 0.5724 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7178 + "score": 0.9014 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5737 + "score": 0.8775 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6527 + "score": 0.7029 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/CYFRAGOVPL_Llama-PLLuM-8B-base.json b/data/models/CYFRAGOVPL_Llama-PLLuM-8B-base.json similarity index 100% rename from data/CYFRAGOVPL_Llama-PLLuM-8B-base.json rename to data/models/CYFRAGOVPL_Llama-PLLuM-8B-base.json diff --git a/data/CYFRAGOVPL_Llama-PLLuM-8B-chat.json b/data/models/CYFRAGOVPL_Llama-PLLuM-8B-chat.json similarity index 100% rename from data/CYFRAGOVPL_Llama-PLLuM-8B-chat.json rename to data/models/CYFRAGOVPL_Llama-PLLuM-8B-chat.json diff --git a/data/CYFRAGOVPL_PLLuM-12B-base.json b/data/models/CYFRAGOVPL_PLLuM-12B-base.json similarity index 100% rename from data/CYFRAGOVPL_PLLuM-12B-base.json rename to data/models/CYFRAGOVPL_PLLuM-12B-base.json diff --git a/data/CYFRAGOVPL_PLLuM-12B-chat.json b/data/models/CYFRAGOVPL_PLLuM-12B-chat.json similarity index 100% rename from data/CYFRAGOVPL_PLLuM-12B-chat.json rename to data/models/CYFRAGOVPL_PLLuM-12B-chat.json diff --git a/data/CYFRAGOVPL_PLLuM-12B-nc-base.json b/data/models/CYFRAGOVPL_PLLuM-12B-nc-base.json similarity index 100% rename from data/CYFRAGOVPL_PLLuM-12B-nc-base.json rename to data/models/CYFRAGOVPL_PLLuM-12B-nc-base.json diff --git a/data/CYFRAGOVPL_PLLuM-12B-nc-chat.json b/data/models/CYFRAGOVPL_PLLuM-12B-nc-chat.json similarity index 100% rename from data/CYFRAGOVPL_PLLuM-12B-nc-chat.json rename to data/models/CYFRAGOVPL_PLLuM-12B-nc-chat.json diff --git a/data/CarrotAI_Llama-3.2-Rabbit-Ko-3B-Instruct-2412.json b/data/models/CarrotAI_Llama-3.2-Rabbit-Ko-3B-Instruct-2412.json similarity index 100% rename from data/CarrotAI_Llama-3.2-Rabbit-Ko-3B-Instruct-2412.json rename to data/models/CarrotAI_Llama-3.2-Rabbit-Ko-3B-Instruct-2412.json diff --git a/data/CarrotAI_Llama-3.2-Rabbit-Ko-3B-Instruct.json b/data/models/CarrotAI_Llama-3.2-Rabbit-Ko-3B-Instruct.json similarity index 100% rename from data/CarrotAI_Llama-3.2-Rabbit-Ko-3B-Instruct.json rename to data/models/CarrotAI_Llama-3.2-Rabbit-Ko-3B-Instruct.json diff --git a/data/Casual-Autopsy_L3-Umbral-Mind-RP-v2.0-8B.json b/data/models/Casual-Autopsy_L3-Umbral-Mind-RP-v2.0-8B.json similarity index 100% rename from data/Casual-Autopsy_L3-Umbral-Mind-RP-v2.0-8B.json rename to data/models/Casual-Autopsy_L3-Umbral-Mind-RP-v2.0-8B.json diff --git a/data/CausalLM_14B.json b/data/models/CausalLM_14B.json similarity index 100% rename from data/CausalLM_14B.json rename to data/models/CausalLM_14B.json diff --git a/data/CausalLM_34b-beta.json b/data/models/CausalLM_34b-beta.json similarity index 100% rename from data/CausalLM_34b-beta.json rename to data/models/CausalLM_34b-beta.json diff --git a/data/CausalLM_preview-1-hf.json b/data/models/CausalLM_preview-1-hf.json similarity index 100% rename from data/CausalLM_preview-1-hf.json rename to data/models/CausalLM_preview-1-hf.json diff --git a/data/Changgil_K2S3-14b-v0.2.json b/data/models/Changgil_K2S3-14b-v0.2.json similarity index 100% rename from data/Changgil_K2S3-14b-v0.2.json rename to data/models/Changgil_K2S3-14b-v0.2.json diff --git a/data/Changgil_K2S3-v0.1.json b/data/models/Changgil_K2S3-v0.1.json similarity index 100% rename from data/Changgil_K2S3-v0.1.json rename to data/models/Changgil_K2S3-v0.1.json diff --git a/data/ClaudioItaly_Albacus.json b/data/models/ClaudioItaly_Albacus.json similarity index 100% rename from data/ClaudioItaly_Albacus.json rename to data/models/ClaudioItaly_Albacus.json diff --git a/data/ClaudioItaly_Book-Gut12B.json b/data/models/ClaudioItaly_Book-Gut12B.json similarity index 100% rename from data/ClaudioItaly_Book-Gut12B.json rename to data/models/ClaudioItaly_Book-Gut12B.json diff --git a/data/ClaudioItaly_Evolutionstory-7B-v2.2.json b/data/models/ClaudioItaly_Evolutionstory-7B-v2.2.json similarity index 100% rename from data/ClaudioItaly_Evolutionstory-7B-v2.2.json rename to data/models/ClaudioItaly_Evolutionstory-7B-v2.2.json diff --git a/data/ClaudioItaly_intelligence-cod-rag-7b-v3.json b/data/models/ClaudioItaly_intelligence-cod-rag-7b-v3.json similarity index 100% rename from data/ClaudioItaly_intelligence-cod-rag-7b-v3.json rename to data/models/ClaudioItaly_intelligence-cod-rag-7b-v3.json diff --git a/data/CohereForAI_aya-23-35B.json b/data/models/CohereForAI_aya-23-35B.json similarity index 100% rename from data/CohereForAI_aya-23-35B.json rename to data/models/CohereForAI_aya-23-35B.json diff --git a/data/CohereForAI_aya-23-8B.json b/data/models/CohereForAI_aya-23-8B.json similarity index 100% rename from data/CohereForAI_aya-23-8B.json rename to data/models/CohereForAI_aya-23-8B.json diff --git a/data/CohereForAI_aya-expanse-32b.json b/data/models/CohereForAI_aya-expanse-32b.json similarity index 100% rename from data/CohereForAI_aya-expanse-32b.json rename to data/models/CohereForAI_aya-expanse-32b.json diff --git a/data/CohereForAI_aya-expanse-8b.json b/data/models/CohereForAI_aya-expanse-8b.json similarity index 100% rename from data/CohereForAI_aya-expanse-8b.json rename to data/models/CohereForAI_aya-expanse-8b.json diff --git a/data/CohereForAI_c4ai-command-r-plus-08-2024.json b/data/models/CohereForAI_c4ai-command-r-plus-08-2024.json similarity index 100% rename from data/CohereForAI_c4ai-command-r-plus-08-2024.json rename to data/models/CohereForAI_c4ai-command-r-plus-08-2024.json diff --git a/data/CohereForAI_c4ai-command-r-plus.json b/data/models/CohereForAI_c4ai-command-r-plus.json similarity index 100% rename from data/CohereForAI_c4ai-command-r-plus.json rename to data/models/CohereForAI_c4ai-command-r-plus.json diff --git a/data/CohereForAI_c4ai-command-r-v01.json b/data/models/CohereForAI_c4ai-command-r-v01.json similarity index 100% rename from data/CohereForAI_c4ai-command-r-v01.json rename to data/models/CohereForAI_c4ai-command-r-v01.json diff --git a/data/CohereForAI_c4ai-command-r7b-12-2024.json b/data/models/CohereForAI_c4ai-command-r7b-12-2024.json similarity index 100% rename from data/CohereForAI_c4ai-command-r7b-12-2024.json rename to data/models/CohereForAI_c4ai-command-r7b-12-2024.json diff --git a/data/Cohere_March_2024.json b/data/models/Cohere_March_2024.json similarity index 100% rename from data/Cohere_March_2024.json rename to data/models/Cohere_March_2024.json diff --git a/data/Cohere_May_2024.json b/data/models/Cohere_May_2024.json similarity index 100% rename from data/Cohere_May_2024.json rename to data/models/Cohere_May_2024.json diff --git a/data/Columbia-NLP_LION-Gemma-2b-dpo-v1.0.json b/data/models/Columbia-NLP_LION-Gemma-2b-dpo-v1.0.json similarity index 99% rename from data/Columbia-NLP_LION-Gemma-2b-dpo-v1.0.json rename to data/models/Columbia-NLP_LION-Gemma-2b-dpo-v1.0.json index 569c9dad2b3811bf35740b3b95f9018fd638936a..b26ff0b073743787f9ddf9ff7dc370db6aa560e8 100644 --- a/data/Columbia-NLP_LION-Gemma-2b-dpo-v1.0.json +++ b/data/models/Columbia-NLP_LION-Gemma-2b-dpo-v1.0.json @@ -5,7 +5,7 @@ "developer": "Columbia-NLP", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "GemmaForCausalLM", "params_billions": "2.506" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3102 + "score": 0.3278 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3881 + "score": 0.392 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0536 + "score": 0.0431 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2534 + "score": 0.2492 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4081 + "score": 0.412 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1665 + "score": 0.1666 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3278 + "score": 0.3102 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.392 + "score": 0.3881 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0431 + "score": 0.0536 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2492 + "score": 0.2534 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.412 + "score": 0.4081 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1666 + "score": 0.1665 } } ], diff --git a/data/Columbia-NLP_LION-Gemma-2b-odpo-v1.0.json b/data/models/Columbia-NLP_LION-Gemma-2b-odpo-v1.0.json similarity index 100% rename from data/Columbia-NLP_LION-Gemma-2b-odpo-v1.0.json rename to data/models/Columbia-NLP_LION-Gemma-2b-odpo-v1.0.json diff --git a/data/Columbia-NLP_LION-Gemma-2b-sft-v1.0.json b/data/models/Columbia-NLP_LION-Gemma-2b-sft-v1.0.json similarity index 100% rename from data/Columbia-NLP_LION-Gemma-2b-sft-v1.0.json rename to data/models/Columbia-NLP_LION-Gemma-2b-sft-v1.0.json diff --git a/data/Columbia-NLP_LION-LLaMA-3-8b-dpo-v1.0.json b/data/models/Columbia-NLP_LION-LLaMA-3-8b-dpo-v1.0.json similarity index 100% rename from data/Columbia-NLP_LION-LLaMA-3-8b-dpo-v1.0.json rename to data/models/Columbia-NLP_LION-LLaMA-3-8b-dpo-v1.0.json diff --git a/data/Columbia-NLP_LION-LLaMA-3-8b-odpo-v1.0.json b/data/models/Columbia-NLP_LION-LLaMA-3-8b-odpo-v1.0.json similarity index 100% rename from data/Columbia-NLP_LION-LLaMA-3-8b-odpo-v1.0.json rename to data/models/Columbia-NLP_LION-LLaMA-3-8b-odpo-v1.0.json diff --git a/data/Columbia-NLP_LION-LLaMA-3-8b-sft-v1.0.json b/data/models/Columbia-NLP_LION-LLaMA-3-8b-sft-v1.0.json similarity index 100% rename from data/Columbia-NLP_LION-LLaMA-3-8b-sft-v1.0.json rename to data/models/Columbia-NLP_LION-LLaMA-3-8b-sft-v1.0.json diff --git a/data/CombinHorizon_Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES.json b/data/models/CombinHorizon_Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES.json similarity index 100% rename from data/CombinHorizon_Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES.json rename to data/models/CombinHorizon_Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES.json diff --git a/data/CombinHorizon_Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES.json b/data/models/CombinHorizon_Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES.json similarity index 100% rename from data/CombinHorizon_Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES.json rename to data/models/CombinHorizon_Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES.json diff --git a/data/CombinHorizon_YiSM-blossom5.1-34B-SLERP.json b/data/models/CombinHorizon_YiSM-blossom5.1-34B-SLERP.json similarity index 100% rename from data/CombinHorizon_YiSM-blossom5.1-34B-SLERP.json rename to data/models/CombinHorizon_YiSM-blossom5.1-34B-SLERP.json diff --git a/data/CombinHorizon_huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES.json b/data/models/CombinHorizon_huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES.json similarity index 100% rename from data/CombinHorizon_huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES.json rename to data/models/CombinHorizon_huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES.json diff --git a/data/CombinHorizon_huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES.json b/data/models/CombinHorizon_huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES.json similarity index 100% rename from data/CombinHorizon_huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES.json rename to data/models/CombinHorizon_huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES.json diff --git a/data/CombinHorizon_zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES.json b/data/models/CombinHorizon_zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES.json similarity index 100% rename from data/CombinHorizon_zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES.json rename to data/models/CombinHorizon_zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES.json diff --git a/data/ContactDoctor_Bio-Medical-3B-CoT-012025.json b/data/models/ContactDoctor_Bio-Medical-3B-CoT-012025.json similarity index 100% rename from data/ContactDoctor_Bio-Medical-3B-CoT-012025.json rename to data/models/ContactDoctor_Bio-Medical-3B-CoT-012025.json diff --git a/data/ContactDoctor_Bio-Medical-Llama-3-8B.json b/data/models/ContactDoctor_Bio-Medical-Llama-3-8B.json similarity index 100% rename from data/ContactDoctor_Bio-Medical-Llama-3-8B.json rename to data/models/ContactDoctor_Bio-Medical-Llama-3-8B.json diff --git a/data/ContextualAI_LMUnit-llama3.1-70b.json b/data/models/ContextualAI_LMUnit-llama3.1-70b.json similarity index 100% rename from data/ContextualAI_LMUnit-llama3.1-70b.json rename to data/models/ContextualAI_LMUnit-llama3.1-70b.json diff --git a/data/ContextualAI_LMUnit-qwen2.5-72b.json b/data/models/ContextualAI_LMUnit-qwen2.5-72b.json similarity index 100% rename from data/ContextualAI_LMUnit-qwen2.5-72b.json rename to data/models/ContextualAI_LMUnit-qwen2.5-72b.json diff --git a/data/ContextualAI_archangel_sft-dpo_llama13b.json b/data/models/ContextualAI_archangel_sft-dpo_llama13b.json similarity index 100% rename from data/ContextualAI_archangel_sft-dpo_llama13b.json rename to data/models/ContextualAI_archangel_sft-dpo_llama13b.json diff --git a/data/ContextualAI_archangel_sft-dpo_llama30b.json b/data/models/ContextualAI_archangel_sft-dpo_llama30b.json similarity index 100% rename from data/ContextualAI_archangel_sft-dpo_llama30b.json rename to data/models/ContextualAI_archangel_sft-dpo_llama30b.json diff --git a/data/ContextualAI_archangel_sft-dpo_llama7b.json b/data/models/ContextualAI_archangel_sft-dpo_llama7b.json similarity index 100% rename from data/ContextualAI_archangel_sft-dpo_llama7b.json rename to data/models/ContextualAI_archangel_sft-dpo_llama7b.json diff --git a/data/ContextualAI_archangel_sft-dpo_pythia1-4b.json b/data/models/ContextualAI_archangel_sft-dpo_pythia1-4b.json similarity index 100% rename from data/ContextualAI_archangel_sft-dpo_pythia1-4b.json rename to data/models/ContextualAI_archangel_sft-dpo_pythia1-4b.json diff --git a/data/ContextualAI_archangel_sft-dpo_pythia12-0b.json b/data/models/ContextualAI_archangel_sft-dpo_pythia12-0b.json similarity index 100% rename from data/ContextualAI_archangel_sft-dpo_pythia12-0b.json rename to data/models/ContextualAI_archangel_sft-dpo_pythia12-0b.json diff --git a/data/ContextualAI_archangel_sft-dpo_pythia2-8b.json b/data/models/ContextualAI_archangel_sft-dpo_pythia2-8b.json similarity index 100% rename from data/ContextualAI_archangel_sft-dpo_pythia2-8b.json rename to data/models/ContextualAI_archangel_sft-dpo_pythia2-8b.json diff --git a/data/ContextualAI_archangel_sft-dpo_pythia6-9b.json b/data/models/ContextualAI_archangel_sft-dpo_pythia6-9b.json similarity index 100% rename from data/ContextualAI_archangel_sft-dpo_pythia6-9b.json rename to data/models/ContextualAI_archangel_sft-dpo_pythia6-9b.json diff --git a/data/ContextualAI_archangel_sft-kto_llama13b.json b/data/models/ContextualAI_archangel_sft-kto_llama13b.json similarity index 100% rename from data/ContextualAI_archangel_sft-kto_llama13b.json rename to data/models/ContextualAI_archangel_sft-kto_llama13b.json diff --git a/data/ContextualAI_archangel_sft-kto_llama30b.json b/data/models/ContextualAI_archangel_sft-kto_llama30b.json similarity index 100% rename from data/ContextualAI_archangel_sft-kto_llama30b.json rename to data/models/ContextualAI_archangel_sft-kto_llama30b.json diff --git a/data/ContextualAI_archangel_sft-kto_llama7b.json b/data/models/ContextualAI_archangel_sft-kto_llama7b.json similarity index 100% rename from data/ContextualAI_archangel_sft-kto_llama7b.json rename to data/models/ContextualAI_archangel_sft-kto_llama7b.json diff --git a/data/ContextualAI_archangel_sft-kto_pythia1-4b.json b/data/models/ContextualAI_archangel_sft-kto_pythia1-4b.json similarity index 100% rename from data/ContextualAI_archangel_sft-kto_pythia1-4b.json rename to data/models/ContextualAI_archangel_sft-kto_pythia1-4b.json diff --git a/data/ContextualAI_archangel_sft-kto_pythia12-0b.json b/data/models/ContextualAI_archangel_sft-kto_pythia12-0b.json similarity index 100% rename from data/ContextualAI_archangel_sft-kto_pythia12-0b.json rename to data/models/ContextualAI_archangel_sft-kto_pythia12-0b.json diff --git a/data/ContextualAI_archangel_sft-kto_pythia2-8b.json b/data/models/ContextualAI_archangel_sft-kto_pythia2-8b.json similarity index 100% rename from data/ContextualAI_archangel_sft-kto_pythia2-8b.json rename to data/models/ContextualAI_archangel_sft-kto_pythia2-8b.json diff --git a/data/ContextualAI_archangel_sft-kto_pythia6-9b.json b/data/models/ContextualAI_archangel_sft-kto_pythia6-9b.json similarity index 100% rename from data/ContextualAI_archangel_sft-kto_pythia6-9b.json rename to data/models/ContextualAI_archangel_sft-kto_pythia6-9b.json diff --git a/data/CoolSpring_Qwen2-0.5B-Abyme-merge2.json b/data/models/CoolSpring_Qwen2-0.5B-Abyme-merge2.json similarity index 100% rename from data/CoolSpring_Qwen2-0.5B-Abyme-merge2.json rename to data/models/CoolSpring_Qwen2-0.5B-Abyme-merge2.json diff --git a/data/CoolSpring_Qwen2-0.5B-Abyme-merge3.json b/data/models/CoolSpring_Qwen2-0.5B-Abyme-merge3.json similarity index 100% rename from data/CoolSpring_Qwen2-0.5B-Abyme-merge3.json rename to data/models/CoolSpring_Qwen2-0.5B-Abyme-merge3.json diff --git a/data/CoolSpring_Qwen2-0.5B-Abyme.json b/data/models/CoolSpring_Qwen2-0.5B-Abyme.json similarity index 100% rename from data/CoolSpring_Qwen2-0.5B-Abyme.json rename to data/models/CoolSpring_Qwen2-0.5B-Abyme.json diff --git a/data/Corianas_Neural-Mistral-7B.json b/data/models/Corianas_Neural-Mistral-7B.json similarity index 100% rename from data/Corianas_Neural-Mistral-7B.json rename to data/models/Corianas_Neural-Mistral-7B.json diff --git a/data/Corianas_Quokka_2.7b.json b/data/models/Corianas_Quokka_2.7b.json similarity index 100% rename from data/Corianas_Quokka_2.7b.json rename to data/models/Corianas_Quokka_2.7b.json diff --git a/data/Corianas_llama-3-reactor.json b/data/models/Corianas_llama-3-reactor.json similarity index 100% rename from data/Corianas_llama-3-reactor.json rename to data/models/Corianas_llama-3-reactor.json diff --git a/data/CortexLM_btlm-7b-base-v0.2.json b/data/models/CortexLM_btlm-7b-base-v0.2.json similarity index 100% rename from data/CortexLM_btlm-7b-base-v0.2.json rename to data/models/CortexLM_btlm-7b-base-v0.2.json diff --git a/data/Cran-May_SCE-2-24B.json b/data/models/Cran-May_SCE-2-24B.json similarity index 100% rename from data/Cran-May_SCE-2-24B.json rename to data/models/Cran-May_SCE-2-24B.json diff --git a/data/Cran-May_SCE-3-24B.json b/data/models/Cran-May_SCE-3-24B.json similarity index 100% rename from data/Cran-May_SCE-3-24B.json rename to data/models/Cran-May_SCE-3-24B.json diff --git a/data/Cran-May_T.E-8.1.json b/data/models/Cran-May_T.E-8.1.json similarity index 100% rename from data/Cran-May_T.E-8.1.json rename to data/models/Cran-May_T.E-8.1.json diff --git a/data/Cran-May_merge_model_20250308_2.json b/data/models/Cran-May_merge_model_20250308_2.json similarity index 100% rename from data/Cran-May_merge_model_20250308_2.json rename to data/models/Cran-May_merge_model_20250308_2.json diff --git a/data/Cran-May_merge_model_20250308_3.json b/data/models/Cran-May_merge_model_20250308_3.json similarity index 100% rename from data/Cran-May_merge_model_20250308_3.json rename to data/models/Cran-May_merge_model_20250308_3.json diff --git a/data/Cran-May_merge_model_20250308_4.json b/data/models/Cran-May_merge_model_20250308_4.json similarity index 100% rename from data/Cran-May_merge_model_20250308_4.json rename to data/models/Cran-May_merge_model_20250308_4.json diff --git a/data/Cran-May_tempmotacilla-cinerea-0308.json b/data/models/Cran-May_tempmotacilla-cinerea-0308.json similarity index 100% rename from data/Cran-May_tempmotacilla-cinerea-0308.json rename to data/models/Cran-May_tempmotacilla-cinerea-0308.json diff --git a/data/CreitinGameplays_Llama-3.1-8B-R1-v0.1.json b/data/models/CreitinGameplays_Llama-3.1-8B-R1-v0.1.json similarity index 100% rename from data/CreitinGameplays_Llama-3.1-8B-R1-v0.1.json rename to data/models/CreitinGameplays_Llama-3.1-8B-R1-v0.1.json diff --git a/data/CultriX_Qwen2.5-14B-Broca.json b/data/models/CultriX_Qwen2.5-14B-Broca.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-Broca.json rename to data/models/CultriX_Qwen2.5-14B-Broca.json diff --git a/data/CultriX_Qwen2.5-14B-BrocaV9.json b/data/models/CultriX_Qwen2.5-14B-BrocaV9.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-BrocaV9.json rename to data/models/CultriX_Qwen2.5-14B-BrocaV9.json diff --git a/data/CultriX_Qwen2.5-14B-Brocav3.json b/data/models/CultriX_Qwen2.5-14B-Brocav3.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-Brocav3.json rename to data/models/CultriX_Qwen2.5-14B-Brocav3.json diff --git a/data/CultriX_Qwen2.5-14B-Brocav6.json b/data/models/CultriX_Qwen2.5-14B-Brocav6.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-Brocav6.json rename to data/models/CultriX_Qwen2.5-14B-Brocav6.json diff --git a/data/CultriX_Qwen2.5-14B-Brocav7.json b/data/models/CultriX_Qwen2.5-14B-Brocav7.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-Brocav7.json rename to data/models/CultriX_Qwen2.5-14B-Brocav7.json diff --git a/data/CultriX_Qwen2.5-14B-Emerged.json b/data/models/CultriX_Qwen2.5-14B-Emerged.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-Emerged.json rename to data/models/CultriX_Qwen2.5-14B-Emerged.json diff --git a/data/CultriX_Qwen2.5-14B-Emergedv3.json b/data/models/CultriX_Qwen2.5-14B-Emergedv3.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-Emergedv3.json rename to data/models/CultriX_Qwen2.5-14B-Emergedv3.json diff --git a/data/CultriX_Qwen2.5-14B-FinalMerge.json b/data/models/CultriX_Qwen2.5-14B-FinalMerge.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-FinalMerge.json rename to data/models/CultriX_Qwen2.5-14B-FinalMerge.json diff --git a/data/CultriX_Qwen2.5-14B-Hyper.json b/data/models/CultriX_Qwen2.5-14B-Hyper.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-Hyper.json rename to data/models/CultriX_Qwen2.5-14B-Hyper.json diff --git a/data/CultriX_Qwen2.5-14B-HyperMarck-dl.json b/data/models/CultriX_Qwen2.5-14B-HyperMarck-dl.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-HyperMarck-dl.json rename to data/models/CultriX_Qwen2.5-14B-HyperMarck-dl.json diff --git a/data/CultriX_Qwen2.5-14B-Hyperionv3.json b/data/models/CultriX_Qwen2.5-14B-Hyperionv3.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-Hyperionv3.json rename to data/models/CultriX_Qwen2.5-14B-Hyperionv3.json diff --git a/data/CultriX_Qwen2.5-14B-Hyperionv4.json b/data/models/CultriX_Qwen2.5-14B-Hyperionv4.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-Hyperionv4.json rename to data/models/CultriX_Qwen2.5-14B-Hyperionv4.json diff --git a/data/CultriX_Qwen2.5-14B-Hyperionv5.json b/data/models/CultriX_Qwen2.5-14B-Hyperionv5.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-Hyperionv5.json rename to data/models/CultriX_Qwen2.5-14B-Hyperionv5.json diff --git a/data/CultriX_Qwen2.5-14B-MegaMerge-pt2.json b/data/models/CultriX_Qwen2.5-14B-MegaMerge-pt2.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-MegaMerge-pt2.json rename to data/models/CultriX_Qwen2.5-14B-MegaMerge-pt2.json diff --git a/data/CultriX_Qwen2.5-14B-MergeStock.json b/data/models/CultriX_Qwen2.5-14B-MergeStock.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-MergeStock.json rename to data/models/CultriX_Qwen2.5-14B-MergeStock.json diff --git a/data/CultriX_Qwen2.5-14B-ReasoningMerge.json b/data/models/CultriX_Qwen2.5-14B-ReasoningMerge.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-ReasoningMerge.json rename to data/models/CultriX_Qwen2.5-14B-ReasoningMerge.json diff --git a/data/CultriX_Qwen2.5-14B-Ultimav2.json b/data/models/CultriX_Qwen2.5-14B-Ultimav2.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-Ultimav2.json rename to data/models/CultriX_Qwen2.5-14B-Ultimav2.json diff --git a/data/CultriX_Qwen2.5-14B-Unity.json b/data/models/CultriX_Qwen2.5-14B-Unity.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-Unity.json rename to data/models/CultriX_Qwen2.5-14B-Unity.json diff --git a/data/CultriX_Qwen2.5-14B-Wernicke-SFT.json b/data/models/CultriX_Qwen2.5-14B-Wernicke-SFT.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-Wernicke-SFT.json rename to data/models/CultriX_Qwen2.5-14B-Wernicke-SFT.json diff --git a/data/CultriX_Qwen2.5-14B-Wernicke-SLERP.json b/data/models/CultriX_Qwen2.5-14B-Wernicke-SLERP.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-Wernicke-SLERP.json rename to data/models/CultriX_Qwen2.5-14B-Wernicke-SLERP.json diff --git a/data/CultriX_Qwen2.5-14B-Wernicke.json b/data/models/CultriX_Qwen2.5-14B-Wernicke.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-Wernicke.json rename to data/models/CultriX_Qwen2.5-14B-Wernicke.json diff --git a/data/CultriX_Qwen2.5-14B-Wernickev3.json b/data/models/CultriX_Qwen2.5-14B-Wernickev3.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-Wernickev3.json rename to data/models/CultriX_Qwen2.5-14B-Wernickev3.json diff --git a/data/CultriX_Qwen2.5-14B-partialmergept1.json b/data/models/CultriX_Qwen2.5-14B-partialmergept1.json similarity index 100% rename from data/CultriX_Qwen2.5-14B-partialmergept1.json rename to data/models/CultriX_Qwen2.5-14B-partialmergept1.json diff --git a/data/CultriX_Qwenfinity-2.5-14B.json b/data/models/CultriX_Qwenfinity-2.5-14B.json similarity index 100% rename from data/CultriX_Qwenfinity-2.5-14B.json rename to data/models/CultriX_Qwenfinity-2.5-14B.json diff --git a/data/CultriX_Qwestion-14B.json b/data/models/CultriX_Qwestion-14B.json similarity index 100% rename from data/CultriX_Qwestion-14B.json rename to data/models/CultriX_Qwestion-14B.json diff --git a/data/CultriX_SeQwence-14B-EvolMerge.json b/data/models/CultriX_SeQwence-14B-EvolMerge.json similarity index 100% rename from data/CultriX_SeQwence-14B-EvolMerge.json rename to data/models/CultriX_SeQwence-14B-EvolMerge.json diff --git a/data/CultriX_SeQwence-14B-EvolMergev1.json b/data/models/CultriX_SeQwence-14B-EvolMergev1.json similarity index 100% rename from data/CultriX_SeQwence-14B-EvolMergev1.json rename to data/models/CultriX_SeQwence-14B-EvolMergev1.json diff --git a/data/CultriX_SeQwence-14B-v5.json b/data/models/CultriX_SeQwence-14B-v5.json similarity index 100% rename from data/CultriX_SeQwence-14B-v5.json rename to data/models/CultriX_SeQwence-14B-v5.json diff --git a/data/CultriX_SeQwence-14B.json b/data/models/CultriX_SeQwence-14B.json similarity index 100% rename from data/CultriX_SeQwence-14B.json rename to data/models/CultriX_SeQwence-14B.json diff --git a/data/CultriX_SeQwence-14Bv1.json b/data/models/CultriX_SeQwence-14Bv1.json similarity index 100% rename from data/CultriX_SeQwence-14Bv1.json rename to data/models/CultriX_SeQwence-14Bv1.json diff --git a/data/CultriX_SeQwence-14Bv2.json b/data/models/CultriX_SeQwence-14Bv2.json similarity index 100% rename from data/CultriX_SeQwence-14Bv2.json rename to data/models/CultriX_SeQwence-14Bv2.json diff --git a/data/CultriX_SeQwence-14Bv3.json b/data/models/CultriX_SeQwence-14Bv3.json similarity index 100% rename from data/CultriX_SeQwence-14Bv3.json rename to data/models/CultriX_SeQwence-14Bv3.json diff --git a/data/DRXD1000_Atlas-7B.json b/data/models/DRXD1000_Atlas-7B.json similarity index 100% rename from data/DRXD1000_Atlas-7B.json rename to data/models/DRXD1000_Atlas-7B.json diff --git a/data/DRXD1000_Phoenix-7B.json b/data/models/DRXD1000_Phoenix-7B.json similarity index 100% rename from data/DRXD1000_Phoenix-7B.json rename to data/models/DRXD1000_Phoenix-7B.json diff --git a/data/DUAL-GPO_zephyr-7b-ipo-0k-15k-i1.json b/data/models/DUAL-GPO_zephyr-7b-ipo-0k-15k-i1.json similarity index 100% rename from data/DUAL-GPO_zephyr-7b-ipo-0k-15k-i1.json rename to data/models/DUAL-GPO_zephyr-7b-ipo-0k-15k-i1.json diff --git a/data/DZgas_GIGABATEMAN-7B.json b/data/models/DZgas_GIGABATEMAN-7B.json similarity index 100% rename from data/DZgas_GIGABATEMAN-7B.json rename to data/models/DZgas_GIGABATEMAN-7B.json diff --git a/data/Daemontatox_AetherDrake-SFT.json b/data/models/Daemontatox_AetherDrake-SFT.json similarity index 100% rename from data/Daemontatox_AetherDrake-SFT.json rename to data/models/Daemontatox_AetherDrake-SFT.json diff --git a/data/Daemontatox_AetherSett.json b/data/models/Daemontatox_AetherSett.json similarity index 100% rename from data/Daemontatox_AetherSett.json rename to data/models/Daemontatox_AetherSett.json diff --git a/data/Daemontatox_AetherTOT.json b/data/models/Daemontatox_AetherTOT.json similarity index 99% rename from data/Daemontatox_AetherTOT.json rename to data/models/Daemontatox_AetherTOT.json index 72bdb01863fd9cb8a56697def402922f2204f3a5..2ea373c2eebb0dfbd13abbe17934735c92168866 100644 --- a/data/Daemontatox_AetherTOT.json +++ b/data/models/Daemontatox_AetherTOT.json @@ -5,7 +5,7 @@ "developer": "Daemontatox", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "MllamaForConditionalGeneration", "params_billions": "10.67" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4383 + "score": 0.4398 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5034 + "score": 0.5066 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1443 + "score": 0.1488 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4052 + "score": 0.4079 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3778 + "score": 0.3804 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4398 + "score": 0.4383 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5066 + "score": 0.5034 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1488 + "score": 0.1443 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4079 + "score": 0.4052 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3804 + "score": 0.3778 } } ], diff --git a/data/Daemontatox_AetherUncensored.json b/data/models/Daemontatox_AetherUncensored.json similarity index 100% rename from data/Daemontatox_AetherUncensored.json rename to data/models/Daemontatox_AetherUncensored.json diff --git a/data/Daemontatox_Cogito-MIS.json b/data/models/Daemontatox_Cogito-MIS.json similarity index 100% rename from data/Daemontatox_Cogito-MIS.json rename to data/models/Daemontatox_Cogito-MIS.json diff --git a/data/Daemontatox_CogitoDistil.json b/data/models/Daemontatox_CogitoDistil.json similarity index 100% rename from data/Daemontatox_CogitoDistil.json rename to data/models/Daemontatox_CogitoDistil.json diff --git a/data/Daemontatox_CogitoZ.json b/data/models/Daemontatox_CogitoZ.json similarity index 100% rename from data/Daemontatox_CogitoZ.json rename to data/models/Daemontatox_CogitoZ.json diff --git a/data/Daemontatox_CogitoZ14.json b/data/models/Daemontatox_CogitoZ14.json similarity index 100% rename from data/Daemontatox_CogitoZ14.json rename to data/models/Daemontatox_CogitoZ14.json diff --git a/data/Daemontatox_DocumentCogito.json b/data/models/Daemontatox_DocumentCogito.json similarity index 99% rename from data/Daemontatox_DocumentCogito.json rename to data/models/Daemontatox_DocumentCogito.json index 795a538c995120394735a0a945454a8623a4236e..d171eba3a94f3af3d92f741f446b0ddb99e1f6b7 100644 --- a/data/Daemontatox_DocumentCogito.json +++ b/data/models/Daemontatox_DocumentCogito.json @@ -5,7 +5,7 @@ "developer": "Daemontatox", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "MllamaForConditionalGeneration", "params_billions": "10.67" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.777 + "score": 0.5064 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5187 + "score": 0.5112 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2198 + "score": 0.1631 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2936 + "score": 0.3163 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3911 + "score": 0.3973 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3738 + "score": 0.3802 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5064 + "score": 0.777 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5112 + "score": 0.5187 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1631 + "score": 0.2198 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3163 + "score": 0.2936 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3973 + "score": 0.3911 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3802 + "score": 0.3738 } } ], diff --git a/data/Daemontatox_Llama3.3-70B-CogniLink.json b/data/models/Daemontatox_Llama3.3-70B-CogniLink.json similarity index 100% rename from data/Daemontatox_Llama3.3-70B-CogniLink.json rename to data/models/Daemontatox_Llama3.3-70B-CogniLink.json diff --git a/data/Daemontatox_Llama_cot.json b/data/models/Daemontatox_Llama_cot.json similarity index 100% rename from data/Daemontatox_Llama_cot.json rename to data/models/Daemontatox_Llama_cot.json diff --git a/data/Daemontatox_MawaredT1.json b/data/models/Daemontatox_MawaredT1.json similarity index 100% rename from data/Daemontatox_MawaredT1.json rename to data/models/Daemontatox_MawaredT1.json diff --git a/data/Daemontatox_Mini_QwQ.json b/data/models/Daemontatox_Mini_QwQ.json similarity index 100% rename from data/Daemontatox_Mini_QwQ.json rename to data/models/Daemontatox_Mini_QwQ.json diff --git a/data/Daemontatox_NemoR.json b/data/models/Daemontatox_NemoR.json similarity index 100% rename from data/Daemontatox_NemoR.json rename to data/models/Daemontatox_NemoR.json diff --git a/data/Daemontatox_PathFinderAI2.0.json b/data/models/Daemontatox_PathFinderAI2.0.json similarity index 100% rename from data/Daemontatox_PathFinderAI2.0.json rename to data/models/Daemontatox_PathFinderAI2.0.json diff --git a/data/Daemontatox_PathFinderAi3.0.json b/data/models/Daemontatox_PathFinderAi3.0.json similarity index 100% rename from data/Daemontatox_PathFinderAi3.0.json rename to data/models/Daemontatox_PathFinderAi3.0.json diff --git a/data/Daemontatox_PathfinderAI.json b/data/models/Daemontatox_PathfinderAI.json similarity index 99% rename from data/Daemontatox_PathfinderAI.json rename to data/models/Daemontatox_PathfinderAI.json index e1f5c19d36675adf14e4da07a19941ae0f69be33..cea390cdba1611d28ac0ecfbae5b1930078d8645 100644 --- a/data/Daemontatox_PathfinderAI.json +++ b/data/models/Daemontatox_PathfinderAI.json @@ -5,7 +5,7 @@ "developer": "Daemontatox", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "Qwen2ForCausalLM", "params_billions": "32.764" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3745 + "score": 0.4855 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6668 + "score": 0.6627 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4758 + "score": 0.4841 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3943 + "score": 0.3096 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4858 + "score": 0.4256 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5593 + "score": 0.5542 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4855 + "score": 0.3745 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6627 + "score": 0.6668 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4841 + "score": 0.4758 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3096 + "score": 0.3943 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4256 + "score": 0.4858 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5542 + "score": 0.5593 } } ], diff --git a/data/Daemontatox_Phi-4-COT.json b/data/models/Daemontatox_Phi-4-COT.json similarity index 100% rename from data/Daemontatox_Phi-4-COT.json rename to data/models/Daemontatox_Phi-4-COT.json diff --git a/data/Daemontatox_PixelParse_AI.json b/data/models/Daemontatox_PixelParse_AI.json similarity index 100% rename from data/Daemontatox_PixelParse_AI.json rename to data/models/Daemontatox_PixelParse_AI.json diff --git a/data/Daemontatox_RA2.0.json b/data/models/Daemontatox_RA2.0.json similarity index 100% rename from data/Daemontatox_RA2.0.json rename to data/models/Daemontatox_RA2.0.json diff --git a/data/Daemontatox_RA_Reasoner.json b/data/models/Daemontatox_RA_Reasoner.json similarity index 100% rename from data/Daemontatox_RA_Reasoner.json rename to data/models/Daemontatox_RA_Reasoner.json diff --git a/data/Daemontatox_RA_Reasoner2.0.json b/data/models/Daemontatox_RA_Reasoner2.0.json similarity index 100% rename from data/Daemontatox_RA_Reasoner2.0.json rename to data/models/Daemontatox_RA_Reasoner2.0.json diff --git a/data/Daemontatox_ReasonTest.json b/data/models/Daemontatox_ReasonTest.json similarity index 100% rename from data/Daemontatox_ReasonTest.json rename to data/models/Daemontatox_ReasonTest.json diff --git a/data/Daemontatox_Research_PathfinderAI.json b/data/models/Daemontatox_Research_PathfinderAI.json similarity index 100% rename from data/Daemontatox_Research_PathfinderAI.json rename to data/models/Daemontatox_Research_PathfinderAI.json diff --git a/data/Daemontatox_SphinX.json b/data/models/Daemontatox_SphinX.json similarity index 100% rename from data/Daemontatox_SphinX.json rename to data/models/Daemontatox_SphinX.json diff --git a/data/Daemontatox_Sphinx2.0.json b/data/models/Daemontatox_Sphinx2.0.json similarity index 100% rename from data/Daemontatox_Sphinx2.0.json rename to data/models/Daemontatox_Sphinx2.0.json diff --git a/data/Daemontatox_TinySphinx.json b/data/models/Daemontatox_TinySphinx.json similarity index 100% rename from data/Daemontatox_TinySphinx.json rename to data/models/Daemontatox_TinySphinx.json diff --git a/data/Daemontatox_TinySphinx2.0.json b/data/models/Daemontatox_TinySphinx2.0.json similarity index 100% rename from data/Daemontatox_TinySphinx2.0.json rename to data/models/Daemontatox_TinySphinx2.0.json diff --git a/data/Daemontatox_Zirel-7B-Math.json b/data/models/Daemontatox_Zirel-7B-Math.json similarity index 100% rename from data/Daemontatox_Zirel-7B-Math.json rename to data/models/Daemontatox_Zirel-7B-Math.json diff --git a/data/Daemontatox_Zirel_1.5.json b/data/models/Daemontatox_Zirel_1.5.json similarity index 100% rename from data/Daemontatox_Zirel_1.5.json rename to data/models/Daemontatox_Zirel_1.5.json diff --git a/data/Daemontatox_mini-Cogito-R1.json b/data/models/Daemontatox_mini-Cogito-R1.json similarity index 100% rename from data/Daemontatox_mini-Cogito-R1.json rename to data/models/Daemontatox_mini-Cogito-R1.json diff --git a/data/Daemontatox_mini_Pathfinder.json b/data/models/Daemontatox_mini_Pathfinder.json similarity index 100% rename from data/Daemontatox_mini_Pathfinder.json rename to data/models/Daemontatox_mini_Pathfinder.json diff --git a/data/Dampfinchen_Llama-3.1-8B-Ultra-Instruct.json b/data/models/Dampfinchen_Llama-3.1-8B-Ultra-Instruct.json similarity index 100% rename from data/Dampfinchen_Llama-3.1-8B-Ultra-Instruct.json rename to data/models/Dampfinchen_Llama-3.1-8B-Ultra-Instruct.json diff --git a/data/Danielbrdz_Barcenas-10b.json b/data/models/Danielbrdz_Barcenas-10b.json similarity index 100% rename from data/Danielbrdz_Barcenas-10b.json rename to data/models/Danielbrdz_Barcenas-10b.json diff --git a/data/Danielbrdz_Barcenas-14b-Phi-3-medium-ORPO.json b/data/models/Danielbrdz_Barcenas-14b-Phi-3-medium-ORPO.json similarity index 100% rename from data/Danielbrdz_Barcenas-14b-Phi-3-medium-ORPO.json rename to data/models/Danielbrdz_Barcenas-14b-Phi-3-medium-ORPO.json diff --git a/data/Danielbrdz_Barcenas-14b-phi-4-v2.json b/data/models/Danielbrdz_Barcenas-14b-phi-4-v2.json similarity index 100% rename from data/Danielbrdz_Barcenas-14b-phi-4-v2.json rename to data/models/Danielbrdz_Barcenas-14b-phi-4-v2.json diff --git a/data/Danielbrdz_Barcenas-14b-phi-4.json b/data/models/Danielbrdz_Barcenas-14b-phi-4.json similarity index 100% rename from data/Danielbrdz_Barcenas-14b-phi-4.json rename to data/models/Danielbrdz_Barcenas-14b-phi-4.json diff --git a/data/Danielbrdz_Barcenas-3b-GRPO.json b/data/models/Danielbrdz_Barcenas-3b-GRPO.json similarity index 100% rename from data/Danielbrdz_Barcenas-3b-GRPO.json rename to data/models/Danielbrdz_Barcenas-3b-GRPO.json diff --git a/data/Danielbrdz_Barcenas-Llama3-8b-ORPO.json b/data/models/Danielbrdz_Barcenas-Llama3-8b-ORPO.json similarity index 100% rename from data/Danielbrdz_Barcenas-Llama3-8b-ORPO.json rename to data/models/Danielbrdz_Barcenas-Llama3-8b-ORPO.json diff --git a/data/Danielbrdz_Barcenas-R1-Qwen-1.5b.json b/data/models/Danielbrdz_Barcenas-R1-Qwen-1.5b.json similarity index 100% rename from data/Danielbrdz_Barcenas-R1-Qwen-1.5b.json rename to data/models/Danielbrdz_Barcenas-R1-Qwen-1.5b.json diff --git a/data/Dans-DiscountModels_12b-mn-dans-reasoning-test-2.json b/data/models/Dans-DiscountModels_12b-mn-dans-reasoning-test-2.json similarity index 100% rename from data/Dans-DiscountModels_12b-mn-dans-reasoning-test-2.json rename to data/models/Dans-DiscountModels_12b-mn-dans-reasoning-test-2.json diff --git a/data/Dans-DiscountModels_12b-mn-dans-reasoning-test-3.json b/data/models/Dans-DiscountModels_12b-mn-dans-reasoning-test-3.json similarity index 100% rename from data/Dans-DiscountModels_12b-mn-dans-reasoning-test-3.json rename to data/models/Dans-DiscountModels_12b-mn-dans-reasoning-test-3.json diff --git a/data/Dans-DiscountModels_Dans-Instruct-CoreCurriculum-12b-ChatML.json b/data/models/Dans-DiscountModels_Dans-Instruct-CoreCurriculum-12b-ChatML.json similarity index 100% rename from data/Dans-DiscountModels_Dans-Instruct-CoreCurriculum-12b-ChatML.json rename to data/models/Dans-DiscountModels_Dans-Instruct-CoreCurriculum-12b-ChatML.json diff --git a/data/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML-V0.1.0.json b/data/models/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML-V0.1.0.json similarity index 100% rename from data/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML-V0.1.0.json rename to data/models/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML-V0.1.0.json diff --git a/data/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML-V0.1.1.json b/data/models/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML-V0.1.1.json similarity index 100% rename from data/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML-V0.1.1.json rename to data/models/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML-V0.1.1.json diff --git a/data/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML-V0.2.0.json b/data/models/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML-V0.2.0.json similarity index 100% rename from data/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML-V0.2.0.json rename to data/models/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML-V0.2.0.json diff --git a/data/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML.json b/data/models/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML.json similarity index 100% rename from data/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML.json rename to data/models/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML.json diff --git a/data/Dans-DiscountModels_Mistral-7b-v0.3-Test-E0.7.json b/data/models/Dans-DiscountModels_Mistral-7b-v0.3-Test-E0.7.json similarity index 100% rename from data/Dans-DiscountModels_Mistral-7b-v0.3-Test-E0.7.json rename to data/models/Dans-DiscountModels_Mistral-7b-v0.3-Test-E0.7.json diff --git a/data/Dans-DiscountModels_mistral-7b-test-merged.json b/data/models/Dans-DiscountModels_mistral-7b-test-merged.json similarity index 100% rename from data/Dans-DiscountModels_mistral-7b-test-merged.json rename to data/models/Dans-DiscountModels_mistral-7b-test-merged.json diff --git a/data/Darkknight535_OpenCrystal-12B-L3.json b/data/models/Darkknight535_OpenCrystal-12B-L3.json similarity index 100% rename from data/Darkknight535_OpenCrystal-12B-L3.json rename to data/models/Darkknight535_OpenCrystal-12B-L3.json diff --git a/data/Databricks-Mosaic-Research_PGRM.json b/data/models/Databricks-Mosaic-Research_PGRM.json similarity index 100% rename from data/Databricks-Mosaic-Research_PGRM.json rename to data/models/Databricks-Mosaic-Research_PGRM.json diff --git a/data/DavidAU_DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm.json b/data/models/DavidAU_DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm.json similarity index 100% rename from data/DavidAU_DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm.json rename to data/models/DavidAU_DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm.json diff --git a/data/DavidAU_DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B.json b/data/models/DavidAU_DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B.json similarity index 100% rename from data/DavidAU_DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B.json rename to data/models/DavidAU_DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B.json diff --git a/data/DavidAU_DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B.json b/data/models/DavidAU_DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B.json similarity index 100% rename from data/DavidAU_DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B.json rename to data/models/DavidAU_DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B.json diff --git a/data/DavidAU_DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B.json b/data/models/DavidAU_DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B.json similarity index 100% rename from data/DavidAU_DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B.json rename to data/models/DavidAU_DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B.json diff --git a/data/DavidAU_DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B.json b/data/models/DavidAU_DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B.json similarity index 100% rename from data/DavidAU_DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B.json rename to data/models/DavidAU_DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B.json diff --git a/data/DavidAU_DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm.json b/data/models/DavidAU_DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm.json similarity index 100% rename from data/DavidAU_DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm.json rename to data/models/DavidAU_DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm.json diff --git a/data/DavidAU_DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B.json b/data/models/DavidAU_DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B.json similarity index 100% rename from data/DavidAU_DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B.json rename to data/models/DavidAU_DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B.json diff --git a/data/DavidAU_DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B.json b/data/models/DavidAU_DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B.json similarity index 100% rename from data/DavidAU_DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B.json rename to data/models/DavidAU_DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B.json diff --git a/data/DavidAU_Gemma-The-Writer-9B.json b/data/models/DavidAU_Gemma-The-Writer-9B.json similarity index 100% rename from data/DavidAU_Gemma-The-Writer-9B.json rename to data/models/DavidAU_Gemma-The-Writer-9B.json diff --git a/data/DavidAU_Gemma-The-Writer-DEADLINE-10B.json b/data/models/DavidAU_Gemma-The-Writer-DEADLINE-10B.json similarity index 100% rename from data/DavidAU_Gemma-The-Writer-DEADLINE-10B.json rename to data/models/DavidAU_Gemma-The-Writer-DEADLINE-10B.json diff --git a/data/DavidAU_Gemma-The-Writer-J.GutenBerg-10B.json b/data/models/DavidAU_Gemma-The-Writer-J.GutenBerg-10B.json similarity index 100% rename from data/DavidAU_Gemma-The-Writer-J.GutenBerg-10B.json rename to data/models/DavidAU_Gemma-The-Writer-J.GutenBerg-10B.json diff --git a/data/DavidAU_Gemma-The-Writer-Mighty-Sword-9B.json b/data/models/DavidAU_Gemma-The-Writer-Mighty-Sword-9B.json similarity index 100% rename from data/DavidAU_Gemma-The-Writer-Mighty-Sword-9B.json rename to data/models/DavidAU_Gemma-The-Writer-Mighty-Sword-9B.json diff --git a/data/DavidAU_Gemma-The-Writer-N-Restless-Quill-10B-Uncensored.json b/data/models/DavidAU_Gemma-The-Writer-N-Restless-Quill-10B-Uncensored.json similarity index 100% rename from data/DavidAU_Gemma-The-Writer-N-Restless-Quill-10B-Uncensored.json rename to data/models/DavidAU_Gemma-The-Writer-N-Restless-Quill-10B-Uncensored.json diff --git a/data/DavidAU_L3-DARKEST-PLANET-16.5B.json b/data/models/DavidAU_L3-DARKEST-PLANET-16.5B.json similarity index 100% rename from data/DavidAU_L3-DARKEST-PLANET-16.5B.json rename to data/models/DavidAU_L3-DARKEST-PLANET-16.5B.json diff --git a/data/DavidAU_L3-Dark-Planet-8B.json b/data/models/DavidAU_L3-Dark-Planet-8B.json similarity index 100% rename from data/DavidAU_L3-Dark-Planet-8B.json rename to data/models/DavidAU_L3-Dark-Planet-8B.json diff --git a/data/DavidAU_L3-Jamet-12.2B-MK.V-Blackroot-Instruct.json b/data/models/DavidAU_L3-Jamet-12.2B-MK.V-Blackroot-Instruct.json similarity index 100% rename from data/DavidAU_L3-Jamet-12.2B-MK.V-Blackroot-Instruct.json rename to data/models/DavidAU_L3-Jamet-12.2B-MK.V-Blackroot-Instruct.json diff --git a/data/DavidAU_L3-Lumimaid-12.2B-v0.1-OAS-Instruct.json b/data/models/DavidAU_L3-Lumimaid-12.2B-v0.1-OAS-Instruct.json similarity index 100% rename from data/DavidAU_L3-Lumimaid-12.2B-v0.1-OAS-Instruct.json rename to data/models/DavidAU_L3-Lumimaid-12.2B-v0.1-OAS-Instruct.json diff --git a/data/DavidAU_L3-SMB-Instruct-12.2B-F32.json b/data/models/DavidAU_L3-SMB-Instruct-12.2B-F32.json similarity index 100% rename from data/DavidAU_L3-SMB-Instruct-12.2B-F32.json rename to data/models/DavidAU_L3-SMB-Instruct-12.2B-F32.json diff --git a/data/DavidAU_L3-Stheno-Maid-Blackroot-Grand-HORROR-16B.json b/data/models/DavidAU_L3-Stheno-Maid-Blackroot-Grand-HORROR-16B.json similarity index 100% rename from data/DavidAU_L3-Stheno-Maid-Blackroot-Grand-HORROR-16B.json rename to data/models/DavidAU_L3-Stheno-Maid-Blackroot-Grand-HORROR-16B.json diff --git a/data/DavidAU_L3-Stheno-v3.2-12.2B-Instruct.json b/data/models/DavidAU_L3-Stheno-v3.2-12.2B-Instruct.json similarity index 100% rename from data/DavidAU_L3-Stheno-v3.2-12.2B-Instruct.json rename to data/models/DavidAU_L3-Stheno-v3.2-12.2B-Instruct.json diff --git a/data/DavidAU_L3.1-Dark-Planet-SpinFire-Uncensored-8B.json b/data/models/DavidAU_L3.1-Dark-Planet-SpinFire-Uncensored-8B.json similarity index 100% rename from data/DavidAU_L3.1-Dark-Planet-SpinFire-Uncensored-8B.json rename to data/models/DavidAU_L3.1-Dark-Planet-SpinFire-Uncensored-8B.json diff --git a/data/DavidAU_L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B.json b/data/models/DavidAU_L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B.json similarity index 100% rename from data/DavidAU_L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B.json rename to data/models/DavidAU_L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B.json diff --git a/data/DavidAU_Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B.json b/data/models/DavidAU_Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B.json similarity index 100% rename from data/DavidAU_Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B.json rename to data/models/DavidAU_Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B.json diff --git a/data/DavidAU_Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B.json b/data/models/DavidAU_Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B.json similarity index 100% rename from data/DavidAU_Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B.json rename to data/models/DavidAU_Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B.json diff --git a/data/DavidAU_Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32.json b/data/models/DavidAU_Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32.json similarity index 100% rename from data/DavidAU_Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32.json rename to data/models/DavidAU_Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32.json diff --git a/data/Davidsv_SUONG-1.json b/data/models/Davidsv_SUONG-1.json similarity index 100% rename from data/Davidsv_SUONG-1.json rename to data/models/Davidsv_SUONG-1.json diff --git a/data/DavieLion_Llama-3.2-1B-SPIN-iter0.json b/data/models/DavieLion_Llama-3.2-1B-SPIN-iter0.json similarity index 99% rename from data/DavieLion_Llama-3.2-1B-SPIN-iter0.json rename to data/models/DavieLion_Llama-3.2-1B-SPIN-iter0.json index 34acafe2182ed5fe43de3f09e1727813e659e101..de774747911df630678aaeeb62f20bc722910dbb 100644 --- a/data/DavieLion_Llama-3.2-1B-SPIN-iter0.json +++ b/data/models/DavieLion_Llama-3.2-1B-SPIN-iter0.json @@ -5,7 +5,7 @@ "developer": "DavieLion", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "LlamaForCausalLM", "params_billions": "1.236" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1507 + "score": 0.1549 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.293 + "score": 0.2937 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0 + "score": 0.006 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2534 + "score": 0.2576 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1125 + "score": 0.1128 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1549 + "score": 0.1507 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2937 + "score": 0.293 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.006 + "score": 0.0 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2576 + "score": 0.2534 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1128 + "score": 0.1125 } } ], diff --git a/data/DavieLion_Llama-3.2-1B-SPIN-iter1.json b/data/models/DavieLion_Llama-3.2-1B-SPIN-iter1.json similarity index 100% rename from data/DavieLion_Llama-3.2-1B-SPIN-iter1.json rename to data/models/DavieLion_Llama-3.2-1B-SPIN-iter1.json diff --git a/data/DavieLion_Llama-3.2-1B-SPIN-iter2.json b/data/models/DavieLion_Llama-3.2-1B-SPIN-iter2.json similarity index 100% rename from data/DavieLion_Llama-3.2-1B-SPIN-iter2.json rename to data/models/DavieLion_Llama-3.2-1B-SPIN-iter2.json diff --git a/data/DavieLion_Llama-3.2-1B-SPIN-iter3.json b/data/models/DavieLion_Llama-3.2-1B-SPIN-iter3.json similarity index 99% rename from data/DavieLion_Llama-3.2-1B-SPIN-iter3.json rename to data/models/DavieLion_Llama-3.2-1B-SPIN-iter3.json index e1850c4ebd33b21607f634255f01f396dc8af368..d80b33f87e66136a611292d52965e3080ddfdf08 100644 --- a/data/DavieLion_Llama-3.2-1B-SPIN-iter3.json +++ b/data/models/DavieLion_Llama-3.2-1B-SPIN-iter3.json @@ -5,7 +5,7 @@ "developer": "DavieLion", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "LlamaForCausalLM", "params_billions": "1.236" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1336 + "score": 0.1324 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2975 + "score": 0.2972 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0068 + "score": 0.0 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2534 + "score": 0.2643 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.35 + "score": 0.3527 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1128 + "score": 0.1129 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1324 + "score": 0.1336 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2972 + "score": 0.2975 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0 + "score": 0.0068 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2643 + "score": 0.2534 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3527 + "score": 0.35 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1129 + "score": 0.1128 } } ], diff --git a/data/DavieLion_Lllma-3.2-1B.json b/data/models/DavieLion_Lllma-3.2-1B.json similarity index 100% rename from data/DavieLion_Lllma-3.2-1B.json rename to data/models/DavieLion_Lllma-3.2-1B.json diff --git a/data/DebateLabKIT_Llama-3.1-Argunaut-1-8B-SFT.json b/data/models/DebateLabKIT_Llama-3.1-Argunaut-1-8B-SFT.json similarity index 100% rename from data/DebateLabKIT_Llama-3.1-Argunaut-1-8B-SFT.json rename to data/models/DebateLabKIT_Llama-3.1-Argunaut-1-8B-SFT.json diff --git a/data/Deci_DeciLM-7B-instruct.json b/data/models/Deci_DeciLM-7B-instruct.json similarity index 100% rename from data/Deci_DeciLM-7B-instruct.json rename to data/models/Deci_DeciLM-7B-instruct.json diff --git a/data/Deci_DeciLM-7B.json b/data/models/Deci_DeciLM-7B.json similarity index 100% rename from data/Deci_DeciLM-7B.json rename to data/models/Deci_DeciLM-7B.json diff --git a/data/DeepAutoAI_Explore_Llama-3.1-8B-Inst.json b/data/models/DeepAutoAI_Explore_Llama-3.1-8B-Inst.json similarity index 100% rename from data/DeepAutoAI_Explore_Llama-3.1-8B-Inst.json rename to data/models/DeepAutoAI_Explore_Llama-3.1-8B-Inst.json diff --git a/data/DeepAutoAI_Explore_Llama-3.2-1B-Inst.json b/data/models/DeepAutoAI_Explore_Llama-3.2-1B-Inst.json similarity index 100% rename from data/DeepAutoAI_Explore_Llama-3.2-1B-Inst.json rename to data/models/DeepAutoAI_Explore_Llama-3.2-1B-Inst.json diff --git a/data/DeepAutoAI_Explore_Llama-3.2-1B-Inst_v0.json b/data/models/DeepAutoAI_Explore_Llama-3.2-1B-Inst_v0.json similarity index 100% rename from data/DeepAutoAI_Explore_Llama-3.2-1B-Inst_v0.json rename to data/models/DeepAutoAI_Explore_Llama-3.2-1B-Inst_v0.json diff --git a/data/DeepAutoAI_Explore_Llama-3.2-1B-Inst_v1.1.json b/data/models/DeepAutoAI_Explore_Llama-3.2-1B-Inst_v1.1.json similarity index 100% rename from data/DeepAutoAI_Explore_Llama-3.2-1B-Inst_v1.1.json rename to data/models/DeepAutoAI_Explore_Llama-3.2-1B-Inst_v1.1.json diff --git a/data/DeepAutoAI_Explore_Llama-3.2-1B-Inst_v1.json b/data/models/DeepAutoAI_Explore_Llama-3.2-1B-Inst_v1.json similarity index 100% rename from data/DeepAutoAI_Explore_Llama-3.2-1B-Inst_v1.json rename to data/models/DeepAutoAI_Explore_Llama-3.2-1B-Inst_v1.json diff --git a/data/DeepAutoAI_causal_gpt2.json b/data/models/DeepAutoAI_causal_gpt2.json similarity index 100% rename from data/DeepAutoAI_causal_gpt2.json rename to data/models/DeepAutoAI_causal_gpt2.json diff --git a/data/DeepAutoAI_d2nwg_Llama-3.1-8B-Instruct-v0.0.json b/data/models/DeepAutoAI_d2nwg_Llama-3.1-8B-Instruct-v0.0.json similarity index 100% rename from data/DeepAutoAI_d2nwg_Llama-3.1-8B-Instruct-v0.0.json rename to data/models/DeepAutoAI_d2nwg_Llama-3.1-8B-Instruct-v0.0.json diff --git a/data/DeepAutoAI_d2nwg_causal_gpt2.json b/data/models/DeepAutoAI_d2nwg_causal_gpt2.json similarity index 100% rename from data/DeepAutoAI_d2nwg_causal_gpt2.json rename to data/models/DeepAutoAI_d2nwg_causal_gpt2.json diff --git a/data/DeepAutoAI_d2nwg_causal_gpt2_v1.json b/data/models/DeepAutoAI_d2nwg_causal_gpt2_v1.json similarity index 100% rename from data/DeepAutoAI_d2nwg_causal_gpt2_v1.json rename to data/models/DeepAutoAI_d2nwg_causal_gpt2_v1.json diff --git a/data/DeepAutoAI_ldm_soup_Llama-3.1-8B-Inst.json b/data/models/DeepAutoAI_ldm_soup_Llama-3.1-8B-Inst.json similarity index 100% rename from data/DeepAutoAI_ldm_soup_Llama-3.1-8B-Inst.json rename to data/models/DeepAutoAI_ldm_soup_Llama-3.1-8B-Inst.json diff --git a/data/DeepAutoAI_ldm_soup_Llama-3.1-8B-Instruct-v0.0.json b/data/models/DeepAutoAI_ldm_soup_Llama-3.1-8B-Instruct-v0.0.json similarity index 100% rename from data/DeepAutoAI_ldm_soup_Llama-3.1-8B-Instruct-v0.0.json rename to data/models/DeepAutoAI_ldm_soup_Llama-3.1-8B-Instruct-v0.0.json diff --git a/data/DeepAutoAI_ldm_soup_Llama-3.1-8B-Instruct-v0.1.json b/data/models/DeepAutoAI_ldm_soup_Llama-3.1-8B-Instruct-v0.1.json similarity index 100% rename from data/DeepAutoAI_ldm_soup_Llama-3.1-8B-Instruct-v0.1.json rename to data/models/DeepAutoAI_ldm_soup_Llama-3.1-8B-Instruct-v0.1.json diff --git a/data/DeepMount00_Lexora-Lite-3B.json b/data/models/DeepMount00_Lexora-Lite-3B.json similarity index 100% rename from data/DeepMount00_Lexora-Lite-3B.json rename to data/models/DeepMount00_Lexora-Lite-3B.json diff --git a/data/DeepMount00_Lexora-Lite-3B_v2.json b/data/models/DeepMount00_Lexora-Lite-3B_v2.json similarity index 100% rename from data/DeepMount00_Lexora-Lite-3B_v2.json rename to data/models/DeepMount00_Lexora-Lite-3B_v2.json diff --git a/data/DeepMount00_Lexora-Medium-7B.json b/data/models/DeepMount00_Lexora-Medium-7B.json similarity index 100% rename from data/DeepMount00_Lexora-Medium-7B.json rename to data/models/DeepMount00_Lexora-Medium-7B.json diff --git a/data/DeepMount00_Llama-3-8b-Ita.json b/data/models/DeepMount00_Llama-3-8b-Ita.json similarity index 100% rename from data/DeepMount00_Llama-3-8b-Ita.json rename to data/models/DeepMount00_Llama-3-8b-Ita.json diff --git a/data/models/DeepMount00_Llama-3.1-8b-ITA.json b/data/models/DeepMount00_Llama-3.1-8b-ITA.json new file mode 100644 index 0000000000000000000000000000000000000000..8fed22ea9dabe408beacb24efe9bc1cfb9d46008 --- /dev/null +++ b/data/models/DeepMount00_Llama-3.1-8b-ITA.json @@ -0,0 +1,145 @@ +{ + "model_info": { + "name": "Llama-3.1-8b-ITA", + "id": "DeepMount00/Llama-3.1-8b-ITA", + "developer": "DeepMount00", + "inference_platform": "unknown", + "additional_details": { + "precision": "bfloat16", + "architecture": "LlamaForCausalLM", + "params_billions": "8.03" + } + }, + "evaluations": [ + { + "evaluation_id": "hfopenllm_v2/DeepMount00_Llama-3.1-8b-ITA/1773936498.240187", + "retrieved_timestamp": "1773936498.240187", + "source_metadata": { + "source_name": "HF Open LLM v2", + "source_type": "documentation", + "source_organization_name": "Hugging Face", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "lm-evaluation-harness", + "version": "0.4.0", + "additional_details": { + "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess" + } + }, + "benchmark": "hfopenllm_v2", + "evaluation_results": [ + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "hf_dataset", + "hf_repo": "google/IFEval" + }, + "metric_config": { + "evaluation_description": "Accuracy on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.7917 + } + }, + { + "evaluation_name": "BBH", + "source_data": { + "dataset_name": "BBH", + "source_type": "hf_dataset", + "hf_repo": "SaylorTwift/bbh" + }, + "metric_config": { + "evaluation_description": "Accuracy on BBH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.5109 + } + }, + { + "evaluation_name": "MATH Level 5", + "source_data": { + "dataset_name": "MATH Level 5", + "source_type": "hf_dataset", + "hf_repo": "DigitalLearningGmbH/MATH-lighteval" + }, + "metric_config": { + "evaluation_description": "Exact Match on MATH Level 5", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.1088 + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "hf_dataset", + "hf_repo": "Idavidrein/gpqa" + }, + "metric_config": { + "evaluation_description": "Accuracy on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2878 + } + }, + { + "evaluation_name": "MUSR", + "source_data": { + "dataset_name": "MUSR", + "source_type": "hf_dataset", + "hf_repo": "TAUR-Lab/MuSR" + }, + "metric_config": { + "evaluation_description": "Accuracy on MUSR", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4136 + } + }, + { + "evaluation_name": "MMLU-PRO", + "source_data": { + "dataset_name": "MMLU-PRO", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/MMLU-Pro" + }, + "metric_config": { + "evaluation_description": "Accuracy on MMLU-PRO", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3876 + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/DeepMount00_Llama-3.1-8b-Ita.json b/data/models/DeepMount00_Llama-3.1-8b-Ita.json similarity index 100% rename from data/DeepMount00_Llama-3.1-8b-Ita.json rename to data/models/DeepMount00_Llama-3.1-8b-Ita.json diff --git a/data/DeepMount00_Llama-3.1-Distilled.json b/data/models/DeepMount00_Llama-3.1-Distilled.json similarity index 100% rename from data/DeepMount00_Llama-3.1-Distilled.json rename to data/models/DeepMount00_Llama-3.1-Distilled.json diff --git a/data/DeepMount00_Qwen2-1.5B-Ita.json b/data/models/DeepMount00_Qwen2-1.5B-Ita.json similarity index 100% rename from data/DeepMount00_Qwen2-1.5B-Ita.json rename to data/models/DeepMount00_Qwen2-1.5B-Ita.json diff --git a/data/DeepMount00_Qwen2-1.5B-Ita_v2.json b/data/models/DeepMount00_Qwen2-1.5B-Ita_v2.json similarity index 100% rename from data/DeepMount00_Qwen2-1.5B-Ita_v2.json rename to data/models/DeepMount00_Qwen2-1.5B-Ita_v2.json diff --git a/data/DeepMount00_Qwen2-1.5B-Ita_v3.json b/data/models/DeepMount00_Qwen2-1.5B-Ita_v3.json similarity index 100% rename from data/DeepMount00_Qwen2-1.5B-Ita_v3.json rename to data/models/DeepMount00_Qwen2-1.5B-Ita_v3.json diff --git a/data/DeepMount00_Qwen2-1.5B-Ita_v5.json b/data/models/DeepMount00_Qwen2-1.5B-Ita_v5.json similarity index 100% rename from data/DeepMount00_Qwen2-1.5B-Ita_v5.json rename to data/models/DeepMount00_Qwen2-1.5B-Ita_v5.json diff --git a/data/DeepMount00_Qwen2-1.5B-Ita_v6.json b/data/models/DeepMount00_Qwen2-1.5B-Ita_v6.json similarity index 100% rename from data/DeepMount00_Qwen2-1.5B-Ita_v6.json rename to data/models/DeepMount00_Qwen2-1.5B-Ita_v6.json diff --git a/data/DeepMount00_Qwen2.5-7B-Instruct-MathCoder.json b/data/models/DeepMount00_Qwen2.5-7B-Instruct-MathCoder.json similarity index 100% rename from data/DeepMount00_Qwen2.5-7B-Instruct-MathCoder.json rename to data/models/DeepMount00_Qwen2.5-7B-Instruct-MathCoder.json diff --git a/data/DeepMount00_mergekit-ties-okvgjfz.json b/data/models/DeepMount00_mergekit-ties-okvgjfz.json similarity index 100% rename from data/DeepMount00_mergekit-ties-okvgjfz.json rename to data/models/DeepMount00_mergekit-ties-okvgjfz.json diff --git a/data/Delta-Vector_Baldur-8B.json b/data/models/Delta-Vector_Baldur-8B.json similarity index 100% rename from data/Delta-Vector_Baldur-8B.json rename to data/models/Delta-Vector_Baldur-8B.json diff --git a/data/Delta-Vector_Control-8B-V1.1.json b/data/models/Delta-Vector_Control-8B-V1.1.json similarity index 100% rename from data/Delta-Vector_Control-8B-V1.1.json rename to data/models/Delta-Vector_Control-8B-V1.1.json diff --git a/data/Delta-Vector_Control-8B.json b/data/models/Delta-Vector_Control-8B.json similarity index 100% rename from data/Delta-Vector_Control-8B.json rename to data/models/Delta-Vector_Control-8B.json diff --git a/data/Delta-Vector_Darkens-8B.json b/data/models/Delta-Vector_Darkens-8B.json similarity index 100% rename from data/Delta-Vector_Darkens-8B.json rename to data/models/Delta-Vector_Darkens-8B.json diff --git a/data/Delta-Vector_Henbane-7b-attempt2.json b/data/models/Delta-Vector_Henbane-7b-attempt2.json similarity index 100% rename from data/Delta-Vector_Henbane-7b-attempt2.json rename to data/models/Delta-Vector_Henbane-7b-attempt2.json diff --git a/data/Delta-Vector_Odin-9B.json b/data/models/Delta-Vector_Odin-9B.json similarity index 100% rename from data/Delta-Vector_Odin-9B.json rename to data/models/Delta-Vector_Odin-9B.json diff --git a/data/Delta-Vector_Tor-8B.json b/data/models/Delta-Vector_Tor-8B.json similarity index 100% rename from data/Delta-Vector_Tor-8B.json rename to data/models/Delta-Vector_Tor-8B.json diff --git a/data/DevQuasar_DevQuasar-R1-Uncensored-Llama-8B.json b/data/models/DevQuasar_DevQuasar-R1-Uncensored-Llama-8B.json similarity index 100% rename from data/DevQuasar_DevQuasar-R1-Uncensored-Llama-8B.json rename to data/models/DevQuasar_DevQuasar-R1-Uncensored-Llama-8B.json diff --git a/data/Dongwei_DeepSeek-R1-Distill-Qwen-7B-GRPO.json b/data/models/Dongwei_DeepSeek-R1-Distill-Qwen-7B-GRPO.json similarity index 100% rename from data/Dongwei_DeepSeek-R1-Distill-Qwen-7B-GRPO.json rename to data/models/Dongwei_DeepSeek-R1-Distill-Qwen-7B-GRPO.json diff --git a/data/DoppelReflEx_L3-8B-R1-WolfCore-V1.5-test.json b/data/models/DoppelReflEx_L3-8B-R1-WolfCore-V1.5-test.json similarity index 100% rename from data/DoppelReflEx_L3-8B-R1-WolfCore-V1.5-test.json rename to data/models/DoppelReflEx_L3-8B-R1-WolfCore-V1.5-test.json diff --git a/data/DoppelReflEx_L3-8B-R1-WolfCore.json b/data/models/DoppelReflEx_L3-8B-R1-WolfCore.json similarity index 100% rename from data/DoppelReflEx_L3-8B-R1-WolfCore.json rename to data/models/DoppelReflEx_L3-8B-R1-WolfCore.json diff --git a/data/DoppelReflEx_L3-8B-WolfCore.json b/data/models/DoppelReflEx_L3-8B-WolfCore.json similarity index 100% rename from data/DoppelReflEx_L3-8B-WolfCore.json rename to data/models/DoppelReflEx_L3-8B-WolfCore.json diff --git a/data/DoppelReflEx_MN-12B-FoxFrame-test.json b/data/models/DoppelReflEx_MN-12B-FoxFrame-test.json similarity index 100% rename from data/DoppelReflEx_MN-12B-FoxFrame-test.json rename to data/models/DoppelReflEx_MN-12B-FoxFrame-test.json diff --git a/data/DoppelReflEx_MN-12B-FoxFrame2-test.json b/data/models/DoppelReflEx_MN-12B-FoxFrame2-test.json similarity index 100% rename from data/DoppelReflEx_MN-12B-FoxFrame2-test.json rename to data/models/DoppelReflEx_MN-12B-FoxFrame2-test.json diff --git a/data/DoppelReflEx_MN-12B-FoxFrame3-test.json b/data/models/DoppelReflEx_MN-12B-FoxFrame3-test.json similarity index 100% rename from data/DoppelReflEx_MN-12B-FoxFrame3-test.json rename to data/models/DoppelReflEx_MN-12B-FoxFrame3-test.json diff --git a/data/DoppelReflEx_MN-12B-Kakigori.json b/data/models/DoppelReflEx_MN-12B-Kakigori.json similarity index 100% rename from data/DoppelReflEx_MN-12B-Kakigori.json rename to data/models/DoppelReflEx_MN-12B-Kakigori.json diff --git a/data/DoppelReflEx_MN-12B-LilithFrame-Experiment-2.json b/data/models/DoppelReflEx_MN-12B-LilithFrame-Experiment-2.json similarity index 100% rename from data/DoppelReflEx_MN-12B-LilithFrame-Experiment-2.json rename to data/models/DoppelReflEx_MN-12B-LilithFrame-Experiment-2.json diff --git a/data/DoppelReflEx_MN-12B-LilithFrame-Experiment-3.json b/data/models/DoppelReflEx_MN-12B-LilithFrame-Experiment-3.json similarity index 100% rename from data/DoppelReflEx_MN-12B-LilithFrame-Experiment-3.json rename to data/models/DoppelReflEx_MN-12B-LilithFrame-Experiment-3.json diff --git a/data/DoppelReflEx_MN-12B-LilithFrame-Experiment-4.json b/data/models/DoppelReflEx_MN-12B-LilithFrame-Experiment-4.json similarity index 100% rename from data/DoppelReflEx_MN-12B-LilithFrame-Experiment-4.json rename to data/models/DoppelReflEx_MN-12B-LilithFrame-Experiment-4.json diff --git a/data/DoppelReflEx_MN-12B-LilithFrame.json b/data/models/DoppelReflEx_MN-12B-LilithFrame.json similarity index 99% rename from data/DoppelReflEx_MN-12B-LilithFrame.json rename to data/models/DoppelReflEx_MN-12B-LilithFrame.json index 966d337b0cb30ef7de1bbbaa6e839a5a5f2a167b..8ecfe43ff9e1fa93bda4febcbc7408f44cb537fc 100644 --- a/data/DoppelReflEx_MN-12B-LilithFrame.json +++ b/data/models/DoppelReflEx_MN-12B-LilithFrame.json @@ -5,7 +5,7 @@ "developer": "DoppelReflEx", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "MistralForCausalLM", "params_billions": "12.248" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.451 + "score": 0.436 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4944 + "score": 0.4956 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1156 + "score": 0.0589 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3196 + "score": 0.3205 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3896 + "score": 0.3843 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3256 + "score": 0.3237 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.436 + "score": 0.451 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4956 + "score": 0.4944 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0589 + "score": 0.1156 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3205 + "score": 0.3196 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3843 + "score": 0.3896 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3237 + "score": 0.3256 } } ], diff --git a/data/DoppelReflEx_MN-12B-Mimicore-GreenSnake.json b/data/models/DoppelReflEx_MN-12B-Mimicore-GreenSnake.json similarity index 100% rename from data/DoppelReflEx_MN-12B-Mimicore-GreenSnake.json rename to data/models/DoppelReflEx_MN-12B-Mimicore-GreenSnake.json diff --git a/data/DoppelReflEx_MN-12B-Mimicore-Nocturne.json b/data/models/DoppelReflEx_MN-12B-Mimicore-Nocturne.json similarity index 100% rename from data/DoppelReflEx_MN-12B-Mimicore-Nocturne.json rename to data/models/DoppelReflEx_MN-12B-Mimicore-Nocturne.json diff --git a/data/DoppelReflEx_MN-12B-Mimicore-Orochi-v2-Experiment.json b/data/models/DoppelReflEx_MN-12B-Mimicore-Orochi-v2-Experiment.json similarity index 100% rename from data/DoppelReflEx_MN-12B-Mimicore-Orochi-v2-Experiment.json rename to data/models/DoppelReflEx_MN-12B-Mimicore-Orochi-v2-Experiment.json diff --git a/data/DoppelReflEx_MN-12B-Mimicore-Orochi-v3-Experiment.json b/data/models/DoppelReflEx_MN-12B-Mimicore-Orochi-v3-Experiment.json similarity index 100% rename from data/DoppelReflEx_MN-12B-Mimicore-Orochi-v3-Experiment.json rename to data/models/DoppelReflEx_MN-12B-Mimicore-Orochi-v3-Experiment.json diff --git a/data/DoppelReflEx_MN-12B-Mimicore-Orochi-v4-Experiment.json b/data/models/DoppelReflEx_MN-12B-Mimicore-Orochi-v4-Experiment.json similarity index 100% rename from data/DoppelReflEx_MN-12B-Mimicore-Orochi-v4-Experiment.json rename to data/models/DoppelReflEx_MN-12B-Mimicore-Orochi-v4-Experiment.json diff --git a/data/DoppelReflEx_MN-12B-Mimicore-Orochi.json b/data/models/DoppelReflEx_MN-12B-Mimicore-Orochi.json similarity index 100% rename from data/DoppelReflEx_MN-12B-Mimicore-Orochi.json rename to data/models/DoppelReflEx_MN-12B-Mimicore-Orochi.json diff --git a/data/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-1.json b/data/models/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-1.json similarity index 100% rename from data/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-1.json rename to data/models/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-1.json diff --git a/data/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-2.json b/data/models/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-2.json similarity index 100% rename from data/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-2.json rename to data/models/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-2.json diff --git a/data/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-3.json b/data/models/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-3.json similarity index 100% rename from data/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-3.json rename to data/models/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-3.json diff --git a/data/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-4.json b/data/models/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-4.json similarity index 100% rename from data/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-4.json rename to data/models/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-4.json diff --git a/data/DoppelReflEx_MN-12B-Mimicore-WhiteSnake.json b/data/models/DoppelReflEx_MN-12B-Mimicore-WhiteSnake.json similarity index 100% rename from data/DoppelReflEx_MN-12B-Mimicore-WhiteSnake.json rename to data/models/DoppelReflEx_MN-12B-Mimicore-WhiteSnake.json diff --git a/data/DoppelReflEx_MN-12B-Unleashed-Twilight.json b/data/models/DoppelReflEx_MN-12B-Unleashed-Twilight.json similarity index 100% rename from data/DoppelReflEx_MN-12B-Unleashed-Twilight.json rename to data/models/DoppelReflEx_MN-12B-Unleashed-Twilight.json diff --git a/data/DoppelReflEx_MN-12B-WolFrame.json b/data/models/DoppelReflEx_MN-12B-WolFrame.json similarity index 100% rename from data/DoppelReflEx_MN-12B-WolFrame.json rename to data/models/DoppelReflEx_MN-12B-WolFrame.json diff --git a/data/DoppelReflEx_MiniusLight-24B-test.json b/data/models/DoppelReflEx_MiniusLight-24B-test.json similarity index 100% rename from data/DoppelReflEx_MiniusLight-24B-test.json rename to data/models/DoppelReflEx_MiniusLight-24B-test.json diff --git a/data/DoppelReflEx_MiniusLight-24B-v1b-test.json b/data/models/DoppelReflEx_MiniusLight-24B-v1b-test.json similarity index 100% rename from data/DoppelReflEx_MiniusLight-24B-v1b-test.json rename to data/models/DoppelReflEx_MiniusLight-24B-v1b-test.json diff --git a/data/DoppelReflEx_MiniusLight-24B-v1c-test.json b/data/models/DoppelReflEx_MiniusLight-24B-v1c-test.json similarity index 100% rename from data/DoppelReflEx_MiniusLight-24B-v1c-test.json rename to data/models/DoppelReflEx_MiniusLight-24B-v1c-test.json diff --git a/data/DoppelReflEx_MiniusLight-24B-v1d-test.json b/data/models/DoppelReflEx_MiniusLight-24B-v1d-test.json similarity index 100% rename from data/DoppelReflEx_MiniusLight-24B-v1d-test.json rename to data/models/DoppelReflEx_MiniusLight-24B-v1d-test.json diff --git a/data/DoppelReflEx_MiniusLight-24B.json b/data/models/DoppelReflEx_MiniusLight-24B.json similarity index 100% rename from data/DoppelReflEx_MiniusLight-24B.json rename to data/models/DoppelReflEx_MiniusLight-24B.json diff --git a/data/DreadPoor_Again-8B-Model_Stock.json b/data/models/DreadPoor_Again-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Again-8B-Model_Stock.json rename to data/models/DreadPoor_Again-8B-Model_Stock.json diff --git a/data/DreadPoor_Alita99-8B-LINEAR.json b/data/models/DreadPoor_Alita99-8B-LINEAR.json similarity index 100% rename from data/DreadPoor_Alita99-8B-LINEAR.json rename to data/models/DreadPoor_Alita99-8B-LINEAR.json diff --git a/data/DreadPoor_AnotherTest.json b/data/models/DreadPoor_AnotherTest.json similarity index 100% rename from data/DreadPoor_AnotherTest.json rename to data/models/DreadPoor_AnotherTest.json diff --git a/data/DreadPoor_Aspire-8B-model_stock.json b/data/models/DreadPoor_Aspire-8B-model_stock.json similarity index 100% rename from data/DreadPoor_Aspire-8B-model_stock.json rename to data/models/DreadPoor_Aspire-8B-model_stock.json diff --git a/data/DreadPoor_Aspire_1.3-8B_model-stock.json b/data/models/DreadPoor_Aspire_1.3-8B_model-stock.json similarity index 100% rename from data/DreadPoor_Aspire_1.3-8B_model-stock.json rename to data/models/DreadPoor_Aspire_1.3-8B_model-stock.json diff --git a/data/DreadPoor_Aspire_V2-8B-Model_Stock.json b/data/models/DreadPoor_Aspire_V2-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Aspire_V2-8B-Model_Stock.json rename to data/models/DreadPoor_Aspire_V2-8B-Model_Stock.json diff --git a/data/DreadPoor_Aspire_V2.1-8B-Model_Stock.json b/data/models/DreadPoor_Aspire_V2.1-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Aspire_V2.1-8B-Model_Stock.json rename to data/models/DreadPoor_Aspire_V2.1-8B-Model_Stock.json diff --git a/data/DreadPoor_Aspire_V2_ALT-8B-Model_Stock.json b/data/models/DreadPoor_Aspire_V2_ALT-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Aspire_V2_ALT-8B-Model_Stock.json rename to data/models/DreadPoor_Aspire_V2_ALT-8B-Model_Stock.json diff --git a/data/DreadPoor_Aspire_V2_ALT_ROW-8B-Model_Stock.json b/data/models/DreadPoor_Aspire_V2_ALT_ROW-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Aspire_V2_ALT_ROW-8B-Model_Stock.json rename to data/models/DreadPoor_Aspire_V2_ALT_ROW-8B-Model_Stock.json diff --git a/data/DreadPoor_Aspire_V3-8B-Model_Stock.json b/data/models/DreadPoor_Aspire_V3-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Aspire_V3-8B-Model_Stock.json rename to data/models/DreadPoor_Aspire_V3-8B-Model_Stock.json diff --git a/data/DreadPoor_Aspire_V4-8B-Model_Stock.json b/data/models/DreadPoor_Aspire_V4-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Aspire_V4-8B-Model_Stock.json rename to data/models/DreadPoor_Aspire_V4-8B-Model_Stock.json diff --git a/data/DreadPoor_Aspire_V4_ALT-8B-Model_Stock.json b/data/models/DreadPoor_Aspire_V4_ALT-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Aspire_V4_ALT-8B-Model_Stock.json rename to data/models/DreadPoor_Aspire_V4_ALT-8B-Model_Stock.json diff --git a/data/DreadPoor_Asymmetric_Linearity-8B-Model_Stock.json b/data/models/DreadPoor_Asymmetric_Linearity-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Asymmetric_Linearity-8B-Model_Stock.json rename to data/models/DreadPoor_Asymmetric_Linearity-8B-Model_Stock.json diff --git a/data/DreadPoor_Aurora_faustus-8B-LINEAR.json b/data/models/DreadPoor_Aurora_faustus-8B-LINEAR.json similarity index 100% rename from data/DreadPoor_Aurora_faustus-8B-LINEAR.json rename to data/models/DreadPoor_Aurora_faustus-8B-LINEAR.json diff --git a/data/DreadPoor_Aurora_faustus-8B-LORABLATED.json b/data/models/DreadPoor_Aurora_faustus-8B-LORABLATED.json similarity index 100% rename from data/DreadPoor_Aurora_faustus-8B-LORABLATED.json rename to data/models/DreadPoor_Aurora_faustus-8B-LORABLATED.json diff --git a/data/DreadPoor_Aurora_faustus-8B-LORABLATED_ALT.json b/data/models/DreadPoor_Aurora_faustus-8B-LORABLATED_ALT.json similarity index 100% rename from data/DreadPoor_Aurora_faustus-8B-LORABLATED_ALT.json rename to data/models/DreadPoor_Aurora_faustus-8B-LORABLATED_ALT.json diff --git a/data/DreadPoor_Autumn_Dawn-8B-LINEAR.json b/data/models/DreadPoor_Autumn_Dawn-8B-LINEAR.json similarity index 100% rename from data/DreadPoor_Autumn_Dawn-8B-LINEAR.json rename to data/models/DreadPoor_Autumn_Dawn-8B-LINEAR.json diff --git a/data/DreadPoor_BaeZel-8B-LINEAR.json b/data/models/DreadPoor_BaeZel-8B-LINEAR.json similarity index 100% rename from data/DreadPoor_BaeZel-8B-LINEAR.json rename to data/models/DreadPoor_BaeZel-8B-LINEAR.json diff --git a/data/DreadPoor_BaeZel-8B-Model_Stock.json b/data/models/DreadPoor_BaeZel-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_BaeZel-8B-Model_Stock.json rename to data/models/DreadPoor_BaeZel-8B-Model_Stock.json diff --git a/data/DreadPoor_BaeZel_V2-8B-Model_Stock.json b/data/models/DreadPoor_BaeZel_V2-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_BaeZel_V2-8B-Model_Stock.json rename to data/models/DreadPoor_BaeZel_V2-8B-Model_Stock.json diff --git a/data/DreadPoor_BaeZel_V2_ALT-8B-Model_Stock.json b/data/models/DreadPoor_BaeZel_V2_ALT-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_BaeZel_V2_ALT-8B-Model_Stock.json rename to data/models/DreadPoor_BaeZel_V2_ALT-8B-Model_Stock.json diff --git a/data/DreadPoor_BaeZel_V3-8B-Model_Stock.json b/data/models/DreadPoor_BaeZel_V3-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_BaeZel_V3-8B-Model_Stock.json rename to data/models/DreadPoor_BaeZel_V3-8B-Model_Stock.json diff --git a/data/DreadPoor_Blunt_Edge-8B-SLERP.json b/data/models/DreadPoor_Blunt_Edge-8B-SLERP.json similarity index 100% rename from data/DreadPoor_Blunt_Edge-8B-SLERP.json rename to data/models/DreadPoor_Blunt_Edge-8B-SLERP.json diff --git a/data/DreadPoor_BulkUp.json b/data/models/DreadPoor_BulkUp.json similarity index 100% rename from data/DreadPoor_BulkUp.json rename to data/models/DreadPoor_BulkUp.json diff --git a/data/DreadPoor_Cadence-8B-LINEAR.json b/data/models/DreadPoor_Cadence-8B-LINEAR.json similarity index 100% rename from data/DreadPoor_Cadence-8B-LINEAR.json rename to data/models/DreadPoor_Cadence-8B-LINEAR.json diff --git a/data/DreadPoor_Caelid-8B-Model_Stock.json b/data/models/DreadPoor_Caelid-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Caelid-8B-Model_Stock.json rename to data/models/DreadPoor_Caelid-8B-Model_Stock.json diff --git a/data/DreadPoor_Casuar-9B-Model_Stock.json b/data/models/DreadPoor_Casuar-9B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Casuar-9B-Model_Stock.json rename to data/models/DreadPoor_Casuar-9B-Model_Stock.json diff --git a/data/DreadPoor_Condensed_Milk-8B-Model_Stock.json b/data/models/DreadPoor_Condensed_Milk-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Condensed_Milk-8B-Model_Stock.json rename to data/models/DreadPoor_Condensed_Milk-8B-Model_Stock.json diff --git a/data/DreadPoor_CoolerCoder-8B-LINEAR.json b/data/models/DreadPoor_CoolerCoder-8B-LINEAR.json similarity index 100% rename from data/DreadPoor_CoolerCoder-8B-LINEAR.json rename to data/models/DreadPoor_CoolerCoder-8B-LINEAR.json diff --git a/data/DreadPoor_Damasteel-8B-LINEAR.json b/data/models/DreadPoor_Damasteel-8B-LINEAR.json similarity index 100% rename from data/DreadPoor_Damasteel-8B-LINEAR.json rename to data/models/DreadPoor_Damasteel-8B-LINEAR.json diff --git a/data/DreadPoor_Dearly_Beloved-8B-TIES.json b/data/models/DreadPoor_Dearly_Beloved-8B-TIES.json similarity index 100% rename from data/DreadPoor_Dearly_Beloved-8B-TIES.json rename to data/models/DreadPoor_Dearly_Beloved-8B-TIES.json diff --git a/data/DreadPoor_Decayed-8B-LINEAR.json b/data/models/DreadPoor_Decayed-8B-LINEAR.json similarity index 100% rename from data/DreadPoor_Decayed-8B-LINEAR.json rename to data/models/DreadPoor_Decayed-8B-LINEAR.json diff --git a/data/DreadPoor_Derivative-8B-Model_Stock.json b/data/models/DreadPoor_Derivative-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Derivative-8B-Model_Stock.json rename to data/models/DreadPoor_Derivative-8B-Model_Stock.json diff --git a/data/DreadPoor_Derivative_V2-8B-Model_Stock.json b/data/models/DreadPoor_Derivative_V2-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Derivative_V2-8B-Model_Stock.json rename to data/models/DreadPoor_Derivative_V2-8B-Model_Stock.json diff --git a/data/DreadPoor_Derivative_V2_ALT-8B-Model_Stock.json b/data/models/DreadPoor_Derivative_V2_ALT-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Derivative_V2_ALT-8B-Model_Stock.json rename to data/models/DreadPoor_Derivative_V2_ALT-8B-Model_Stock.json diff --git a/data/DreadPoor_Derivative_V3-8B-Model_Stock.json b/data/models/DreadPoor_Derivative_V3-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Derivative_V3-8B-Model_Stock.json rename to data/models/DreadPoor_Derivative_V3-8B-Model_Stock.json diff --git a/data/DreadPoor_Elusive_Dragon_Heart-8B-LINEAR.json b/data/models/DreadPoor_Elusive_Dragon_Heart-8B-LINEAR.json similarity index 100% rename from data/DreadPoor_Elusive_Dragon_Heart-8B-LINEAR.json rename to data/models/DreadPoor_Elusive_Dragon_Heart-8B-LINEAR.json diff --git a/data/DreadPoor_Emu_Eggs-9B-Model_Stock.json b/data/models/DreadPoor_Emu_Eggs-9B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Emu_Eggs-9B-Model_Stock.json rename to data/models/DreadPoor_Emu_Eggs-9B-Model_Stock.json diff --git a/data/DreadPoor_Eunoia_Vespera-8B-LINEAR.json b/data/models/DreadPoor_Eunoia_Vespera-8B-LINEAR.json similarity index 100% rename from data/DreadPoor_Eunoia_Vespera-8B-LINEAR.json rename to data/models/DreadPoor_Eunoia_Vespera-8B-LINEAR.json diff --git a/data/DreadPoor_Fu_sion_HA-8B-SLERP.json b/data/models/DreadPoor_Fu_sion_HA-8B-SLERP.json similarity index 100% rename from data/DreadPoor_Fu_sion_HA-8B-SLERP.json rename to data/models/DreadPoor_Fu_sion_HA-8B-SLERP.json diff --git a/data/DreadPoor_HOT_STINKING_GARBAGE.json b/data/models/DreadPoor_HOT_STINKING_GARBAGE.json similarity index 100% rename from data/DreadPoor_HOT_STINKING_GARBAGE.json rename to data/models/DreadPoor_HOT_STINKING_GARBAGE.json diff --git a/data/DreadPoor_H_the_eighth-8B-LINEAR.json b/data/models/DreadPoor_H_the_eighth-8B-LINEAR.json similarity index 100% rename from data/DreadPoor_H_the_eighth-8B-LINEAR.json rename to data/models/DreadPoor_H_the_eighth-8B-LINEAR.json diff --git a/data/DreadPoor_Happy_New_Year-8B-Model_Stock.json b/data/models/DreadPoor_Happy_New_Year-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Happy_New_Year-8B-Model_Stock.json rename to data/models/DreadPoor_Happy_New_Year-8B-Model_Stock.json diff --git a/data/DreadPoor_Heart_Stolen-8B-Model_Stock.json b/data/models/DreadPoor_Heart_Stolen-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Heart_Stolen-8B-Model_Stock.json rename to data/models/DreadPoor_Heart_Stolen-8B-Model_Stock.json diff --git a/data/DreadPoor_Heart_Stolen-ALT-8B-Model_Stock.json b/data/models/DreadPoor_Heart_Stolen-ALT-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Heart_Stolen-ALT-8B-Model_Stock.json rename to data/models/DreadPoor_Heart_Stolen-ALT-8B-Model_Stock.json diff --git a/data/DreadPoor_Here_We_Go_Again-8B-SLERP.json b/data/models/DreadPoor_Here_We_Go_Again-8B-SLERP.json similarity index 100% rename from data/DreadPoor_Here_We_Go_Again-8B-SLERP.json rename to data/models/DreadPoor_Here_We_Go_Again-8B-SLERP.json diff --git a/data/DreadPoor_Howdy-8B-LINEAR.json b/data/models/DreadPoor_Howdy-8B-LINEAR.json similarity index 100% rename from data/DreadPoor_Howdy-8B-LINEAR.json rename to data/models/DreadPoor_Howdy-8B-LINEAR.json diff --git a/data/DreadPoor_Incidental-8B-Model_Stock.json b/data/models/DreadPoor_Incidental-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Incidental-8B-Model_Stock.json rename to data/models/DreadPoor_Incidental-8B-Model_Stock.json diff --git a/data/DreadPoor_Irina-8B-model_stock.json b/data/models/DreadPoor_Irina-8B-model_stock.json similarity index 100% rename from data/DreadPoor_Irina-8B-model_stock.json rename to data/models/DreadPoor_Irina-8B-model_stock.json diff --git a/data/DreadPoor_Kindling-8B-Model_Stock.json b/data/models/DreadPoor_Kindling-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Kindling-8B-Model_Stock.json rename to data/models/DreadPoor_Kindling-8B-Model_Stock.json diff --git a/data/DreadPoor_L3.1-BaeZel-8B-Della.json b/data/models/DreadPoor_L3.1-BaeZel-8B-Della.json similarity index 100% rename from data/DreadPoor_L3.1-BaeZel-8B-Della.json rename to data/models/DreadPoor_L3.1-BaeZel-8B-Della.json diff --git a/data/DreadPoor_Laughing_Stock-8B-Model_Stock.json b/data/models/DreadPoor_Laughing_Stock-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Laughing_Stock-8B-Model_Stock.json rename to data/models/DreadPoor_Laughing_Stock-8B-Model_Stock.json diff --git a/data/DreadPoor_Lava_Lamp-8B-SLERP.json b/data/models/DreadPoor_Lava_Lamp-8B-SLERP.json similarity index 100% rename from data/DreadPoor_Lava_Lamp-8B-SLERP.json rename to data/models/DreadPoor_Lava_Lamp-8B-SLERP.json diff --git a/data/DreadPoor_LemonP-8B-Model_Stock.json b/data/models/DreadPoor_LemonP-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_LemonP-8B-Model_Stock.json rename to data/models/DreadPoor_LemonP-8B-Model_Stock.json diff --git a/data/DreadPoor_Lydia_of_Whiterun-8B-LINEAR.json b/data/models/DreadPoor_Lydia_of_Whiterun-8B-LINEAR.json similarity index 100% rename from data/DreadPoor_Lydia_of_Whiterun-8B-LINEAR.json rename to data/models/DreadPoor_Lydia_of_Whiterun-8B-LINEAR.json diff --git a/data/DreadPoor_Matryoshka-8B-LINEAR.json b/data/models/DreadPoor_Matryoshka-8B-LINEAR.json similarity index 100% rename from data/DreadPoor_Matryoshka-8B-LINEAR.json rename to data/models/DreadPoor_Matryoshka-8B-LINEAR.json diff --git a/data/DreadPoor_Mercury_In_Retrograde-8b-Model-Stock.json b/data/models/DreadPoor_Mercury_In_Retrograde-8b-Model-Stock.json similarity index 100% rename from data/DreadPoor_Mercury_In_Retrograde-8b-Model-Stock.json rename to data/models/DreadPoor_Mercury_In_Retrograde-8b-Model-Stock.json diff --git a/data/DreadPoor_Minthy-8B-Model_Stock.json b/data/models/DreadPoor_Minthy-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Minthy-8B-Model_Stock.json rename to data/models/DreadPoor_Minthy-8B-Model_Stock.json diff --git a/data/DreadPoor_Minthy_ALT-8B-Model_Stock.json b/data/models/DreadPoor_Minthy_ALT-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Minthy_ALT-8B-Model_Stock.json rename to data/models/DreadPoor_Minthy_ALT-8B-Model_Stock.json diff --git a/data/DreadPoor_Minthy_V2-8B-Model_Stock.json b/data/models/DreadPoor_Minthy_V2-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Minthy_V2-8B-Model_Stock.json rename to data/models/DreadPoor_Minthy_V2-8B-Model_Stock.json diff --git a/data/DreadPoor_Minus_Penus-8B-Model_Stock.json b/data/models/DreadPoor_Minus_Penus-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Minus_Penus-8B-Model_Stock.json rename to data/models/DreadPoor_Minus_Penus-8B-Model_Stock.json diff --git a/data/DreadPoor_Morphing-8B-Model_Stock.json b/data/models/DreadPoor_Morphing-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Morphing-8B-Model_Stock.json rename to data/models/DreadPoor_Morphing-8B-Model_Stock.json diff --git a/data/DreadPoor_Not_Even_My_Final_Form-8B-Model_Stock.json b/data/models/DreadPoor_Not_Even_My_Final_Form-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Not_Even_My_Final_Form-8B-Model_Stock.json rename to data/models/DreadPoor_Not_Even_My_Final_Form-8B-Model_Stock.json diff --git a/data/DreadPoor_Nother_One-8B-Model_Stock.json b/data/models/DreadPoor_Nother_One-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Nother_One-8B-Model_Stock.json rename to data/models/DreadPoor_Nother_One-8B-Model_Stock.json diff --git a/data/DreadPoor_Noxis-8B-LINEAR.json b/data/models/DreadPoor_Noxis-8B-LINEAR.json similarity index 100% rename from data/DreadPoor_Noxis-8B-LINEAR.json rename to data/models/DreadPoor_Noxis-8B-LINEAR.json diff --git a/data/DreadPoor_Nullsworn-12B-LINEAR.json b/data/models/DreadPoor_Nullsworn-12B-LINEAR.json similarity index 100% rename from data/DreadPoor_Nullsworn-12B-LINEAR.json rename to data/models/DreadPoor_Nullsworn-12B-LINEAR.json diff --git a/data/DreadPoor_Nwah-8B-Model_Stock.json b/data/models/DreadPoor_Nwah-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Nwah-8B-Model_Stock.json rename to data/models/DreadPoor_Nwah-8B-Model_Stock.json diff --git a/data/DreadPoor_ONeil-model_stock-8B.json b/data/models/DreadPoor_ONeil-model_stock-8B.json similarity index 100% rename from data/DreadPoor_ONeil-model_stock-8B.json rename to data/models/DreadPoor_ONeil-model_stock-8B.json diff --git a/data/DreadPoor_Oh_Boy-8B-LINEAR.json b/data/models/DreadPoor_Oh_Boy-8B-LINEAR.json similarity index 100% rename from data/DreadPoor_Oh_Boy-8B-LINEAR.json rename to data/models/DreadPoor_Oh_Boy-8B-LINEAR.json diff --git a/data/DreadPoor_OrangeJ-8B-Model_Stock.json b/data/models/DreadPoor_OrangeJ-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_OrangeJ-8B-Model_Stock.json rename to data/models/DreadPoor_OrangeJ-8B-Model_Stock.json diff --git a/data/DreadPoor_Promissum_Mane-8B-LINEAR-lorablated.json b/data/models/DreadPoor_Promissum_Mane-8B-LINEAR-lorablated.json similarity index 100% rename from data/DreadPoor_Promissum_Mane-8B-LINEAR-lorablated.json rename to data/models/DreadPoor_Promissum_Mane-8B-LINEAR-lorablated.json diff --git a/data/DreadPoor_Promissum_Mane-8B-LINEAR.json b/data/models/DreadPoor_Promissum_Mane-8B-LINEAR.json similarity index 100% rename from data/DreadPoor_Promissum_Mane-8B-LINEAR.json rename to data/models/DreadPoor_Promissum_Mane-8B-LINEAR.json diff --git a/data/DreadPoor_RPMash-8B-Model_Stock.json b/data/models/DreadPoor_RPMash-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_RPMash-8B-Model_Stock.json rename to data/models/DreadPoor_RPMash-8B-Model_Stock.json diff --git a/data/DreadPoor_RPMash_V3-8B-Model_Stock.json b/data/models/DreadPoor_RPMash_V3-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_RPMash_V3-8B-Model_Stock.json rename to data/models/DreadPoor_RPMash_V3-8B-Model_Stock.json diff --git a/data/DreadPoor_Rusted_Gold-8B-LINEAR.json b/data/models/DreadPoor_Rusted_Gold-8B-LINEAR.json similarity index 100% rename from data/DreadPoor_Rusted_Gold-8B-LINEAR.json rename to data/models/DreadPoor_Rusted_Gold-8B-LINEAR.json diff --git a/data/DreadPoor_Rusted_Platinum-8B-LINEAR.json b/data/models/DreadPoor_Rusted_Platinum-8B-LINEAR.json similarity index 100% rename from data/DreadPoor_Rusted_Platinum-8B-LINEAR.json rename to data/models/DreadPoor_Rusted_Platinum-8B-LINEAR.json diff --git a/data/DreadPoor_Rusted_Platinum-8B-Model_Stock.json b/data/models/DreadPoor_Rusted_Platinum-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Rusted_Platinum-8B-Model_Stock.json rename to data/models/DreadPoor_Rusted_Platinum-8B-Model_Stock.json diff --git a/data/DreadPoor_Sellen-8B-model_stock.json b/data/models/DreadPoor_Sellen-8B-model_stock.json similarity index 100% rename from data/DreadPoor_Sellen-8B-model_stock.json rename to data/models/DreadPoor_Sellen-8B-model_stock.json diff --git a/data/DreadPoor_Something-8B-Model_Stock.json b/data/models/DreadPoor_Something-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Something-8B-Model_Stock.json rename to data/models/DreadPoor_Something-8B-Model_Stock.json diff --git a/data/DreadPoor_Spring_Dusk-8B-SCE.json b/data/models/DreadPoor_Spring_Dusk-8B-SCE.json similarity index 100% rename from data/DreadPoor_Spring_Dusk-8B-SCE.json rename to data/models/DreadPoor_Spring_Dusk-8B-SCE.json diff --git a/data/DreadPoor_Summer_Dawn-8B-SCE.json b/data/models/DreadPoor_Summer_Dawn-8B-SCE.json similarity index 100% rename from data/DreadPoor_Summer_Dawn-8B-SCE.json rename to data/models/DreadPoor_Summer_Dawn-8B-SCE.json diff --git a/data/DreadPoor_Summer_Dusk-8B-TIES.json b/data/models/DreadPoor_Summer_Dusk-8B-TIES.json similarity index 100% rename from data/DreadPoor_Summer_Dusk-8B-TIES.json rename to data/models/DreadPoor_Summer_Dusk-8B-TIES.json diff --git a/data/DreadPoor_Summer_Rain-8B-SCE.json b/data/models/DreadPoor_Summer_Rain-8B-SCE.json similarity index 100% rename from data/DreadPoor_Summer_Rain-8B-SCE.json rename to data/models/DreadPoor_Summer_Rain-8B-SCE.json diff --git a/data/DreadPoor_Summer_Rain-8B-TIES.json b/data/models/DreadPoor_Summer_Rain-8B-TIES.json similarity index 100% rename from data/DreadPoor_Summer_Rain-8B-TIES.json rename to data/models/DreadPoor_Summer_Rain-8B-TIES.json diff --git a/data/DreadPoor_Sun-8B-Model_Stock.json b/data/models/DreadPoor_Sun-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Sun-8B-Model_Stock.json rename to data/models/DreadPoor_Sun-8B-Model_Stock.json diff --git a/data/DreadPoor_Sweetened_Condensed_Milk-8B-Model_Stock.json b/data/models/DreadPoor_Sweetened_Condensed_Milk-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Sweetened_Condensed_Milk-8B-Model_Stock.json rename to data/models/DreadPoor_Sweetened_Condensed_Milk-8B-Model_Stock.json diff --git a/data/DreadPoor_TEST02-Ignore.json b/data/models/DreadPoor_TEST02-Ignore.json similarity index 100% rename from data/DreadPoor_TEST02-Ignore.json rename to data/models/DreadPoor_TEST02-Ignore.json diff --git a/data/DreadPoor_TEST03-ignore.json b/data/models/DreadPoor_TEST03-ignore.json similarity index 100% rename from data/DreadPoor_TEST03-ignore.json rename to data/models/DreadPoor_TEST03-ignore.json diff --git a/data/DreadPoor_TEST06-ignore.json b/data/models/DreadPoor_TEST06-ignore.json similarity index 100% rename from data/DreadPoor_TEST06-ignore.json rename to data/models/DreadPoor_TEST06-ignore.json diff --git a/data/DreadPoor_TEST07-ignore.json b/data/models/DreadPoor_TEST07-ignore.json similarity index 100% rename from data/DreadPoor_TEST07-ignore.json rename to data/models/DreadPoor_TEST07-ignore.json diff --git a/data/DreadPoor_TEST08-ignore.json b/data/models/DreadPoor_TEST08-ignore.json similarity index 100% rename from data/DreadPoor_TEST08-ignore.json rename to data/models/DreadPoor_TEST08-ignore.json diff --git a/data/DreadPoor_Trinas_Nectar-8B-model_stock.json b/data/models/DreadPoor_Trinas_Nectar-8B-model_stock.json similarity index 100% rename from data/DreadPoor_Trinas_Nectar-8B-model_stock.json rename to data/models/DreadPoor_Trinas_Nectar-8B-model_stock.json diff --git a/data/DreadPoor_UNTESTED-VENN_1.2-8B-Model_Stock.json b/data/models/DreadPoor_UNTESTED-VENN_1.2-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_UNTESTED-VENN_1.2-8B-Model_Stock.json rename to data/models/DreadPoor_UNTESTED-VENN_1.2-8B-Model_Stock.json diff --git a/data/DreadPoor_VENN_1.2-8B-Model_Stock.json b/data/models/DreadPoor_VENN_1.2-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_VENN_1.2-8B-Model_Stock.json rename to data/models/DreadPoor_VENN_1.2-8B-Model_Stock.json diff --git a/data/DreadPoor_WIP-Acacia-8B-Model_Stock.json b/data/models/DreadPoor_WIP-Acacia-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_WIP-Acacia-8B-Model_Stock.json rename to data/models/DreadPoor_WIP-Acacia-8B-Model_Stock.json diff --git a/data/DreadPoor_WIP_Damascus-8B-TIES.json b/data/models/DreadPoor_WIP_Damascus-8B-TIES.json similarity index 100% rename from data/DreadPoor_WIP_Damascus-8B-TIES.json rename to data/models/DreadPoor_WIP_Damascus-8B-TIES.json diff --git a/data/DreadPoor_Wannabe-8B-Model_Stock.json b/data/models/DreadPoor_Wannabe-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Wannabe-8B-Model_Stock.json rename to data/models/DreadPoor_Wannabe-8B-Model_Stock.json diff --git a/data/DreadPoor_What_A_Thrill-8B-Model_Stock.json b/data/models/DreadPoor_What_A_Thrill-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_What_A_Thrill-8B-Model_Stock.json rename to data/models/DreadPoor_What_A_Thrill-8B-Model_Stock.json diff --git a/data/DreadPoor_Winter-8B-SCE.json b/data/models/DreadPoor_Winter-8B-SCE.json similarity index 100% rename from data/DreadPoor_Winter-8B-SCE.json rename to data/models/DreadPoor_Winter-8B-SCE.json diff --git a/data/DreadPoor_Winter_Dawn-8B-TIES.json b/data/models/DreadPoor_Winter_Dawn-8B-TIES.json similarity index 100% rename from data/DreadPoor_Winter_Dawn-8B-TIES.json rename to data/models/DreadPoor_Winter_Dawn-8B-TIES.json diff --git a/data/DreadPoor_Winter_Dusk-8B-TIES.json b/data/models/DreadPoor_Winter_Dusk-8B-TIES.json similarity index 100% rename from data/DreadPoor_Winter_Dusk-8B-TIES.json rename to data/models/DreadPoor_Winter_Dusk-8B-TIES.json diff --git a/data/DreadPoor_Winter_Night-8B-Model_Stock.json b/data/models/DreadPoor_Winter_Night-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Winter_Night-8B-Model_Stock.json rename to data/models/DreadPoor_Winter_Night-8B-Model_Stock.json diff --git a/data/DreadPoor_Yafune-8B-Model_Stock.json b/data/models/DreadPoor_Yafune-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Yafune-8B-Model_Stock.json rename to data/models/DreadPoor_Yafune-8B-Model_Stock.json diff --git a/data/DreadPoor_Yearn_V3-8B-Model_Stock.json b/data/models/DreadPoor_Yearn_V3-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Yearn_V3-8B-Model_Stock.json rename to data/models/DreadPoor_Yearn_V3-8B-Model_Stock.json diff --git a/data/DreadPoor_ZEUS-8B-V17-Abliterated_ALT.json b/data/models/DreadPoor_ZEUS-8B-V17-Abliterated_ALT.json similarity index 100% rename from data/DreadPoor_ZEUS-8B-V17-Abliterated_ALT.json rename to data/models/DreadPoor_ZEUS-8B-V17-Abliterated_ALT.json diff --git a/data/DreadPoor_Zelus-8B-Model_Stock.json b/data/models/DreadPoor_Zelus-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Zelus-8B-Model_Stock.json rename to data/models/DreadPoor_Zelus-8B-Model_Stock.json diff --git a/data/DreadPoor_Zelus_V2-8B-Model_Stock.json b/data/models/DreadPoor_Zelus_V2-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_Zelus_V2-8B-Model_Stock.json rename to data/models/DreadPoor_Zelus_V2-8B-Model_Stock.json diff --git a/data/DreadPoor_felix_dies-mistral-7B-model_stock.json b/data/models/DreadPoor_felix_dies-mistral-7B-model_stock.json similarity index 100% rename from data/DreadPoor_felix_dies-mistral-7B-model_stock.json rename to data/models/DreadPoor_felix_dies-mistral-7B-model_stock.json diff --git a/data/DreadPoor_hakuchido-8B-MODEL_STOCK.json b/data/models/DreadPoor_hakuchido-8B-MODEL_STOCK.json similarity index 100% rename from data/DreadPoor_hakuchido-8B-MODEL_STOCK.json rename to data/models/DreadPoor_hakuchido-8B-MODEL_STOCK.json diff --git a/data/DreadPoor_ichor-8B-Model_Stock.json b/data/models/DreadPoor_ichor-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_ichor-8B-Model_Stock.json rename to data/models/DreadPoor_ichor-8B-Model_Stock.json diff --git a/data/DreadPoor_ichor_1.1-8B-Model_Stock.json b/data/models/DreadPoor_ichor_1.1-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_ichor_1.1-8B-Model_Stock.json rename to data/models/DreadPoor_ichor_1.1-8B-Model_Stock.json diff --git a/data/DreadPoor_inexpertus-8B-Model_Stock.json b/data/models/DreadPoor_inexpertus-8B-Model_Stock.json similarity index 100% rename from data/DreadPoor_inexpertus-8B-Model_Stock.json rename to data/models/DreadPoor_inexpertus-8B-Model_Stock.json diff --git a/data/DreadPoor_inexpertus_1.1-8B-LINEAR.json b/data/models/DreadPoor_inexpertus_1.1-8B-LINEAR.json similarity index 100% rename from data/DreadPoor_inexpertus_1.1-8B-LINEAR.json rename to data/models/DreadPoor_inexpertus_1.1-8B-LINEAR.json diff --git a/data/DreadPoor_inexpertus_1.2-8B-LINEAR.json b/data/models/DreadPoor_inexpertus_1.2-8B-LINEAR.json similarity index 100% rename from data/DreadPoor_inexpertus_1.2-8B-LINEAR.json rename to data/models/DreadPoor_inexpertus_1.2-8B-LINEAR.json diff --git a/data/DreadPoor_mergekit-nuslerp-nqzkedi.json b/data/models/DreadPoor_mergekit-nuslerp-nqzkedi.json similarity index 100% rename from data/DreadPoor_mergekit-nuslerp-nqzkedi.json rename to data/models/DreadPoor_mergekit-nuslerp-nqzkedi.json diff --git a/data/DreadPoor_remember_to_breathe-8b-Model-Stock.json b/data/models/DreadPoor_remember_to_breathe-8b-Model-Stock.json similarity index 100% rename from data/DreadPoor_remember_to_breathe-8b-Model-Stock.json rename to data/models/DreadPoor_remember_to_breathe-8b-Model-Stock.json diff --git a/data/DreadPoor_test.json b/data/models/DreadPoor_test.json similarity index 100% rename from data/DreadPoor_test.json rename to data/models/DreadPoor_test.json diff --git a/data/DreadPoor_test_ALT.json b/data/models/DreadPoor_test_ALT.json similarity index 100% rename from data/DreadPoor_test_ALT.json rename to data/models/DreadPoor_test_ALT.json diff --git a/data/DreadPoor_tests_pending-do_not_use_yet.json b/data/models/DreadPoor_tests_pending-do_not_use_yet.json similarity index 100% rename from data/DreadPoor_tests_pending-do_not_use_yet.json rename to data/models/DreadPoor_tests_pending-do_not_use_yet.json diff --git a/data/ECE-ILAB-PRYMMAL_ILAB-Merging-3B-V2.json b/data/models/ECE-ILAB-PRYMMAL_ILAB-Merging-3B-V2.json similarity index 100% rename from data/ECE-ILAB-PRYMMAL_ILAB-Merging-3B-V2.json rename to data/models/ECE-ILAB-PRYMMAL_ILAB-Merging-3B-V2.json diff --git a/data/EVA-UNIT-01_EVA-Qwen2.5-14B-v0.2.json b/data/models/EVA-UNIT-01_EVA-Qwen2.5-14B-v0.2.json similarity index 100% rename from data/EVA-UNIT-01_EVA-Qwen2.5-14B-v0.2.json rename to data/models/EVA-UNIT-01_EVA-Qwen2.5-14B-v0.2.json diff --git a/data/EVA-UNIT-01_EVA-Qwen2.5-72B-v0.2.json b/data/models/EVA-UNIT-01_EVA-Qwen2.5-72B-v0.2.json similarity index 100% rename from data/EVA-UNIT-01_EVA-Qwen2.5-72B-v0.2.json rename to data/models/EVA-UNIT-01_EVA-Qwen2.5-72B-v0.2.json diff --git a/data/Edgerunners_meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16.json b/data/models/Edgerunners_meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16.json similarity index 100% rename from data/Edgerunners_meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16.json rename to data/models/Edgerunners_meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16.json diff --git a/data/EleutherAI_gpt-j-6b.json b/data/models/EleutherAI_gpt-j-6b.json similarity index 100% rename from data/EleutherAI_gpt-j-6b.json rename to data/models/EleutherAI_gpt-j-6b.json diff --git a/data/EleutherAI_gpt-neo-1.3B.json b/data/models/EleutherAI_gpt-neo-1.3B.json similarity index 100% rename from data/EleutherAI_gpt-neo-1.3B.json rename to data/models/EleutherAI_gpt-neo-1.3B.json diff --git a/data/EleutherAI_gpt-neo-125m.json b/data/models/EleutherAI_gpt-neo-125m.json similarity index 100% rename from data/EleutherAI_gpt-neo-125m.json rename to data/models/EleutherAI_gpt-neo-125m.json diff --git a/data/EleutherAI_gpt-neo-2.7B.json b/data/models/EleutherAI_gpt-neo-2.7B.json similarity index 100% rename from data/EleutherAI_gpt-neo-2.7B.json rename to data/models/EleutherAI_gpt-neo-2.7B.json diff --git a/data/EleutherAI_gpt-neox-20b.json b/data/models/EleutherAI_gpt-neox-20b.json similarity index 100% rename from data/EleutherAI_gpt-neox-20b.json rename to data/models/EleutherAI_gpt-neox-20b.json diff --git a/data/EleutherAI_pythia-1.4b.json b/data/models/EleutherAI_pythia-1.4b.json similarity index 100% rename from data/EleutherAI_pythia-1.4b.json rename to data/models/EleutherAI_pythia-1.4b.json diff --git a/data/models/EleutherAI_pythia-12b.json b/data/models/EleutherAI_pythia-12b.json new file mode 100644 index 0000000000000000000000000000000000000000..d91ea53b67c368a4733808be60d6cd87c3fcb34c --- /dev/null +++ b/data/models/EleutherAI_pythia-12b.json @@ -0,0 +1,145 @@ +{ + "model_info": { + "name": "pythia-12b", + "id": "EleutherAI/pythia-12b", + "developer": "EleutherAI", + "inference_platform": "unknown", + "additional_details": { + "precision": "float16", + "architecture": "GPTNeoXForCausalLM", + "params_billions": "12.0" + } + }, + "evaluations": [ + { + "evaluation_id": "hfopenllm_v2/EleutherAI_pythia-12b/1773936498.240187", + "retrieved_timestamp": "1773936498.240187", + "source_metadata": { + "source_name": "HF Open LLM v2", + "source_type": "documentation", + "source_organization_name": "Hugging Face", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "lm-evaluation-harness", + "version": "0.4.0", + "additional_details": { + "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess" + } + }, + "benchmark": "hfopenllm_v2", + "evaluation_results": [ + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "hf_dataset", + "hf_repo": "google/IFEval" + }, + "metric_config": { + "evaluation_description": "Accuracy on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2471 + } + }, + { + "evaluation_name": "BBH", + "source_data": { + "dataset_name": "BBH", + "source_type": "hf_dataset", + "hf_repo": "SaylorTwift/bbh" + }, + "metric_config": { + "evaluation_description": "Accuracy on BBH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.318 + } + }, + { + "evaluation_name": "MATH Level 5", + "source_data": { + "dataset_name": "MATH Level 5", + "source_type": "hf_dataset", + "hf_repo": "DigitalLearningGmbH/MATH-lighteval" + }, + "metric_config": { + "evaluation_description": "Exact Match on MATH Level 5", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.0166 + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "hf_dataset", + "hf_repo": "Idavidrein/gpqa" + }, + "metric_config": { + "evaluation_description": "Accuracy on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2466 + } + }, + { + "evaluation_name": "MUSR", + "source_data": { + "dataset_name": "MUSR", + "source_type": "hf_dataset", + "hf_repo": "TAUR-Lab/MuSR" + }, + "metric_config": { + "evaluation_description": "Accuracy on MUSR", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3647 + } + }, + { + "evaluation_name": "MMLU-PRO", + "source_data": { + "dataset_name": "MMLU-PRO", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/MMLU-Pro" + }, + "metric_config": { + "evaluation_description": "Accuracy on MMLU-PRO", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.1109 + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/EleutherAI_pythia-160m.json b/data/models/EleutherAI_pythia-160m.json similarity index 100% rename from data/EleutherAI_pythia-160m.json rename to data/models/EleutherAI_pythia-160m.json diff --git a/data/EleutherAI_pythia-1b.json b/data/models/EleutherAI_pythia-1b.json similarity index 100% rename from data/EleutherAI_pythia-1b.json rename to data/models/EleutherAI_pythia-1b.json diff --git a/data/EleutherAI_pythia-2.8b.json b/data/models/EleutherAI_pythia-2.8b.json similarity index 100% rename from data/EleutherAI_pythia-2.8b.json rename to data/models/EleutherAI_pythia-2.8b.json diff --git a/data/EleutherAI_pythia-410m.json b/data/models/EleutherAI_pythia-410m.json similarity index 100% rename from data/EleutherAI_pythia-410m.json rename to data/models/EleutherAI_pythia-410m.json diff --git a/data/models/EleutherAI_pythia-6.9b.json b/data/models/EleutherAI_pythia-6.9b.json new file mode 100644 index 0000000000000000000000000000000000000000..741bd60585c63d3f5c010209b30b58147e393b9f --- /dev/null +++ b/data/models/EleutherAI_pythia-6.9b.json @@ -0,0 +1,145 @@ +{ + "model_info": { + "name": "pythia-6.9b", + "id": "EleutherAI/pythia-6.9b", + "developer": "EleutherAI", + "inference_platform": "unknown", + "additional_details": { + "precision": "float16", + "architecture": "GPTNeoXForCausalLM", + "params_billions": "6.9" + } + }, + "evaluations": [ + { + "evaluation_id": "hfopenllm_v2/EleutherAI_pythia-6.9b/1773936498.240187", + "retrieved_timestamp": "1773936498.240187", + "source_metadata": { + "source_name": "HF Open LLM v2", + "source_type": "documentation", + "source_organization_name": "Hugging Face", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "lm-evaluation-harness", + "version": "0.4.0", + "additional_details": { + "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess" + } + }, + "benchmark": "hfopenllm_v2", + "evaluation_results": [ + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "hf_dataset", + "hf_repo": "google/IFEval" + }, + "metric_config": { + "evaluation_description": "Accuracy on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2281 + } + }, + { + "evaluation_name": "BBH", + "source_data": { + "dataset_name": "BBH", + "source_type": "hf_dataset", + "hf_repo": "SaylorTwift/bbh" + }, + "metric_config": { + "evaluation_description": "Accuracy on BBH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3232 + } + }, + { + "evaluation_name": "MATH Level 5", + "source_data": { + "dataset_name": "MATH Level 5", + "source_type": "hf_dataset", + "hf_repo": "DigitalLearningGmbH/MATH-lighteval" + }, + "metric_config": { + "evaluation_description": "Exact Match on MATH Level 5", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.0144 + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "hf_dataset", + "hf_repo": "Idavidrein/gpqa" + }, + "metric_config": { + "evaluation_description": "Accuracy on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2517 + } + }, + { + "evaluation_name": "MUSR", + "source_data": { + "dataset_name": "MUSR", + "source_type": "hf_dataset", + "hf_repo": "TAUR-Lab/MuSR" + }, + "metric_config": { + "evaluation_description": "Accuracy on MUSR", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3591 + } + }, + { + "evaluation_name": "MMLU-PRO", + "source_data": { + "dataset_name": "MMLU-PRO", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/MMLU-Pro" + }, + "metric_config": { + "evaluation_description": "Accuracy on MMLU-PRO", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.1147 + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/Enno-Ai_EnnoAi-Pro-French-Llama-3-8B-v0.4.json b/data/models/Enno-Ai_EnnoAi-Pro-French-Llama-3-8B-v0.4.json similarity index 100% rename from data/Enno-Ai_EnnoAi-Pro-French-Llama-3-8B-v0.4.json rename to data/models/Enno-Ai_EnnoAi-Pro-French-Llama-3-8B-v0.4.json diff --git a/data/Enno-Ai_EnnoAi-Pro-Llama-3-8B-v0.3.json b/data/models/Enno-Ai_EnnoAi-Pro-Llama-3-8B-v0.3.json similarity index 100% rename from data/Enno-Ai_EnnoAi-Pro-Llama-3-8B-v0.3.json rename to data/models/Enno-Ai_EnnoAi-Pro-Llama-3-8B-v0.3.json diff --git a/data/Enno-Ai_EnnoAi-Pro-Llama-3-8B.json b/data/models/Enno-Ai_EnnoAi-Pro-Llama-3-8B.json similarity index 100% rename from data/Enno-Ai_EnnoAi-Pro-Llama-3-8B.json rename to data/models/Enno-Ai_EnnoAi-Pro-Llama-3-8B.json diff --git a/data/Enno-Ai_EnnoAi-Pro-Llama-3.1-8B-v0.9.json b/data/models/Enno-Ai_EnnoAi-Pro-Llama-3.1-8B-v0.9.json similarity index 100% rename from data/Enno-Ai_EnnoAi-Pro-Llama-3.1-8B-v0.9.json rename to data/models/Enno-Ai_EnnoAi-Pro-Llama-3.1-8B-v0.9.json diff --git a/data/EnnoAi_EnnoAi-7B-French-Instruct-202502.json b/data/models/EnnoAi_EnnoAi-7B-French-Instruct-202502.json similarity index 100% rename from data/EnnoAi_EnnoAi-7B-French-Instruct-202502.json rename to data/models/EnnoAi_EnnoAi-7B-French-Instruct-202502.json diff --git a/data/EnnoAi_EnnoAi-Pro-Llama-3.1-8B-v1.0.json b/data/models/EnnoAi_EnnoAi-Pro-Llama-3.1-8B-v1.0.json similarity index 100% rename from data/EnnoAi_EnnoAi-Pro-Llama-3.1-8B-v1.0.json rename to data/models/EnnoAi_EnnoAi-Pro-Llama-3.1-8B-v1.0.json diff --git a/data/Epiculous_Azure_Dusk-v0.2.json b/data/models/Epiculous_Azure_Dusk-v0.2.json similarity index 100% rename from data/Epiculous_Azure_Dusk-v0.2.json rename to data/models/Epiculous_Azure_Dusk-v0.2.json diff --git a/data/Epiculous_Crimson_Dawn-v0.2.json b/data/models/Epiculous_Crimson_Dawn-v0.2.json similarity index 100% rename from data/Epiculous_Crimson_Dawn-v0.2.json rename to data/models/Epiculous_Crimson_Dawn-v0.2.json diff --git a/data/Epiculous_NovaSpark.json b/data/models/Epiculous_NovaSpark.json similarity index 100% rename from data/Epiculous_NovaSpark.json rename to data/models/Epiculous_NovaSpark.json diff --git a/data/Epiculous_Violet_Twilight-v0.2.json b/data/models/Epiculous_Violet_Twilight-v0.2.json similarity index 100% rename from data/Epiculous_Violet_Twilight-v0.2.json rename to data/models/Epiculous_Violet_Twilight-v0.2.json diff --git a/data/EpistemeAI2_Athene-codegemma-2-7b-it-alpaca-v1.2.json b/data/models/EpistemeAI2_Athene-codegemma-2-7b-it-alpaca-v1.2.json similarity index 100% rename from data/EpistemeAI2_Athene-codegemma-2-7b-it-alpaca-v1.2.json rename to data/models/EpistemeAI2_Athene-codegemma-2-7b-it-alpaca-v1.2.json diff --git a/data/EpistemeAI2_Fireball-12B-v1.2.json b/data/models/EpistemeAI2_Fireball-12B-v1.2.json similarity index 100% rename from data/EpistemeAI2_Fireball-12B-v1.2.json rename to data/models/EpistemeAI2_Fireball-12B-v1.2.json diff --git a/data/EpistemeAI2_Fireball-Alpaca-Llama3.1-8B-Philos.json b/data/models/EpistemeAI2_Fireball-Alpaca-Llama3.1-8B-Philos.json similarity index 100% rename from data/EpistemeAI2_Fireball-Alpaca-Llama3.1-8B-Philos.json rename to data/models/EpistemeAI2_Fireball-Alpaca-Llama3.1-8B-Philos.json diff --git a/data/EpistemeAI2_Fireball-Alpaca-Llama3.1.01-8B-Philos.json b/data/models/EpistemeAI2_Fireball-Alpaca-Llama3.1.01-8B-Philos.json similarity index 100% rename from data/EpistemeAI2_Fireball-Alpaca-Llama3.1.01-8B-Philos.json rename to data/models/EpistemeAI2_Fireball-Alpaca-Llama3.1.01-8B-Philos.json diff --git a/data/EpistemeAI2_Fireball-Alpaca-Llama3.1.03-8B-Philos.json b/data/models/EpistemeAI2_Fireball-Alpaca-Llama3.1.03-8B-Philos.json similarity index 100% rename from data/EpistemeAI2_Fireball-Alpaca-Llama3.1.03-8B-Philos.json rename to data/models/EpistemeAI2_Fireball-Alpaca-Llama3.1.03-8B-Philos.json diff --git a/data/EpistemeAI2_Fireball-Alpaca-Llama3.1.04-8B-Philos.json b/data/models/EpistemeAI2_Fireball-Alpaca-Llama3.1.04-8B-Philos.json similarity index 100% rename from data/EpistemeAI2_Fireball-Alpaca-Llama3.1.04-8B-Philos.json rename to data/models/EpistemeAI2_Fireball-Alpaca-Llama3.1.04-8B-Philos.json diff --git a/data/EpistemeAI2_Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo.json b/data/models/EpistemeAI2_Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo.json similarity index 100% rename from data/EpistemeAI2_Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo.json rename to data/models/EpistemeAI2_Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo.json diff --git a/data/EpistemeAI2_Fireball-Alpaca-Llama3.1.07-8B-Philos-Math.json b/data/models/EpistemeAI2_Fireball-Alpaca-Llama3.1.07-8B-Philos-Math.json similarity index 100% rename from data/EpistemeAI2_Fireball-Alpaca-Llama3.1.07-8B-Philos-Math.json rename to data/models/EpistemeAI2_Fireball-Alpaca-Llama3.1.07-8B-Philos-Math.json diff --git a/data/EpistemeAI2_Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection.json b/data/models/EpistemeAI2_Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection.json similarity index 100% rename from data/EpistemeAI2_Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection.json rename to data/models/EpistemeAI2_Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection.json diff --git a/data/EpistemeAI2_Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1.json b/data/models/EpistemeAI2_Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1.json similarity index 100% rename from data/EpistemeAI2_Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1.json rename to data/models/EpistemeAI2_Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1.json diff --git a/data/EpistemeAI2_Fireball-Llama-3.1-8B-Philos-Reflection.json b/data/models/EpistemeAI2_Fireball-Llama-3.1-8B-Philos-Reflection.json similarity index 100% rename from data/EpistemeAI2_Fireball-Llama-3.1-8B-Philos-Reflection.json rename to data/models/EpistemeAI2_Fireball-Llama-3.1-8B-Philos-Reflection.json diff --git a/data/EpistemeAI2_Fireball-MathMistral-Nemo-Base-2407-v2dpo.json b/data/models/EpistemeAI2_Fireball-MathMistral-Nemo-Base-2407-v2dpo.json similarity index 100% rename from data/EpistemeAI2_Fireball-MathMistral-Nemo-Base-2407-v2dpo.json rename to data/models/EpistemeAI2_Fireball-MathMistral-Nemo-Base-2407-v2dpo.json diff --git a/data/EpistemeAI2_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math.json b/data/models/EpistemeAI2_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math.json similarity index 100% rename from data/EpistemeAI2_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math.json rename to data/models/EpistemeAI2_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math.json diff --git a/data/EpistemeAI2_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT.json b/data/models/EpistemeAI2_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT.json similarity index 100% rename from data/EpistemeAI2_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT.json rename to data/models/EpistemeAI2_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT.json diff --git a/data/EpistemeAI2_Fireball-Phi-3-medium-4k-inst-Philos.json b/data/models/EpistemeAI2_Fireball-Phi-3-medium-4k-inst-Philos.json similarity index 100% rename from data/EpistemeAI2_Fireball-Phi-3-medium-4k-inst-Philos.json rename to data/models/EpistemeAI2_Fireball-Phi-3-medium-4k-inst-Philos.json diff --git a/data/EpistemeAI_Alpaca-Llama3.1-8B.json b/data/models/EpistemeAI_Alpaca-Llama3.1-8B.json similarity index 100% rename from data/EpistemeAI_Alpaca-Llama3.1-8B.json rename to data/models/EpistemeAI_Alpaca-Llama3.1-8B.json diff --git a/data/EpistemeAI_Athena-gemma-2-2b-it-Philos.json b/data/models/EpistemeAI_Athena-gemma-2-2b-it-Philos.json similarity index 100% rename from data/EpistemeAI_Athena-gemma-2-2b-it-Philos.json rename to data/models/EpistemeAI_Athena-gemma-2-2b-it-Philos.json diff --git a/data/EpistemeAI_Athena-gemma-2-2b-it.json b/data/models/EpistemeAI_Athena-gemma-2-2b-it.json similarity index 100% rename from data/EpistemeAI_Athena-gemma-2-2b-it.json rename to data/models/EpistemeAI_Athena-gemma-2-2b-it.json diff --git a/data/EpistemeAI_Athene-codegemma-2-7b-it-alpaca-v1.3.json b/data/models/EpistemeAI_Athene-codegemma-2-7b-it-alpaca-v1.3.json similarity index 100% rename from data/EpistemeAI_Athene-codegemma-2-7b-it-alpaca-v1.3.json rename to data/models/EpistemeAI_Athene-codegemma-2-7b-it-alpaca-v1.3.json diff --git a/data/EpistemeAI_DeepPhi-3.5-mini-instruct.json b/data/models/EpistemeAI_DeepPhi-3.5-mini-instruct.json similarity index 100% rename from data/EpistemeAI_DeepPhi-3.5-mini-instruct.json rename to data/models/EpistemeAI_DeepPhi-3.5-mini-instruct.json diff --git a/data/EpistemeAI_DeepThinkers-Phi4.json b/data/models/EpistemeAI_DeepThinkers-Phi4.json similarity index 100% rename from data/EpistemeAI_DeepThinkers-Phi4.json rename to data/models/EpistemeAI_DeepThinkers-Phi4.json diff --git a/data/EpistemeAI_FineLlama3.1-8B-Instruct.json b/data/models/EpistemeAI_FineLlama3.1-8B-Instruct.json similarity index 100% rename from data/EpistemeAI_FineLlama3.1-8B-Instruct.json rename to data/models/EpistemeAI_FineLlama3.1-8B-Instruct.json diff --git a/data/EpistemeAI_Fireball-12B-v1.13a-philosophers.json b/data/models/EpistemeAI_Fireball-12B-v1.13a-philosophers.json similarity index 100% rename from data/EpistemeAI_Fireball-12B-v1.13a-philosophers.json rename to data/models/EpistemeAI_Fireball-12B-v1.13a-philosophers.json diff --git a/data/EpistemeAI_Fireball-12B.json b/data/models/EpistemeAI_Fireball-12B.json similarity index 100% rename from data/EpistemeAI_Fireball-12B.json rename to data/models/EpistemeAI_Fireball-12B.json diff --git a/data/EpistemeAI_Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200.json b/data/models/EpistemeAI_Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200.json similarity index 100% rename from data/EpistemeAI_Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200.json rename to data/models/EpistemeAI_Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200.json diff --git a/data/EpistemeAI_Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta.json b/data/models/EpistemeAI_Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta.json similarity index 100% rename from data/EpistemeAI_Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta.json rename to data/models/EpistemeAI_Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta.json diff --git a/data/EpistemeAI_Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2.json b/data/models/EpistemeAI_Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2.json similarity index 100% rename from data/EpistemeAI_Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2.json rename to data/models/EpistemeAI_Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2.json diff --git a/data/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto.json b/data/models/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto.json similarity index 100% rename from data/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto.json rename to data/models/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto.json diff --git a/data/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto.json b/data/models/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto.json similarity index 100% rename from data/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto.json rename to data/models/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto.json diff --git a/data/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds.json b/data/models/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds.json similarity index 100% rename from data/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds.json rename to data/models/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds.json diff --git a/data/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code.json b/data/models/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code.json similarity index 100% rename from data/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code.json rename to data/models/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code.json diff --git a/data/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K.json b/data/models/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K.json similarity index 100% rename from data/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K.json rename to data/models/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K.json diff --git a/data/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT.json b/data/models/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT.json similarity index 100% rename from data/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT.json rename to data/models/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT.json diff --git a/data/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto.json b/data/models/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto.json similarity index 100% rename from data/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto.json rename to data/models/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto.json diff --git a/data/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Math.json b/data/models/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Math.json similarity index 100% rename from data/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Math.json rename to data/models/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Math.json diff --git a/data/EpistemeAI_Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO.json b/data/models/EpistemeAI_Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO.json similarity index 100% rename from data/EpistemeAI_Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO.json rename to data/models/EpistemeAI_Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO.json diff --git a/data/EpistemeAI_Fireball-Mistral-Nemo-Base-2407-v1-DPO2.json b/data/models/EpistemeAI_Fireball-Mistral-Nemo-Base-2407-v1-DPO2.json similarity index 100% rename from data/EpistemeAI_Fireball-Mistral-Nemo-Base-2407-v1-DPO2.json rename to data/models/EpistemeAI_Fireball-Mistral-Nemo-Base-2407-v1-DPO2.json diff --git a/data/EpistemeAI_Fireball-R1-Llama-3.1-8B-Medical-COT.json b/data/models/EpistemeAI_Fireball-R1-Llama-3.1-8B-Medical-COT.json similarity index 100% rename from data/EpistemeAI_Fireball-R1-Llama-3.1-8B-Medical-COT.json rename to data/models/EpistemeAI_Fireball-R1-Llama-3.1-8B-Medical-COT.json diff --git a/data/EpistemeAI_Fireball-R1-Llama-3.1-8B.json b/data/models/EpistemeAI_Fireball-R1-Llama-3.1-8B.json similarity index 100% rename from data/EpistemeAI_Fireball-R1-Llama-3.1-8B.json rename to data/models/EpistemeAI_Fireball-R1-Llama-3.1-8B.json diff --git a/data/EpistemeAI_Fireball-R1.1-Llama-3.1-8B.json b/data/models/EpistemeAI_Fireball-R1.1-Llama-3.1-8B.json similarity index 100% rename from data/EpistemeAI_Fireball-R1.1-Llama-3.1-8B.json rename to data/models/EpistemeAI_Fireball-R1.1-Llama-3.1-8B.json diff --git a/data/EpistemeAI_Llama-3.2-3B-Agent007-Coder.json b/data/models/EpistemeAI_Llama-3.2-3B-Agent007-Coder.json similarity index 100% rename from data/EpistemeAI_Llama-3.2-3B-Agent007-Coder.json rename to data/models/EpistemeAI_Llama-3.2-3B-Agent007-Coder.json diff --git a/data/EpistemeAI_Mistral-Nemo-Instruct-12B-Philosophy-Math.json b/data/models/EpistemeAI_Mistral-Nemo-Instruct-12B-Philosophy-Math.json similarity index 100% rename from data/EpistemeAI_Mistral-Nemo-Instruct-12B-Philosophy-Math.json rename to data/models/EpistemeAI_Mistral-Nemo-Instruct-12B-Philosophy-Math.json diff --git a/data/EpistemeAI_OpenReasoner-Llama-3.2-3B-rs1.0.json b/data/models/EpistemeAI_OpenReasoner-Llama-3.2-3B-rs1.0.json similarity index 100% rename from data/EpistemeAI_OpenReasoner-Llama-3.2-3B-rs1.0.json rename to data/models/EpistemeAI_OpenReasoner-Llama-3.2-3B-rs1.0.json diff --git a/data/EpistemeAI_Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy.json b/data/models/EpistemeAI_Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy.json similarity index 100% rename from data/EpistemeAI_Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy.json rename to data/models/EpistemeAI_Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy.json diff --git a/data/EpistemeAI_Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic.json b/data/models/EpistemeAI_Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic.json similarity index 100% rename from data/EpistemeAI_Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic.json rename to data/models/EpistemeAI_Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic.json diff --git a/data/EpistemeAI_Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent.json b/data/models/EpistemeAI_Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent.json similarity index 100% rename from data/EpistemeAI_Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent.json rename to data/models/EpistemeAI_Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent.json diff --git a/data/EpistemeAI_Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO.json b/data/models/EpistemeAI_Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO.json similarity index 100% rename from data/EpistemeAI_Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO.json rename to data/models/EpistemeAI_Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO.json diff --git a/data/EpistemeAI_Reasoning-Llama-3.1-CoT-RE1-NMT.json b/data/models/EpistemeAI_Reasoning-Llama-3.1-CoT-RE1-NMT.json similarity index 100% rename from data/EpistemeAI_Reasoning-Llama-3.1-CoT-RE1-NMT.json rename to data/models/EpistemeAI_Reasoning-Llama-3.1-CoT-RE1-NMT.json diff --git a/data/EpistemeAI_Reasoning-Llama-3.2-1B-Instruct-v1.2.json b/data/models/EpistemeAI_Reasoning-Llama-3.2-1B-Instruct-v1.2.json similarity index 100% rename from data/EpistemeAI_Reasoning-Llama-3.2-1B-Instruct-v1.2.json rename to data/models/EpistemeAI_Reasoning-Llama-3.2-1B-Instruct-v1.2.json diff --git a/data/EpistemeAI_Reasoning-Llama-3.2-1B-Instruct-v1.3.json b/data/models/EpistemeAI_Reasoning-Llama-3.2-1B-Instruct-v1.3.json similarity index 100% rename from data/EpistemeAI_Reasoning-Llama-3.2-1B-Instruct-v1.3.json rename to data/models/EpistemeAI_Reasoning-Llama-3.2-1B-Instruct-v1.3.json diff --git a/data/EpistemeAI_Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO.json b/data/models/EpistemeAI_Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO.json similarity index 100% rename from data/EpistemeAI_Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO.json rename to data/models/EpistemeAI_Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO.json diff --git a/data/EpistemeAI_Reasoning-Llama-3.2-3B-Math-Instruct-RE1.json b/data/models/EpistemeAI_Reasoning-Llama-3.2-3B-Math-Instruct-RE1.json similarity index 100% rename from data/EpistemeAI_Reasoning-Llama-3.2-3B-Math-Instruct-RE1.json rename to data/models/EpistemeAI_Reasoning-Llama-3.2-3B-Math-Instruct-RE1.json diff --git a/data/EpistemeAI_ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math.json b/data/models/EpistemeAI_ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math.json similarity index 100% rename from data/EpistemeAI_ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math.json rename to data/models/EpistemeAI_ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math.json diff --git a/data/EpistemeAI_ReasoningCore-3B-0.json b/data/models/EpistemeAI_ReasoningCore-3B-0.json similarity index 100% rename from data/EpistemeAI_ReasoningCore-3B-0.json rename to data/models/EpistemeAI_ReasoningCore-3B-0.json diff --git a/data/EpistemeAI_ReasoningCore-3B-Instruct-r01-Reflect.json b/data/models/EpistemeAI_ReasoningCore-3B-Instruct-r01-Reflect.json similarity index 100% rename from data/EpistemeAI_ReasoningCore-3B-Instruct-r01-Reflect.json rename to data/models/EpistemeAI_ReasoningCore-3B-Instruct-r01-Reflect.json diff --git a/data/EpistemeAI_ReasoningCore-3B-R01.json b/data/models/EpistemeAI_ReasoningCore-3B-R01.json similarity index 100% rename from data/EpistemeAI_ReasoningCore-3B-R01.json rename to data/models/EpistemeAI_ReasoningCore-3B-R01.json diff --git a/data/EpistemeAI_ReasoningCore-3B-RE1-V2.json b/data/models/EpistemeAI_ReasoningCore-3B-RE1-V2.json similarity index 100% rename from data/EpistemeAI_ReasoningCore-3B-RE1-V2.json rename to data/models/EpistemeAI_ReasoningCore-3B-RE1-V2.json diff --git a/data/EpistemeAI_ReasoningCore-3B-RE1-V2A.json b/data/models/EpistemeAI_ReasoningCore-3B-RE1-V2A.json similarity index 100% rename from data/EpistemeAI_ReasoningCore-3B-RE1-V2A.json rename to data/models/EpistemeAI_ReasoningCore-3B-RE1-V2A.json diff --git a/data/EpistemeAI_ReasoningCore-3B-RE1-V2B.json b/data/models/EpistemeAI_ReasoningCore-3B-RE1-V2B.json similarity index 100% rename from data/EpistemeAI_ReasoningCore-3B-RE1-V2B.json rename to data/models/EpistemeAI_ReasoningCore-3B-RE1-V2B.json diff --git a/data/EpistemeAI_ReasoningCore-3B-RE1-V2C.json b/data/models/EpistemeAI_ReasoningCore-3B-RE1-V2C.json similarity index 100% rename from data/EpistemeAI_ReasoningCore-3B-RE1-V2C.json rename to data/models/EpistemeAI_ReasoningCore-3B-RE1-V2C.json diff --git a/data/EpistemeAI_ReasoningCore-3B-T1-V1.json b/data/models/EpistemeAI_ReasoningCore-3B-T1-V1.json similarity index 100% rename from data/EpistemeAI_ReasoningCore-3B-T1-V1.json rename to data/models/EpistemeAI_ReasoningCore-3B-T1-V1.json diff --git a/data/EpistemeAI_ReasoningCore-3B-T1_1.json b/data/models/EpistemeAI_ReasoningCore-3B-T1_1.json similarity index 100% rename from data/EpistemeAI_ReasoningCore-3B-T1_1.json rename to data/models/EpistemeAI_ReasoningCore-3B-T1_1.json diff --git a/data/Eric111_CatunaMayo-DPO.json b/data/models/Eric111_CatunaMayo-DPO.json similarity index 100% rename from data/Eric111_CatunaMayo-DPO.json rename to data/models/Eric111_CatunaMayo-DPO.json diff --git a/data/Eric111_CatunaMayo.json b/data/models/Eric111_CatunaMayo.json similarity index 100% rename from data/Eric111_CatunaMayo.json rename to data/models/Eric111_CatunaMayo.json diff --git a/data/Etherll_Chocolatine-3B-Instruct-DPO-Revised-Ties-v2.json b/data/models/Etherll_Chocolatine-3B-Instruct-DPO-Revised-Ties-v2.json similarity index 100% rename from data/Etherll_Chocolatine-3B-Instruct-DPO-Revised-Ties-v2.json rename to data/models/Etherll_Chocolatine-3B-Instruct-DPO-Revised-Ties-v2.json diff --git a/data/Etherll_Chocolatine-3B-Instruct-DPO-Revised-Ties.json b/data/models/Etherll_Chocolatine-3B-Instruct-DPO-Revised-Ties.json similarity index 100% rename from data/Etherll_Chocolatine-3B-Instruct-DPO-Revised-Ties.json rename to data/models/Etherll_Chocolatine-3B-Instruct-DPO-Revised-Ties.json diff --git a/data/Etherll_Herplete-LLM-Llama-3.1-8b-Ties.json b/data/models/Etherll_Herplete-LLM-Llama-3.1-8b-Ties.json similarity index 100% rename from data/Etherll_Herplete-LLM-Llama-3.1-8b-Ties.json rename to data/models/Etherll_Herplete-LLM-Llama-3.1-8b-Ties.json diff --git a/data/Etherll_Herplete-LLM-Llama-3.1-8b.json b/data/models/Etherll_Herplete-LLM-Llama-3.1-8b.json similarity index 100% rename from data/Etherll_Herplete-LLM-Llama-3.1-8b.json rename to data/models/Etherll_Herplete-LLM-Llama-3.1-8b.json diff --git a/data/Etherll_Qwen2.5-7B-della-test.json b/data/models/Etherll_Qwen2.5-7B-della-test.json similarity index 100% rename from data/Etherll_Qwen2.5-7B-della-test.json rename to data/models/Etherll_Qwen2.5-7B-della-test.json diff --git a/data/Etherll_Qwen2.5-Coder-7B-Instruct-Ties.json b/data/models/Etherll_Qwen2.5-Coder-7B-Instruct-Ties.json similarity index 100% rename from data/Etherll_Qwen2.5-Coder-7B-Instruct-Ties.json rename to data/models/Etherll_Qwen2.5-Coder-7B-Instruct-Ties.json diff --git a/data/Etherll_Replete-LLM-V3-Llama-3.1-8b.json b/data/models/Etherll_Replete-LLM-V3-Llama-3.1-8b.json similarity index 100% rename from data/Etherll_Replete-LLM-V3-Llama-3.1-8b.json rename to data/models/Etherll_Replete-LLM-V3-Llama-3.1-8b.json diff --git a/data/Etherll_SuperHermes.json b/data/models/Etherll_SuperHermes.json similarity index 100% rename from data/Etherll_SuperHermes.json rename to data/models/Etherll_SuperHermes.json diff --git a/data/Eurdem_Defne-llama3.1-8B.json b/data/models/Eurdem_Defne-llama3.1-8B.json similarity index 100% rename from data/Eurdem_Defne-llama3.1-8B.json rename to data/models/Eurdem_Defne-llama3.1-8B.json diff --git a/data/FINGU-AI_Chocolatine-Fusion-14B.json b/data/models/FINGU-AI_Chocolatine-Fusion-14B.json similarity index 100% rename from data/FINGU-AI_Chocolatine-Fusion-14B.json rename to data/models/FINGU-AI_Chocolatine-Fusion-14B.json diff --git a/data/FINGU-AI_L3-8B.json b/data/models/FINGU-AI_L3-8B.json similarity index 100% rename from data/FINGU-AI_L3-8B.json rename to data/models/FINGU-AI_L3-8B.json diff --git a/data/FINGU-AI_Phi-4-RRStock.json b/data/models/FINGU-AI_Phi-4-RRStock.json similarity index 100% rename from data/FINGU-AI_Phi-4-RRStock.json rename to data/models/FINGU-AI_Phi-4-RRStock.json diff --git a/data/FINGU-AI_Q-Small-3B.json b/data/models/FINGU-AI_Q-Small-3B.json similarity index 100% rename from data/FINGU-AI_Q-Small-3B.json rename to data/models/FINGU-AI_Q-Small-3B.json diff --git a/data/FINGU-AI_QwQ-Buddy-32B-Alpha.json b/data/models/FINGU-AI_QwQ-Buddy-32B-Alpha.json similarity index 100% rename from data/FINGU-AI_QwQ-Buddy-32B-Alpha.json rename to data/models/FINGU-AI_QwQ-Buddy-32B-Alpha.json diff --git a/data/FINGU-AI_RomboUltima-32B.json b/data/models/FINGU-AI_RomboUltima-32B.json similarity index 100% rename from data/FINGU-AI_RomboUltima-32B.json rename to data/models/FINGU-AI_RomboUltima-32B.json diff --git a/data/FINGU-AI_Ultimos-32B.json b/data/models/FINGU-AI_Ultimos-32B.json similarity index 100% rename from data/FINGU-AI_Ultimos-32B.json rename to data/models/FINGU-AI_Ultimos-32B.json diff --git a/data/FallenMerick_Chewy-Lemon-Cookie-11B.json b/data/models/FallenMerick_Chewy-Lemon-Cookie-11B.json similarity index 100% rename from data/FallenMerick_Chewy-Lemon-Cookie-11B.json rename to data/models/FallenMerick_Chewy-Lemon-Cookie-11B.json diff --git a/data/Felladrin_Llama-160M-Chat-v1.json b/data/models/Felladrin_Llama-160M-Chat-v1.json similarity index 100% rename from data/Felladrin_Llama-160M-Chat-v1.json rename to data/models/Felladrin_Llama-160M-Chat-v1.json diff --git a/data/Felladrin_Minueza-32M-UltraChat.json b/data/models/Felladrin_Minueza-32M-UltraChat.json similarity index 100% rename from data/Felladrin_Minueza-32M-UltraChat.json rename to data/models/Felladrin_Minueza-32M-UltraChat.json diff --git a/data/FlofloB_100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.json b/data/models/FlofloB_100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.json similarity index 100% rename from data/FlofloB_100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.json rename to data/models/FlofloB_100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.json diff --git a/data/FlofloB_10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit.json b/data/models/FlofloB_10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit.json similarity index 100% rename from data/FlofloB_10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit.json rename to data/models/FlofloB_10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit.json diff --git a/data/FlofloB_10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.json b/data/models/FlofloB_10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.json similarity index 100% rename from data/FlofloB_10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.json rename to data/models/FlofloB_10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.json diff --git a/data/FlofloB_40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.json b/data/models/FlofloB_40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.json similarity index 100% rename from data/FlofloB_40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.json rename to data/models/FlofloB_40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.json diff --git a/data/FlofloB_83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.json b/data/models/FlofloB_83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.json similarity index 100% rename from data/FlofloB_83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.json rename to data/models/FlofloB_83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.json diff --git a/data/FlofloB_smollm2-135M_pretrained_1000k_fineweb.json b/data/models/FlofloB_smollm2-135M_pretrained_1000k_fineweb.json similarity index 100% rename from data/FlofloB_smollm2-135M_pretrained_1000k_fineweb.json rename to data/models/FlofloB_smollm2-135M_pretrained_1000k_fineweb.json diff --git a/data/FlofloB_smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed.json b/data/models/FlofloB_smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed.json similarity index 100% rename from data/FlofloB_smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed.json rename to data/models/FlofloB_smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed.json diff --git a/data/FlofloB_smollm2-135M_pretrained_1000k_fineweb_uncovai_selected.json b/data/models/FlofloB_smollm2-135M_pretrained_1000k_fineweb_uncovai_selected.json similarity index 100% rename from data/FlofloB_smollm2-135M_pretrained_1000k_fineweb_uncovai_selected.json rename to data/models/FlofloB_smollm2-135M_pretrained_1000k_fineweb_uncovai_selected.json diff --git a/data/FlofloB_smollm2-135M_pretrained_1200k_fineweb.json b/data/models/FlofloB_smollm2-135M_pretrained_1200k_fineweb.json similarity index 100% rename from data/FlofloB_smollm2-135M_pretrained_1200k_fineweb.json rename to data/models/FlofloB_smollm2-135M_pretrained_1200k_fineweb.json diff --git a/data/FlofloB_smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed.json b/data/models/FlofloB_smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed.json similarity index 100% rename from data/FlofloB_smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed.json rename to data/models/FlofloB_smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed.json diff --git a/data/FlofloB_smollm2-135M_pretrained_1200k_fineweb_uncovai_selected.json b/data/models/FlofloB_smollm2-135M_pretrained_1200k_fineweb_uncovai_selected.json similarity index 100% rename from data/FlofloB_smollm2-135M_pretrained_1200k_fineweb_uncovai_selected.json rename to data/models/FlofloB_smollm2-135M_pretrained_1200k_fineweb_uncovai_selected.json diff --git a/data/FlofloB_smollm2-135M_pretrained_1400k_fineweb.json b/data/models/FlofloB_smollm2-135M_pretrained_1400k_fineweb.json similarity index 100% rename from data/FlofloB_smollm2-135M_pretrained_1400k_fineweb.json rename to data/models/FlofloB_smollm2-135M_pretrained_1400k_fineweb.json diff --git a/data/FlofloB_smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed.json b/data/models/FlofloB_smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed.json similarity index 100% rename from data/FlofloB_smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed.json rename to data/models/FlofloB_smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed.json diff --git a/data/FlofloB_smollm2-135M_pretrained_1400k_fineweb_uncovai_selected.json b/data/models/FlofloB_smollm2-135M_pretrained_1400k_fineweb_uncovai_selected.json similarity index 100% rename from data/FlofloB_smollm2-135M_pretrained_1400k_fineweb_uncovai_selected.json rename to data/models/FlofloB_smollm2-135M_pretrained_1400k_fineweb_uncovai_selected.json diff --git a/data/FlofloB_smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed.json b/data/models/FlofloB_smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed.json similarity index 100% rename from data/FlofloB_smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed.json rename to data/models/FlofloB_smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed.json diff --git a/data/FlofloB_smollm2-135M_pretrained_200k_fineweb_uncovai_selected.json b/data/models/FlofloB_smollm2-135M_pretrained_200k_fineweb_uncovai_selected.json similarity index 100% rename from data/FlofloB_smollm2-135M_pretrained_200k_fineweb_uncovai_selected.json rename to data/models/FlofloB_smollm2-135M_pretrained_200k_fineweb_uncovai_selected.json diff --git a/data/FlofloB_smollm2-135M_pretrained_400k_fineweb.json b/data/models/FlofloB_smollm2-135M_pretrained_400k_fineweb.json similarity index 100% rename from data/FlofloB_smollm2-135M_pretrained_400k_fineweb.json rename to data/models/FlofloB_smollm2-135M_pretrained_400k_fineweb.json diff --git a/data/FlofloB_smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed.json b/data/models/FlofloB_smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed.json similarity index 100% rename from data/FlofloB_smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed.json rename to data/models/FlofloB_smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed.json diff --git a/data/FlofloB_smollm2-135M_pretrained_400k_fineweb_uncovai_selected.json b/data/models/FlofloB_smollm2-135M_pretrained_400k_fineweb_uncovai_selected.json similarity index 100% rename from data/FlofloB_smollm2-135M_pretrained_400k_fineweb_uncovai_selected.json rename to data/models/FlofloB_smollm2-135M_pretrained_400k_fineweb_uncovai_selected.json diff --git a/data/FlofloB_smollm2-135M_pretrained_600k_fineweb.json b/data/models/FlofloB_smollm2-135M_pretrained_600k_fineweb.json similarity index 100% rename from data/FlofloB_smollm2-135M_pretrained_600k_fineweb.json rename to data/models/FlofloB_smollm2-135M_pretrained_600k_fineweb.json diff --git a/data/FlofloB_smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed.json b/data/models/FlofloB_smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed.json similarity index 100% rename from data/FlofloB_smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed.json rename to data/models/FlofloB_smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed.json diff --git a/data/FlofloB_smollm2-135M_pretrained_600k_fineweb_uncovai_selected.json b/data/models/FlofloB_smollm2-135M_pretrained_600k_fineweb_uncovai_selected.json similarity index 100% rename from data/FlofloB_smollm2-135M_pretrained_600k_fineweb_uncovai_selected.json rename to data/models/FlofloB_smollm2-135M_pretrained_600k_fineweb_uncovai_selected.json diff --git a/data/FlofloB_smollm2-135M_pretrained_800k_fineweb.json b/data/models/FlofloB_smollm2-135M_pretrained_800k_fineweb.json similarity index 100% rename from data/FlofloB_smollm2-135M_pretrained_800k_fineweb.json rename to data/models/FlofloB_smollm2-135M_pretrained_800k_fineweb.json diff --git a/data/FlofloB_smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed.json b/data/models/FlofloB_smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed.json similarity index 100% rename from data/FlofloB_smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed.json rename to data/models/FlofloB_smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed.json diff --git a/data/FlofloB_smollm2-135M_pretrained_800k_fineweb_uncovai_selected.json b/data/models/FlofloB_smollm2-135M_pretrained_800k_fineweb_uncovai_selected.json similarity index 100% rename from data/FlofloB_smollm2-135M_pretrained_800k_fineweb_uncovai_selected.json rename to data/models/FlofloB_smollm2-135M_pretrained_800k_fineweb_uncovai_selected.json diff --git a/data/FlofloB_smollm2_pretrained_200k_fineweb.json b/data/models/FlofloB_smollm2_pretrained_200k_fineweb.json similarity index 100% rename from data/FlofloB_smollm2_pretrained_200k_fineweb.json rename to data/models/FlofloB_smollm2_pretrained_200k_fineweb.json diff --git a/data/FlofloB_test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit.json b/data/models/FlofloB_test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit.json similarity index 100% rename from data/FlofloB_test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit.json rename to data/models/FlofloB_test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit.json diff --git a/data/FuJhen_ft-openhermes-25-mistral-7b-irca-dpo-pairs.json b/data/models/FuJhen_ft-openhermes-25-mistral-7b-irca-dpo-pairs.json similarity index 100% rename from data/FuJhen_ft-openhermes-25-mistral-7b-irca-dpo-pairs.json rename to data/models/FuJhen_ft-openhermes-25-mistral-7b-irca-dpo-pairs.json diff --git a/data/FuJhen_mistral-instruct-7B-DPO.json b/data/models/FuJhen_mistral-instruct-7B-DPO.json similarity index 100% rename from data/FuJhen_mistral-instruct-7B-DPO.json rename to data/models/FuJhen_mistral-instruct-7B-DPO.json diff --git a/data/FuJhen_mistral_7b_v0.1_structedData_e2e.json b/data/models/FuJhen_mistral_7b_v0.1_structedData_e2e.json similarity index 100% rename from data/FuJhen_mistral_7b_v0.1_structedData_e2e.json rename to data/models/FuJhen_mistral_7b_v0.1_structedData_e2e.json diff --git a/data/FuJhen_mistral_7b_v0.1_structedData_viggo.json b/data/models/FuJhen_mistral_7b_v0.1_structedData_viggo.json similarity index 100% rename from data/FuJhen_mistral_7b_v0.1_structedData_viggo.json rename to data/models/FuJhen_mistral_7b_v0.1_structedData_viggo.json diff --git a/data/FuseAI_FuseChat-7B-v2.0.json b/data/models/FuseAI_FuseChat-7B-v2.0.json similarity index 100% rename from data/FuseAI_FuseChat-7B-v2.0.json rename to data/models/FuseAI_FuseChat-7B-v2.0.json diff --git a/data/FuseAI_FuseChat-Llama-3.1-8B-Instruct.json b/data/models/FuseAI_FuseChat-Llama-3.1-8B-Instruct.json similarity index 100% rename from data/FuseAI_FuseChat-Llama-3.1-8B-Instruct.json rename to data/models/FuseAI_FuseChat-Llama-3.1-8B-Instruct.json diff --git a/data/FuseAI_FuseChat-Llama-3.2-3B-Instruct.json b/data/models/FuseAI_FuseChat-Llama-3.2-3B-Instruct.json similarity index 100% rename from data/FuseAI_FuseChat-Llama-3.2-3B-Instruct.json rename to data/models/FuseAI_FuseChat-Llama-3.2-3B-Instruct.json diff --git a/data/FuseAI_FuseChat-Qwen-2.5-7B-Instruct.json b/data/models/FuseAI_FuseChat-Qwen-2.5-7B-Instruct.json similarity index 100% rename from data/FuseAI_FuseChat-Qwen-2.5-7B-Instruct.json rename to data/models/FuseAI_FuseChat-Qwen-2.5-7B-Instruct.json diff --git a/data/GalrionSoftworks_MN-LooseCannon-12B-v1.json b/data/models/GalrionSoftworks_MN-LooseCannon-12B-v1.json similarity index 100% rename from data/GalrionSoftworks_MN-LooseCannon-12B-v1.json rename to data/models/GalrionSoftworks_MN-LooseCannon-12B-v1.json diff --git a/data/GalrionSoftworks_MagnusIntellectus-12B-v1.json b/data/models/GalrionSoftworks_MagnusIntellectus-12B-v1.json similarity index 100% rename from data/GalrionSoftworks_MagnusIntellectus-12B-v1.json rename to data/models/GalrionSoftworks_MagnusIntellectus-12B-v1.json diff --git a/data/GenVRadmin_AryaBhatta-GemmaOrca-2-Merged.json b/data/models/GenVRadmin_AryaBhatta-GemmaOrca-2-Merged.json similarity index 100% rename from data/GenVRadmin_AryaBhatta-GemmaOrca-2-Merged.json rename to data/models/GenVRadmin_AryaBhatta-GemmaOrca-2-Merged.json diff --git a/data/GenVRadmin_AryaBhatta-GemmaOrca-Merged.json b/data/models/GenVRadmin_AryaBhatta-GemmaOrca-Merged.json similarity index 100% rename from data/GenVRadmin_AryaBhatta-GemmaOrca-Merged.json rename to data/models/GenVRadmin_AryaBhatta-GemmaOrca-Merged.json diff --git a/data/GenVRadmin_AryaBhatta-GemmaUltra-Merged.json b/data/models/GenVRadmin_AryaBhatta-GemmaUltra-Merged.json similarity index 100% rename from data/GenVRadmin_AryaBhatta-GemmaUltra-Merged.json rename to data/models/GenVRadmin_AryaBhatta-GemmaUltra-Merged.json diff --git a/data/GenVRadmin_llama38bGenZ_Vikas-Merged.json b/data/models/GenVRadmin_llama38bGenZ_Vikas-Merged.json similarity index 100% rename from data/GenVRadmin_llama38bGenZ_Vikas-Merged.json rename to data/models/GenVRadmin_llama38bGenZ_Vikas-Merged.json diff --git a/data/GoToCompany_gemma2-9b-cpt-sahabatai-v1-instruct.json b/data/models/GoToCompany_gemma2-9b-cpt-sahabatai-v1-instruct.json similarity index 100% rename from data/GoToCompany_gemma2-9b-cpt-sahabatai-v1-instruct.json rename to data/models/GoToCompany_gemma2-9b-cpt-sahabatai-v1-instruct.json diff --git a/data/GoToCompany_llama3-8b-cpt-sahabatai-v1-instruct.json b/data/models/GoToCompany_llama3-8b-cpt-sahabatai-v1-instruct.json similarity index 100% rename from data/GoToCompany_llama3-8b-cpt-sahabatai-v1-instruct.json rename to data/models/GoToCompany_llama3-8b-cpt-sahabatai-v1-instruct.json diff --git a/data/Goekdeniz-Guelmez_Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1.json b/data/models/Goekdeniz-Guelmez_Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1.json similarity index 100% rename from data/Goekdeniz-Guelmez_Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1.json rename to data/models/Goekdeniz-Guelmez_Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1.json diff --git a/data/Goekdeniz-Guelmez_Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1.json b/data/models/Goekdeniz-Guelmez_Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1.json similarity index 100% rename from data/Goekdeniz-Guelmez_Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1.json rename to data/models/Goekdeniz-Guelmez_Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1.json diff --git a/data/Goekdeniz-Guelmez_Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2.json b/data/models/Goekdeniz-Guelmez_Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2.json similarity index 100% rename from data/Goekdeniz-Guelmez_Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2.json rename to data/models/Goekdeniz-Guelmez_Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2.json diff --git a/data/Goekdeniz-Guelmez_Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3.json b/data/models/Goekdeniz-Guelmez_Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3.json similarity index 100% rename from data/Goekdeniz-Guelmez_Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3.json rename to data/models/Goekdeniz-Guelmez_Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3.json diff --git a/data/Goekdeniz-Guelmez_Josiefied-Qwen2.5-14B-Instruct-abliterated-v4.json b/data/models/Goekdeniz-Guelmez_Josiefied-Qwen2.5-14B-Instruct-abliterated-v4.json similarity index 100% rename from data/Goekdeniz-Guelmez_Josiefied-Qwen2.5-14B-Instruct-abliterated-v4.json rename to data/models/Goekdeniz-Guelmez_Josiefied-Qwen2.5-14B-Instruct-abliterated-v4.json diff --git a/data/Goekdeniz-Guelmez_Josiefied-Qwen2.5-7B-Instruct-abliterated-v2.json b/data/models/Goekdeniz-Guelmez_Josiefied-Qwen2.5-7B-Instruct-abliterated-v2.json similarity index 100% rename from data/Goekdeniz-Guelmez_Josiefied-Qwen2.5-7B-Instruct-abliterated-v2.json rename to data/models/Goekdeniz-Guelmez_Josiefied-Qwen2.5-7B-Instruct-abliterated-v2.json diff --git a/data/Goekdeniz-Guelmez_j.o.s.i.e.v4o-1.5b-dpo-stage1-v1.json b/data/models/Goekdeniz-Guelmez_j.o.s.i.e.v4o-1.5b-dpo-stage1-v1.json similarity index 100% rename from data/Goekdeniz-Guelmez_j.o.s.i.e.v4o-1.5b-dpo-stage1-v1.json rename to data/models/Goekdeniz-Guelmez_j.o.s.i.e.v4o-1.5b-dpo-stage1-v1.json diff --git a/data/Goekdeniz-Guelmez_josie-3b-v6.0.json b/data/models/Goekdeniz-Guelmez_josie-3b-v6.0.json similarity index 100% rename from data/Goekdeniz-Guelmez_josie-3b-v6.0.json rename to data/models/Goekdeniz-Guelmez_josie-3b-v6.0.json diff --git a/data/Goekdeniz-Guelmez_josie-7b-v6.0-step2000.json b/data/models/Goekdeniz-Guelmez_josie-7b-v6.0-step2000.json similarity index 100% rename from data/Goekdeniz-Guelmez_josie-7b-v6.0-step2000.json rename to data/models/Goekdeniz-Guelmez_josie-7b-v6.0-step2000.json diff --git a/data/Goekdeniz-Guelmez_josie-7b-v6.0.json b/data/models/Goekdeniz-Guelmez_josie-7b-v6.0.json similarity index 100% rename from data/Goekdeniz-Guelmez_josie-7b-v6.0.json rename to data/models/Goekdeniz-Guelmez_josie-7b-v6.0.json diff --git a/data/GreenNode_GreenNode-small-9B-it.json b/data/models/GreenNode_GreenNode-small-9B-it.json similarity index 100% rename from data/GreenNode_GreenNode-small-9B-it.json rename to data/models/GreenNode_GreenNode-small-9B-it.json diff --git a/data/GritLM_GritLM-7B-KTO.json b/data/models/GritLM_GritLM-7B-KTO.json similarity index 100% rename from data/GritLM_GritLM-7B-KTO.json rename to data/models/GritLM_GritLM-7B-KTO.json diff --git a/data/GritLM_GritLM-8x7B-KTO.json b/data/models/GritLM_GritLM-8x7B-KTO.json similarity index 100% rename from data/GritLM_GritLM-8x7B-KTO.json rename to data/models/GritLM_GritLM-8x7B-KTO.json diff --git a/data/Groq_Llama-3-Groq-8B-Tool-Use.json b/data/models/Groq_Llama-3-Groq-8B-Tool-Use.json similarity index 100% rename from data/Groq_Llama-3-Groq-8B-Tool-Use.json rename to data/models/Groq_Llama-3-Groq-8B-Tool-Use.json diff --git a/data/Gryphe_Pantheon-RP-1.0-8b-Llama-3.json b/data/models/Gryphe_Pantheon-RP-1.0-8b-Llama-3.json similarity index 100% rename from data/Gryphe_Pantheon-RP-1.0-8b-Llama-3.json rename to data/models/Gryphe_Pantheon-RP-1.0-8b-Llama-3.json diff --git a/data/Gryphe_Pantheon-RP-1.5-12b-Nemo.json b/data/models/Gryphe_Pantheon-RP-1.5-12b-Nemo.json similarity index 100% rename from data/Gryphe_Pantheon-RP-1.5-12b-Nemo.json rename to data/models/Gryphe_Pantheon-RP-1.5-12b-Nemo.json diff --git a/data/Gryphe_Pantheon-RP-1.6-12b-Nemo-KTO.json b/data/models/Gryphe_Pantheon-RP-1.6-12b-Nemo-KTO.json similarity index 100% rename from data/Gryphe_Pantheon-RP-1.6-12b-Nemo-KTO.json rename to data/models/Gryphe_Pantheon-RP-1.6-12b-Nemo-KTO.json diff --git a/data/Gryphe_Pantheon-RP-1.6-12b-Nemo.json b/data/models/Gryphe_Pantheon-RP-1.6-12b-Nemo.json similarity index 100% rename from data/Gryphe_Pantheon-RP-1.6-12b-Nemo.json rename to data/models/Gryphe_Pantheon-RP-1.6-12b-Nemo.json diff --git a/data/Gryphe_Pantheon-RP-Pure-1.6.2-22b-Small.json b/data/models/Gryphe_Pantheon-RP-Pure-1.6.2-22b-Small.json similarity index 100% rename from data/Gryphe_Pantheon-RP-Pure-1.6.2-22b-Small.json rename to data/models/Gryphe_Pantheon-RP-Pure-1.6.2-22b-Small.json diff --git a/data/GuilhermeNaturaUmana_Nature-Reason-1.2-reallysmall.json b/data/models/GuilhermeNaturaUmana_Nature-Reason-1.2-reallysmall.json similarity index 100% rename from data/GuilhermeNaturaUmana_Nature-Reason-1.2-reallysmall.json rename to data/models/GuilhermeNaturaUmana_Nature-Reason-1.2-reallysmall.json diff --git a/data/Gunulhona_Gemma-Ko-Merge-PEFT.json b/data/models/Gunulhona_Gemma-Ko-Merge-PEFT.json similarity index 100% rename from data/Gunulhona_Gemma-Ko-Merge-PEFT.json rename to data/models/Gunulhona_Gemma-Ko-Merge-PEFT.json diff --git a/data/Gunulhona_Gemma-Ko-Merge.json b/data/models/Gunulhona_Gemma-Ko-Merge.json similarity index 100% rename from data/Gunulhona_Gemma-Ko-Merge.json rename to data/models/Gunulhona_Gemma-Ko-Merge.json diff --git a/data/HFXM_RAMO-Llama3.1-8B.json b/data/models/HFXM_RAMO-Llama3.1-8B.json similarity index 100% rename from data/HFXM_RAMO-Llama3.1-8B.json rename to data/models/HFXM_RAMO-Llama3.1-8B.json diff --git a/data/HPAI-BSC_Llama3-Aloe-8B-Alpha.json b/data/models/HPAI-BSC_Llama3-Aloe-8B-Alpha.json similarity index 100% rename from data/HPAI-BSC_Llama3-Aloe-8B-Alpha.json rename to data/models/HPAI-BSC_Llama3-Aloe-8B-Alpha.json diff --git a/data/HPAI-BSC_Llama3.1-Aloe-Beta-8B.json b/data/models/HPAI-BSC_Llama3.1-Aloe-Beta-8B.json similarity index 100% rename from data/HPAI-BSC_Llama3.1-Aloe-Beta-8B.json rename to data/models/HPAI-BSC_Llama3.1-Aloe-Beta-8B.json diff --git a/data/HPAI-BSC_Qwen2.5-Aloe-Beta-7B.json b/data/models/HPAI-BSC_Qwen2.5-Aloe-Beta-7B.json similarity index 100% rename from data/HPAI-BSC_Qwen2.5-Aloe-Beta-7B.json rename to data/models/HPAI-BSC_Qwen2.5-Aloe-Beta-7B.json diff --git a/data/HarbingerX_Zeitgeist-3b-V1.2.json b/data/models/HarbingerX_Zeitgeist-3b-V1.2.json similarity index 100% rename from data/HarbingerX_Zeitgeist-3b-V1.2.json rename to data/models/HarbingerX_Zeitgeist-3b-V1.2.json diff --git a/data/HarbingerX_Zeitgeist-3b-V1.json b/data/models/HarbingerX_Zeitgeist-3b-V1.json similarity index 100% rename from data/HarbingerX_Zeitgeist-3b-V1.json rename to data/models/HarbingerX_Zeitgeist-3b-V1.json diff --git a/data/Hastagaras_L3.2-JametMini-3B-MK.III.json b/data/models/Hastagaras_L3.2-JametMini-3B-MK.III.json similarity index 100% rename from data/Hastagaras_L3.2-JametMini-3B-MK.III.json rename to data/models/Hastagaras_L3.2-JametMini-3B-MK.III.json diff --git a/data/Hastagaras_Llama-3.1-Jamet-8B-MK.I.json b/data/models/Hastagaras_Llama-3.1-Jamet-8B-MK.I.json similarity index 100% rename from data/Hastagaras_Llama-3.1-Jamet-8B-MK.I.json rename to data/models/Hastagaras_Llama-3.1-Jamet-8B-MK.I.json diff --git a/data/Hastagaras_Zabuza-8B-Llama-3.1.json b/data/models/Hastagaras_Zabuza-8B-Llama-3.1.json similarity index 100% rename from data/Hastagaras_Zabuza-8B-Llama-3.1.json rename to data/models/Hastagaras_Zabuza-8B-Llama-3.1.json diff --git a/data/HelpingAI_Cipher-20B.json b/data/models/HelpingAI_Cipher-20B.json similarity index 100% rename from data/HelpingAI_Cipher-20B.json rename to data/models/HelpingAI_Cipher-20B.json diff --git a/data/HelpingAI_Dhanishtha-Large.json b/data/models/HelpingAI_Dhanishtha-Large.json similarity index 100% rename from data/HelpingAI_Dhanishtha-Large.json rename to data/models/HelpingAI_Dhanishtha-Large.json diff --git a/data/HelpingAI_Priya-10B.json b/data/models/HelpingAI_Priya-10B.json similarity index 100% rename from data/HelpingAI_Priya-10B.json rename to data/models/HelpingAI_Priya-10B.json diff --git a/data/HelpingAI_Priya-3B.json b/data/models/HelpingAI_Priya-3B.json similarity index 100% rename from data/HelpingAI_Priya-3B.json rename to data/models/HelpingAI_Priya-3B.json diff --git a/data/HeraiHench_DeepSeek-R1-Qwen-Coder-8B.json b/data/models/HeraiHench_DeepSeek-R1-Qwen-Coder-8B.json similarity index 100% rename from data/HeraiHench_DeepSeek-R1-Qwen-Coder-8B.json rename to data/models/HeraiHench_DeepSeek-R1-Qwen-Coder-8B.json diff --git a/data/HeraiHench_Double-Down-Qwen-Math-7B.json b/data/models/HeraiHench_Double-Down-Qwen-Math-7B.json similarity index 100% rename from data/HeraiHench_Double-Down-Qwen-Math-7B.json rename to data/models/HeraiHench_Double-Down-Qwen-Math-7B.json diff --git a/data/HeraiHench_Marge-Qwen-Math-7B.json b/data/models/HeraiHench_Marge-Qwen-Math-7B.json similarity index 100% rename from data/HeraiHench_Marge-Qwen-Math-7B.json rename to data/models/HeraiHench_Marge-Qwen-Math-7B.json diff --git a/data/HeraiHench_Phi-4-slerp-ReasoningRP-14B.json b/data/models/HeraiHench_Phi-4-slerp-ReasoningRP-14B.json similarity index 100% rename from data/HeraiHench_Phi-4-slerp-ReasoningRP-14B.json rename to data/models/HeraiHench_Phi-4-slerp-ReasoningRP-14B.json diff --git a/data/HiroseKoichi_Llama-Salad-4x8B-V3.json b/data/models/HiroseKoichi_Llama-Salad-4x8B-V3.json similarity index 100% rename from data/HiroseKoichi_Llama-Salad-4x8B-V3.json rename to data/models/HiroseKoichi_Llama-Salad-4x8B-V3.json diff --git a/data/HoangHa_Pensez-Llama3.1-8B.json b/data/models/HoangHa_Pensez-Llama3.1-8B.json similarity index 100% rename from data/HoangHa_Pensez-Llama3.1-8B.json rename to data/models/HoangHa_Pensez-Llama3.1-8B.json diff --git a/data/HuggingFaceH4_starchat2-15b-v0.1.json b/data/models/HuggingFaceH4_starchat2-15b-v0.1.json similarity index 100% rename from data/HuggingFaceH4_starchat2-15b-v0.1.json rename to data/models/HuggingFaceH4_starchat2-15b-v0.1.json diff --git a/data/HuggingFaceH4_zephyr-7b-alpha.json b/data/models/HuggingFaceH4_zephyr-7b-alpha.json similarity index 100% rename from data/HuggingFaceH4_zephyr-7b-alpha.json rename to data/models/HuggingFaceH4_zephyr-7b-alpha.json diff --git a/data/HuggingFaceH4_zephyr-7b-beta.json b/data/models/HuggingFaceH4_zephyr-7b-beta.json similarity index 100% rename from data/HuggingFaceH4_zephyr-7b-beta.json rename to data/models/HuggingFaceH4_zephyr-7b-beta.json diff --git a/data/HuggingFaceH4_zephyr-7b-gemma-v0.1.json b/data/models/HuggingFaceH4_zephyr-7b-gemma-v0.1.json similarity index 100% rename from data/HuggingFaceH4_zephyr-7b-gemma-v0.1.json rename to data/models/HuggingFaceH4_zephyr-7b-gemma-v0.1.json diff --git a/data/HuggingFaceH4_zephyr-orpo-141b-A35b-v0.1.json b/data/models/HuggingFaceH4_zephyr-orpo-141b-A35b-v0.1.json similarity index 100% rename from data/HuggingFaceH4_zephyr-orpo-141b-A35b-v0.1.json rename to data/models/HuggingFaceH4_zephyr-orpo-141b-A35b-v0.1.json diff --git a/data/HuggingFaceTB_SmolLM-1.7B-Instruct.json b/data/models/HuggingFaceTB_SmolLM-1.7B-Instruct.json similarity index 100% rename from data/HuggingFaceTB_SmolLM-1.7B-Instruct.json rename to data/models/HuggingFaceTB_SmolLM-1.7B-Instruct.json diff --git a/data/HuggingFaceTB_SmolLM-1.7B.json b/data/models/HuggingFaceTB_SmolLM-1.7B.json similarity index 100% rename from data/HuggingFaceTB_SmolLM-1.7B.json rename to data/models/HuggingFaceTB_SmolLM-1.7B.json diff --git a/data/HuggingFaceTB_SmolLM-135M-Instruct.json b/data/models/HuggingFaceTB_SmolLM-135M-Instruct.json similarity index 100% rename from data/HuggingFaceTB_SmolLM-135M-Instruct.json rename to data/models/HuggingFaceTB_SmolLM-135M-Instruct.json diff --git a/data/HuggingFaceTB_SmolLM-135M.json b/data/models/HuggingFaceTB_SmolLM-135M.json similarity index 100% rename from data/HuggingFaceTB_SmolLM-135M.json rename to data/models/HuggingFaceTB_SmolLM-135M.json diff --git a/data/HuggingFaceTB_SmolLM-360M-Instruct.json b/data/models/HuggingFaceTB_SmolLM-360M-Instruct.json similarity index 100% rename from data/HuggingFaceTB_SmolLM-360M-Instruct.json rename to data/models/HuggingFaceTB_SmolLM-360M-Instruct.json diff --git a/data/HuggingFaceTB_SmolLM-360M.json b/data/models/HuggingFaceTB_SmolLM-360M.json similarity index 100% rename from data/HuggingFaceTB_SmolLM-360M.json rename to data/models/HuggingFaceTB_SmolLM-360M.json diff --git a/data/HuggingFaceTB_SmolLM2-1.7B-Instruct.json b/data/models/HuggingFaceTB_SmolLM2-1.7B-Instruct.json similarity index 100% rename from data/HuggingFaceTB_SmolLM2-1.7B-Instruct.json rename to data/models/HuggingFaceTB_SmolLM2-1.7B-Instruct.json diff --git a/data/HuggingFaceTB_SmolLM2-1.7B.json b/data/models/HuggingFaceTB_SmolLM2-1.7B.json similarity index 100% rename from data/HuggingFaceTB_SmolLM2-1.7B.json rename to data/models/HuggingFaceTB_SmolLM2-1.7B.json diff --git a/data/HuggingFaceTB_SmolLM2-135M-Instruct.json b/data/models/HuggingFaceTB_SmolLM2-135M-Instruct.json similarity index 100% rename from data/HuggingFaceTB_SmolLM2-135M-Instruct.json rename to data/models/HuggingFaceTB_SmolLM2-135M-Instruct.json diff --git a/data/HuggingFaceTB_SmolLM2-135M.json b/data/models/HuggingFaceTB_SmolLM2-135M.json similarity index 100% rename from data/HuggingFaceTB_SmolLM2-135M.json rename to data/models/HuggingFaceTB_SmolLM2-135M.json diff --git a/data/HuggingFaceTB_SmolLM2-360M-Instruct.json b/data/models/HuggingFaceTB_SmolLM2-360M-Instruct.json similarity index 100% rename from data/HuggingFaceTB_SmolLM2-360M-Instruct.json rename to data/models/HuggingFaceTB_SmolLM2-360M-Instruct.json diff --git a/data/HuggingFaceTB_SmolLM2-360M.json b/data/models/HuggingFaceTB_SmolLM2-360M.json similarity index 100% rename from data/HuggingFaceTB_SmolLM2-360M.json rename to data/models/HuggingFaceTB_SmolLM2-360M.json diff --git a/data/HumanLLMs_Humanish-LLama3-8B-Instruct.json b/data/models/HumanLLMs_Humanish-LLama3-8B-Instruct.json similarity index 100% rename from data/HumanLLMs_Humanish-LLama3-8B-Instruct.json rename to data/models/HumanLLMs_Humanish-LLama3-8B-Instruct.json diff --git a/data/HumanLLMs_Humanish-Mistral-Nemo-Instruct-2407.json b/data/models/HumanLLMs_Humanish-Mistral-Nemo-Instruct-2407.json similarity index 100% rename from data/HumanLLMs_Humanish-Mistral-Nemo-Instruct-2407.json rename to data/models/HumanLLMs_Humanish-Mistral-Nemo-Instruct-2407.json diff --git a/data/HumanLLMs_Humanish-Qwen2.5-7B-Instruct.json b/data/models/HumanLLMs_Humanish-Qwen2.5-7B-Instruct.json similarity index 100% rename from data/HumanLLMs_Humanish-Qwen2.5-7B-Instruct.json rename to data/models/HumanLLMs_Humanish-Qwen2.5-7B-Instruct.json diff --git a/data/IDEA-CCNL_Ziya-LLaMA-13B-v1.json b/data/models/IDEA-CCNL_Ziya-LLaMA-13B-v1.json similarity index 100% rename from data/IDEA-CCNL_Ziya-LLaMA-13B-v1.json rename to data/models/IDEA-CCNL_Ziya-LLaMA-13B-v1.json diff --git a/data/IDEA-CCNL_Ziya-LLaMA-7B-Reward.json b/data/models/IDEA-CCNL_Ziya-LLaMA-7B-Reward.json similarity index 100% rename from data/IDEA-CCNL_Ziya-LLaMA-7B-Reward.json rename to data/models/IDEA-CCNL_Ziya-LLaMA-7B-Reward.json diff --git a/data/INSAIT-Institute_BgGPT-Gemma-2-27B-IT-v1.0.json b/data/models/INSAIT-Institute_BgGPT-Gemma-2-27B-IT-v1.0.json similarity index 100% rename from data/INSAIT-Institute_BgGPT-Gemma-2-27B-IT-v1.0.json rename to data/models/INSAIT-Institute_BgGPT-Gemma-2-27B-IT-v1.0.json diff --git a/data/IlyaGusev_gemma-2-2b-it-abliterated.json b/data/models/IlyaGusev_gemma-2-2b-it-abliterated.json similarity index 100% rename from data/IlyaGusev_gemma-2-2b-it-abliterated.json rename to data/models/IlyaGusev_gemma-2-2b-it-abliterated.json diff --git a/data/IlyaGusev_gemma-2-9b-it-abliterated.json b/data/models/IlyaGusev_gemma-2-9b-it-abliterated.json similarity index 100% rename from data/IlyaGusev_gemma-2-9b-it-abliterated.json rename to data/models/IlyaGusev_gemma-2-9b-it-abliterated.json diff --git a/data/Infinirc_Infinirc-Llama3-8B-2G-Release-v1.0.json b/data/models/Infinirc_Infinirc-Llama3-8B-2G-Release-v1.0.json similarity index 100% rename from data/Infinirc_Infinirc-Llama3-8B-2G-Release-v1.0.json rename to data/models/Infinirc_Infinirc-Llama3-8B-2G-Release-v1.0.json diff --git a/data/Intel_neural-chat-7b-v3-1.json b/data/models/Intel_neural-chat-7b-v3-1.json similarity index 100% rename from data/Intel_neural-chat-7b-v3-1.json rename to data/models/Intel_neural-chat-7b-v3-1.json diff --git a/data/Intel_neural-chat-7b-v3-2.json b/data/models/Intel_neural-chat-7b-v3-2.json similarity index 100% rename from data/Intel_neural-chat-7b-v3-2.json rename to data/models/Intel_neural-chat-7b-v3-2.json diff --git a/data/Intel_neural-chat-7b-v3-3.json b/data/models/Intel_neural-chat-7b-v3-3.json similarity index 100% rename from data/Intel_neural-chat-7b-v3-3.json rename to data/models/Intel_neural-chat-7b-v3-3.json diff --git a/data/Intel_neural-chat-7b-v3.json b/data/models/Intel_neural-chat-7b-v3.json similarity index 100% rename from data/Intel_neural-chat-7b-v3.json rename to data/models/Intel_neural-chat-7b-v3.json diff --git a/data/IntervitensInc_internlm2_5-20b-llamafied.json b/data/models/IntervitensInc_internlm2_5-20b-llamafied.json similarity index 100% rename from data/IntervitensInc_internlm2_5-20b-llamafied.json rename to data/models/IntervitensInc_internlm2_5-20b-llamafied.json diff --git a/data/Invalid-Null_PeiYangMe-0.5.json b/data/models/Invalid-Null_PeiYangMe-0.5.json similarity index 100% rename from data/Invalid-Null_PeiYangMe-0.5.json rename to data/models/Invalid-Null_PeiYangMe-0.5.json diff --git a/data/Invalid-Null_PeiYangMe-0.7.json b/data/models/Invalid-Null_PeiYangMe-0.7.json similarity index 100% rename from data/Invalid-Null_PeiYangMe-0.7.json rename to data/models/Invalid-Null_PeiYangMe-0.7.json diff --git a/data/Isaak-Carter_JOSIEv4o-8b-stage1-v4.json b/data/models/Isaak-Carter_JOSIEv4o-8b-stage1-v4.json similarity index 100% rename from data/Isaak-Carter_JOSIEv4o-8b-stage1-v4.json rename to data/models/Isaak-Carter_JOSIEv4o-8b-stage1-v4.json diff --git a/data/Isaak-Carter_Josiefied-Qwen2.5-7B-Instruct-abliterated-v2.json b/data/models/Isaak-Carter_Josiefied-Qwen2.5-7B-Instruct-abliterated-v2.json similarity index 100% rename from data/Isaak-Carter_Josiefied-Qwen2.5-7B-Instruct-abliterated-v2.json rename to data/models/Isaak-Carter_Josiefied-Qwen2.5-7B-Instruct-abliterated-v2.json diff --git a/data/Isaak-Carter_Josiefied-Qwen2.5-7B-Instruct-abliterated.json b/data/models/Isaak-Carter_Josiefied-Qwen2.5-7B-Instruct-abliterated.json similarity index 100% rename from data/Isaak-Carter_Josiefied-Qwen2.5-7B-Instruct-abliterated.json rename to data/models/Isaak-Carter_Josiefied-Qwen2.5-7B-Instruct-abliterated.json diff --git a/data/J-LAB_Thynk_orpo.json b/data/models/J-LAB_Thynk_orpo.json similarity index 100% rename from data/J-LAB_Thynk_orpo.json rename to data/models/J-LAB_Thynk_orpo.json diff --git a/data/JackFram_llama-160m.json b/data/models/JackFram_llama-160m.json similarity index 100% rename from data/JackFram_llama-160m.json rename to data/models/JackFram_llama-160m.json diff --git a/data/JackFram_llama-68m.json b/data/models/JackFram_llama-68m.json similarity index 100% rename from data/JackFram_llama-68m.json rename to data/models/JackFram_llama-68m.json diff --git a/data/Jacoby746_Casual-Magnum-34B.json b/data/models/Jacoby746_Casual-Magnum-34B.json similarity index 100% rename from data/Jacoby746_Casual-Magnum-34B.json rename to data/models/Jacoby746_Casual-Magnum-34B.json diff --git a/data/Jacoby746_Inf-Silent-Kunoichi-v0.1-2x7B.json b/data/models/Jacoby746_Inf-Silent-Kunoichi-v0.1-2x7B.json similarity index 100% rename from data/Jacoby746_Inf-Silent-Kunoichi-v0.1-2x7B.json rename to data/models/Jacoby746_Inf-Silent-Kunoichi-v0.1-2x7B.json diff --git a/data/Jacoby746_Inf-Silent-Kunoichi-v0.2-2x7B.json b/data/models/Jacoby746_Inf-Silent-Kunoichi-v0.2-2x7B.json similarity index 100% rename from data/Jacoby746_Inf-Silent-Kunoichi-v0.2-2x7B.json rename to data/models/Jacoby746_Inf-Silent-Kunoichi-v0.2-2x7B.json diff --git a/data/Jacoby746_Proto-Athena-4x7B.json b/data/models/Jacoby746_Proto-Athena-4x7B.json similarity index 100% rename from data/Jacoby746_Proto-Athena-4x7B.json rename to data/models/Jacoby746_Proto-Athena-4x7B.json diff --git a/data/Jacoby746_Proto-Athena-v0.2-4x7B.json b/data/models/Jacoby746_Proto-Athena-v0.2-4x7B.json similarity index 100% rename from data/Jacoby746_Proto-Athena-v0.2-4x7B.json rename to data/models/Jacoby746_Proto-Athena-v0.2-4x7B.json diff --git a/data/Jacoby746_Proto-Harpy-Blazing-Light-v0.1-2x7B.json b/data/models/Jacoby746_Proto-Harpy-Blazing-Light-v0.1-2x7B.json similarity index 100% rename from data/Jacoby746_Proto-Harpy-Blazing-Light-v0.1-2x7B.json rename to data/models/Jacoby746_Proto-Harpy-Blazing-Light-v0.1-2x7B.json diff --git a/data/Jacoby746_Proto-Harpy-Spark-v0.1-7B.json b/data/models/Jacoby746_Proto-Harpy-Spark-v0.1-7B.json similarity index 100% rename from data/Jacoby746_Proto-Harpy-Spark-v0.1-7B.json rename to data/models/Jacoby746_Proto-Harpy-Spark-v0.1-7B.json diff --git a/data/JayHyeon_Qwen-0.5B-DPO-1epoch.json b/data/models/JayHyeon_Qwen-0.5B-DPO-1epoch.json similarity index 100% rename from data/JayHyeon_Qwen-0.5B-DPO-1epoch.json rename to data/models/JayHyeon_Qwen-0.5B-DPO-1epoch.json diff --git a/data/JayHyeon_Qwen-0.5B-DPO-5epoch.json b/data/models/JayHyeon_Qwen-0.5B-DPO-5epoch.json similarity index 100% rename from data/JayHyeon_Qwen-0.5B-DPO-5epoch.json rename to data/models/JayHyeon_Qwen-0.5B-DPO-5epoch.json diff --git a/data/JayHyeon_Qwen-0.5B-IRPO-1epoch.json b/data/models/JayHyeon_Qwen-0.5B-IRPO-1epoch.json similarity index 100% rename from data/JayHyeon_Qwen-0.5B-IRPO-1epoch.json rename to data/models/JayHyeon_Qwen-0.5B-IRPO-1epoch.json diff --git a/data/JayHyeon_Qwen-0.5B-IRPO-5epoch.json b/data/models/JayHyeon_Qwen-0.5B-IRPO-5epoch.json similarity index 100% rename from data/JayHyeon_Qwen-0.5B-IRPO-5epoch.json rename to data/models/JayHyeon_Qwen-0.5B-IRPO-5epoch.json diff --git a/data/JayHyeon_Qwen-0.5B-eDPO-1epoch.json b/data/models/JayHyeon_Qwen-0.5B-eDPO-1epoch.json similarity index 100% rename from data/JayHyeon_Qwen-0.5B-eDPO-1epoch.json rename to data/models/JayHyeon_Qwen-0.5B-eDPO-1epoch.json diff --git a/data/JayHyeon_Qwen-0.5B-eDPO-5epoch.json b/data/models/JayHyeon_Qwen-0.5B-eDPO-5epoch.json similarity index 100% rename from data/JayHyeon_Qwen-0.5B-eDPO-5epoch.json rename to data/models/JayHyeon_Qwen-0.5B-eDPO-5epoch.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1.json b/data/models/JayHyeon_Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1.json rename to data/models/JayHyeon_Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1.json b/data/models/JayHyeon_Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1.json rename to data/models/JayHyeon_Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1.json b/data/models/JayHyeon_Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1.json rename to data/models/JayHyeon_Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-Instruct-SFT.json b/data/models/JayHyeon_Qwen2.5-0.5B-Instruct-SFT.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-Instruct-SFT.json rename to data/models/JayHyeon_Qwen2.5-0.5B-Instruct-SFT.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-1e-4-2ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-1e-4-2ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-1e-4-2ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-1e-4-2ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-1e-4-3ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-1e-4-3ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-1e-4-3ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-1e-4-3ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-1e-4-5ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-1e-4-5ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-1e-4-5ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-1e-4-5ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-1e-4.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-1e-4.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-1e-4.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-1e-4.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-1e-5-2ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-1e-5-2ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-1e-5-2ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-1e-5-2ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-1e-5-3ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-1e-5-3ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-1e-5-3ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-1e-5-3ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-1e-5-5ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-1e-5-5ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-1e-5-5ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-1e-5-5ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-1e-5.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-1e-5.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-1e-5.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-1e-5.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-4-2ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-4-2ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-4-2ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-4-2ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-4-3ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-4-3ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-4-3ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-4-3ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-4-5ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-4-5ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-4-5ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-4-5ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-4.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-4.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-4.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-4.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-3ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-3ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-3ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-3ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-2e-5.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-2e-5.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-5e-5-2ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-5e-5-2ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-5e-5-2ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-5e-5-2ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-5e-5-3ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-5e-5-3ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-5e-5-3ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-5e-5-3ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-5e-5-5ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-5e-5-5ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-5e-5-5ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-5e-5-5ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-5e-5.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-5e-5.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-5e-5.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-5e-5.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-7e-5-2ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-7e-5-2ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-7e-5-2ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-7e-5-2ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-7e-5-3ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-7e-5-3ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-7e-5-3ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-7e-5-3ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-7e-5-5ep.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-7e-5-5ep.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-7e-5-5ep.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-7e-5-5ep.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-7e-5.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-7e-5.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-7e-5.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-7e-5.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-DPO-1epoch_v1.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-DPO-1epoch_v1.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-DPO-1epoch_v1.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-DPO-1epoch_v1.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT-MDPO-1epoch_v1.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT-MDPO-1epoch_v1.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT-MDPO-1epoch_v1.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT-MDPO-1epoch_v1.json diff --git a/data/JayHyeon_Qwen2.5-0.5B-SFT.json b/data/models/JayHyeon_Qwen2.5-0.5B-SFT.json similarity index 100% rename from data/JayHyeon_Qwen2.5-0.5B-SFT.json rename to data/models/JayHyeon_Qwen2.5-0.5B-SFT.json diff --git a/data/JayHyeon_Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam.json b/data/models/JayHyeon_Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam.json rename to data/models/JayHyeon_Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam.json diff --git a/data/JayHyeon_Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam.json b/data/models/JayHyeon_Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam.json rename to data/models/JayHyeon_Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam.json diff --git a/data/JayHyeon_Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam.json b/data/models/JayHyeon_Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam.json rename to data/models/JayHyeon_Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam.json diff --git a/data/JayHyeon_Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam.json b/data/models/JayHyeon_Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam.json rename to data/models/JayHyeon_Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam.json diff --git a/data/JayHyeon_Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam.json b/data/models/JayHyeon_Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam.json rename to data/models/JayHyeon_Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam.json diff --git a/data/JayHyeon_Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam.json b/data/models/JayHyeon_Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam.json rename to data/models/JayHyeon_Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam.json diff --git a/data/JayHyeon_Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam.json b/data/models/JayHyeon_Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam.json rename to data/models/JayHyeon_Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam.json diff --git a/data/JayHyeon_Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam.json b/data/models/JayHyeon_Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam.json rename to data/models/JayHyeon_Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam.json diff --git a/data/JayHyeon_Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam.json b/data/models/JayHyeon_Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam.json rename to data/models/JayHyeon_Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam.json diff --git a/data/JayHyeon_Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam.json b/data/models/JayHyeon_Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam.json rename to data/models/JayHyeon_Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam.json diff --git a/data/JayHyeon_Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam.json b/data/models/JayHyeon_Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam.json rename to data/models/JayHyeon_Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam.json diff --git a/data/JayHyeon_Qwen_0.5-DPO_1e-6-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-DPO_1e-6-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPO_1e-6-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-DPO_1e-6-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-DPO_1e-7-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-DPO_1e-7-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPO_1e-7-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-DPO_1e-7-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-DPO_3e-6-1ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-DPO_3e-6-1ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPO_3e-6-1ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-DPO_3e-6-1ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-DPO_3e-6-2ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-DPO_3e-6-2ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPO_3e-6-2ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-DPO_3e-6-2ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-DPO_3e-6-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-DPO_3e-6-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPO_3e-6-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-DPO_3e-6-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-DPO_3e-7-1ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-DPO_3e-7-1ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPO_3e-7-1ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-DPO_3e-7-1ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-DPO_3e-7-2ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-DPO_3e-7-2ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPO_3e-7-2ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-DPO_3e-7-2ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-DPO_3e-7-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-DPO_3e-7-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPO_3e-7-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-DPO_3e-7-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-DPO_5e-7-1ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-DPO_5e-7-1ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPO_5e-7-1ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-DPO_5e-7-1ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-DPO_5e-7-2ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-DPO_5e-7-2ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPO_5e-7-2ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-DPO_5e-7-2ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-DPO_5e-7-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-DPO_5e-7-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-DPO_5e-7-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-DPO_5e-7-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-IPO_5e-7-1ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-IPO_5e-7-1ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-IPO_5e-7-1ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-IPO_5e-7-1ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-IPO_5e-7-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-IPO_5e-7-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-IPO_5e-7-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-IPO_5e-7-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-VDPO_3e-6-1ep_3vpo_const.json b/data/models/JayHyeon_Qwen_0.5-VDPO_3e-6-1ep_3vpo_const.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-VDPO_3e-6-1ep_3vpo_const.json rename to data/models/JayHyeon_Qwen_0.5-VDPO_3e-6-1ep_3vpo_const.json diff --git a/data/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_10vpo_const.json b/data/models/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_10vpo_const.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_10vpo_const.json rename to data/models/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_10vpo_const.json diff --git a/data/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_1vpo_const.json b/data/models/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_1vpo_const.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_1vpo_const.json rename to data/models/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_1vpo_const.json diff --git a/data/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_3vpo_const.json b/data/models/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_3vpo_const.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_3vpo_const.json rename to data/models/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_3vpo_const.json diff --git a/data/JayHyeon_Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-VDPO_5e-7-3ep_1vpo_const.json b/data/models/JayHyeon_Qwen_0.5-VDPO_5e-7-3ep_1vpo_const.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-VDPO_5e-7-3ep_1vpo_const.json rename to data/models/JayHyeon_Qwen_0.5-VDPO_5e-7-3ep_1vpo_const.json diff --git a/data/JayHyeon_Qwen_0.5-VDPO_5e-7-3ep_3vpo_const.json b/data/models/JayHyeon_Qwen_0.5-VDPO_5e-7-3ep_3vpo_const.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-VDPO_5e-7-3ep_3vpo_const.json rename to data/models/JayHyeon_Qwen_0.5-VDPO_5e-7-3ep_3vpo_const.json diff --git a/data/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_10vpo_const.json b/data/models/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_10vpo_const.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_10vpo_const.json rename to data/models/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_10vpo_const.json diff --git a/data/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_1vpo_const.json b/data/models/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_1vpo_const.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_1vpo_const.json rename to data/models/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_1vpo_const.json diff --git a/data/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_30vpo_const.json b/data/models/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_30vpo_const.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_30vpo_const.json rename to data/models/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_30vpo_const.json diff --git a/data/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_3vpo_const.json b/data/models/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_3vpo_const.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_3vpo_const.json rename to data/models/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_3vpo_const.json diff --git a/data/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam.json b/data/models/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam.json rename to data/models/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam.json diff --git a/data/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_10vpo_const.json b/data/models/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_10vpo_const.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_10vpo_const.json rename to data/models/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_10vpo_const.json diff --git a/data/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_1vpo_const.json b/data/models/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_1vpo_const.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_1vpo_const.json rename to data/models/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_1vpo_const.json diff --git a/data/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_30vpo_const.json b/data/models/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_30vpo_const.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_30vpo_const.json rename to data/models/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_30vpo_const.json diff --git a/data/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_3vpo_const.json b/data/models/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_3vpo_const.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_3vpo_const.json rename to data/models/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_3vpo_const.json diff --git a/data/JayHyeon_Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1.json b/data/models/JayHyeon_Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1.json rename to data/models/JayHyeon_Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1.json diff --git a/data/JayHyeon_Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3.json b/data/models/JayHyeon_Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3.json rename to data/models/JayHyeon_Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3.json diff --git a/data/JayHyeon_Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1.json b/data/models/JayHyeon_Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1.json rename to data/models/JayHyeon_Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1.json diff --git a/data/JayHyeon_Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1.json b/data/models/JayHyeon_Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1.json rename to data/models/JayHyeon_Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1.json diff --git a/data/JayHyeon_Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3.json b/data/models/JayHyeon_Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3.json similarity index 100% rename from data/JayHyeon_Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3.json rename to data/models/JayHyeon_Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3.json diff --git a/data/Jimmy19991222_Llama-3-Instruct-8B-SimPO-v0.2.json b/data/models/Jimmy19991222_Llama-3-Instruct-8B-SimPO-v0.2.json similarity index 100% rename from data/Jimmy19991222_Llama-3-Instruct-8B-SimPO-v0.2.json rename to data/models/Jimmy19991222_Llama-3-Instruct-8B-SimPO-v0.2.json diff --git a/data/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun.json b/data/models/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun.json similarity index 100% rename from data/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun.json rename to data/models/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun.json diff --git a/data/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log.json b/data/models/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log.json similarity index 100% rename from data/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log.json rename to data/models/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log.json diff --git a/data/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log.json b/data/models/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log.json similarity index 100% rename from data/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log.json rename to data/models/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log.json diff --git a/data/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4.json b/data/models/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4.json similarity index 100% rename from data/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4.json rename to data/models/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4.json diff --git a/data/Jimmy19991222_llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun.json b/data/models/Jimmy19991222_llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun.json similarity index 100% rename from data/Jimmy19991222_llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun.json rename to data/models/Jimmy19991222_llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun.json diff --git a/data/Jimmy19991222_llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log.json b/data/models/Jimmy19991222_llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log.json similarity index 100% rename from data/Jimmy19991222_llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log.json rename to data/models/Jimmy19991222_llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log.json diff --git a/data/Jimmy19991222_llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log.json b/data/models/Jimmy19991222_llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log.json similarity index 100% rename from data/Jimmy19991222_llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log.json rename to data/models/Jimmy19991222_llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log.json diff --git a/data/Joseph717171_Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32.json b/data/models/Joseph717171_Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32.json similarity index 100% rename from data/Joseph717171_Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32.json rename to data/models/Joseph717171_Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32.json diff --git a/data/Joseph717171_Llama-3.1-SuperNova-8B-Lite_TIES_with_Base.json b/data/models/Joseph717171_Llama-3.1-SuperNova-8B-Lite_TIES_with_Base.json similarity index 100% rename from data/Joseph717171_Llama-3.1-SuperNova-8B-Lite_TIES_with_Base.json rename to data/models/Joseph717171_Llama-3.1-SuperNova-8B-Lite_TIES_with_Base.json diff --git a/data/Josephgflowers_Cinder-Phi-2-V1-F16-gguf.json b/data/models/Josephgflowers_Cinder-Phi-2-V1-F16-gguf.json similarity index 100% rename from data/Josephgflowers_Cinder-Phi-2-V1-F16-gguf.json rename to data/models/Josephgflowers_Cinder-Phi-2-V1-F16-gguf.json diff --git a/data/Josephgflowers_Differential-Attention-Liquid-Metal-Tinyllama.json b/data/models/Josephgflowers_Differential-Attention-Liquid-Metal-Tinyllama.json similarity index 100% rename from data/Josephgflowers_Differential-Attention-Liquid-Metal-Tinyllama.json rename to data/models/Josephgflowers_Differential-Attention-Liquid-Metal-Tinyllama.json diff --git a/data/Josephgflowers_TinyLlama-Cinder-Agent-v1.json b/data/models/Josephgflowers_TinyLlama-Cinder-Agent-v1.json similarity index 100% rename from data/Josephgflowers_TinyLlama-Cinder-Agent-v1.json rename to data/models/Josephgflowers_TinyLlama-Cinder-Agent-v1.json diff --git a/data/Josephgflowers_TinyLlama-v1.1-Cinders-World.json b/data/models/Josephgflowers_TinyLlama-v1.1-Cinders-World.json similarity index 100% rename from data/Josephgflowers_TinyLlama-v1.1-Cinders-World.json rename to data/models/Josephgflowers_TinyLlama-v1.1-Cinders-World.json diff --git a/data/Josephgflowers_TinyLlama_v1.1_math_code-world-test-1.json b/data/models/Josephgflowers_TinyLlama_v1.1_math_code-world-test-1.json similarity index 100% rename from data/Josephgflowers_TinyLlama_v1.1_math_code-world-test-1.json rename to data/models/Josephgflowers_TinyLlama_v1.1_math_code-world-test-1.json diff --git a/data/Josephgflowers_Tinyllama-STEM-Cinder-Agent-v1.json b/data/models/Josephgflowers_Tinyllama-STEM-Cinder-Agent-v1.json similarity index 100% rename from data/Josephgflowers_Tinyllama-STEM-Cinder-Agent-v1.json rename to data/models/Josephgflowers_Tinyllama-STEM-Cinder-Agent-v1.json diff --git a/data/Josephgflowers_Tinyllama-r1.json b/data/models/Josephgflowers_Tinyllama-r1.json similarity index 100% rename from data/Josephgflowers_Tinyllama-r1.json rename to data/models/Josephgflowers_Tinyllama-r1.json diff --git a/data/JungZoona_T3Q-Qwen2.5-14B-Instruct-1M-e3.json b/data/models/JungZoona_T3Q-Qwen2.5-14B-Instruct-1M-e3.json similarity index 100% rename from data/JungZoona_T3Q-Qwen2.5-14B-Instruct-1M-e3.json rename to data/models/JungZoona_T3Q-Qwen2.5-14B-Instruct-1M-e3.json diff --git a/data/JungZoona_T3Q-qwen2.5-14b-v1.0-e3.json b/data/models/JungZoona_T3Q-qwen2.5-14b-v1.0-e3.json similarity index 100% rename from data/JungZoona_T3Q-qwen2.5-14b-v1.0-e3.json rename to data/models/JungZoona_T3Q-qwen2.5-14b-v1.0-e3.json diff --git a/data/Junhoee_Qwen-Megumin.json b/data/models/Junhoee_Qwen-Megumin.json similarity index 100% rename from data/Junhoee_Qwen-Megumin.json rename to data/models/Junhoee_Qwen-Megumin.json diff --git a/data/KSU-HW-SEC_Llama3-70b-SVA-FT-1415.json b/data/models/KSU-HW-SEC_Llama3-70b-SVA-FT-1415.json similarity index 100% rename from data/KSU-HW-SEC_Llama3-70b-SVA-FT-1415.json rename to data/models/KSU-HW-SEC_Llama3-70b-SVA-FT-1415.json diff --git a/data/KSU-HW-SEC_Llama3-70b-SVA-FT-500.json b/data/models/KSU-HW-SEC_Llama3-70b-SVA-FT-500.json similarity index 100% rename from data/KSU-HW-SEC_Llama3-70b-SVA-FT-500.json rename to data/models/KSU-HW-SEC_Llama3-70b-SVA-FT-500.json diff --git a/data/KSU-HW-SEC_Llama3-70b-SVA-FT-final.json b/data/models/KSU-HW-SEC_Llama3-70b-SVA-FT-final.json similarity index 100% rename from data/KSU-HW-SEC_Llama3-70b-SVA-FT-final.json rename to data/models/KSU-HW-SEC_Llama3-70b-SVA-FT-final.json diff --git a/data/KSU-HW-SEC_Llama3.1-70b-SVA-FT-1000step.json b/data/models/KSU-HW-SEC_Llama3.1-70b-SVA-FT-1000step.json similarity index 100% rename from data/KSU-HW-SEC_Llama3.1-70b-SVA-FT-1000step.json rename to data/models/KSU-HW-SEC_Llama3.1-70b-SVA-FT-1000step.json diff --git a/data/Khetterman_DarkAtom-12B-v3.json b/data/models/Khetterman_DarkAtom-12B-v3.json similarity index 100% rename from data/Khetterman_DarkAtom-12B-v3.json rename to data/models/Khetterman_DarkAtom-12B-v3.json diff --git a/data/Khetterman_Kosmos-8B-v1.json b/data/models/Khetterman_Kosmos-8B-v1.json similarity index 100% rename from data/Khetterman_Kosmos-8B-v1.json rename to data/models/Khetterman_Kosmos-8B-v1.json diff --git a/data/Kimargin_GPT-NEO-1.3B-wiki.json b/data/models/Kimargin_GPT-NEO-1.3B-wiki.json similarity index 100% rename from data/Kimargin_GPT-NEO-1.3B-wiki.json rename to data/models/Kimargin_GPT-NEO-1.3B-wiki.json diff --git a/data/KingNish_Qwen2.5-0.5b-Test-ft.json b/data/models/KingNish_Qwen2.5-0.5b-Test-ft.json similarity index 100% rename from data/KingNish_Qwen2.5-0.5b-Test-ft.json rename to data/models/KingNish_Qwen2.5-0.5b-Test-ft.json diff --git a/data/KingNish_Reasoning-0.5b.json b/data/models/KingNish_Reasoning-0.5b.json similarity index 100% rename from data/KingNish_Reasoning-0.5b.json rename to data/models/KingNish_Reasoning-0.5b.json diff --git a/data/KingNish_Reasoning-Llama-3b-v0.1.json b/data/models/KingNish_Reasoning-Llama-3b-v0.1.json similarity index 100% rename from data/KingNish_Reasoning-Llama-3b-v0.1.json rename to data/models/KingNish_Reasoning-Llama-3b-v0.1.json diff --git a/data/KingNish_qwen-1b-continued-v2.1.json b/data/models/KingNish_qwen-1b-continued-v2.1.json similarity index 100% rename from data/KingNish_qwen-1b-continued-v2.1.json rename to data/models/KingNish_qwen-1b-continued-v2.1.json diff --git a/data/KingNish_qwen-1b-continued-v2.2.json b/data/models/KingNish_qwen-1b-continued-v2.2.json similarity index 100% rename from data/KingNish_qwen-1b-continued-v2.2.json rename to data/models/KingNish_qwen-1b-continued-v2.2.json diff --git a/data/KingNish_qwen-1b-continued-v2.json b/data/models/KingNish_qwen-1b-continued-v2.json similarity index 100% rename from data/KingNish_qwen-1b-continued-v2.json rename to data/models/KingNish_qwen-1b-continued-v2.json diff --git a/data/KingNish_qwen-1b-continued.json b/data/models/KingNish_qwen-1b-continued.json similarity index 100% rename from data/KingNish_qwen-1b-continued.json rename to data/models/KingNish_qwen-1b-continued.json diff --git a/data/Kquant03_CognitiveFusion2-4x7B-BF16.json b/data/models/Kquant03_CognitiveFusion2-4x7B-BF16.json similarity index 100% rename from data/Kquant03_CognitiveFusion2-4x7B-BF16.json rename to data/models/Kquant03_CognitiveFusion2-4x7B-BF16.json diff --git a/data/Kquant03_L3-Pneuma-8B.json b/data/models/Kquant03_L3-Pneuma-8B.json similarity index 100% rename from data/Kquant03_L3-Pneuma-8B.json rename to data/models/Kquant03_L3-Pneuma-8B.json diff --git a/data/Krystalan_DRT-o1-14B.json b/data/models/Krystalan_DRT-o1-14B.json similarity index 100% rename from data/Krystalan_DRT-o1-14B.json rename to data/models/Krystalan_DRT-o1-14B.json diff --git a/data/Krystalan_DRT-o1-7B.json b/data/models/Krystalan_DRT-o1-7B.json similarity index 100% rename from data/Krystalan_DRT-o1-7B.json rename to data/models/Krystalan_DRT-o1-7B.json diff --git a/data/Kukedlc_NeuralExperiment-7b-MagicCoder-v7.5.json b/data/models/Kukedlc_NeuralExperiment-7b-MagicCoder-v7.5.json similarity index 100% rename from data/Kukedlc_NeuralExperiment-7b-MagicCoder-v7.5.json rename to data/models/Kukedlc_NeuralExperiment-7b-MagicCoder-v7.5.json diff --git a/data/Kukedlc_NeuralLLaMa-3-8b-DT-v0.1.json b/data/models/Kukedlc_NeuralLLaMa-3-8b-DT-v0.1.json similarity index 100% rename from data/Kukedlc_NeuralLLaMa-3-8b-DT-v0.1.json rename to data/models/Kukedlc_NeuralLLaMa-3-8b-DT-v0.1.json diff --git a/data/Kukedlc_NeuralLLaMa-3-8b-ORPO-v0.3.json b/data/models/Kukedlc_NeuralLLaMa-3-8b-ORPO-v0.3.json similarity index 100% rename from data/Kukedlc_NeuralLLaMa-3-8b-ORPO-v0.3.json rename to data/models/Kukedlc_NeuralLLaMa-3-8b-ORPO-v0.3.json diff --git a/data/Kukedlc_NeuralSynthesis-7B-v0.1.json b/data/models/Kukedlc_NeuralSynthesis-7B-v0.1.json similarity index 100% rename from data/Kukedlc_NeuralSynthesis-7B-v0.1.json rename to data/models/Kukedlc_NeuralSynthesis-7B-v0.1.json diff --git a/data/Kukedlc_NeuralSynthesis-7B-v0.3.json b/data/models/Kukedlc_NeuralSynthesis-7B-v0.3.json similarity index 100% rename from data/Kukedlc_NeuralSynthesis-7B-v0.3.json rename to data/models/Kukedlc_NeuralSynthesis-7B-v0.3.json diff --git a/data/Kukedlc_NeuralSynthesis-7b-v0.4-slerp.json b/data/models/Kukedlc_NeuralSynthesis-7b-v0.4-slerp.json similarity index 100% rename from data/Kukedlc_NeuralSynthesis-7b-v0.4-slerp.json rename to data/models/Kukedlc_NeuralSynthesis-7b-v0.4-slerp.json diff --git a/data/Kukedlc_Qwen-2.5-7b-Spanish-o1-CoT.json b/data/models/Kukedlc_Qwen-2.5-7b-Spanish-o1-CoT.json similarity index 100% rename from data/Kukedlc_Qwen-2.5-7b-Spanish-o1-CoT.json rename to data/models/Kukedlc_Qwen-2.5-7b-Spanish-o1-CoT.json diff --git a/data/Kumar955_Hemanth-llm.json b/data/models/Kumar955_Hemanth-llm.json similarity index 100% rename from data/Kumar955_Hemanth-llm.json rename to data/models/Kumar955_Hemanth-llm.json diff --git a/data/L-RAGE_3_PRYMMAL-ECE-7B-SLERP-V1.json b/data/models/L-RAGE_3_PRYMMAL-ECE-7B-SLERP-V1.json similarity index 100% rename from data/L-RAGE_3_PRYMMAL-ECE-7B-SLERP-V1.json rename to data/models/L-RAGE_3_PRYMMAL-ECE-7B-SLERP-V1.json diff --git a/data/LEESM_llama-2-7b-hf-lora-oki100p.json b/data/models/LEESM_llama-2-7b-hf-lora-oki100p.json similarity index 100% rename from data/LEESM_llama-2-7b-hf-lora-oki100p.json rename to data/models/LEESM_llama-2-7b-hf-lora-oki100p.json diff --git a/data/LEESM_llama-2-7b-hf-lora-oki10p.json b/data/models/LEESM_llama-2-7b-hf-lora-oki10p.json similarity index 100% rename from data/LEESM_llama-2-7b-hf-lora-oki10p.json rename to data/models/LEESM_llama-2-7b-hf-lora-oki10p.json diff --git a/data/LEESM_llama-3-8b-bnb-4b-kowiki231101.json b/data/models/LEESM_llama-3-8b-bnb-4b-kowiki231101.json similarity index 100% rename from data/LEESM_llama-3-8b-bnb-4b-kowiki231101.json rename to data/models/LEESM_llama-3-8b-bnb-4b-kowiki231101.json diff --git a/data/LEESM_llama-3-Korean-Bllossom-8B-trexlab-oki10p.json b/data/models/LEESM_llama-3-Korean-Bllossom-8B-trexlab-oki10p.json similarity index 100% rename from data/LEESM_llama-3-Korean-Bllossom-8B-trexlab-oki10p.json rename to data/models/LEESM_llama-3-Korean-Bllossom-8B-trexlab-oki10p.json diff --git a/data/LGAI-EXAONE_EXAONE-3.0-7.8B-Instruct.json b/data/models/LGAI-EXAONE_EXAONE-3.0-7.8B-Instruct.json similarity index 100% rename from data/LGAI-EXAONE_EXAONE-3.0-7.8B-Instruct.json rename to data/models/LGAI-EXAONE_EXAONE-3.0-7.8B-Instruct.json diff --git a/data/LGAI-EXAONE_EXAONE-3.5-2.4B-Instruct.json b/data/models/LGAI-EXAONE_EXAONE-3.5-2.4B-Instruct.json similarity index 100% rename from data/LGAI-EXAONE_EXAONE-3.5-2.4B-Instruct.json rename to data/models/LGAI-EXAONE_EXAONE-3.5-2.4B-Instruct.json diff --git a/data/LGAI-EXAONE_EXAONE-3.5-32B-Instruct.json b/data/models/LGAI-EXAONE_EXAONE-3.5-32B-Instruct.json similarity index 100% rename from data/LGAI-EXAONE_EXAONE-3.5-32B-Instruct.json rename to data/models/LGAI-EXAONE_EXAONE-3.5-32B-Instruct.json diff --git a/data/LGAI-EXAONE_EXAONE-3.5-7.8B-Instruct.json b/data/models/LGAI-EXAONE_EXAONE-3.5-7.8B-Instruct.json similarity index 100% rename from data/LGAI-EXAONE_EXAONE-3.5-7.8B-Instruct.json rename to data/models/LGAI-EXAONE_EXAONE-3.5-7.8B-Instruct.json diff --git a/data/LLM360_K2-Chat.json b/data/models/LLM360_K2-Chat.json similarity index 100% rename from data/LLM360_K2-Chat.json rename to data/models/LLM360_K2-Chat.json diff --git a/data/LLM360_K2.json b/data/models/LLM360_K2.json similarity index 100% rename from data/LLM360_K2.json rename to data/models/LLM360_K2.json diff --git a/data/LLM4Binary_llm4decompile-1.3b-v2.json b/data/models/LLM4Binary_llm4decompile-1.3b-v2.json similarity index 100% rename from data/LLM4Binary_llm4decompile-1.3b-v2.json rename to data/models/LLM4Binary_llm4decompile-1.3b-v2.json diff --git a/data/Lambent_qwen2.5-reinstruct-alternate-lumen-14B.json b/data/models/Lambent_qwen2.5-reinstruct-alternate-lumen-14B.json similarity index 100% rename from data/Lambent_qwen2.5-reinstruct-alternate-lumen-14B.json rename to data/models/Lambent_qwen2.5-reinstruct-alternate-lumen-14B.json diff --git a/data/Langboat_Mengzi3-8B-Chat.json b/data/models/Langboat_Mengzi3-8B-Chat.json similarity index 100% rename from data/Langboat_Mengzi3-8B-Chat.json rename to data/models/Langboat_Mengzi3-8B-Chat.json diff --git a/data/Lawnakk_BBA100.json b/data/models/Lawnakk_BBA100.json similarity index 100% rename from data/Lawnakk_BBA100.json rename to data/models/Lawnakk_BBA100.json diff --git a/data/Lawnakk_BBALAW1.0.json b/data/models/Lawnakk_BBALAW1.0.json similarity index 100% rename from data/Lawnakk_BBALAW1.0.json rename to data/models/Lawnakk_BBALAW1.0.json diff --git a/data/Lawnakk_BBALAW1.2.json b/data/models/Lawnakk_BBALAW1.2.json similarity index 100% rename from data/Lawnakk_BBALAW1.2.json rename to data/models/Lawnakk_BBALAW1.2.json diff --git a/data/Lawnakk_BBALAW1.3.json b/data/models/Lawnakk_BBALAW1.3.json similarity index 100% rename from data/Lawnakk_BBALAW1.3.json rename to data/models/Lawnakk_BBALAW1.3.json diff --git a/data/Lawnakk_BBALAW1.6.json b/data/models/Lawnakk_BBALAW1.6.json similarity index 100% rename from data/Lawnakk_BBALAW1.6.json rename to data/models/Lawnakk_BBALAW1.6.json diff --git a/data/Lawnakk_BBALAW1.61.json b/data/models/Lawnakk_BBALAW1.61.json similarity index 100% rename from data/Lawnakk_BBALAW1.61.json rename to data/models/Lawnakk_BBALAW1.61.json diff --git a/data/Lawnakk_BBALAW1.62.json b/data/models/Lawnakk_BBALAW1.62.json similarity index 100% rename from data/Lawnakk_BBALAW1.62.json rename to data/models/Lawnakk_BBALAW1.62.json diff --git a/data/Lawnakk_BBALAW1.63.json b/data/models/Lawnakk_BBALAW1.63.json similarity index 100% rename from data/Lawnakk_BBALAW1.63.json rename to data/models/Lawnakk_BBALAW1.63.json diff --git a/data/Lawnakk_BBALAW1.64.json b/data/models/Lawnakk_BBALAW1.64.json similarity index 100% rename from data/Lawnakk_BBALAW1.64.json rename to data/models/Lawnakk_BBALAW1.64.json diff --git a/data/Lawnakk_BBALAW1.json b/data/models/Lawnakk_BBALAW1.json similarity index 100% rename from data/Lawnakk_BBALAW1.json rename to data/models/Lawnakk_BBALAW1.json diff --git a/data/LenguajeNaturalAI_leniachat-gemma-2b-v0.json b/data/models/LenguajeNaturalAI_leniachat-gemma-2b-v0.json similarity index 100% rename from data/LenguajeNaturalAI_leniachat-gemma-2b-v0.json rename to data/models/LenguajeNaturalAI_leniachat-gemma-2b-v0.json diff --git a/data/LenguajeNaturalAI_leniachat-qwen2-1.5B-v0.json b/data/models/LenguajeNaturalAI_leniachat-qwen2-1.5B-v0.json similarity index 100% rename from data/LenguajeNaturalAI_leniachat-qwen2-1.5B-v0.json rename to data/models/LenguajeNaturalAI_leniachat-qwen2-1.5B-v0.json diff --git a/data/LeroyDyer_CheckPoint_A.json b/data/models/LeroyDyer_CheckPoint_A.json similarity index 100% rename from data/LeroyDyer_CheckPoint_A.json rename to data/models/LeroyDyer_CheckPoint_A.json diff --git a/data/LeroyDyer_CheckPoint_B.json b/data/models/LeroyDyer_CheckPoint_B.json similarity index 100% rename from data/LeroyDyer_CheckPoint_B.json rename to data/models/LeroyDyer_CheckPoint_B.json diff --git a/data/LeroyDyer_CheckPoint_C.json b/data/models/LeroyDyer_CheckPoint_C.json similarity index 100% rename from data/LeroyDyer_CheckPoint_C.json rename to data/models/LeroyDyer_CheckPoint_C.json diff --git a/data/LeroyDyer_CheckPoint_R1.json b/data/models/LeroyDyer_CheckPoint_R1.json similarity index 100% rename from data/LeroyDyer_CheckPoint_R1.json rename to data/models/LeroyDyer_CheckPoint_R1.json diff --git a/data/LeroyDyer_LCARS_AI_001.json b/data/models/LeroyDyer_LCARS_AI_001.json similarity index 100% rename from data/LeroyDyer_LCARS_AI_001.json rename to data/models/LeroyDyer_LCARS_AI_001.json diff --git a/data/LeroyDyer_LCARS_AI_1x4_003_SuperAI.json b/data/models/LeroyDyer_LCARS_AI_1x4_003_SuperAI.json similarity index 100% rename from data/LeroyDyer_LCARS_AI_1x4_003_SuperAI.json rename to data/models/LeroyDyer_LCARS_AI_1x4_003_SuperAI.json diff --git a/data/LeroyDyer_LCARS_AI_StarTrek_Computer.json b/data/models/LeroyDyer_LCARS_AI_StarTrek_Computer.json similarity index 100% rename from data/LeroyDyer_LCARS_AI_StarTrek_Computer.json rename to data/models/LeroyDyer_LCARS_AI_StarTrek_Computer.json diff --git a/data/LeroyDyer_LCARS_TOP_SCORE.json b/data/models/LeroyDyer_LCARS_TOP_SCORE.json similarity index 100% rename from data/LeroyDyer_LCARS_TOP_SCORE.json rename to data/models/LeroyDyer_LCARS_TOP_SCORE.json diff --git a/data/LeroyDyer_Mixtral_AI_SwahiliTron_7b.json b/data/models/LeroyDyer_Mixtral_AI_SwahiliTron_7b.json similarity index 100% rename from data/LeroyDyer_Mixtral_AI_SwahiliTron_7b.json rename to data/models/LeroyDyer_Mixtral_AI_SwahiliTron_7b.json diff --git a/data/LeroyDyer_SpydazWebAI_Human_AGI.json b/data/models/LeroyDyer_SpydazWebAI_Human_AGI.json similarity index 100% rename from data/LeroyDyer_SpydazWebAI_Human_AGI.json rename to data/models/LeroyDyer_SpydazWebAI_Human_AGI.json diff --git a/data/LeroyDyer_SpydazWebAI_Human_AGI_001.json b/data/models/LeroyDyer_SpydazWebAI_Human_AGI_001.json similarity index 100% rename from data/LeroyDyer_SpydazWebAI_Human_AGI_001.json rename to data/models/LeroyDyer_SpydazWebAI_Human_AGI_001.json diff --git a/data/LeroyDyer_SpydazWeb_AI_CyberTron_Ultra_7b.json b/data/models/LeroyDyer_SpydazWeb_AI_CyberTron_Ultra_7b.json similarity index 100% rename from data/LeroyDyer_SpydazWeb_AI_CyberTron_Ultra_7b.json rename to data/models/LeroyDyer_SpydazWeb_AI_CyberTron_Ultra_7b.json diff --git a/data/LeroyDyer_SpydazWeb_AI_HumanAGI_001_M2.json b/data/models/LeroyDyer_SpydazWeb_AI_HumanAGI_001_M2.json similarity index 100% rename from data/LeroyDyer_SpydazWeb_AI_HumanAGI_001_M2.json rename to data/models/LeroyDyer_SpydazWeb_AI_HumanAGI_001_M2.json diff --git a/data/LeroyDyer_SpydazWeb_AI_HumanAGI_002.json b/data/models/LeroyDyer_SpydazWeb_AI_HumanAGI_002.json similarity index 100% rename from data/LeroyDyer_SpydazWeb_AI_HumanAGI_002.json rename to data/models/LeroyDyer_SpydazWeb_AI_HumanAGI_002.json diff --git a/data/LeroyDyer_SpydazWeb_AI_HumanAI_001.json b/data/models/LeroyDyer_SpydazWeb_AI_HumanAI_001.json similarity index 100% rename from data/LeroyDyer_SpydazWeb_AI_HumanAI_001.json rename to data/models/LeroyDyer_SpydazWeb_AI_HumanAI_001.json diff --git a/data/LeroyDyer_SpydazWeb_AI_HumanAI_006.json b/data/models/LeroyDyer_SpydazWeb_AI_HumanAI_006.json similarity index 100% rename from data/LeroyDyer_SpydazWeb_AI_HumanAI_006.json rename to data/models/LeroyDyer_SpydazWeb_AI_HumanAI_006.json diff --git a/data/LeroyDyer_SpydazWeb_AI_HumanAI_007.json b/data/models/LeroyDyer_SpydazWeb_AI_HumanAI_007.json similarity index 100% rename from data/LeroyDyer_SpydazWeb_AI_HumanAI_007.json rename to data/models/LeroyDyer_SpydazWeb_AI_HumanAI_007.json diff --git a/data/LeroyDyer_SpydazWeb_AI_HumanAI_009_CHAT.json b/data/models/LeroyDyer_SpydazWeb_AI_HumanAI_009_CHAT.json similarity index 100% rename from data/LeroyDyer_SpydazWeb_AI_HumanAI_009_CHAT.json rename to data/models/LeroyDyer_SpydazWeb_AI_HumanAI_009_CHAT.json diff --git a/data/LeroyDyer_SpydazWeb_AI_HumanAI_010_CHAT.json b/data/models/LeroyDyer_SpydazWeb_AI_HumanAI_010_CHAT.json similarity index 100% rename from data/LeroyDyer_SpydazWeb_AI_HumanAI_010_CHAT.json rename to data/models/LeroyDyer_SpydazWeb_AI_HumanAI_010_CHAT.json diff --git a/data/LeroyDyer_SpydazWeb_AI_HumanAI_011_INSTRUCT.json b/data/models/LeroyDyer_SpydazWeb_AI_HumanAI_011_INSTRUCT.json similarity index 100% rename from data/LeroyDyer_SpydazWeb_AI_HumanAI_011_INSTRUCT.json rename to data/models/LeroyDyer_SpydazWeb_AI_HumanAI_011_INSTRUCT.json diff --git a/data/LeroyDyer_SpydazWeb_AI_HumanAI_011_INSTRUCT_ML.json b/data/models/LeroyDyer_SpydazWeb_AI_HumanAI_011_INSTRUCT_ML.json similarity index 100% rename from data/LeroyDyer_SpydazWeb_AI_HumanAI_011_INSTRUCT_ML.json rename to data/models/LeroyDyer_SpydazWeb_AI_HumanAI_011_INSTRUCT_ML.json diff --git a/data/LeroyDyer_SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1.json b/data/models/LeroyDyer_SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1.json similarity index 100% rename from data/LeroyDyer_SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1.json rename to data/models/LeroyDyer_SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1.json diff --git a/data/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_IA.json b/data/models/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_IA.json similarity index 99% rename from data/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_IA.json rename to data/models/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_IA.json index 50b0c5cefee474ed95982a89695dbeb43db80caf..e8101d23b8216216f871bb896a1fc98edf09778e 100644 --- a/data/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_IA.json +++ b/data/models/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_IA.json @@ -5,7 +5,7 @@ "developer": "LeroyDyer", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "MistralForCausalLM", "params_billions": "7.242" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3066 + "score": 0.3036 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4577 + "score": 0.4575 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2995 + "score": 0.3012 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4254 + "score": 0.4253 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2318 + "score": 0.2329 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3036 + "score": 0.3066 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4575 + "score": 0.4577 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3012 + "score": 0.2995 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4253 + "score": 0.4254 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2329 + "score": 0.2318 } } ], diff --git a/data/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_MX.json b/data/models/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_MX.json similarity index 100% rename from data/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_MX.json rename to data/models/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_MX.json diff --git a/data/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_XA.json b/data/models/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_XA.json similarity index 100% rename from data/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_XA.json rename to data/models/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_XA.json diff --git a/data/LeroyDyer_SpydazWeb_AI_HumanAI_RP.json b/data/models/LeroyDyer_SpydazWeb_AI_HumanAI_RP.json similarity index 100% rename from data/LeroyDyer_SpydazWeb_AI_HumanAI_RP.json rename to data/models/LeroyDyer_SpydazWeb_AI_HumanAI_RP.json diff --git a/data/LeroyDyer_SpydazWeb_AI_HumanAI_TextVision.json b/data/models/LeroyDyer_SpydazWeb_AI_HumanAI_TextVision.json similarity index 100% rename from data/LeroyDyer_SpydazWeb_AI_HumanAI_TextVision.json rename to data/models/LeroyDyer_SpydazWeb_AI_HumanAI_TextVision.json diff --git a/data/LeroyDyer_SpydazWeb_HumanAI_M1.json b/data/models/LeroyDyer_SpydazWeb_HumanAI_M1.json similarity index 100% rename from data/LeroyDyer_SpydazWeb_HumanAI_M1.json rename to data/models/LeroyDyer_SpydazWeb_HumanAI_M1.json diff --git a/data/LeroyDyer_SpydazWeb_HumanAI_M2.json b/data/models/LeroyDyer_SpydazWeb_HumanAI_M2.json similarity index 100% rename from data/LeroyDyer_SpydazWeb_HumanAI_M2.json rename to data/models/LeroyDyer_SpydazWeb_HumanAI_M2.json diff --git a/data/LeroyDyer_SpydazWeb_HumanAI_M3.json b/data/models/LeroyDyer_SpydazWeb_HumanAI_M3.json similarity index 100% rename from data/LeroyDyer_SpydazWeb_HumanAI_M3.json rename to data/models/LeroyDyer_SpydazWeb_HumanAI_M3.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_12.json b/data/models/LeroyDyer__Spydaz_Web_AI_12.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_12.json rename to data/models/LeroyDyer__Spydaz_Web_AI_12.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_14.json b/data/models/LeroyDyer__Spydaz_Web_AI_14.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_14.json rename to data/models/LeroyDyer__Spydaz_Web_AI_14.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_AGI_R1_001.json b/data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_001.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_AGI_R1_001.json rename to data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_001.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_AGI_R1_002.json b/data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_002.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_AGI_R1_002.json rename to data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_002.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_AGI_R1_MUSR.json b/data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_MUSR.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_AGI_R1_MUSR.json rename to data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_MUSR.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_AGI_R1_MasterCoder.json b/data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_MasterCoder.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_AGI_R1_MasterCoder.json rename to data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_MasterCoder.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_001.json b/data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_001.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_001.json rename to data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_001.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_003.json b/data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_003.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_003.json rename to data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_003.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent.json b/data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent.json rename to data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_Student.json b/data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_Student.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_Student.json rename to data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_Student.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_Teacher.json b/data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_Teacher.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_Teacher.json rename to data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_Teacher.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_001.json b/data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_001.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_001.json rename to data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_001.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_002.json b/data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_002.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_002.json rename to data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_002.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_Coder.json b/data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_Coder.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_Coder.json rename to data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_Coder.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_Math.json b/data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_Math.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_Math.json rename to data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_Math.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_MathMaster.json b/data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_MathMaster.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_MathMaster.json rename to data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_MathMaster.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_AGI_R1_Student_Coder.json b/data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_Student_Coder.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_AGI_R1_Student_Coder.json rename to data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_Student_Coder.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_AGI_R1_Teacher_Coder.json b/data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_Teacher_Coder.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_AGI_R1_Teacher_Coder.json rename to data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_Teacher_Coder.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_AGI_R1_Top_Student.json b/data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_Top_Student.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_AGI_R1_Top_Student.json rename to data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_Top_Student.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_AGI_R1_X1.json b/data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_X1.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_AGI_R1_X1.json rename to data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_X1.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_AGI_R1_X2.json b/data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_X2.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_AGI_R1_X2.json rename to data/models/LeroyDyer__Spydaz_Web_AI_AGI_R1_X2.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_AGI_RP_R1.json b/data/models/LeroyDyer__Spydaz_Web_AI_AGI_RP_R1.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_AGI_RP_R1.json rename to data/models/LeroyDyer__Spydaz_Web_AI_AGI_RP_R1.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_BIBLE_002.json b/data/models/LeroyDyer__Spydaz_Web_AI_BIBLE_002.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_BIBLE_002.json rename to data/models/LeroyDyer__Spydaz_Web_AI_BIBLE_002.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_ChatML_002.json b/data/models/LeroyDyer__Spydaz_Web_AI_ChatML_002.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_ChatML_002.json rename to data/models/LeroyDyer__Spydaz_Web_AI_ChatML_002.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_ChatQA.json b/data/models/LeroyDyer__Spydaz_Web_AI_ChatQA.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_ChatQA.json rename to data/models/LeroyDyer__Spydaz_Web_AI_ChatQA.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_ChatQA_003.json b/data/models/LeroyDyer__Spydaz_Web_AI_ChatQA_003.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_ChatQA_003.json rename to data/models/LeroyDyer__Spydaz_Web_AI_ChatQA_003.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_TEMP.json b/data/models/LeroyDyer__Spydaz_Web_AI_TEMP.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_TEMP.json rename to data/models/LeroyDyer__Spydaz_Web_AI_TEMP.json diff --git a/data/LeroyDyer__Spydaz_Web_AI_Top_Teacher.json b/data/models/LeroyDyer__Spydaz_Web_AI_Top_Teacher.json similarity index 100% rename from data/LeroyDyer__Spydaz_Web_AI_Top_Teacher.json rename to data/models/LeroyDyer__Spydaz_Web_AI_Top_Teacher.json diff --git a/data/LightningRodLabs_Flashlight-v1.0.json b/data/models/LightningRodLabs_Flashlight-v1.0.json similarity index 100% rename from data/LightningRodLabs_Flashlight-v1.0.json rename to data/models/LightningRodLabs_Flashlight-v1.0.json diff --git a/data/LightningRodLabs_Flashlight-v1.1.json b/data/models/LightningRodLabs_Flashlight-v1.1.json similarity index 100% rename from data/LightningRodLabs_Flashlight-v1.1.json rename to data/models/LightningRodLabs_Flashlight-v1.1.json diff --git a/data/LightningRodLabs_Flashlight-v1.2.json b/data/models/LightningRodLabs_Flashlight-v1.2.json similarity index 100% rename from data/LightningRodLabs_Flashlight-v1.2.json rename to data/models/LightningRodLabs_Flashlight-v1.2.json diff --git a/data/Lil-R_2_PRYMMAL-ECE-2B-SLERP-V1.json b/data/models/Lil-R_2_PRYMMAL-ECE-2B-SLERP-V1.json similarity index 100% rename from data/Lil-R_2_PRYMMAL-ECE-2B-SLERP-V1.json rename to data/models/Lil-R_2_PRYMMAL-ECE-2B-SLERP-V1.json diff --git a/data/Lil-R_2_PRYMMAL-ECE-2B-SLERP-V2.json b/data/models/Lil-R_2_PRYMMAL-ECE-2B-SLERP-V2.json similarity index 100% rename from data/Lil-R_2_PRYMMAL-ECE-2B-SLERP-V2.json rename to data/models/Lil-R_2_PRYMMAL-ECE-2B-SLERP-V2.json diff --git a/data/Lil-R_2_PRYMMAL-ECE-7B-SLERP-V1.json b/data/models/Lil-R_2_PRYMMAL-ECE-7B-SLERP-V1.json similarity index 100% rename from data/Lil-R_2_PRYMMAL-ECE-7B-SLERP-V1.json rename to data/models/Lil-R_2_PRYMMAL-ECE-7B-SLERP-V1.json diff --git a/data/Lil-R_2_PRYMMAL-ECE-7B-SLERP-V2.json b/data/models/Lil-R_2_PRYMMAL-ECE-7B-SLERP-V2.json similarity index 100% rename from data/Lil-R_2_PRYMMAL-ECE-7B-SLERP-V2.json rename to data/models/Lil-R_2_PRYMMAL-ECE-7B-SLERP-V2.json diff --git a/data/Lil-R_2_PRYMMAL-ECE-7B-SLERP-V3.json b/data/models/Lil-R_2_PRYMMAL-ECE-7B-SLERP-V3.json similarity index 100% rename from data/Lil-R_2_PRYMMAL-ECE-7B-SLERP-V3.json rename to data/models/Lil-R_2_PRYMMAL-ECE-7B-SLERP-V3.json diff --git a/data/Lil-R_2_PRYMMAL-ECE-7B-SLERP.json b/data/models/Lil-R_2_PRYMMAL-ECE-7B-SLERP.json similarity index 100% rename from data/Lil-R_2_PRYMMAL-ECE-7B-SLERP.json rename to data/models/Lil-R_2_PRYMMAL-ECE-7B-SLERP.json diff --git a/data/Lil-R_PRYMMAL-ECE-1B-SLERP-V1.json b/data/models/Lil-R_PRYMMAL-ECE-1B-SLERP-V1.json similarity index 100% rename from data/Lil-R_PRYMMAL-ECE-1B-SLERP-V1.json rename to data/models/Lil-R_PRYMMAL-ECE-1B-SLERP-V1.json diff --git a/data/Lil-R_PRYMMAL-ECE-7B-SLERP-V8.json b/data/models/Lil-R_PRYMMAL-ECE-7B-SLERP-V8.json similarity index 100% rename from data/Lil-R_PRYMMAL-ECE-7B-SLERP-V8.json rename to data/models/Lil-R_PRYMMAL-ECE-7B-SLERP-V8.json diff --git a/data/LilRg_10PRYMMAL-3B-slerp.json b/data/models/LilRg_10PRYMMAL-3B-slerp.json similarity index 100% rename from data/LilRg_10PRYMMAL-3B-slerp.json rename to data/models/LilRg_10PRYMMAL-3B-slerp.json diff --git a/data/LilRg_ECE-1B-merge-PRYMMAL.json b/data/models/LilRg_ECE-1B-merge-PRYMMAL.json similarity index 100% rename from data/LilRg_ECE-1B-merge-PRYMMAL.json rename to data/models/LilRg_ECE-1B-merge-PRYMMAL.json diff --git a/data/LilRg_ECE_Finetunning.json b/data/models/LilRg_ECE_Finetunning.json similarity index 100% rename from data/LilRg_ECE_Finetunning.json rename to data/models/LilRg_ECE_Finetunning.json diff --git a/data/LilRg_PRYMMAL-6B-slerp.json b/data/models/LilRg_PRYMMAL-6B-slerp.json similarity index 100% rename from data/LilRg_PRYMMAL-6B-slerp.json rename to data/models/LilRg_PRYMMAL-6B-slerp.json diff --git a/data/LilRg_PRYMMAL-ECE-7B-SLERP-V3.json b/data/models/LilRg_PRYMMAL-ECE-7B-SLERP-V3.json similarity index 100% rename from data/LilRg_PRYMMAL-ECE-7B-SLERP-V3.json rename to data/models/LilRg_PRYMMAL-ECE-7B-SLERP-V3.json diff --git a/data/LilRg_PRYMMAL-ECE-7B-SLERP-V4.json b/data/models/LilRg_PRYMMAL-ECE-7B-SLERP-V4.json similarity index 100% rename from data/LilRg_PRYMMAL-ECE-7B-SLERP-V4.json rename to data/models/LilRg_PRYMMAL-ECE-7B-SLERP-V4.json diff --git a/data/LilRg_PRYMMAL-ECE-7B-SLERP-V5.json b/data/models/LilRg_PRYMMAL-ECE-7B-SLERP-V5.json similarity index 100% rename from data/LilRg_PRYMMAL-ECE-7B-SLERP-V5.json rename to data/models/LilRg_PRYMMAL-ECE-7B-SLERP-V5.json diff --git a/data/LilRg_PRYMMAL-ECE-7B-SLERP-V6.json b/data/models/LilRg_PRYMMAL-ECE-7B-SLERP-V6.json similarity index 100% rename from data/LilRg_PRYMMAL-ECE-7B-SLERP-V6.json rename to data/models/LilRg_PRYMMAL-ECE-7B-SLERP-V6.json diff --git a/data/LilRg_PRYMMAL-ECE-7B-SLERP-V7.json b/data/models/LilRg_PRYMMAL-ECE-7B-SLERP-V7.json similarity index 100% rename from data/LilRg_PRYMMAL-ECE-7B-SLERP-V7.json rename to data/models/LilRg_PRYMMAL-ECE-7B-SLERP-V7.json diff --git a/data/LilRg_PRYMMAL-slerp-Merge.json b/data/models/LilRg_PRYMMAL-slerp-Merge.json similarity index 100% rename from data/LilRg_PRYMMAL-slerp-Merge.json rename to data/models/LilRg_PRYMMAL-slerp-Merge.json diff --git a/data/LimYeri_CodeMind-Llama3-8B-unsloth_v2-merged.json b/data/models/LimYeri_CodeMind-Llama3-8B-unsloth_v2-merged.json similarity index 100% rename from data/LimYeri_CodeMind-Llama3-8B-unsloth_v2-merged.json rename to data/models/LimYeri_CodeMind-Llama3-8B-unsloth_v2-merged.json diff --git a/data/LimYeri_CodeMind-Llama3-8B-unsloth_v3-merged.json b/data/models/LimYeri_CodeMind-Llama3-8B-unsloth_v3-merged.json similarity index 100% rename from data/LimYeri_CodeMind-Llama3-8B-unsloth_v3-merged.json rename to data/models/LimYeri_CodeMind-Llama3-8B-unsloth_v3-merged.json diff --git a/data/LimYeri_CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged.json b/data/models/LimYeri_CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged.json similarity index 100% rename from data/LimYeri_CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged.json rename to data/models/LimYeri_CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged.json diff --git a/data/LimYeri_CodeMind-Llama3-8B-unsloth_v4-one-merged.json b/data/models/LimYeri_CodeMind-Llama3-8B-unsloth_v4-one-merged.json similarity index 100% rename from data/LimYeri_CodeMind-Llama3-8B-unsloth_v4-one-merged.json rename to data/models/LimYeri_CodeMind-Llama3-8B-unsloth_v4-one-merged.json diff --git a/data/LimYeri_CodeMind-Llama3.1-8B-unsloth-merged.json b/data/models/LimYeri_CodeMind-Llama3.1-8B-unsloth-merged.json similarity index 100% rename from data/LimYeri_CodeMind-Llama3.1-8B-unsloth-merged.json rename to data/models/LimYeri_CodeMind-Llama3.1-8B-unsloth-merged.json diff --git a/data/Locutusque_CollectiveLM-Falcon-3-7B.json b/data/models/Locutusque_CollectiveLM-Falcon-3-7B.json similarity index 100% rename from data/Locutusque_CollectiveLM-Falcon-3-7B.json rename to data/models/Locutusque_CollectiveLM-Falcon-3-7B.json diff --git a/data/Locutusque_Hercules-6.0-Llama-3.1-8B.json b/data/models/Locutusque_Hercules-6.0-Llama-3.1-8B.json similarity index 100% rename from data/Locutusque_Hercules-6.0-Llama-3.1-8B.json rename to data/models/Locutusque_Hercules-6.0-Llama-3.1-8B.json diff --git a/data/Locutusque_Hercules-6.1-Llama-3.1-8B.json b/data/models/Locutusque_Hercules-6.1-Llama-3.1-8B.json similarity index 100% rename from data/Locutusque_Hercules-6.1-Llama-3.1-8B.json rename to data/models/Locutusque_Hercules-6.1-Llama-3.1-8B.json diff --git a/data/Locutusque_Llama-3-NeuralHercules-5.0-8B.json b/data/models/Locutusque_Llama-3-NeuralHercules-5.0-8B.json similarity index 100% rename from data/Locutusque_Llama-3-NeuralHercules-5.0-8B.json rename to data/models/Locutusque_Llama-3-NeuralHercules-5.0-8B.json diff --git a/data/Locutusque_Llama-3-Yggdrasil-2.0-8B.json b/data/models/Locutusque_Llama-3-Yggdrasil-2.0-8B.json similarity index 100% rename from data/Locutusque_Llama-3-Yggdrasil-2.0-8B.json rename to data/models/Locutusque_Llama-3-Yggdrasil-2.0-8B.json diff --git a/data/Locutusque_TinyMistral-248M-v2.5.json b/data/models/Locutusque_TinyMistral-248M-v2.5.json similarity index 100% rename from data/Locutusque_TinyMistral-248M-v2.5.json rename to data/models/Locutusque_TinyMistral-248M-v2.5.json diff --git a/data/Luni_StarDust-12b-v1.json b/data/models/Luni_StarDust-12b-v1.json similarity index 100% rename from data/Luni_StarDust-12b-v1.json rename to data/models/Luni_StarDust-12b-v1.json diff --git a/data/Luni_StarDust-12b-v2.json b/data/models/Luni_StarDust-12b-v2.json similarity index 100% rename from data/Luni_StarDust-12b-v2.json rename to data/models/Luni_StarDust-12b-v2.json diff --git a/data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v3.json b/data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v3.json similarity index 100% rename from data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v3.json rename to data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v3.json diff --git a/data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v4.json b/data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v4.json similarity index 100% rename from data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v4.json rename to data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v4.json diff --git a/data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v5.json b/data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v5.json similarity index 100% rename from data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v5.json rename to data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v5.json diff --git a/data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt.json b/data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt.json similarity index 100% rename from data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt.json rename to data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt.json diff --git a/data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v6.json b/data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v6.json similarity index 100% rename from data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v6.json rename to data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v6.json diff --git a/data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase.json b/data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase.json similarity index 100% rename from data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase.json rename to data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase.json diff --git a/data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v7.json b/data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v7.json similarity index 100% rename from data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v7.json rename to data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v7.json diff --git a/data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.5.json b/data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.5.json similarity index 100% rename from data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.5.json rename to data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.5.json diff --git a/data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.6.json b/data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.6.json similarity index 100% rename from data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.6.json rename to data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.6.json diff --git a/data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.7.json b/data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.7.json similarity index 100% rename from data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.7.json rename to data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.7.json diff --git a/data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.8.json b/data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.8.json similarity index 100% rename from data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.8.json rename to data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.8.json diff --git a/data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.9.json b/data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.9.json similarity index 100% rename from data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.9.json rename to data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.9.json diff --git a/data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.json b/data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.json similarity index 100% rename from data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.json rename to data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.json diff --git a/data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9-stock.json b/data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9-stock.json similarity index 100% rename from data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9-stock.json rename to data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9-stock.json diff --git a/data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9.1.json b/data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9.1.json similarity index 100% rename from data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9.1.json rename to data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9.1.json diff --git a/data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9.2.json b/data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9.2.json similarity index 100% rename from data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9.2.json rename to data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9.2.json diff --git a/data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9.json b/data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9.json similarity index 100% rename from data/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9.json rename to data/models/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9.json diff --git a/data/Lunzima_NQLSG-Qwen2.5-14B-OriginalFusion.json b/data/models/Lunzima_NQLSG-Qwen2.5-14B-OriginalFusion.json similarity index 100% rename from data/Lunzima_NQLSG-Qwen2.5-14B-OriginalFusion.json rename to data/models/Lunzima_NQLSG-Qwen2.5-14B-OriginalFusion.json diff --git a/data/LxzGordon_URM-LLaMa-3-8B.json b/data/models/LxzGordon_URM-LLaMa-3-8B.json similarity index 100% rename from data/LxzGordon_URM-LLaMa-3-8B.json rename to data/models/LxzGordon_URM-LLaMa-3-8B.json diff --git a/data/LxzGordon_URM-LLaMa-3.1-8B.json b/data/models/LxzGordon_URM-LLaMa-3.1-8B.json similarity index 100% rename from data/LxzGordon_URM-LLaMa-3.1-8B.json rename to data/models/LxzGordon_URM-LLaMa-3.1-8B.json index 2ce56c90ce0d5fb5788660f5b3f1f1e179701786..7f03035f2bd809fb14be27130e313f5b86a26f9a 100644 --- a/data/LxzGordon_URM-LLaMa-3.1-8B.json +++ b/data/models/LxzGordon_URM-LLaMa-3.1-8B.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/LxzGordon_URM-LLaMa-3.1-8B/1766412838.146816", + "evaluation_id": "reward-bench-2/LxzGordon_URM-LLaMa-3.1-8B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,128 +31,104 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9294 + "score": 0.7394 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9553 + "score": 0.6884 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8816 + "score": 0.45 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Math", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Math score - measures mathematical reasoning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9108 + "score": 0.6393 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9698 + "score": 0.9178 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } - } - ], - "detailed_evaluation_results": null, - "generation_config": null - }, - { - "evaluation_id": "reward-bench-2/LxzGordon_URM-LLaMa-3.1-8B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "rewardbench", - "version": "0.1.3", - "additional_details": { - "subsets": "Chat, Chat Hard, Safety, Reasoning", - "hf_space": "allenai/reward-bench" - } - }, - "benchmark": "reward-bench", - "evaluation_results": [ + }, { - "evaluation_name": "Score", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7394 + "score": 0.9758 }, "source_data": { "dataset_name": "RewardBench 2", @@ -161,111 +137,135 @@ } }, { - "evaluation_name": "Factuality", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6884 + "score": 0.7653 }, "source_data": { "dataset_name": "RewardBench 2", "source_type": "hf_dataset", "hf_repo": "allenai/reward-bench-2-results" } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": null + }, + { + "evaluation_id": "reward-bench/LxzGordon_URM-LLaMa-3.1-8B/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ { - "evaluation_name": "Precise IF", + "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.45 + "score": 0.9294 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6393 + "score": 0.9553 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9178 + "score": 0.8816 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9758 + "score": 0.9108 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7653 + "score": 0.9698 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/Lyte_Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3.json b/data/models/Lyte_Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3.json similarity index 100% rename from data/Lyte_Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3.json rename to data/models/Lyte_Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3.json diff --git a/data/Lyte_Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04.json b/data/models/Lyte_Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04.json similarity index 100% rename from data/Lyte_Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04.json rename to data/models/Lyte_Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04.json diff --git a/data/Lyte_Llama-3.2-3B-Overthinker.json b/data/models/Lyte_Llama-3.2-3B-Overthinker.json similarity index 100% rename from data/Lyte_Llama-3.2-3B-Overthinker.json rename to data/models/Lyte_Llama-3.2-3B-Overthinker.json diff --git a/data/M4-ai_TinyMistral-248M-v3.json b/data/models/M4-ai_TinyMistral-248M-v3.json similarity index 100% rename from data/M4-ai_TinyMistral-248M-v3.json rename to data/models/M4-ai_TinyMistral-248M-v3.json diff --git a/data/MEscriva_ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis.json b/data/models/MEscriva_ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis.json similarity index 100% rename from data/MEscriva_ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis.json rename to data/models/MEscriva_ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis.json diff --git a/data/MLP-KTLim_llama-3-Korean-Bllossom-8B.json b/data/models/MLP-KTLim_llama-3-Korean-Bllossom-8B.json similarity index 100% rename from data/MLP-KTLim_llama-3-Korean-Bllossom-8B.json rename to data/models/MLP-KTLim_llama-3-Korean-Bllossom-8B.json diff --git a/data/MTSAIR_Cotype-Nano.json b/data/models/MTSAIR_Cotype-Nano.json similarity index 100% rename from data/MTSAIR_Cotype-Nano.json rename to data/models/MTSAIR_Cotype-Nano.json diff --git a/data/MTSAIR_MultiVerse_70B.json b/data/models/MTSAIR_MultiVerse_70B.json similarity index 100% rename from data/MTSAIR_MultiVerse_70B.json rename to data/models/MTSAIR_MultiVerse_70B.json diff --git a/data/Magpie-Align_Llama-3-8B-Magpie-Align-SFT-v0.1.json b/data/models/Magpie-Align_Llama-3-8B-Magpie-Align-SFT-v0.1.json similarity index 100% rename from data/Magpie-Align_Llama-3-8B-Magpie-Align-SFT-v0.1.json rename to data/models/Magpie-Align_Llama-3-8B-Magpie-Align-SFT-v0.1.json diff --git a/data/Magpie-Align_Llama-3-8B-Magpie-Align-SFT-v0.3.json b/data/models/Magpie-Align_Llama-3-8B-Magpie-Align-SFT-v0.3.json similarity index 100% rename from data/Magpie-Align_Llama-3-8B-Magpie-Align-SFT-v0.3.json rename to data/models/Magpie-Align_Llama-3-8B-Magpie-Align-SFT-v0.3.json diff --git a/data/Magpie-Align_Llama-3-8B-Magpie-Align-v0.1.json b/data/models/Magpie-Align_Llama-3-8B-Magpie-Align-v0.1.json similarity index 100% rename from data/Magpie-Align_Llama-3-8B-Magpie-Align-v0.1.json rename to data/models/Magpie-Align_Llama-3-8B-Magpie-Align-v0.1.json diff --git a/data/Magpie-Align_Llama-3-8B-Magpie-Align-v0.3.json b/data/models/Magpie-Align_Llama-3-8B-Magpie-Align-v0.3.json similarity index 100% rename from data/Magpie-Align_Llama-3-8B-Magpie-Align-v0.3.json rename to data/models/Magpie-Align_Llama-3-8B-Magpie-Align-v0.3.json diff --git a/data/Magpie-Align_Llama-3.1-8B-Magpie-Align-SFT-v0.1.json b/data/models/Magpie-Align_Llama-3.1-8B-Magpie-Align-SFT-v0.1.json similarity index 100% rename from data/Magpie-Align_Llama-3.1-8B-Magpie-Align-SFT-v0.1.json rename to data/models/Magpie-Align_Llama-3.1-8B-Magpie-Align-SFT-v0.1.json diff --git a/data/Magpie-Align_Llama-3.1-8B-Magpie-Align-v0.1.json b/data/models/Magpie-Align_Llama-3.1-8B-Magpie-Align-v0.1.json similarity index 100% rename from data/Magpie-Align_Llama-3.1-8B-Magpie-Align-v0.1.json rename to data/models/Magpie-Align_Llama-3.1-8B-Magpie-Align-v0.1.json diff --git a/data/Magpie-Align_MagpieLM-8B-Chat-v0.1.json b/data/models/Magpie-Align_MagpieLM-8B-Chat-v0.1.json similarity index 100% rename from data/Magpie-Align_MagpieLM-8B-Chat-v0.1.json rename to data/models/Magpie-Align_MagpieLM-8B-Chat-v0.1.json diff --git a/data/Magpie-Align_MagpieLM-8B-SFT-v0.1.json b/data/models/Magpie-Align_MagpieLM-8B-SFT-v0.1.json similarity index 100% rename from data/Magpie-Align_MagpieLM-8B-SFT-v0.1.json rename to data/models/Magpie-Align_MagpieLM-8B-SFT-v0.1.json diff --git a/data/MagusCorp_grpo_lora_enem_llama3_7b.json b/data/models/MagusCorp_grpo_lora_enem_llama3_7b.json similarity index 100% rename from data/MagusCorp_grpo_lora_enem_llama3_7b.json rename to data/models/MagusCorp_grpo_lora_enem_llama3_7b.json diff --git a/data/ManoloPueblo_ContentCuisine_1-7B-slerp.json b/data/models/ManoloPueblo_ContentCuisine_1-7B-slerp.json similarity index 100% rename from data/ManoloPueblo_ContentCuisine_1-7B-slerp.json rename to data/models/ManoloPueblo_ContentCuisine_1-7B-slerp.json diff --git a/data/ManoloPueblo_LLM_MERGE_CC2.json b/data/models/ManoloPueblo_LLM_MERGE_CC2.json similarity index 100% rename from data/ManoloPueblo_LLM_MERGE_CC2.json rename to data/models/ManoloPueblo_LLM_MERGE_CC2.json diff --git a/data/ManoloPueblo_LLM_MERGE_CC3.json b/data/models/ManoloPueblo_LLM_MERGE_CC3.json similarity index 100% rename from data/ManoloPueblo_LLM_MERGE_CC3.json rename to data/models/ManoloPueblo_LLM_MERGE_CC3.json diff --git a/data/MarinaraSpaghetti_NemoReRemix-12B.json b/data/models/MarinaraSpaghetti_NemoReRemix-12B.json similarity index 100% rename from data/MarinaraSpaghetti_NemoReRemix-12B.json rename to data/models/MarinaraSpaghetti_NemoReRemix-12B.json diff --git a/data/MarinaraSpaghetti_Nemomix-v4.0-12B.json b/data/models/MarinaraSpaghetti_Nemomix-v4.0-12B.json similarity index 100% rename from data/MarinaraSpaghetti_Nemomix-v4.0-12B.json rename to data/models/MarinaraSpaghetti_Nemomix-v4.0-12B.json diff --git a/data/Marsouuu_MiniMathExpert-2_61B-ECE-PRYMMAL-Martial.json b/data/models/Marsouuu_MiniMathExpert-2_61B-ECE-PRYMMAL-Martial.json similarity index 100% rename from data/Marsouuu_MiniMathExpert-2_61B-ECE-PRYMMAL-Martial.json rename to data/models/Marsouuu_MiniMathExpert-2_61B-ECE-PRYMMAL-Martial.json diff --git a/data/Marsouuu_MiniQwenMathExpert-ECE-PRYMMAL-Martial.json b/data/models/Marsouuu_MiniQwenMathExpert-ECE-PRYMMAL-Martial.json similarity index 100% rename from data/Marsouuu_MiniQwenMathExpert-ECE-PRYMMAL-Martial.json rename to data/models/Marsouuu_MiniQwenMathExpert-ECE-PRYMMAL-Martial.json diff --git a/data/Marsouuu_MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial.json b/data/models/Marsouuu_MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial.json similarity index 100% rename from data/Marsouuu_MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial.json rename to data/models/Marsouuu_MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial.json diff --git a/data/Marsouuu_general3B-ECE-PRYMMAL-Martial.json b/data/models/Marsouuu_general3B-ECE-PRYMMAL-Martial.json similarity index 100% rename from data/Marsouuu_general3B-ECE-PRYMMAL-Martial.json rename to data/models/Marsouuu_general3B-ECE-PRYMMAL-Martial.json diff --git a/data/Marsouuu_general3Bv2-ECE-PRYMMAL-Martial.json b/data/models/Marsouuu_general3Bv2-ECE-PRYMMAL-Martial.json similarity index 100% rename from data/Marsouuu_general3Bv2-ECE-PRYMMAL-Martial.json rename to data/models/Marsouuu_general3Bv2-ECE-PRYMMAL-Martial.json diff --git a/data/Marsouuu_lareneg1_78B-ECE-PRYMMAL-Martial.json b/data/models/Marsouuu_lareneg1_78B-ECE-PRYMMAL-Martial.json similarity index 100% rename from data/Marsouuu_lareneg1_78B-ECE-PRYMMAL-Martial.json rename to data/models/Marsouuu_lareneg1_78B-ECE-PRYMMAL-Martial.json diff --git a/data/Marsouuu_lareneg3B-ECE-PRYMMAL-Martial.json b/data/models/Marsouuu_lareneg3B-ECE-PRYMMAL-Martial.json similarity index 100% rename from data/Marsouuu_lareneg3B-ECE-PRYMMAL-Martial.json rename to data/models/Marsouuu_lareneg3B-ECE-PRYMMAL-Martial.json diff --git a/data/Marsouuu_lareneg3Bv2-ECE-PRYMMAL-Martial.json b/data/models/Marsouuu_lareneg3Bv2-ECE-PRYMMAL-Martial.json similarity index 100% rename from data/Marsouuu_lareneg3Bv2-ECE-PRYMMAL-Martial.json rename to data/models/Marsouuu_lareneg3Bv2-ECE-PRYMMAL-Martial.json diff --git a/data/MaziyarPanahi_Calme-4x7B-MoE-v0.1.json b/data/models/MaziyarPanahi_Calme-4x7B-MoE-v0.1.json similarity index 100% rename from data/MaziyarPanahi_Calme-4x7B-MoE-v0.1.json rename to data/models/MaziyarPanahi_Calme-4x7B-MoE-v0.1.json diff --git a/data/MaziyarPanahi_Calme-4x7B-MoE-v0.2.json b/data/models/MaziyarPanahi_Calme-4x7B-MoE-v0.2.json similarity index 100% rename from data/MaziyarPanahi_Calme-4x7B-MoE-v0.2.json rename to data/models/MaziyarPanahi_Calme-4x7B-MoE-v0.2.json diff --git a/data/MaziyarPanahi_Llama-3-70B-Instruct-v0.1.json b/data/models/MaziyarPanahi_Llama-3-70B-Instruct-v0.1.json similarity index 100% rename from data/MaziyarPanahi_Llama-3-70B-Instruct-v0.1.json rename to data/models/MaziyarPanahi_Llama-3-70B-Instruct-v0.1.json diff --git a/data/MaziyarPanahi_Llama-3-8B-Instruct-v0.10.json b/data/models/MaziyarPanahi_Llama-3-8B-Instruct-v0.10.json similarity index 100% rename from data/MaziyarPanahi_Llama-3-8B-Instruct-v0.10.json rename to data/models/MaziyarPanahi_Llama-3-8B-Instruct-v0.10.json diff --git a/data/MaziyarPanahi_Llama-3-8B-Instruct-v0.8.json b/data/models/MaziyarPanahi_Llama-3-8B-Instruct-v0.8.json similarity index 100% rename from data/MaziyarPanahi_Llama-3-8B-Instruct-v0.8.json rename to data/models/MaziyarPanahi_Llama-3-8B-Instruct-v0.8.json diff --git a/data/MaziyarPanahi_Llama-3-8B-Instruct-v0.9.json b/data/models/MaziyarPanahi_Llama-3-8B-Instruct-v0.9.json similarity index 100% rename from data/MaziyarPanahi_Llama-3-8B-Instruct-v0.9.json rename to data/models/MaziyarPanahi_Llama-3-8B-Instruct-v0.9.json diff --git a/data/MaziyarPanahi_Qwen1.5-MoE-A2.7B-Wikihow.json b/data/models/MaziyarPanahi_Qwen1.5-MoE-A2.7B-Wikihow.json similarity index 100% rename from data/MaziyarPanahi_Qwen1.5-MoE-A2.7B-Wikihow.json rename to data/models/MaziyarPanahi_Qwen1.5-MoE-A2.7B-Wikihow.json diff --git a/data/MaziyarPanahi_Qwen2-7B-Instruct-v0.1.json b/data/models/MaziyarPanahi_Qwen2-7B-Instruct-v0.1.json similarity index 100% rename from data/MaziyarPanahi_Qwen2-7B-Instruct-v0.1.json rename to data/models/MaziyarPanahi_Qwen2-7B-Instruct-v0.1.json diff --git a/data/MaziyarPanahi_Qwen2-7B-Instruct-v0.8.json b/data/models/MaziyarPanahi_Qwen2-7B-Instruct-v0.8.json similarity index 100% rename from data/MaziyarPanahi_Qwen2-7B-Instruct-v0.8.json rename to data/models/MaziyarPanahi_Qwen2-7B-Instruct-v0.8.json diff --git a/data/MaziyarPanahi_calme-2.1-llama3.1-70b.json b/data/models/MaziyarPanahi_calme-2.1-llama3.1-70b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.1-llama3.1-70b.json rename to data/models/MaziyarPanahi_calme-2.1-llama3.1-70b.json diff --git a/data/MaziyarPanahi_calme-2.1-phi3-4b.json b/data/models/MaziyarPanahi_calme-2.1-phi3-4b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.1-phi3-4b.json rename to data/models/MaziyarPanahi_calme-2.1-phi3-4b.json diff --git a/data/MaziyarPanahi_calme-2.1-phi3.5-4b.json b/data/models/MaziyarPanahi_calme-2.1-phi3.5-4b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.1-phi3.5-4b.json rename to data/models/MaziyarPanahi_calme-2.1-phi3.5-4b.json diff --git a/data/MaziyarPanahi_calme-2.1-qwen2-72b.json b/data/models/MaziyarPanahi_calme-2.1-qwen2-72b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.1-qwen2-72b.json rename to data/models/MaziyarPanahi_calme-2.1-qwen2-72b.json diff --git a/data/MaziyarPanahi_calme-2.1-qwen2-7b.json b/data/models/MaziyarPanahi_calme-2.1-qwen2-7b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.1-qwen2-7b.json rename to data/models/MaziyarPanahi_calme-2.1-qwen2-7b.json diff --git a/data/MaziyarPanahi_calme-2.1-qwen2.5-72b.json b/data/models/MaziyarPanahi_calme-2.1-qwen2.5-72b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.1-qwen2.5-72b.json rename to data/models/MaziyarPanahi_calme-2.1-qwen2.5-72b.json diff --git a/data/MaziyarPanahi_calme-2.1-rys-78b.json b/data/models/MaziyarPanahi_calme-2.1-rys-78b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.1-rys-78b.json rename to data/models/MaziyarPanahi_calme-2.1-rys-78b.json diff --git a/data/MaziyarPanahi_calme-2.2-llama3-70b.json b/data/models/MaziyarPanahi_calme-2.2-llama3-70b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.2-llama3-70b.json rename to data/models/MaziyarPanahi_calme-2.2-llama3-70b.json diff --git a/data/MaziyarPanahi_calme-2.2-llama3.1-70b.json b/data/models/MaziyarPanahi_calme-2.2-llama3.1-70b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.2-llama3.1-70b.json rename to data/models/MaziyarPanahi_calme-2.2-llama3.1-70b.json diff --git a/data/MaziyarPanahi_calme-2.2-phi3-4b.json b/data/models/MaziyarPanahi_calme-2.2-phi3-4b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.2-phi3-4b.json rename to data/models/MaziyarPanahi_calme-2.2-phi3-4b.json diff --git a/data/MaziyarPanahi_calme-2.2-qwen2-72b.json b/data/models/MaziyarPanahi_calme-2.2-qwen2-72b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.2-qwen2-72b.json rename to data/models/MaziyarPanahi_calme-2.2-qwen2-72b.json diff --git a/data/MaziyarPanahi_calme-2.2-qwen2-7b.json b/data/models/MaziyarPanahi_calme-2.2-qwen2-7b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.2-qwen2-7b.json rename to data/models/MaziyarPanahi_calme-2.2-qwen2-7b.json diff --git a/data/MaziyarPanahi_calme-2.2-qwen2.5-72b.json b/data/models/MaziyarPanahi_calme-2.2-qwen2.5-72b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.2-qwen2.5-72b.json rename to data/models/MaziyarPanahi_calme-2.2-qwen2.5-72b.json diff --git a/data/MaziyarPanahi_calme-2.2-rys-78b.json b/data/models/MaziyarPanahi_calme-2.2-rys-78b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.2-rys-78b.json rename to data/models/MaziyarPanahi_calme-2.2-rys-78b.json diff --git a/data/MaziyarPanahi_calme-2.3-llama3-70b.json b/data/models/MaziyarPanahi_calme-2.3-llama3-70b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.3-llama3-70b.json rename to data/models/MaziyarPanahi_calme-2.3-llama3-70b.json diff --git a/data/MaziyarPanahi_calme-2.3-llama3.1-70b.json b/data/models/MaziyarPanahi_calme-2.3-llama3.1-70b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.3-llama3.1-70b.json rename to data/models/MaziyarPanahi_calme-2.3-llama3.1-70b.json diff --git a/data/MaziyarPanahi_calme-2.3-phi3-4b.json b/data/models/MaziyarPanahi_calme-2.3-phi3-4b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.3-phi3-4b.json rename to data/models/MaziyarPanahi_calme-2.3-phi3-4b.json diff --git a/data/MaziyarPanahi_calme-2.3-qwen2-72b.json b/data/models/MaziyarPanahi_calme-2.3-qwen2-72b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.3-qwen2-72b.json rename to data/models/MaziyarPanahi_calme-2.3-qwen2-72b.json diff --git a/data/MaziyarPanahi_calme-2.3-qwen2-7b.json b/data/models/MaziyarPanahi_calme-2.3-qwen2-7b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.3-qwen2-7b.json rename to data/models/MaziyarPanahi_calme-2.3-qwen2-7b.json diff --git a/data/MaziyarPanahi_calme-2.3-rys-78b.json b/data/models/MaziyarPanahi_calme-2.3-rys-78b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.3-rys-78b.json rename to data/models/MaziyarPanahi_calme-2.3-rys-78b.json diff --git a/data/MaziyarPanahi_calme-2.4-llama3-70b.json b/data/models/MaziyarPanahi_calme-2.4-llama3-70b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.4-llama3-70b.json rename to data/models/MaziyarPanahi_calme-2.4-llama3-70b.json diff --git a/data/MaziyarPanahi_calme-2.4-qwen2-7b.json b/data/models/MaziyarPanahi_calme-2.4-qwen2-7b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.4-qwen2-7b.json rename to data/models/MaziyarPanahi_calme-2.4-qwen2-7b.json diff --git a/data/MaziyarPanahi_calme-2.4-rys-78b.json b/data/models/MaziyarPanahi_calme-2.4-rys-78b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.4-rys-78b.json rename to data/models/MaziyarPanahi_calme-2.4-rys-78b.json diff --git a/data/MaziyarPanahi_calme-2.5-qwen2-7b.json b/data/models/MaziyarPanahi_calme-2.5-qwen2-7b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.5-qwen2-7b.json rename to data/models/MaziyarPanahi_calme-2.5-qwen2-7b.json diff --git a/data/MaziyarPanahi_calme-2.6-qwen2-7b.json b/data/models/MaziyarPanahi_calme-2.6-qwen2-7b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.6-qwen2-7b.json rename to data/models/MaziyarPanahi_calme-2.6-qwen2-7b.json diff --git a/data/MaziyarPanahi_calme-2.7-qwen2-7b.json b/data/models/MaziyarPanahi_calme-2.7-qwen2-7b.json similarity index 100% rename from data/MaziyarPanahi_calme-2.7-qwen2-7b.json rename to data/models/MaziyarPanahi_calme-2.7-qwen2-7b.json diff --git a/data/MaziyarPanahi_calme-3.1-baguette-3b.json b/data/models/MaziyarPanahi_calme-3.1-baguette-3b.json similarity index 100% rename from data/MaziyarPanahi_calme-3.1-baguette-3b.json rename to data/models/MaziyarPanahi_calme-3.1-baguette-3b.json diff --git a/data/MaziyarPanahi_calme-3.1-instruct-3b.json b/data/models/MaziyarPanahi_calme-3.1-instruct-3b.json similarity index 100% rename from data/MaziyarPanahi_calme-3.1-instruct-3b.json rename to data/models/MaziyarPanahi_calme-3.1-instruct-3b.json diff --git a/data/MaziyarPanahi_calme-3.1-instruct-78b.json b/data/models/MaziyarPanahi_calme-3.1-instruct-78b.json similarity index 100% rename from data/MaziyarPanahi_calme-3.1-instruct-78b.json rename to data/models/MaziyarPanahi_calme-3.1-instruct-78b.json diff --git a/data/MaziyarPanahi_calme-3.1-llamaloi-3b.json b/data/models/MaziyarPanahi_calme-3.1-llamaloi-3b.json similarity index 100% rename from data/MaziyarPanahi_calme-3.1-llamaloi-3b.json rename to data/models/MaziyarPanahi_calme-3.1-llamaloi-3b.json diff --git a/data/MaziyarPanahi_calme-3.2-baguette-3b.json b/data/models/MaziyarPanahi_calme-3.2-baguette-3b.json similarity index 100% rename from data/MaziyarPanahi_calme-3.2-baguette-3b.json rename to data/models/MaziyarPanahi_calme-3.2-baguette-3b.json diff --git a/data/MaziyarPanahi_calme-3.2-instruct-3b.json b/data/models/MaziyarPanahi_calme-3.2-instruct-3b.json similarity index 100% rename from data/MaziyarPanahi_calme-3.2-instruct-3b.json rename to data/models/MaziyarPanahi_calme-3.2-instruct-3b.json diff --git a/data/MaziyarPanahi_calme-3.2-instruct-78b.json b/data/models/MaziyarPanahi_calme-3.2-instruct-78b.json similarity index 100% rename from data/MaziyarPanahi_calme-3.2-instruct-78b.json rename to data/models/MaziyarPanahi_calme-3.2-instruct-78b.json diff --git a/data/MaziyarPanahi_calme-3.3-baguette-3b.json b/data/models/MaziyarPanahi_calme-3.3-baguette-3b.json similarity index 100% rename from data/MaziyarPanahi_calme-3.3-baguette-3b.json rename to data/models/MaziyarPanahi_calme-3.3-baguette-3b.json diff --git a/data/MaziyarPanahi_calme-3.3-instruct-3b.json b/data/models/MaziyarPanahi_calme-3.3-instruct-3b.json similarity index 100% rename from data/MaziyarPanahi_calme-3.3-instruct-3b.json rename to data/models/MaziyarPanahi_calme-3.3-instruct-3b.json diff --git a/data/Minami-su_Amara-o1-7B-Qwen.json b/data/models/Minami-su_Amara-o1-7B-Qwen.json similarity index 100% rename from data/Minami-su_Amara-o1-7B-Qwen.json rename to data/models/Minami-su_Amara-o1-7B-Qwen.json diff --git a/data/Minami-su_Amara-o2-7B-Qwen.json b/data/models/Minami-su_Amara-o2-7B-Qwen.json similarity index 100% rename from data/Minami-su_Amara-o2-7B-Qwen.json rename to data/models/Minami-su_Amara-o2-7B-Qwen.json diff --git a/data/Minami-su_test-7B-00.json b/data/models/Minami-su_test-7B-00.json similarity index 100% rename from data/Minami-su_test-7B-00.json rename to data/models/Minami-su_test-7B-00.json diff --git a/data/Minami-su_test-7B-01.json b/data/models/Minami-su_test-7B-01.json similarity index 100% rename from data/Minami-su_test-7B-01.json rename to data/models/Minami-su_test-7B-01.json diff --git a/data/Minami-su_test-v2-7B-00.json b/data/models/Minami-su_test-v2-7B-00.json similarity index 100% rename from data/Minami-su_test-v2-7B-00.json rename to data/models/Minami-su_test-v2-7B-00.json diff --git a/data/ModelCloud_Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1.json b/data/models/ModelCloud_Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1.json similarity index 100% rename from data/ModelCloud_Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1.json rename to data/models/ModelCloud_Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1.json diff --git a/data/ModelSpace_GemmaX2-28-9B-v0.1.json b/data/models/ModelSpace_GemmaX2-28-9B-v0.1.json similarity index 100% rename from data/ModelSpace_GemmaX2-28-9B-v0.1.json rename to data/models/ModelSpace_GemmaX2-28-9B-v0.1.json diff --git a/data/MoonRide_Llama-3.2-3B-Khelavaster.json b/data/models/MoonRide_Llama-3.2-3B-Khelavaster.json similarity index 100% rename from data/MoonRide_Llama-3.2-3B-Khelavaster.json rename to data/models/MoonRide_Llama-3.2-3B-Khelavaster.json diff --git a/data/Mostafa8Mehrabi_llama-3.2-1b-Insomnia-ChatBot-merged.json b/data/models/Mostafa8Mehrabi_llama-3.2-1b-Insomnia-ChatBot-merged.json similarity index 100% rename from data/Mostafa8Mehrabi_llama-3.2-1b-Insomnia-ChatBot-merged.json rename to data/models/Mostafa8Mehrabi_llama-3.2-1b-Insomnia-ChatBot-merged.json diff --git a/data/MrRobotoAI_MrRoboto-ProLong-8b-v4i.json b/data/models/MrRobotoAI_MrRoboto-ProLong-8b-v4i.json similarity index 100% rename from data/MrRobotoAI_MrRoboto-ProLong-8b-v4i.json rename to data/models/MrRobotoAI_MrRoboto-ProLong-8b-v4i.json diff --git a/data/MrRobotoAI_MrRoboto-ProLongBASE-pt8-unaligned-8b.json b/data/models/MrRobotoAI_MrRoboto-ProLongBASE-pt8-unaligned-8b.json similarity index 100% rename from data/MrRobotoAI_MrRoboto-ProLongBASE-pt8-unaligned-8b.json rename to data/models/MrRobotoAI_MrRoboto-ProLongBASE-pt8-unaligned-8b.json diff --git a/data/MultivexAI_Gladiator-Mini-Exp-1211-3B.json b/data/models/MultivexAI_Gladiator-Mini-Exp-1211-3B.json similarity index 100% rename from data/MultivexAI_Gladiator-Mini-Exp-1211-3B.json rename to data/models/MultivexAI_Gladiator-Mini-Exp-1211-3B.json diff --git a/data/MultivexAI_Gladiator-Mini-Exp-1221-3B-Instruct-V2.json b/data/models/MultivexAI_Gladiator-Mini-Exp-1221-3B-Instruct-V2.json similarity index 100% rename from data/MultivexAI_Gladiator-Mini-Exp-1221-3B-Instruct-V2.json rename to data/models/MultivexAI_Gladiator-Mini-Exp-1221-3B-Instruct-V2.json diff --git a/data/MultivexAI_Gladiator-Mini-Exp-1221-3B-Instruct.json b/data/models/MultivexAI_Gladiator-Mini-Exp-1221-3B-Instruct.json similarity index 100% rename from data/MultivexAI_Gladiator-Mini-Exp-1221-3B-Instruct.json rename to data/models/MultivexAI_Gladiator-Mini-Exp-1221-3B-Instruct.json diff --git a/data/MultivexAI_Gladiator-Mini-Exp-1222-3B-Instruct.json b/data/models/MultivexAI_Gladiator-Mini-Exp-1222-3B-Instruct.json similarity index 100% rename from data/MultivexAI_Gladiator-Mini-Exp-1222-3B-Instruct.json rename to data/models/MultivexAI_Gladiator-Mini-Exp-1222-3B-Instruct.json diff --git a/data/MultivexAI_Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF.json b/data/models/MultivexAI_Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF.json similarity index 100% rename from data/MultivexAI_Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF.json rename to data/models/MultivexAI_Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF.json diff --git a/data/Mxode_NanoLM-0.3B-Instruct-v1.1.json b/data/models/Mxode_NanoLM-0.3B-Instruct-v1.1.json similarity index 100% rename from data/Mxode_NanoLM-0.3B-Instruct-v1.1.json rename to data/models/Mxode_NanoLM-0.3B-Instruct-v1.1.json diff --git a/data/Mxode_NanoLM-0.3B-Instruct-v1.json b/data/models/Mxode_NanoLM-0.3B-Instruct-v1.json similarity index 100% rename from data/Mxode_NanoLM-0.3B-Instruct-v1.json rename to data/models/Mxode_NanoLM-0.3B-Instruct-v1.json diff --git a/data/Mxode_NanoLM-0.3B-Instruct-v2.json b/data/models/Mxode_NanoLM-0.3B-Instruct-v2.json similarity index 100% rename from data/Mxode_NanoLM-0.3B-Instruct-v2.json rename to data/models/Mxode_NanoLM-0.3B-Instruct-v2.json diff --git a/data/Mxode_NanoLM-1B-Instruct-v1.1.json b/data/models/Mxode_NanoLM-1B-Instruct-v1.1.json similarity index 100% rename from data/Mxode_NanoLM-1B-Instruct-v1.1.json rename to data/models/Mxode_NanoLM-1B-Instruct-v1.1.json diff --git a/data/Mxode_NanoLM-1B-Instruct-v2.json b/data/models/Mxode_NanoLM-1B-Instruct-v2.json similarity index 100% rename from data/Mxode_NanoLM-1B-Instruct-v2.json rename to data/models/Mxode_NanoLM-1B-Instruct-v2.json diff --git a/data/NAPS-ai_naps-gemma-2-27b-v-0.1.0.json b/data/models/NAPS-ai_naps-gemma-2-27b-v-0.1.0.json similarity index 100% rename from data/NAPS-ai_naps-gemma-2-27b-v-0.1.0.json rename to data/models/NAPS-ai_naps-gemma-2-27b-v-0.1.0.json diff --git a/data/NAPS-ai_naps-gemma-2-27b-v0.1.0.json b/data/models/NAPS-ai_naps-gemma-2-27b-v0.1.0.json similarity index 100% rename from data/NAPS-ai_naps-gemma-2-27b-v0.1.0.json rename to data/models/NAPS-ai_naps-gemma-2-27b-v0.1.0.json diff --git a/data/NAPS-ai_naps-llama-3_1-8b-instruct-v0.3.json b/data/models/NAPS-ai_naps-llama-3_1-8b-instruct-v0.3.json similarity index 100% rename from data/NAPS-ai_naps-llama-3_1-8b-instruct-v0.3.json rename to data/models/NAPS-ai_naps-llama-3_1-8b-instruct-v0.3.json diff --git a/data/NAPS-ai_naps-llama-3_1-8b-instruct-v0.4.json b/data/models/NAPS-ai_naps-llama-3_1-8b-instruct-v0.4.json similarity index 100% rename from data/NAPS-ai_naps-llama-3_1-8b-instruct-v0.4.json rename to data/models/NAPS-ai_naps-llama-3_1-8b-instruct-v0.4.json diff --git a/data/NAPS-ai_naps-llama-3_1-instruct-v0.5.0.json b/data/models/NAPS-ai_naps-llama-3_1-instruct-v0.5.0.json similarity index 100% rename from data/NAPS-ai_naps-llama-3_1-instruct-v0.5.0.json rename to data/models/NAPS-ai_naps-llama-3_1-instruct-v0.5.0.json diff --git a/data/NAPS-ai_naps-llama-3_1_instruct-v0.6.0.json b/data/models/NAPS-ai_naps-llama-3_1_instruct-v0.6.0.json similarity index 100% rename from data/NAPS-ai_naps-llama-3_1_instruct-v0.6.0.json rename to data/models/NAPS-ai_naps-llama-3_1_instruct-v0.6.0.json diff --git a/data/NAPS-ai_naps-llama3.1-70B-v0.2-fp16.json b/data/models/NAPS-ai_naps-llama3.1-70B-v0.2-fp16.json similarity index 100% rename from data/NAPS-ai_naps-llama3.1-70B-v0.2-fp16.json rename to data/models/NAPS-ai_naps-llama3.1-70B-v0.2-fp16.json diff --git a/data/NCSOFT_Llama-3-OffsetBias-8B.json b/data/models/NCSOFT_Llama-3-OffsetBias-8B.json similarity index 100% rename from data/NCSOFT_Llama-3-OffsetBias-8B.json rename to data/models/NCSOFT_Llama-3-OffsetBias-8B.json diff --git a/data/NCSOFT_Llama-3-OffsetBias-RM-8B.json b/data/models/NCSOFT_Llama-3-OffsetBias-RM-8B.json similarity index 100% rename from data/NCSOFT_Llama-3-OffsetBias-RM-8B.json rename to data/models/NCSOFT_Llama-3-OffsetBias-RM-8B.json diff --git a/data/NCSOFT_Llama-VARCO-8B-Instruct.json b/data/models/NCSOFT_Llama-VARCO-8B-Instruct.json similarity index 100% rename from data/NCSOFT_Llama-VARCO-8B-Instruct.json rename to data/models/NCSOFT_Llama-VARCO-8B-Instruct.json diff --git a/data/NJS26_NJS_777.json b/data/models/NJS26_NJS_777.json similarity index 100% rename from data/NJS26_NJS_777.json rename to data/models/NJS26_NJS_777.json diff --git a/data/NLPark_AnFeng_v3.1-Avocet.json b/data/models/NLPark_AnFeng_v3.1-Avocet.json similarity index 100% rename from data/NLPark_AnFeng_v3.1-Avocet.json rename to data/models/NLPark_AnFeng_v3.1-Avocet.json diff --git a/data/NLPark_B-and-W_Flycatcher-3AD1E.json b/data/models/NLPark_B-and-W_Flycatcher-3AD1E.json similarity index 100% rename from data/NLPark_B-and-W_Flycatcher-3AD1E.json rename to data/models/NLPark_B-and-W_Flycatcher-3AD1E.json diff --git a/data/NLPark_Shi-Ci-Robin-Test_3AD80.json b/data/models/NLPark_Shi-Ci-Robin-Test_3AD80.json similarity index 100% rename from data/NLPark_Shi-Ci-Robin-Test_3AD80.json rename to data/models/NLPark_Shi-Ci-Robin-Test_3AD80.json diff --git a/data/NTQAI_NxMobileLM-1.5B-SFT.json b/data/models/NTQAI_NxMobileLM-1.5B-SFT.json similarity index 100% rename from data/NTQAI_NxMobileLM-1.5B-SFT.json rename to data/models/NTQAI_NxMobileLM-1.5B-SFT.json diff --git a/data/NTQAI_Nxcode-CQ-7B-orpo.json b/data/models/NTQAI_Nxcode-CQ-7B-orpo.json similarity index 100% rename from data/NTQAI_Nxcode-CQ-7B-orpo.json rename to data/models/NTQAI_Nxcode-CQ-7B-orpo.json diff --git a/data/NYTK_PULI-GPTrio.json b/data/models/NYTK_PULI-GPTrio.json similarity index 100% rename from data/NYTK_PULI-GPTrio.json rename to data/models/NYTK_PULI-GPTrio.json diff --git a/data/NYTK_PULI-LlumiX-32K.json b/data/models/NYTK_PULI-LlumiX-32K.json similarity index 100% rename from data/NYTK_PULI-LlumiX-32K.json rename to data/models/NYTK_PULI-LlumiX-32K.json diff --git a/data/Naveenpoliasetty_llama3-8B-V2.json b/data/models/Naveenpoliasetty_llama3-8B-V2.json similarity index 100% rename from data/Naveenpoliasetty_llama3-8B-V2.json rename to data/models/Naveenpoliasetty_llama3-8B-V2.json diff --git a/data/NbAiLab_nb-llama-3.1-8B-Instruct.json b/data/models/NbAiLab_nb-llama-3.1-8B-Instruct.json similarity index 100% rename from data/NbAiLab_nb-llama-3.1-8B-Instruct.json rename to data/models/NbAiLab_nb-llama-3.1-8B-Instruct.json diff --git a/data/NbAiLab_nb-llama-3.1-8B-sft.json b/data/models/NbAiLab_nb-llama-3.1-8B-sft.json similarity index 100% rename from data/NbAiLab_nb-llama-3.1-8B-sft.json rename to data/models/NbAiLab_nb-llama-3.1-8B-sft.json diff --git a/data/Nekochu_Llama-3.1-8B-German-ORPO.json b/data/models/Nekochu_Llama-3.1-8B-German-ORPO.json similarity index 100% rename from data/Nekochu_Llama-3.1-8B-German-ORPO.json rename to data/models/Nekochu_Llama-3.1-8B-German-ORPO.json diff --git a/data/Nekochu_Llama-3.1-8B-french-DPO.json b/data/models/Nekochu_Llama-3.1-8B-french-DPO.json similarity index 100% rename from data/Nekochu_Llama-3.1-8B-french-DPO.json rename to data/models/Nekochu_Llama-3.1-8B-french-DPO.json diff --git a/data/Nekochu_Luminia-13B-v3.json b/data/models/Nekochu_Luminia-13B-v3.json similarity index 100% rename from data/Nekochu_Luminia-13B-v3.json rename to data/models/Nekochu_Luminia-13B-v3.json diff --git a/data/Nekochu_Luminia-8B-RP.json b/data/models/Nekochu_Luminia-8B-RP.json similarity index 100% rename from data/Nekochu_Luminia-8B-RP.json rename to data/models/Nekochu_Luminia-8B-RP.json diff --git a/data/NeverSleep_Lumimaid-v0.2-12B.json b/data/models/NeverSleep_Lumimaid-v0.2-12B.json similarity index 100% rename from data/NeverSleep_Lumimaid-v0.2-12B.json rename to data/models/NeverSleep_Lumimaid-v0.2-12B.json diff --git a/data/NeverSleep_Lumimaid-v0.2-8B.json b/data/models/NeverSleep_Lumimaid-v0.2-8B.json similarity index 100% rename from data/NeverSleep_Lumimaid-v0.2-8B.json rename to data/models/NeverSleep_Lumimaid-v0.2-8B.json diff --git a/data/Nexesenex_Dolphin3.0-Llama3.1-1B-abliterated.json b/data/models/Nexesenex_Dolphin3.0-Llama3.1-1B-abliterated.json similarity index 100% rename from data/Nexesenex_Dolphin3.0-Llama3.1-1B-abliterated.json rename to data/models/Nexesenex_Dolphin3.0-Llama3.1-1B-abliterated.json diff --git a/data/Nexesenex_Llama_3.1_8b_DeepDive_3_Prev_v1.0.json b/data/models/Nexesenex_Llama_3.1_8b_DeepDive_3_Prev_v1.0.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_DeepDive_3_Prev_v1.0.json rename to data/models/Nexesenex_Llama_3.1_8b_DeepDive_3_Prev_v1.0.json diff --git a/data/Nexesenex_Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0.json b/data/models/Nexesenex_Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0.json rename to data/models/Nexesenex_Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0.json diff --git a/data/Nexesenex_Llama_3.1_8b_DobHerWild_R1_v1.1R.json b/data/models/Nexesenex_Llama_3.1_8b_DobHerWild_R1_v1.1R.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_DobHerWild_R1_v1.1R.json rename to data/models/Nexesenex_Llama_3.1_8b_DobHerWild_R1_v1.1R.json diff --git a/data/Nexesenex_Llama_3.1_8b_DoberWild_v2.01.json b/data/models/Nexesenex_Llama_3.1_8b_DoberWild_v2.01.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_DoberWild_v2.01.json rename to data/models/Nexesenex_Llama_3.1_8b_DoberWild_v2.01.json diff --git a/data/Nexesenex_Llama_3.1_8b_DoberWild_v2.02.json b/data/models/Nexesenex_Llama_3.1_8b_DoberWild_v2.02.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_DoberWild_v2.02.json rename to data/models/Nexesenex_Llama_3.1_8b_DoberWild_v2.02.json diff --git a/data/Nexesenex_Llama_3.1_8b_DoberWild_v2.03.json b/data/models/Nexesenex_Llama_3.1_8b_DoberWild_v2.03.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_DoberWild_v2.03.json rename to data/models/Nexesenex_Llama_3.1_8b_DoberWild_v2.03.json diff --git a/data/Nexesenex_Llama_3.1_8b_DodoWild_v2.01.json b/data/models/Nexesenex_Llama_3.1_8b_DodoWild_v2.01.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_DodoWild_v2.01.json rename to data/models/Nexesenex_Llama_3.1_8b_DodoWild_v2.01.json diff --git a/data/Nexesenex_Llama_3.1_8b_DodoWild_v2.02.json b/data/models/Nexesenex_Llama_3.1_8b_DodoWild_v2.02.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_DodoWild_v2.02.json rename to data/models/Nexesenex_Llama_3.1_8b_DodoWild_v2.02.json diff --git a/data/Nexesenex_Llama_3.1_8b_DodoWild_v2.03.json b/data/models/Nexesenex_Llama_3.1_8b_DodoWild_v2.03.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_DodoWild_v2.03.json rename to data/models/Nexesenex_Llama_3.1_8b_DodoWild_v2.03.json diff --git a/data/Nexesenex_Llama_3.1_8b_DodoWild_v2.10.json b/data/models/Nexesenex_Llama_3.1_8b_DodoWild_v2.10.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_DodoWild_v2.10.json rename to data/models/Nexesenex_Llama_3.1_8b_DodoWild_v2.10.json diff --git a/data/Nexesenex_Llama_3.1_8b_Dolermed_R1_V1.01.json b/data/models/Nexesenex_Llama_3.1_8b_Dolermed_R1_V1.01.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_Dolermed_R1_V1.01.json rename to data/models/Nexesenex_Llama_3.1_8b_Dolermed_R1_V1.01.json diff --git a/data/Nexesenex_Llama_3.1_8b_Dolermed_R1_V1.03.json b/data/models/Nexesenex_Llama_3.1_8b_Dolermed_R1_V1.03.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_Dolermed_R1_V1.03.json rename to data/models/Nexesenex_Llama_3.1_8b_Dolermed_R1_V1.03.json diff --git a/data/Nexesenex_Llama_3.1_8b_Dolermed_V1.01.json b/data/models/Nexesenex_Llama_3.1_8b_Dolermed_V1.01.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_Dolermed_V1.01.json rename to data/models/Nexesenex_Llama_3.1_8b_Dolermed_V1.01.json diff --git a/data/Nexesenex_Llama_3.1_8b_Dolerstormed_V1.04.json b/data/models/Nexesenex_Llama_3.1_8b_Dolerstormed_V1.04.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_Dolerstormed_V1.04.json rename to data/models/Nexesenex_Llama_3.1_8b_Dolerstormed_V1.04.json diff --git a/data/Nexesenex_Llama_3.1_8b_Hermedash_R1_V1.04.json b/data/models/Nexesenex_Llama_3.1_8b_Hermedash_R1_V1.04.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_Hermedash_R1_V1.04.json rename to data/models/Nexesenex_Llama_3.1_8b_Hermedash_R1_V1.04.json diff --git a/data/Nexesenex_Llama_3.1_8b_Hermedive_R1_V1.01.json b/data/models/Nexesenex_Llama_3.1_8b_Hermedive_R1_V1.01.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_Hermedive_R1_V1.01.json rename to data/models/Nexesenex_Llama_3.1_8b_Hermedive_R1_V1.01.json diff --git a/data/Nexesenex_Llama_3.1_8b_Hermedive_R1_V1.03.json b/data/models/Nexesenex_Llama_3.1_8b_Hermedive_R1_V1.03.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_Hermedive_R1_V1.03.json rename to data/models/Nexesenex_Llama_3.1_8b_Hermedive_R1_V1.03.json diff --git a/data/Nexesenex_Llama_3.1_8b_Hermedive_V1.01.json b/data/models/Nexesenex_Llama_3.1_8b_Hermedive_V1.01.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_Hermedive_V1.01.json rename to data/models/Nexesenex_Llama_3.1_8b_Hermedive_V1.01.json diff --git a/data/Nexesenex_Llama_3.1_8b_Mediver_V1.01.json b/data/models/Nexesenex_Llama_3.1_8b_Mediver_V1.01.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_Mediver_V1.01.json rename to data/models/Nexesenex_Llama_3.1_8b_Mediver_V1.01.json diff --git a/data/Nexesenex_Llama_3.1_8b_Medusa_v1.01.json b/data/models/Nexesenex_Llama_3.1_8b_Medusa_v1.01.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_Medusa_v1.01.json rename to data/models/Nexesenex_Llama_3.1_8b_Medusa_v1.01.json diff --git a/data/Nexesenex_Llama_3.1_8b_Smarteaz_0.2_R1.json b/data/models/Nexesenex_Llama_3.1_8b_Smarteaz_0.2_R1.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_Smarteaz_0.2_R1.json rename to data/models/Nexesenex_Llama_3.1_8b_Smarteaz_0.2_R1.json diff --git a/data/Nexesenex_Llama_3.1_8b_Smarteaz_V1.01.json b/data/models/Nexesenex_Llama_3.1_8b_Smarteaz_V1.01.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_Smarteaz_V1.01.json rename to data/models/Nexesenex_Llama_3.1_8b_Smarteaz_V1.01.json diff --git a/data/Nexesenex_Llama_3.1_8b_Stormeder_v1.04.json b/data/models/Nexesenex_Llama_3.1_8b_Stormeder_v1.04.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_Stormeder_v1.04.json rename to data/models/Nexesenex_Llama_3.1_8b_Stormeder_v1.04.json diff --git a/data/Nexesenex_Llama_3.1_8b_Typhoon_v1.03.json b/data/models/Nexesenex_Llama_3.1_8b_Typhoon_v1.03.json similarity index 100% rename from data/Nexesenex_Llama_3.1_8b_Typhoon_v1.03.json rename to data/models/Nexesenex_Llama_3.1_8b_Typhoon_v1.03.json diff --git a/data/Nexesenex_Llama_3.2_1b_AquaSyn_0.1.json b/data/models/Nexesenex_Llama_3.2_1b_AquaSyn_0.1.json similarity index 100% rename from data/Nexesenex_Llama_3.2_1b_AquaSyn_0.1.json rename to data/models/Nexesenex_Llama_3.2_1b_AquaSyn_0.1.json diff --git a/data/Nexesenex_Llama_3.2_1b_AquaSyn_0.11.json b/data/models/Nexesenex_Llama_3.2_1b_AquaSyn_0.11.json similarity index 100% rename from data/Nexesenex_Llama_3.2_1b_AquaSyn_0.11.json rename to data/models/Nexesenex_Llama_3.2_1b_AquaSyn_0.11.json diff --git a/data/Nexesenex_Llama_3.2_1b_Dolto_0.1.json b/data/models/Nexesenex_Llama_3.2_1b_Dolto_0.1.json similarity index 100% rename from data/Nexesenex_Llama_3.2_1b_Dolto_0.1.json rename to data/models/Nexesenex_Llama_3.2_1b_Dolto_0.1.json diff --git a/data/Nexesenex_Llama_3.2_1b_Odyssea_V1.01.json b/data/models/Nexesenex_Llama_3.2_1b_Odyssea_V1.01.json similarity index 100% rename from data/Nexesenex_Llama_3.2_1b_Odyssea_V1.01.json rename to data/models/Nexesenex_Llama_3.2_1b_Odyssea_V1.01.json diff --git a/data/Nexesenex_Llama_3.2_1b_Odyssea_V1.json b/data/models/Nexesenex_Llama_3.2_1b_Odyssea_V1.json similarity index 100% rename from data/Nexesenex_Llama_3.2_1b_Odyssea_V1.json rename to data/models/Nexesenex_Llama_3.2_1b_Odyssea_V1.json diff --git a/data/Nexesenex_Llama_3.2_1b_OpenTree_R1_0.1.json b/data/models/Nexesenex_Llama_3.2_1b_OpenTree_R1_0.1.json similarity index 100% rename from data/Nexesenex_Llama_3.2_1b_OpenTree_R1_0.1.json rename to data/models/Nexesenex_Llama_3.2_1b_OpenTree_R1_0.1.json diff --git a/data/Nexesenex_Llama_3.2_1b_OrcaSun_V1.json b/data/models/Nexesenex_Llama_3.2_1b_OrcaSun_V1.json similarity index 100% rename from data/Nexesenex_Llama_3.2_1b_OrcaSun_V1.json rename to data/models/Nexesenex_Llama_3.2_1b_OrcaSun_V1.json diff --git a/data/Nexesenex_Llama_3.2_1b_RandomLego_RP_R1_0.1.json b/data/models/Nexesenex_Llama_3.2_1b_RandomLego_RP_R1_0.1.json similarity index 100% rename from data/Nexesenex_Llama_3.2_1b_RandomLego_RP_R1_0.1.json rename to data/models/Nexesenex_Llama_3.2_1b_RandomLego_RP_R1_0.1.json diff --git a/data/Nexesenex_Llama_3.2_1b_SunOrca_V1.json b/data/models/Nexesenex_Llama_3.2_1b_SunOrca_V1.json similarity index 100% rename from data/Nexesenex_Llama_3.2_1b_SunOrca_V1.json rename to data/models/Nexesenex_Llama_3.2_1b_SunOrca_V1.json diff --git a/data/Nexesenex_Llama_3.2_1b_Sydonia_0.1.json b/data/models/Nexesenex_Llama_3.2_1b_Sydonia_0.1.json similarity index 100% rename from data/Nexesenex_Llama_3.2_1b_Sydonia_0.1.json rename to data/models/Nexesenex_Llama_3.2_1b_Sydonia_0.1.json diff --git a/data/Nexesenex_Llama_3.2_1b_Syneridol_0.2.json b/data/models/Nexesenex_Llama_3.2_1b_Syneridol_0.2.json similarity index 100% rename from data/Nexesenex_Llama_3.2_1b_Syneridol_0.2.json rename to data/models/Nexesenex_Llama_3.2_1b_Syneridol_0.2.json diff --git a/data/Nexesenex_Llama_3.2_1b_Synopsys_0.1.json b/data/models/Nexesenex_Llama_3.2_1b_Synopsys_0.1.json similarity index 100% rename from data/Nexesenex_Llama_3.2_1b_Synopsys_0.1.json rename to data/models/Nexesenex_Llama_3.2_1b_Synopsys_0.1.json diff --git a/data/Nexesenex_Llama_3.2_1b_Synopsys_0.11.json b/data/models/Nexesenex_Llama_3.2_1b_Synopsys_0.11.json similarity index 100% rename from data/Nexesenex_Llama_3.2_1b_Synopsys_0.11.json rename to data/models/Nexesenex_Llama_3.2_1b_Synopsys_0.11.json diff --git a/data/Nexesenex_Llama_3.2_3b_Kermes_v1.json b/data/models/Nexesenex_Llama_3.2_3b_Kermes_v1.json similarity index 100% rename from data/Nexesenex_Llama_3.2_3b_Kermes_v1.json rename to data/models/Nexesenex_Llama_3.2_3b_Kermes_v1.json diff --git a/data/Nexesenex_Llama_3.2_3b_Kermes_v2.1.json b/data/models/Nexesenex_Llama_3.2_3b_Kermes_v2.1.json similarity index 100% rename from data/Nexesenex_Llama_3.2_3b_Kermes_v2.1.json rename to data/models/Nexesenex_Llama_3.2_3b_Kermes_v2.1.json diff --git a/data/Nexesenex_Llama_3.2_3b_Kermes_v2.json b/data/models/Nexesenex_Llama_3.2_3b_Kermes_v2.json similarity index 100% rename from data/Nexesenex_Llama_3.2_3b_Kermes_v2.json rename to data/models/Nexesenex_Llama_3.2_3b_Kermes_v2.json diff --git a/data/Nexesenex_Nemotron_W_4b_Halo_0.1.json b/data/models/Nexesenex_Nemotron_W_4b_Halo_0.1.json similarity index 100% rename from data/Nexesenex_Nemotron_W_4b_Halo_0.1.json rename to data/models/Nexesenex_Nemotron_W_4b_Halo_0.1.json diff --git a/data/Nexesenex_Nemotron_W_4b_MagLight_0.1.json b/data/models/Nexesenex_Nemotron_W_4b_MagLight_0.1.json similarity index 100% rename from data/Nexesenex_Nemotron_W_4b_MagLight_0.1.json rename to data/models/Nexesenex_Nemotron_W_4b_MagLight_0.1.json diff --git a/data/Nexesenex_Qwen_2.5_3b_Smarteaz_0.01a.json b/data/models/Nexesenex_Qwen_2.5_3b_Smarteaz_0.01a.json similarity index 100% rename from data/Nexesenex_Qwen_2.5_3b_Smarteaz_0.01a.json rename to data/models/Nexesenex_Qwen_2.5_3b_Smarteaz_0.01a.json diff --git a/data/Nexesenex_pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL.json b/data/models/Nexesenex_pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL.json similarity index 100% rename from data/Nexesenex_pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL.json rename to data/models/Nexesenex_pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL.json diff --git a/data/Nexusflow_NexusRaven-V2-13B.json b/data/models/Nexusflow_NexusRaven-V2-13B.json similarity index 100% rename from data/Nexusflow_NexusRaven-V2-13B.json rename to data/models/Nexusflow_NexusRaven-V2-13B.json diff --git a/data/Nexusflow_Starling-RM-34B.json b/data/models/Nexusflow_Starling-RM-34B.json similarity index 100% rename from data/Nexusflow_Starling-RM-34B.json rename to data/models/Nexusflow_Starling-RM-34B.json index 0373bd963cc9795a2ad38fc9da30f8417d64ff6d..8ab4f392fbf3047e7bcb88c6ed9f3a2b6d8e5a37 100644 --- a/data/Nexusflow_Starling-RM-34B.json +++ b/data/models/Nexusflow_Starling-RM-34B.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/Nexusflow_Starling-RM-34B/1766412838.146816", + "evaluation_id": "reward-bench-2/Nexusflow_Starling-RM-34B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,109 +31,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8133 + "score": 0.4553 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9693 + "score": 0.4589 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5724 + "score": 0.3187 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6175 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.877 + "score": 0.7556 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8845 + "score": 0.4808 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7137 + "score": 0.1004 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], @@ -141,10 +159,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench-2/Nexusflow_Starling-RM-34B/1766412838.146816", + "evaluation_id": "reward-bench/Nexusflow_Starling-RM-34B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -163,127 +181,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4553 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4589 + "score": 0.8133 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3187 + "score": 0.9693 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6175 + "score": 0.5724 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7556 + "score": 0.877 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4808 + "score": 0.8845 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.1004 + "score": 0.7137 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/NikolaSigmoid_AceMath-1.5B-Instruct-1epoch.json b/data/models/NikolaSigmoid_AceMath-1.5B-Instruct-1epoch.json similarity index 100% rename from data/NikolaSigmoid_AceMath-1.5B-Instruct-1epoch.json rename to data/models/NikolaSigmoid_AceMath-1.5B-Instruct-1epoch.json diff --git a/data/NikolaSigmoid_AceMath-1.5B-Instruct-dolphin-r1-200.json b/data/models/NikolaSigmoid_AceMath-1.5B-Instruct-dolphin-r1-200.json similarity index 100% rename from data/NikolaSigmoid_AceMath-1.5B-Instruct-dolphin-r1-200.json rename to data/models/NikolaSigmoid_AceMath-1.5B-Instruct-dolphin-r1-200.json diff --git a/data/NikolaSigmoid_DeepSeek-R1-Distill-Qwen-1.5B-500.json b/data/models/NikolaSigmoid_DeepSeek-R1-Distill-Qwen-1.5B-500.json similarity index 100% rename from data/NikolaSigmoid_DeepSeek-R1-Distill-Qwen-1.5B-500.json rename to data/models/NikolaSigmoid_DeepSeek-R1-Distill-Qwen-1.5B-500.json diff --git a/data/NikolaSigmoid_acemath-200.json b/data/models/NikolaSigmoid_acemath-200.json similarity index 100% rename from data/NikolaSigmoid_acemath-200.json rename to data/models/NikolaSigmoid_acemath-200.json diff --git a/data/NikolaSigmoid_phi-4-14b.json b/data/models/NikolaSigmoid_phi-4-14b.json similarity index 100% rename from data/NikolaSigmoid_phi-4-14b.json rename to data/models/NikolaSigmoid_phi-4-14b.json diff --git a/data/NikolaSigmoid_phi-4-1steps.json b/data/models/NikolaSigmoid_phi-4-1steps.json similarity index 100% rename from data/NikolaSigmoid_phi-4-1steps.json rename to data/models/NikolaSigmoid_phi-4-1steps.json diff --git a/data/NikolaSigmoid_phi-4-300steps.json b/data/models/NikolaSigmoid_phi-4-300steps.json similarity index 100% rename from data/NikolaSigmoid_phi-4-300steps.json rename to data/models/NikolaSigmoid_phi-4-300steps.json diff --git a/data/Nitral-AI_Captain-Eris-BMO_Violent-GRPO-v0.420.json b/data/models/Nitral-AI_Captain-Eris-BMO_Violent-GRPO-v0.420.json similarity index 100% rename from data/Nitral-AI_Captain-Eris-BMO_Violent-GRPO-v0.420.json rename to data/models/Nitral-AI_Captain-Eris-BMO_Violent-GRPO-v0.420.json diff --git a/data/Nitral-AI_Captain-Eris_BMO-Violent-12B.json b/data/models/Nitral-AI_Captain-Eris_BMO-Violent-12B.json similarity index 100% rename from data/Nitral-AI_Captain-Eris_BMO-Violent-12B.json rename to data/models/Nitral-AI_Captain-Eris_BMO-Violent-12B.json diff --git a/data/Nitral-AI_Captain-Eris_Violet-GRPO-v0.420.json b/data/models/Nitral-AI_Captain-Eris_Violet-GRPO-v0.420.json similarity index 100% rename from data/Nitral-AI_Captain-Eris_Violet-GRPO-v0.420.json rename to data/models/Nitral-AI_Captain-Eris_Violet-GRPO-v0.420.json diff --git a/data/Nitral-AI_Captain-Eris_Violet-V0.420-12B.json b/data/models/Nitral-AI_Captain-Eris_Violet-V0.420-12B.json similarity index 100% rename from data/Nitral-AI_Captain-Eris_Violet-V0.420-12B.json rename to data/models/Nitral-AI_Captain-Eris_Violet-V0.420-12B.json diff --git a/data/Nitral-AI_Captain_BMO-12B.json b/data/models/Nitral-AI_Captain_BMO-12B.json similarity index 100% rename from data/Nitral-AI_Captain_BMO-12B.json rename to data/models/Nitral-AI_Captain_BMO-12B.json diff --git a/data/Nitral-AI_Hathor_Stable-v0.2-L3-8B.json b/data/models/Nitral-AI_Hathor_Stable-v0.2-L3-8B.json similarity index 100% rename from data/Nitral-AI_Hathor_Stable-v0.2-L3-8B.json rename to data/models/Nitral-AI_Hathor_Stable-v0.2-L3-8B.json diff --git a/data/Nitral-AI_Hathor_Tahsin-L3-8B-v0.85.json b/data/models/Nitral-AI_Hathor_Tahsin-L3-8B-v0.85.json similarity index 100% rename from data/Nitral-AI_Hathor_Tahsin-L3-8B-v0.85.json rename to data/models/Nitral-AI_Hathor_Tahsin-L3-8B-v0.85.json diff --git a/data/Nitral-AI_Nera_Noctis-12B.json b/data/models/Nitral-AI_Nera_Noctis-12B.json similarity index 100% rename from data/Nitral-AI_Nera_Noctis-12B.json rename to data/models/Nitral-AI_Nera_Noctis-12B.json diff --git a/data/Nohobby_MS-Schisandra-22B-v0.1.json b/data/models/Nohobby_MS-Schisandra-22B-v0.1.json similarity index 100% rename from data/Nohobby_MS-Schisandra-22B-v0.1.json rename to data/models/Nohobby_MS-Schisandra-22B-v0.1.json diff --git a/data/Nohobby_MS-Schisandra-22B-v0.2.json b/data/models/Nohobby_MS-Schisandra-22B-v0.2.json similarity index 100% rename from data/Nohobby_MS-Schisandra-22B-v0.2.json rename to data/models/Nohobby_MS-Schisandra-22B-v0.2.json diff --git a/data/Norquinal_Alpha.json b/data/models/Norquinal_Alpha.json similarity index 100% rename from data/Norquinal_Alpha.json rename to data/models/Norquinal_Alpha.json diff --git a/data/Norquinal_Bravo.json b/data/models/Norquinal_Bravo.json similarity index 100% rename from data/Norquinal_Bravo.json rename to data/models/Norquinal_Bravo.json diff --git a/data/Norquinal_Charlie.json b/data/models/Norquinal_Charlie.json similarity index 100% rename from data/Norquinal_Charlie.json rename to data/models/Norquinal_Charlie.json diff --git a/data/Norquinal_Delta.json b/data/models/Norquinal_Delta.json similarity index 100% rename from data/Norquinal_Delta.json rename to data/models/Norquinal_Delta.json diff --git a/data/Norquinal_Echo.json b/data/models/Norquinal_Echo.json similarity index 100% rename from data/Norquinal_Echo.json rename to data/models/Norquinal_Echo.json diff --git a/data/Norquinal_Foxtrot.json b/data/models/Norquinal_Foxtrot.json similarity index 100% rename from data/Norquinal_Foxtrot.json rename to data/models/Norquinal_Foxtrot.json diff --git a/data/Norquinal_Golf.json b/data/models/Norquinal_Golf.json similarity index 100% rename from data/Norquinal_Golf.json rename to data/models/Norquinal_Golf.json diff --git a/data/Norquinal_Hotel.json b/data/models/Norquinal_Hotel.json similarity index 100% rename from data/Norquinal_Hotel.json rename to data/models/Norquinal_Hotel.json diff --git a/data/NotASI_FineTome-Llama3.2-1B-0929.json b/data/models/NotASI_FineTome-Llama3.2-1B-0929.json similarity index 100% rename from data/NotASI_FineTome-Llama3.2-1B-0929.json rename to data/models/NotASI_FineTome-Llama3.2-1B-0929.json diff --git a/data/NotASI_FineTome-Llama3.2-3B-1002.json b/data/models/NotASI_FineTome-Llama3.2-3B-1002.json similarity index 100% rename from data/NotASI_FineTome-Llama3.2-3B-1002.json rename to data/models/NotASI_FineTome-Llama3.2-3B-1002.json diff --git a/data/NotASI_FineTome-v1.5-Llama3.2-1B-1007.json b/data/models/NotASI_FineTome-v1.5-Llama3.2-1B-1007.json similarity index 100% rename from data/NotASI_FineTome-v1.5-Llama3.2-1B-1007.json rename to data/models/NotASI_FineTome-v1.5-Llama3.2-1B-1007.json diff --git a/data/NotASI_FineTome-v1.5-Llama3.2-3B-1007.json b/data/models/NotASI_FineTome-v1.5-Llama3.2-3B-1007.json similarity index 100% rename from data/NotASI_FineTome-v1.5-Llama3.2-3B-1007.json rename to data/models/NotASI_FineTome-v1.5-Llama3.2-3B-1007.json diff --git a/data/NousResearch_DeepHermes-3-Mistral-24B-Preview.json b/data/models/NousResearch_DeepHermes-3-Mistral-24B-Preview.json similarity index 100% rename from data/NousResearch_DeepHermes-3-Mistral-24B-Preview.json rename to data/models/NousResearch_DeepHermes-3-Mistral-24B-Preview.json diff --git a/data/NousResearch_Hermes-2-Pro-Llama-3-8B.json b/data/models/NousResearch_Hermes-2-Pro-Llama-3-8B.json similarity index 100% rename from data/NousResearch_Hermes-2-Pro-Llama-3-8B.json rename to data/models/NousResearch_Hermes-2-Pro-Llama-3-8B.json diff --git a/data/NousResearch_Hermes-2-Pro-Mistral-7B.json b/data/models/NousResearch_Hermes-2-Pro-Mistral-7B.json similarity index 100% rename from data/NousResearch_Hermes-2-Pro-Mistral-7B.json rename to data/models/NousResearch_Hermes-2-Pro-Mistral-7B.json diff --git a/data/NousResearch_Hermes-2-Theta-Llama-3-8B.json b/data/models/NousResearch_Hermes-2-Theta-Llama-3-8B.json similarity index 100% rename from data/NousResearch_Hermes-2-Theta-Llama-3-8B.json rename to data/models/NousResearch_Hermes-2-Theta-Llama-3-8B.json diff --git a/data/NousResearch_Hermes-3-Llama-3.1-70B.json b/data/models/NousResearch_Hermes-3-Llama-3.1-70B.json similarity index 100% rename from data/NousResearch_Hermes-3-Llama-3.1-70B.json rename to data/models/NousResearch_Hermes-3-Llama-3.1-70B.json diff --git a/data/NousResearch_Hermes-3-Llama-3.1-8B.json b/data/models/NousResearch_Hermes-3-Llama-3.1-8B.json similarity index 100% rename from data/NousResearch_Hermes-3-Llama-3.1-8B.json rename to data/models/NousResearch_Hermes-3-Llama-3.1-8B.json diff --git a/data/NousResearch_Hermes-3-Llama-3.2-3B.json b/data/models/NousResearch_Hermes-3-Llama-3.2-3B.json similarity index 100% rename from data/NousResearch_Hermes-3-Llama-3.2-3B.json rename to data/models/NousResearch_Hermes-3-Llama-3.2-3B.json diff --git a/data/NousResearch_Nous-Hermes-2-Mistral-7B-DPO.json b/data/models/NousResearch_Nous-Hermes-2-Mistral-7B-DPO.json similarity index 100% rename from data/NousResearch_Nous-Hermes-2-Mistral-7B-DPO.json rename to data/models/NousResearch_Nous-Hermes-2-Mistral-7B-DPO.json diff --git a/data/NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO.json b/data/models/NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO.json similarity index 100% rename from data/NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO.json rename to data/models/NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO.json diff --git a/data/NousResearch_Nous-Hermes-2-Mixtral-8x7B-SFT.json b/data/models/NousResearch_Nous-Hermes-2-Mixtral-8x7B-SFT.json similarity index 100% rename from data/NousResearch_Nous-Hermes-2-Mixtral-8x7B-SFT.json rename to data/models/NousResearch_Nous-Hermes-2-Mixtral-8x7B-SFT.json diff --git a/data/NousResearch_Nous-Hermes-2-SOLAR-10.7B.json b/data/models/NousResearch_Nous-Hermes-2-SOLAR-10.7B.json similarity index 100% rename from data/NousResearch_Nous-Hermes-2-SOLAR-10.7B.json rename to data/models/NousResearch_Nous-Hermes-2-SOLAR-10.7B.json diff --git a/data/NousResearch_Nous-Hermes-llama-2-7b.json b/data/models/NousResearch_Nous-Hermes-llama-2-7b.json similarity index 100% rename from data/NousResearch_Nous-Hermes-llama-2-7b.json rename to data/models/NousResearch_Nous-Hermes-llama-2-7b.json diff --git a/data/NousResearch_Yarn-Llama-2-13b-128k.json b/data/models/NousResearch_Yarn-Llama-2-13b-128k.json similarity index 100% rename from data/NousResearch_Yarn-Llama-2-13b-128k.json rename to data/models/NousResearch_Yarn-Llama-2-13b-128k.json diff --git a/data/NousResearch_Yarn-Llama-2-7b-128k.json b/data/models/NousResearch_Yarn-Llama-2-7b-128k.json similarity index 100% rename from data/NousResearch_Yarn-Llama-2-7b-128k.json rename to data/models/NousResearch_Yarn-Llama-2-7b-128k.json diff --git a/data/NousResearch_Yarn-Llama-2-7b-64k.json b/data/models/NousResearch_Yarn-Llama-2-7b-64k.json similarity index 100% rename from data/NousResearch_Yarn-Llama-2-7b-64k.json rename to data/models/NousResearch_Yarn-Llama-2-7b-64k.json diff --git a/data/NousResearch_Yarn-Mistral-7b-128k.json b/data/models/NousResearch_Yarn-Mistral-7b-128k.json similarity index 100% rename from data/NousResearch_Yarn-Mistral-7b-128k.json rename to data/models/NousResearch_Yarn-Mistral-7b-128k.json diff --git a/data/NousResearch_Yarn-Mistral-7b-64k.json b/data/models/NousResearch_Yarn-Mistral-7b-64k.json similarity index 100% rename from data/NousResearch_Yarn-Mistral-7b-64k.json rename to data/models/NousResearch_Yarn-Mistral-7b-64k.json diff --git a/data/NousResearch_Yarn-Solar-10b-32k.json b/data/models/NousResearch_Yarn-Solar-10b-32k.json similarity index 100% rename from data/NousResearch_Yarn-Solar-10b-32k.json rename to data/models/NousResearch_Yarn-Solar-10b-32k.json diff --git a/data/NousResearch_Yarn-Solar-10b-64k.json b/data/models/NousResearch_Yarn-Solar-10b-64k.json similarity index 100% rename from data/NousResearch_Yarn-Solar-10b-64k.json rename to data/models/NousResearch_Yarn-Solar-10b-64k.json diff --git a/data/Novaciano_ASTAROTH-3.2-1B.json b/data/models/Novaciano_ASTAROTH-3.2-1B.json similarity index 100% rename from data/Novaciano_ASTAROTH-3.2-1B.json rename to data/models/Novaciano_ASTAROTH-3.2-1B.json diff --git a/data/Novaciano_BLAST_PROCESSING-3.2-1B.json b/data/models/Novaciano_BLAST_PROCESSING-3.2-1B.json similarity index 100% rename from data/Novaciano_BLAST_PROCESSING-3.2-1B.json rename to data/models/Novaciano_BLAST_PROCESSING-3.2-1B.json diff --git a/data/Novaciano_Cerberus-3.2-1B.json b/data/models/Novaciano_Cerberus-3.2-1B.json similarity index 100% rename from data/Novaciano_Cerberus-3.2-1B.json rename to data/models/Novaciano_Cerberus-3.2-1B.json diff --git a/data/Novaciano_Cultist-3.2-1B.json b/data/models/Novaciano_Cultist-3.2-1B.json similarity index 100% rename from data/Novaciano_Cultist-3.2-1B.json rename to data/models/Novaciano_Cultist-3.2-1B.json diff --git a/data/Novaciano_FuseChat-3.2-1B-GRPO_Creative_RP.json b/data/models/Novaciano_FuseChat-3.2-1B-GRPO_Creative_RP.json similarity index 100% rename from data/Novaciano_FuseChat-3.2-1B-GRPO_Creative_RP.json rename to data/models/Novaciano_FuseChat-3.2-1B-GRPO_Creative_RP.json diff --git a/data/Novaciano_Fusetrix-3.2-1B-GRPO_RP_Creative.json b/data/models/Novaciano_Fusetrix-3.2-1B-GRPO_RP_Creative.json similarity index 100% rename from data/Novaciano_Fusetrix-3.2-1B-GRPO_RP_Creative.json rename to data/models/Novaciano_Fusetrix-3.2-1B-GRPO_RP_Creative.json diff --git a/data/Novaciano_Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP.json b/data/models/Novaciano_Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP.json similarity index 100% rename from data/Novaciano_Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP.json rename to data/models/Novaciano_Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP.json diff --git a/data/Novaciano_HarmfulProject-3.2-1B.json b/data/models/Novaciano_HarmfulProject-3.2-1B.json similarity index 100% rename from data/Novaciano_HarmfulProject-3.2-1B.json rename to data/models/Novaciano_HarmfulProject-3.2-1B.json diff --git a/data/Novaciano_LEWD-Mental-Cultist-3.2-1B.json b/data/models/Novaciano_LEWD-Mental-Cultist-3.2-1B.json similarity index 100% rename from data/Novaciano_LEWD-Mental-Cultist-3.2-1B.json rename to data/models/Novaciano_LEWD-Mental-Cultist-3.2-1B.json diff --git a/data/Novaciano_La_Mejor_Mezcla-3.2-1B.json b/data/models/Novaciano_La_Mejor_Mezcla-3.2-1B.json similarity index 100% rename from data/Novaciano_La_Mejor_Mezcla-3.2-1B.json rename to data/models/Novaciano_La_Mejor_Mezcla-3.2-1B.json diff --git a/data/Novaciano_Sigil-Of-Satan-3.2-1B.json b/data/models/Novaciano_Sigil-Of-Satan-3.2-1B.json similarity index 100% rename from data/Novaciano_Sigil-Of-Satan-3.2-1B.json rename to data/models/Novaciano_Sigil-Of-Satan-3.2-1B.json diff --git a/data/NucleusAI_nucleus-22B-token-500B.json b/data/models/NucleusAI_nucleus-22B-token-500B.json similarity index 100% rename from data/NucleusAI_nucleus-22B-token-500B.json rename to data/models/NucleusAI_nucleus-22B-token-500B.json diff --git a/data/NyxKrage_Microsoft_Phi-4.json b/data/models/NyxKrage_Microsoft_Phi-4.json similarity index 100% rename from data/NyxKrage_Microsoft_Phi-4.json rename to data/models/NyxKrage_Microsoft_Phi-4.json diff --git a/data/OEvortex_Emotional-llama-8B.json b/data/models/OEvortex_Emotional-llama-8B.json similarity index 100% rename from data/OEvortex_Emotional-llama-8B.json rename to data/models/OEvortex_Emotional-llama-8B.json diff --git a/data/OEvortex_HelpingAI-15B.json b/data/models/OEvortex_HelpingAI-15B.json similarity index 100% rename from data/OEvortex_HelpingAI-15B.json rename to data/models/OEvortex_HelpingAI-15B.json diff --git a/data/OEvortex_HelpingAI-3B-reloaded.json b/data/models/OEvortex_HelpingAI-3B-reloaded.json similarity index 100% rename from data/OEvortex_HelpingAI-3B-reloaded.json rename to data/models/OEvortex_HelpingAI-3B-reloaded.json diff --git a/data/OEvortex_HelpingAI2-9B.json b/data/models/OEvortex_HelpingAI2-9B.json similarity index 100% rename from data/OEvortex_HelpingAI2-9B.json rename to data/models/OEvortex_HelpingAI2-9B.json diff --git a/data/OEvortex_HelpingAI2.5-10B.json b/data/models/OEvortex_HelpingAI2.5-10B.json similarity index 100% rename from data/OEvortex_HelpingAI2.5-10B.json rename to data/models/OEvortex_HelpingAI2.5-10B.json diff --git a/data/OliveiraJLT_Sagui-7B-Instruct-v0.1.json b/data/models/OliveiraJLT_Sagui-7B-Instruct-v0.1.json similarity index 100% rename from data/OliveiraJLT_Sagui-7B-Instruct-v0.1.json rename to data/models/OliveiraJLT_Sagui-7B-Instruct-v0.1.json diff --git a/data/Omkar1102_code-yi.json b/data/models/Omkar1102_code-yi.json similarity index 100% rename from data/Omkar1102_code-yi.json rename to data/models/Omkar1102_code-yi.json diff --git a/data/OmnicromsBrain_NeuralStar_FusionWriter_4x7b.json b/data/models/OmnicromsBrain_NeuralStar_FusionWriter_4x7b.json similarity index 100% rename from data/OmnicromsBrain_NeuralStar_FusionWriter_4x7b.json rename to data/models/OmnicromsBrain_NeuralStar_FusionWriter_4x7b.json diff --git a/data/OnlyCheeini_greesychat-turbo.json b/data/models/OnlyCheeini_greesychat-turbo.json similarity index 100% rename from data/OnlyCheeini_greesychat-turbo.json rename to data/models/OnlyCheeini_greesychat-turbo.json diff --git a/data/Open-Orca_Mistral-7B-OpenOrca.json b/data/models/Open-Orca_Mistral-7B-OpenOrca.json similarity index 100% rename from data/Open-Orca_Mistral-7B-OpenOrca.json rename to data/models/Open-Orca_Mistral-7B-OpenOrca.json diff --git a/data/OpenAssistant_oasst-rm-2-pythia-6.9b-epoch-1.json b/data/models/OpenAssistant_oasst-rm-2-pythia-6.9b-epoch-1.json similarity index 100% rename from data/OpenAssistant_oasst-rm-2-pythia-6.9b-epoch-1.json rename to data/models/OpenAssistant_oasst-rm-2-pythia-6.9b-epoch-1.json diff --git a/data/OpenAssistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5.json b/data/models/OpenAssistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5.json similarity index 100% rename from data/OpenAssistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5.json rename to data/models/OpenAssistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5.json diff --git a/data/OpenAssistant_oasst-sft-1-pythia-12b.json b/data/models/OpenAssistant_oasst-sft-1-pythia-12b.json similarity index 100% rename from data/OpenAssistant_oasst-sft-1-pythia-12b.json rename to data/models/OpenAssistant_oasst-sft-1-pythia-12b.json diff --git a/data/OpenAssistant_reward-model-deberta-v3-large-v2.json b/data/models/OpenAssistant_reward-model-deberta-v3-large-v2.json similarity index 100% rename from data/OpenAssistant_reward-model-deberta-v3-large-v2.json rename to data/models/OpenAssistant_reward-model-deberta-v3-large-v2.json index cf1ba02f6dd573bf3e0770ff614660b73e92dbb8..b28ca5c5700af0f5cf22d77dcb1c4fea033ecc2d 100644 --- a/data/OpenAssistant_reward-model-deberta-v3-large-v2.json +++ b/data/models/OpenAssistant_reward-model-deberta-v3-large-v2.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench-2/OpenAssistant_reward-model-deberta-v3-large-v2/1766412838.146816", + "evaluation_id": "reward-bench/OpenAssistant_reward-model-deberta-v3-large-v2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,127 +31,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.32 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3853 + "score": 0.6126 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2687 + "score": 0.8939 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5027 + "score": 0.4518 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3667 + "score": 0.7338 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2768 + "score": 0.3855 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.12 + "score": 0.5836 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], @@ -159,10 +141,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench/OpenAssistant_reward-model-deberta-v3-large-v2/1766412838.146816", + "evaluation_id": "reward-bench-2/OpenAssistant_reward-model-deberta-v3-large-v2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -181,109 +163,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6126 + "score": 0.32 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8939 + "score": 0.3853 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4518 + "score": 0.2687 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.5027 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7338 + "score": 0.3667 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3855 + "score": 0.2768 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5836 + "score": 0.12 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], diff --git a/data/OpenBuddy_openbuddy-falcon3-10b-v24.2-131k.json b/data/models/OpenBuddy_openbuddy-falcon3-10b-v24.2-131k.json similarity index 100% rename from data/OpenBuddy_openbuddy-falcon3-10b-v24.2-131k.json rename to data/models/OpenBuddy_openbuddy-falcon3-10b-v24.2-131k.json diff --git a/data/OpenBuddy_openbuddy-llama3-70b-v21.2-32k.json b/data/models/OpenBuddy_openbuddy-llama3-70b-v21.2-32k.json similarity index 100% rename from data/OpenBuddy_openbuddy-llama3-70b-v21.2-32k.json rename to data/models/OpenBuddy_openbuddy-llama3-70b-v21.2-32k.json diff --git a/data/OpenBuddy_openbuddy-llama3-8b-v21.1-8k.json b/data/models/OpenBuddy_openbuddy-llama3-8b-v21.1-8k.json similarity index 100% rename from data/OpenBuddy_openbuddy-llama3-8b-v21.1-8k.json rename to data/models/OpenBuddy_openbuddy-llama3-8b-v21.1-8k.json diff --git a/data/OpenBuddy_openbuddy-llama3-8b-v21.2-32k.json b/data/models/OpenBuddy_openbuddy-llama3-8b-v21.2-32k.json similarity index 100% rename from data/OpenBuddy_openbuddy-llama3-8b-v21.2-32k.json rename to data/models/OpenBuddy_openbuddy-llama3-8b-v21.2-32k.json diff --git a/data/OpenBuddy_openbuddy-llama3.1-70b-v22.1-131k.json b/data/models/OpenBuddy_openbuddy-llama3.1-70b-v22.1-131k.json similarity index 100% rename from data/OpenBuddy_openbuddy-llama3.1-70b-v22.1-131k.json rename to data/models/OpenBuddy_openbuddy-llama3.1-70b-v22.1-131k.json diff --git a/data/OpenBuddy_openbuddy-llama3.1-8b-v22.2-131k.json b/data/models/OpenBuddy_openbuddy-llama3.1-8b-v22.2-131k.json similarity index 100% rename from data/OpenBuddy_openbuddy-llama3.1-8b-v22.2-131k.json rename to data/models/OpenBuddy_openbuddy-llama3.1-8b-v22.2-131k.json diff --git a/data/OpenBuddy_openbuddy-llama3.1-8b-v22.3-131k.json b/data/models/OpenBuddy_openbuddy-llama3.1-8b-v22.3-131k.json similarity index 100% rename from data/OpenBuddy_openbuddy-llama3.1-8b-v22.3-131k.json rename to data/models/OpenBuddy_openbuddy-llama3.1-8b-v22.3-131k.json diff --git a/data/OpenBuddy_openbuddy-llama3.2-1b-v23.1-131k.json b/data/models/OpenBuddy_openbuddy-llama3.2-1b-v23.1-131k.json similarity index 100% rename from data/OpenBuddy_openbuddy-llama3.2-1b-v23.1-131k.json rename to data/models/OpenBuddy_openbuddy-llama3.2-1b-v23.1-131k.json diff --git a/data/OpenBuddy_openbuddy-llama3.2-3b-v23.2-131k.json b/data/models/OpenBuddy_openbuddy-llama3.2-3b-v23.2-131k.json similarity index 100% rename from data/OpenBuddy_openbuddy-llama3.2-3b-v23.2-131k.json rename to data/models/OpenBuddy_openbuddy-llama3.2-3b-v23.2-131k.json diff --git a/data/OpenBuddy_openbuddy-llama3.3-70b-v24.1-131k.json b/data/models/OpenBuddy_openbuddy-llama3.3-70b-v24.1-131k.json similarity index 100% rename from data/OpenBuddy_openbuddy-llama3.3-70b-v24.1-131k.json rename to data/models/OpenBuddy_openbuddy-llama3.3-70b-v24.1-131k.json diff --git a/data/OpenBuddy_openbuddy-mixtral-7bx8-v18.1-32k.json b/data/models/OpenBuddy_openbuddy-mixtral-7bx8-v18.1-32k.json similarity index 100% rename from data/OpenBuddy_openbuddy-mixtral-7bx8-v18.1-32k.json rename to data/models/OpenBuddy_openbuddy-mixtral-7bx8-v18.1-32k.json diff --git a/data/OpenBuddy_openbuddy-nemotron-70b-v23.1-131k.json b/data/models/OpenBuddy_openbuddy-nemotron-70b-v23.1-131k.json similarity index 100% rename from data/OpenBuddy_openbuddy-nemotron-70b-v23.1-131k.json rename to data/models/OpenBuddy_openbuddy-nemotron-70b-v23.1-131k.json diff --git a/data/OpenBuddy_openbuddy-nemotron-70b-v23.2-131k.json b/data/models/OpenBuddy_openbuddy-nemotron-70b-v23.2-131k.json similarity index 100% rename from data/OpenBuddy_openbuddy-nemotron-70b-v23.2-131k.json rename to data/models/OpenBuddy_openbuddy-nemotron-70b-v23.2-131k.json diff --git a/data/OpenBuddy_openbuddy-qwen2.5llamaify-14b-v23.1-200k.json b/data/models/OpenBuddy_openbuddy-qwen2.5llamaify-14b-v23.1-200k.json similarity index 100% rename from data/OpenBuddy_openbuddy-qwen2.5llamaify-14b-v23.1-200k.json rename to data/models/OpenBuddy_openbuddy-qwen2.5llamaify-14b-v23.1-200k.json diff --git a/data/OpenBuddy_openbuddy-qwen2.5llamaify-14b-v23.3-200k.json b/data/models/OpenBuddy_openbuddy-qwen2.5llamaify-14b-v23.3-200k.json similarity index 100% rename from data/OpenBuddy_openbuddy-qwen2.5llamaify-14b-v23.3-200k.json rename to data/models/OpenBuddy_openbuddy-qwen2.5llamaify-14b-v23.3-200k.json diff --git a/data/OpenBuddy_openbuddy-qwen2.5llamaify-7b-v23.1-200k.json b/data/models/OpenBuddy_openbuddy-qwen2.5llamaify-7b-v23.1-200k.json similarity index 100% rename from data/OpenBuddy_openbuddy-qwen2.5llamaify-7b-v23.1-200k.json rename to data/models/OpenBuddy_openbuddy-qwen2.5llamaify-7b-v23.1-200k.json diff --git a/data/OpenBuddy_openbuddy-qwq-32b-v24.1-200k.json b/data/models/OpenBuddy_openbuddy-qwq-32b-v24.1-200k.json similarity index 100% rename from data/OpenBuddy_openbuddy-qwq-32b-v24.1-200k.json rename to data/models/OpenBuddy_openbuddy-qwq-32b-v24.1-200k.json diff --git a/data/OpenBuddy_openbuddy-qwq-32b-v24.2-200k.json b/data/models/OpenBuddy_openbuddy-qwq-32b-v24.2-200k.json similarity index 100% rename from data/OpenBuddy_openbuddy-qwq-32b-v24.2-200k.json rename to data/models/OpenBuddy_openbuddy-qwq-32b-v24.2-200k.json diff --git a/data/OpenBuddy_openbuddy-yi1.5-34b-v21.3-32k.json b/data/models/OpenBuddy_openbuddy-yi1.5-34b-v21.3-32k.json similarity index 100% rename from data/OpenBuddy_openbuddy-yi1.5-34b-v21.3-32k.json rename to data/models/OpenBuddy_openbuddy-yi1.5-34b-v21.3-32k.json diff --git a/data/OpenBuddy_openbuddy-zero-14b-v22.3-32k.json b/data/models/OpenBuddy_openbuddy-zero-14b-v22.3-32k.json similarity index 100% rename from data/OpenBuddy_openbuddy-zero-14b-v22.3-32k.json rename to data/models/OpenBuddy_openbuddy-zero-14b-v22.3-32k.json diff --git a/data/OpenBuddy_openbuddy-zero-3b-v21.2-32k.json b/data/models/OpenBuddy_openbuddy-zero-3b-v21.2-32k.json similarity index 100% rename from data/OpenBuddy_openbuddy-zero-3b-v21.2-32k.json rename to data/models/OpenBuddy_openbuddy-zero-3b-v21.2-32k.json diff --git a/data/OpenBuddy_openbuddy-zero-56b-v21.2-32k.json b/data/models/OpenBuddy_openbuddy-zero-56b-v21.2-32k.json similarity index 100% rename from data/OpenBuddy_openbuddy-zero-56b-v21.2-32k.json rename to data/models/OpenBuddy_openbuddy-zero-56b-v21.2-32k.json diff --git a/data/OpenGenerativeAI_Bifrost-14B.json b/data/models/OpenGenerativeAI_Bifrost-14B.json similarity index 100% rename from data/OpenGenerativeAI_Bifrost-14B.json rename to data/models/OpenGenerativeAI_Bifrost-14B.json diff --git a/data/OpenGenerativeAI_Bifrost.json b/data/models/OpenGenerativeAI_Bifrost.json similarity index 100% rename from data/OpenGenerativeAI_Bifrost.json rename to data/models/OpenGenerativeAI_Bifrost.json diff --git a/data/OpenLLM-France_Lucie-7B-Instruct-human-data.json b/data/models/OpenLLM-France_Lucie-7B-Instruct-human-data.json similarity index 100% rename from data/OpenLLM-France_Lucie-7B-Instruct-human-data.json rename to data/models/OpenLLM-France_Lucie-7B-Instruct-human-data.json diff --git a/data/OpenLLM-France_Lucie-7B-Instruct-v1.1.json b/data/models/OpenLLM-France_Lucie-7B-Instruct-v1.1.json similarity index 100% rename from data/OpenLLM-France_Lucie-7B-Instruct-v1.1.json rename to data/models/OpenLLM-France_Lucie-7B-Instruct-v1.1.json diff --git a/data/OpenLLM-France_Lucie-7B-Instruct.json b/data/models/OpenLLM-France_Lucie-7B-Instruct.json similarity index 100% rename from data/OpenLLM-France_Lucie-7B-Instruct.json rename to data/models/OpenLLM-France_Lucie-7B-Instruct.json diff --git a/data/OpenLLM-France_Lucie-7B.json b/data/models/OpenLLM-France_Lucie-7B.json similarity index 100% rename from data/OpenLLM-France_Lucie-7B.json rename to data/models/OpenLLM-France_Lucie-7B.json diff --git a/data/OpenLeecher_llama3-8b-lima.json b/data/models/OpenLeecher_llama3-8b-lima.json similarity index 100% rename from data/OpenLeecher_llama3-8b-lima.json rename to data/models/OpenLeecher_llama3-8b-lima.json diff --git a/data/OpenScholar_Llama-3.1_OpenScholar-8B.json b/data/models/OpenScholar_Llama-3.1_OpenScholar-8B.json similarity index 100% rename from data/OpenScholar_Llama-3.1_OpenScholar-8B.json rename to data/models/OpenScholar_Llama-3.1_OpenScholar-8B.json diff --git a/data/Orenguteng_Llama-3.1-8B-Lexi-Uncensored-V2.json b/data/models/Orenguteng_Llama-3.1-8B-Lexi-Uncensored-V2.json similarity index 100% rename from data/Orenguteng_Llama-3.1-8B-Lexi-Uncensored-V2.json rename to data/models/Orenguteng_Llama-3.1-8B-Lexi-Uncensored-V2.json diff --git a/data/Orenguteng_Llama-3.1-8B-Lexi-Uncensored.json b/data/models/Orenguteng_Llama-3.1-8B-Lexi-Uncensored.json similarity index 100% rename from data/Orenguteng_Llama-3.1-8B-Lexi-Uncensored.json rename to data/models/Orenguteng_Llama-3.1-8B-Lexi-Uncensored.json diff --git a/data/Orion-zhen_Qwen2.5-7B-Instruct-Uncensored.json b/data/models/Orion-zhen_Qwen2.5-7B-Instruct-Uncensored.json similarity index 100% rename from data/Orion-zhen_Qwen2.5-7B-Instruct-Uncensored.json rename to data/models/Orion-zhen_Qwen2.5-7B-Instruct-Uncensored.json diff --git a/data/Orion-zhen_phi-4-abliterated.json b/data/models/Orion-zhen_phi-4-abliterated.json similarity index 100% rename from data/Orion-zhen_phi-4-abliterated.json rename to data/models/Orion-zhen_phi-4-abliterated.json diff --git a/data/P0x0_Astra-v1-12B.json b/data/models/P0x0_Astra-v1-12B.json similarity index 100% rename from data/P0x0_Astra-v1-12B.json rename to data/models/P0x0_Astra-v1-12B.json diff --git a/data/PJMixers-Dev_L3.2-Instruct-Thinking-v0.1-1B.json b/data/models/PJMixers-Dev_L3.2-Instruct-Thinking-v0.1-1B.json similarity index 100% rename from data/PJMixers-Dev_L3.2-Instruct-Thinking-v0.1-1B.json rename to data/models/PJMixers-Dev_L3.2-Instruct-Thinking-v0.1-1B.json diff --git a/data/PJMixers-Dev_LLaMa-3.1-Instruct-Interleaved-Zeroed-13B.json b/data/models/PJMixers-Dev_LLaMa-3.1-Instruct-Interleaved-Zeroed-13B.json similarity index 100% rename from data/PJMixers-Dev_LLaMa-3.1-Instruct-Interleaved-Zeroed-13B.json rename to data/models/PJMixers-Dev_LLaMa-3.1-Instruct-Interleaved-Zeroed-13B.json diff --git a/data/PJMixers-Dev_LLaMa-3.1-RomboTiesTest-8B.json b/data/models/PJMixers-Dev_LLaMa-3.1-RomboTiesTest-8B.json similarity index 100% rename from data/PJMixers-Dev_LLaMa-3.1-RomboTiesTest-8B.json rename to data/models/PJMixers-Dev_LLaMa-3.1-RomboTiesTest-8B.json diff --git a/data/PJMixers-Dev_LLaMa-3.1-RomboTiesTest2-8B.json b/data/models/PJMixers-Dev_LLaMa-3.1-RomboTiesTest2-8B.json similarity index 100% rename from data/PJMixers-Dev_LLaMa-3.1-RomboTiesTest2-8B.json rename to data/models/PJMixers-Dev_LLaMa-3.1-RomboTiesTest2-8B.json diff --git a/data/PJMixers-Dev_LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B.json b/data/models/PJMixers-Dev_LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B.json similarity index 100% rename from data/PJMixers-Dev_LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B.json rename to data/models/PJMixers-Dev_LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B.json diff --git a/data/PJMixers-Dev_LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B.json b/data/models/PJMixers-Dev_LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B.json similarity index 100% rename from data/PJMixers-Dev_LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B.json rename to data/models/PJMixers-Dev_LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B.json diff --git a/data/PJMixers-Dev_LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B.json b/data/models/PJMixers-Dev_LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B.json similarity index 100% rename from data/PJMixers-Dev_LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B.json rename to data/models/PJMixers-Dev_LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B.json diff --git a/data/PJMixers-Dev_LLaMa-3.2-Instruct-JankMixBread-v0.1-3B.json b/data/models/PJMixers-Dev_LLaMa-3.2-Instruct-JankMixBread-v0.1-3B.json similarity index 100% rename from data/PJMixers-Dev_LLaMa-3.2-Instruct-JankMixBread-v0.1-3B.json rename to data/models/PJMixers-Dev_LLaMa-3.2-Instruct-JankMixBread-v0.1-3B.json diff --git a/data/PJMixers-Dev_Qwen2.5-RomboTiesTest-7B.json b/data/models/PJMixers-Dev_Qwen2.5-RomboTiesTest-7B.json similarity index 100% rename from data/PJMixers-Dev_Qwen2.5-RomboTiesTest-7B.json rename to data/models/PJMixers-Dev_Qwen2.5-RomboTiesTest-7B.json diff --git a/data/PJMixers_LLaMa-3-CursedStock-v2.0-8B.json b/data/models/PJMixers_LLaMa-3-CursedStock-v2.0-8B.json similarity index 100% rename from data/PJMixers_LLaMa-3-CursedStock-v2.0-8B.json rename to data/models/PJMixers_LLaMa-3-CursedStock-v2.0-8B.json diff --git a/data/PKU-Alignment_beaver-7b-v1.0-cost.json b/data/models/PKU-Alignment_beaver-7b-v1.0-cost.json similarity index 100% rename from data/PKU-Alignment_beaver-7b-v1.0-cost.json rename to data/models/PKU-Alignment_beaver-7b-v1.0-cost.json diff --git a/data/PKU-Alignment_beaver-7b-v1.0-reward.json b/data/models/PKU-Alignment_beaver-7b-v1.0-reward.json similarity index 100% rename from data/PKU-Alignment_beaver-7b-v1.0-reward.json rename to data/models/PKU-Alignment_beaver-7b-v1.0-reward.json diff --git a/data/PKU-Alignment_beaver-7b-v2.0-cost.json b/data/models/PKU-Alignment_beaver-7b-v2.0-cost.json similarity index 100% rename from data/PKU-Alignment_beaver-7b-v2.0-cost.json rename to data/models/PKU-Alignment_beaver-7b-v2.0-cost.json index 5314c675193b80f86e094f04e503749d966c642b..81393cc7dd76e676c977d4915c4171a2a2ea4f5e 100644 --- a/data/PKU-Alignment_beaver-7b-v2.0-cost.json +++ b/data/models/PKU-Alignment_beaver-7b-v2.0-cost.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v2.0-cost/1766412838.146816", + "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v2.0-cost/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,127 +31,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3326 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3789 + "score": 0.5957 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.275 + "score": 0.5726 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3333 + "score": 0.4561 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7356 + "score": 0.7608 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2828 + "score": 0.6211 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": -0.01 + "score": 0.5397 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], @@ -159,10 +141,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v2.0-cost/1766412838.146816", + "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v2.0-cost/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -181,109 +163,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5957 + "score": 0.3326 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5726 + "score": 0.3789 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4561 + "score": 0.275 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3333 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7608 + "score": 0.7356 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6211 + "score": 0.2828 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5397 + "score": -0.01 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], diff --git a/data/PKU-Alignment_beaver-7b-v2.0-reward.json b/data/models/PKU-Alignment_beaver-7b-v2.0-reward.json similarity index 100% rename from data/PKU-Alignment_beaver-7b-v2.0-reward.json rename to data/models/PKU-Alignment_beaver-7b-v2.0-reward.json index e6954fa8026e0ae406f243f530096227275cb53f..4c7c399de00819c04f2b08eddfc902f45548a34f 100644 --- a/data/PKU-Alignment_beaver-7b-v2.0-reward.json +++ b/data/models/PKU-Alignment_beaver-7b-v2.0-reward.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v2.0-reward/1766412838.146816", + "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v2.0-reward/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,127 +31,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2544 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2168 + "score": 0.6366 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2562 + "score": 0.8994 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3825 + "score": 0.364 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3156 + "score": 0.6041 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2606 + "score": 0.6887 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.0944 + "score": 0.6171 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], @@ -159,10 +141,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v2.0-reward/1766412838.146816", + "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v2.0-reward/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -181,109 +163,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6366 + "score": 0.2544 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8994 + "score": 0.2168 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.364 + "score": 0.2562 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3825 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6041 + "score": 0.3156 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6887 + "score": 0.2606 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6171 + "score": 0.0944 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], diff --git a/data/Parissa3_test-model.json b/data/models/Parissa3_test-model.json similarity index 100% rename from data/Parissa3_test-model.json rename to data/models/Parissa3_test-model.json diff --git a/data/Pinkstack_PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B.json b/data/models/Pinkstack_PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B.json similarity index 100% rename from data/Pinkstack_PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B.json rename to data/models/Pinkstack_PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B.json diff --git a/data/Pinkstack_SuperThoughts-CoT-14B-16k-o1-QwQ.json b/data/models/Pinkstack_SuperThoughts-CoT-14B-16k-o1-QwQ.json similarity index 100% rename from data/Pinkstack_SuperThoughts-CoT-14B-16k-o1-QwQ.json rename to data/models/Pinkstack_SuperThoughts-CoT-14B-16k-o1-QwQ.json diff --git a/data/Pinkstack_Superthoughts-lite-1.8B-experimental-o1.json b/data/models/Pinkstack_Superthoughts-lite-1.8B-experimental-o1.json similarity index 100% rename from data/Pinkstack_Superthoughts-lite-1.8B-experimental-o1.json rename to data/models/Pinkstack_Superthoughts-lite-1.8B-experimental-o1.json diff --git a/data/Pinkstack_Superthoughts-lite-v1.json b/data/models/Pinkstack_Superthoughts-lite-v1.json similarity index 100% rename from data/Pinkstack_Superthoughts-lite-v1.json rename to data/models/Pinkstack_Superthoughts-lite-v1.json diff --git a/data/PoLL_gpt-3.5-turbo-0125_claude-3-sonnet-2024022....json b/data/models/PoLL_gpt-3.5-turbo-0125_claude-3-sonnet-2024022....json similarity index 100% rename from data/PoLL_gpt-3.5-turbo-0125_claude-3-sonnet-2024022....json rename to data/models/PoLL_gpt-3.5-turbo-0125_claude-3-sonnet-2024022....json diff --git a/data/PocketDoc_Dans-Instruct-CoreCurriculum-12b.json b/data/models/PocketDoc_Dans-Instruct-CoreCurriculum-12b.json similarity index 100% rename from data/PocketDoc_Dans-Instruct-CoreCurriculum-12b.json rename to data/models/PocketDoc_Dans-Instruct-CoreCurriculum-12b.json diff --git a/data/PocketDoc_Dans-PersonalityEngine-V1.1.0-12b.json b/data/models/PocketDoc_Dans-PersonalityEngine-V1.1.0-12b.json similarity index 100% rename from data/PocketDoc_Dans-PersonalityEngine-V1.1.0-12b.json rename to data/models/PocketDoc_Dans-PersonalityEngine-V1.1.0-12b.json diff --git a/data/PocketDoc_Dans-PersonalityEngine-V1.2.0-24b.json b/data/models/PocketDoc_Dans-PersonalityEngine-V1.2.0-24b.json similarity index 100% rename from data/PocketDoc_Dans-PersonalityEngine-V1.2.0-24b.json rename to data/models/PocketDoc_Dans-PersonalityEngine-V1.2.0-24b.json diff --git a/data/PocketDoc_Dans-PersonalityEngine-v1.0.0-8b.json b/data/models/PocketDoc_Dans-PersonalityEngine-v1.0.0-8b.json similarity index 100% rename from data/PocketDoc_Dans-PersonalityEngine-v1.0.0-8b.json rename to data/models/PocketDoc_Dans-PersonalityEngine-v1.0.0-8b.json diff --git a/data/PocketDoc_Dans-SakuraKaze-V1.0.0-12b.json b/data/models/PocketDoc_Dans-SakuraKaze-V1.0.0-12b.json similarity index 100% rename from data/PocketDoc_Dans-SakuraKaze-V1.0.0-12b.json rename to data/models/PocketDoc_Dans-SakuraKaze-V1.0.0-12b.json diff --git a/data/PowerInfer_SmallThinker-3B-Preview.json b/data/models/PowerInfer_SmallThinker-3B-Preview.json similarity index 100% rename from data/PowerInfer_SmallThinker-3B-Preview.json rename to data/models/PowerInfer_SmallThinker-3B-Preview.json diff --git a/data/PranavHarshan_LaMistral-V4.json b/data/models/PranavHarshan_LaMistral-V4.json similarity index 100% rename from data/PranavHarshan_LaMistral-V4.json rename to data/models/PranavHarshan_LaMistral-V4.json diff --git a/data/PranavHarshan_MedNarra-X1.json b/data/models/PranavHarshan_MedNarra-X1.json similarity index 100% rename from data/PranavHarshan_MedNarra-X1.json rename to data/models/PranavHarshan_MedNarra-X1.json diff --git a/data/Pretergeek_OpenChat-3.5-0106_10.7B_48Layers-Appended.json b/data/models/Pretergeek_OpenChat-3.5-0106_10.7B_48Layers-Appended.json similarity index 100% rename from data/Pretergeek_OpenChat-3.5-0106_10.7B_48Layers-Appended.json rename to data/models/Pretergeek_OpenChat-3.5-0106_10.7B_48Layers-Appended.json diff --git a/data/Pretergeek_OpenChat-3.5-0106_10.7B_48Layers-Interleaved.json b/data/models/Pretergeek_OpenChat-3.5-0106_10.7B_48Layers-Interleaved.json similarity index 100% rename from data/Pretergeek_OpenChat-3.5-0106_10.7B_48Layers-Interleaved.json rename to data/models/Pretergeek_OpenChat-3.5-0106_10.7B_48Layers-Interleaved.json diff --git a/data/Pretergeek_OpenChat-3.5-0106_32K-PoSE.json b/data/models/Pretergeek_OpenChat-3.5-0106_32K-PoSE.json similarity index 100% rename from data/Pretergeek_OpenChat-3.5-0106_32K-PoSE.json rename to data/models/Pretergeek_OpenChat-3.5-0106_32K-PoSE.json diff --git a/data/Pretergeek_OpenChat-3.5-0106_8.11B_36Layers-Appended.json b/data/models/Pretergeek_OpenChat-3.5-0106_8.11B_36Layers-Appended.json similarity index 100% rename from data/Pretergeek_OpenChat-3.5-0106_8.11B_36Layers-Appended.json rename to data/models/Pretergeek_OpenChat-3.5-0106_8.11B_36Layers-Appended.json diff --git a/data/Pretergeek_OpenChat-3.5-0106_8.11B_36Layers-Interleaved.json b/data/models/Pretergeek_OpenChat-3.5-0106_8.11B_36Layers-Interleaved.json similarity index 100% rename from data/Pretergeek_OpenChat-3.5-0106_8.11B_36Layers-Interleaved.json rename to data/models/Pretergeek_OpenChat-3.5-0106_8.11B_36Layers-Interleaved.json diff --git a/data/Pretergeek_OpenChat-3.5-0106_8.99B_40Layers-Appended.json b/data/models/Pretergeek_OpenChat-3.5-0106_8.99B_40Layers-Appended.json similarity index 100% rename from data/Pretergeek_OpenChat-3.5-0106_8.99B_40Layers-Appended.json rename to data/models/Pretergeek_OpenChat-3.5-0106_8.99B_40Layers-Appended.json diff --git a/data/Pretergeek_OpenChat-3.5-0106_8.99B_40Layers-Interleaved.json b/data/models/Pretergeek_OpenChat-3.5-0106_8.99B_40Layers-Interleaved.json similarity index 100% rename from data/Pretergeek_OpenChat-3.5-0106_8.99B_40Layers-Interleaved.json rename to data/models/Pretergeek_OpenChat-3.5-0106_8.99B_40Layers-Interleaved.json diff --git a/data/Pretergeek_OpenChat-3.5-0106_9.86B_44Layers-Appended.json b/data/models/Pretergeek_OpenChat-3.5-0106_9.86B_44Layers-Appended.json similarity index 100% rename from data/Pretergeek_OpenChat-3.5-0106_9.86B_44Layers-Appended.json rename to data/models/Pretergeek_OpenChat-3.5-0106_9.86B_44Layers-Appended.json diff --git a/data/Pretergeek_openchat-3.5-0106_Rebased_Mistral-7B-v0.2.json b/data/models/Pretergeek_openchat-3.5-0106_Rebased_Mistral-7B-v0.2.json similarity index 100% rename from data/Pretergeek_openchat-3.5-0106_Rebased_Mistral-7B-v0.2.json rename to data/models/Pretergeek_openchat-3.5-0106_Rebased_Mistral-7B-v0.2.json diff --git a/data/PrimeIntellect_INTELLECT-1-Instruct.json b/data/models/PrimeIntellect_INTELLECT-1-Instruct.json similarity index 100% rename from data/PrimeIntellect_INTELLECT-1-Instruct.json rename to data/models/PrimeIntellect_INTELLECT-1-Instruct.json diff --git a/data/PrimeIntellect_INTELLECT-1.json b/data/models/PrimeIntellect_INTELLECT-1.json similarity index 100% rename from data/PrimeIntellect_INTELLECT-1.json rename to data/models/PrimeIntellect_INTELLECT-1.json diff --git a/data/PuxAI_LUA_model.json b/data/models/PuxAI_LUA_model.json similarity index 100% rename from data/PuxAI_LUA_model.json rename to data/models/PuxAI_LUA_model.json diff --git a/data/PygmalionAI_pygmalion-6b.json b/data/models/PygmalionAI_pygmalion-6b.json similarity index 100% rename from data/PygmalionAI_pygmalion-6b.json rename to data/models/PygmalionAI_pygmalion-6b.json diff --git a/data/Q-bert_MetaMath-1B.json b/data/models/Q-bert_MetaMath-1B.json similarity index 100% rename from data/Q-bert_MetaMath-1B.json rename to data/models/Q-bert_MetaMath-1B.json diff --git a/data/Quazim0t0_1up-14b.json b/data/models/Quazim0t0_1up-14b.json similarity index 100% rename from data/Quazim0t0_1up-14b.json rename to data/models/Quazim0t0_1up-14b.json diff --git a/data/Quazim0t0_Adamant-14B-sce.json b/data/models/Quazim0t0_Adamant-14B-sce.json similarity index 100% rename from data/Quazim0t0_Adamant-14B-sce.json rename to data/models/Quazim0t0_Adamant-14B-sce.json diff --git a/data/Quazim0t0_Alice-14B.json b/data/models/Quazim0t0_Alice-14B.json similarity index 100% rename from data/Quazim0t0_Alice-14B.json rename to data/models/Quazim0t0_Alice-14B.json diff --git a/data/Quazim0t0_Alien-CoT-14B-sce.json b/data/models/Quazim0t0_Alien-CoT-14B-sce.json similarity index 100% rename from data/Quazim0t0_Alien-CoT-14B-sce.json rename to data/models/Quazim0t0_Alien-CoT-14B-sce.json diff --git a/data/Quazim0t0_Aura-8B-Linear.json b/data/models/Quazim0t0_Aura-8B-Linear.json similarity index 100% rename from data/Quazim0t0_Aura-8B-Linear.json rename to data/models/Quazim0t0_Aura-8B-Linear.json diff --git a/data/Quazim0t0_Casa-14b-sce.json b/data/models/Quazim0t0_Casa-14b-sce.json similarity index 100% rename from data/Quazim0t0_Casa-14b-sce.json rename to data/models/Quazim0t0_Casa-14b-sce.json diff --git a/data/Quazim0t0_Charlie-8B-Linear.json b/data/models/Quazim0t0_Charlie-8B-Linear.json similarity index 100% rename from data/Quazim0t0_Charlie-8B-Linear.json rename to data/models/Quazim0t0_Charlie-8B-Linear.json diff --git a/data/Quazim0t0_Chromatic-8b-sce.json b/data/models/Quazim0t0_Chromatic-8b-sce.json similarity index 100% rename from data/Quazim0t0_Chromatic-8b-sce.json rename to data/models/Quazim0t0_Chromatic-8b-sce.json diff --git a/data/Quazim0t0_CoT_Phi.json b/data/models/Quazim0t0_CoT_Phi.json similarity index 100% rename from data/Quazim0t0_CoT_Phi.json rename to data/models/Quazim0t0_CoT_Phi.json diff --git a/data/Quazim0t0_Dyson-14b.json b/data/models/Quazim0t0_Dyson-14b.json similarity index 100% rename from data/Quazim0t0_Dyson-14b.json rename to data/models/Quazim0t0_Dyson-14b.json diff --git a/data/Quazim0t0_Edu-14B-Linear.json b/data/models/Quazim0t0_Edu-14B-Linear.json similarity index 100% rename from data/Quazim0t0_Edu-14B-Linear.json rename to data/models/Quazim0t0_Edu-14B-Linear.json diff --git a/data/Quazim0t0_Fugazi14b.json b/data/models/Quazim0t0_Fugazi14b.json similarity index 100% rename from data/Quazim0t0_Fugazi14b.json rename to data/models/Quazim0t0_Fugazi14b.json diff --git a/data/Quazim0t0_GZA-14B-sce.json b/data/models/Quazim0t0_GZA-14B-sce.json similarity index 100% rename from data/Quazim0t0_GZA-14B-sce.json rename to data/models/Quazim0t0_GZA-14B-sce.json diff --git a/data/Quazim0t0_Geedorah-14B.json b/data/models/Quazim0t0_Geedorah-14B.json similarity index 100% rename from data/Quazim0t0_Geedorah-14B.json rename to data/models/Quazim0t0_Geedorah-14B.json diff --git a/data/Quazim0t0_GivingTree-8b-sce.json b/data/models/Quazim0t0_GivingTree-8b-sce.json similarity index 100% rename from data/Quazim0t0_GivingTree-8b-sce.json rename to data/models/Quazim0t0_GivingTree-8b-sce.json diff --git a/data/Quazim0t0_GuiltySpark-14B-ties.json b/data/models/Quazim0t0_GuiltySpark-14B-ties.json similarity index 100% rename from data/Quazim0t0_GuiltySpark-14B-ties.json rename to data/models/Quazim0t0_GuiltySpark-14B-ties.json diff --git a/data/Quazim0t0_Halo-14B-sce.json b/data/models/Quazim0t0_Halo-14B-sce.json similarity index 100% rename from data/Quazim0t0_Halo-14B-sce.json rename to data/models/Quazim0t0_Halo-14B-sce.json diff --git a/data/Quazim0t0_Heretic1.5b.json b/data/models/Quazim0t0_Heretic1.5b.json similarity index 100% rename from data/Quazim0t0_Heretic1.5b.json rename to data/models/Quazim0t0_Heretic1.5b.json diff --git a/data/Quazim0t0_Hyde-14b-sce.json b/data/models/Quazim0t0_Hyde-14b-sce.json similarity index 100% rename from data/Quazim0t0_Hyde-14b-sce.json rename to data/models/Quazim0t0_Hyde-14b-sce.json diff --git a/data/Quazim0t0_Imagine-v0.5-16bit.json b/data/models/Quazim0t0_Imagine-v0.5-16bit.json similarity index 100% rename from data/Quazim0t0_Imagine-v0.5-16bit.json rename to data/models/Quazim0t0_Imagine-v0.5-16bit.json diff --git a/data/Quazim0t0_Imbue-14b.json b/data/models/Quazim0t0_Imbue-14b.json similarity index 100% rename from data/Quazim0t0_Imbue-14b.json rename to data/models/Quazim0t0_Imbue-14b.json diff --git a/data/Quazim0t0_Insom.json b/data/models/Quazim0t0_Insom.json similarity index 100% rename from data/Quazim0t0_Insom.json rename to data/models/Quazim0t0_Insom.json diff --git a/data/Quazim0t0_InspectorDeck-14B-sce.json b/data/models/Quazim0t0_InspectorDeck-14B-sce.json similarity index 100% rename from data/Quazim0t0_InspectorDeck-14B-sce.json rename to data/models/Quazim0t0_InspectorDeck-14B-sce.json diff --git a/data/Quazim0t0_Jekyl-8b-sce.json b/data/models/Quazim0t0_Jekyl-8b-sce.json similarity index 100% rename from data/Quazim0t0_Jekyl-8b-sce.json rename to data/models/Quazim0t0_Jekyl-8b-sce.json diff --git a/data/Quazim0t0_Jigsaw-14B-Linear.json b/data/models/Quazim0t0_Jigsaw-14B-Linear.json similarity index 100% rename from data/Quazim0t0_Jigsaw-14B-Linear.json rename to data/models/Quazim0t0_Jigsaw-14B-Linear.json diff --git a/data/Quazim0t0_Katana-8b-sce.json b/data/models/Quazim0t0_Katana-8b-sce.json similarity index 100% rename from data/Quazim0t0_Katana-8b-sce.json rename to data/models/Quazim0t0_Katana-8b-sce.json diff --git a/data/Quazim0t0_Knot-CoT-14B-sce.json b/data/models/Quazim0t0_Knot-CoT-14B-sce.json similarity index 100% rename from data/Quazim0t0_Knot-CoT-14B-sce.json rename to data/models/Quazim0t0_Knot-CoT-14B-sce.json diff --git a/data/Quazim0t0_Lineage-14B.json b/data/models/Quazim0t0_Lineage-14B.json similarity index 100% rename from data/Quazim0t0_Lineage-14B.json rename to data/models/Quazim0t0_Lineage-14B.json diff --git a/data/Quazim0t0_Lo-Phi-14b.json b/data/models/Quazim0t0_Lo-Phi-14b.json similarity index 100% rename from data/Quazim0t0_Lo-Phi-14b.json rename to data/models/Quazim0t0_Lo-Phi-14b.json diff --git a/data/Quazim0t0_Loke-14B-sce.json b/data/models/Quazim0t0_Loke-14B-sce.json similarity index 100% rename from data/Quazim0t0_Loke-14B-sce.json rename to data/models/Quazim0t0_Loke-14B-sce.json diff --git a/data/Quazim0t0_MFDOOM-14B.json b/data/models/Quazim0t0_MFDOOM-14B.json similarity index 100% rename from data/Quazim0t0_MFDOOM-14B.json rename to data/models/Quazim0t0_MFDOOM-14B.json diff --git a/data/Quazim0t0_MFGRIMM-14B.json b/data/models/Quazim0t0_MFGRIMM-14B.json similarity index 100% rename from data/Quazim0t0_MFGRIMM-14B.json rename to data/models/Quazim0t0_MFGRIMM-14B.json diff --git a/data/Quazim0t0_Math_Phi4_Reason.json b/data/models/Quazim0t0_Math_Phi4_Reason.json similarity index 100% rename from data/Quazim0t0_Math_Phi4_Reason.json rename to data/models/Quazim0t0_Math_Phi4_Reason.json diff --git a/data/Quazim0t0_Mithril-14B-sce.json b/data/models/Quazim0t0_Mithril-14B-sce.json similarity index 100% rename from data/Quazim0t0_Mithril-14B-sce.json rename to data/models/Quazim0t0_Mithril-14B-sce.json diff --git a/data/Quazim0t0_Mononoke-14B-sce.json b/data/models/Quazim0t0_Mononoke-14B-sce.json similarity index 100% rename from data/Quazim0t0_Mononoke-14B-sce.json rename to data/models/Quazim0t0_Mononoke-14B-sce.json diff --git a/data/Quazim0t0_Motion-8B-Linear.json b/data/models/Quazim0t0_Motion-8B-Linear.json similarity index 100% rename from data/Quazim0t0_Motion-8B-Linear.json rename to data/models/Quazim0t0_Motion-8B-Linear.json diff --git a/data/Quazim0t0_Mouse-9B.json b/data/models/Quazim0t0_Mouse-9B.json similarity index 100% rename from data/Quazim0t0_Mouse-9B.json rename to data/models/Quazim0t0_Mouse-9B.json diff --git a/data/Quazim0t0_Nova-14b-sce.json b/data/models/Quazim0t0_Nova-14b-sce.json similarity index 100% rename from data/Quazim0t0_Nova-14b-sce.json rename to data/models/Quazim0t0_Nova-14b-sce.json diff --git a/data/Quazim0t0_NovaScotia-14b-stock.json b/data/models/Quazim0t0_NovaScotia-14b-stock.json similarity index 100% rename from data/Quazim0t0_NovaScotia-14b-stock.json rename to data/models/Quazim0t0_NovaScotia-14b-stock.json diff --git a/data/models/Quazim0t0_ODB-14B-sce.json b/data/models/Quazim0t0_ODB-14B-sce.json new file mode 100644 index 0000000000000000000000000000000000000000..247bc5e16960cf2329de952f28590c74176a7ba4 --- /dev/null +++ b/data/models/Quazim0t0_ODB-14B-sce.json @@ -0,0 +1,145 @@ +{ + "model_info": { + "name": "ODB-14B-sce", + "id": "Quazim0t0/ODB-14B-sce", + "developer": "Quazim0t0", + "inference_platform": "unknown", + "additional_details": { + "precision": "bfloat16", + "architecture": "Unknown", + "params_billions": "0.0" + } + }, + "evaluations": [ + { + "evaluation_id": "hfopenllm_v2/Quazim0t0_ODB-14B-sce/1773936498.240187", + "retrieved_timestamp": "1773936498.240187", + "source_metadata": { + "source_name": "HF Open LLM v2", + "source_type": "documentation", + "source_organization_name": "Hugging Face", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "lm-evaluation-harness", + "version": "0.4.0", + "additional_details": { + "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess" + } + }, + "benchmark": "hfopenllm_v2", + "evaluation_results": [ + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "hf_dataset", + "hf_repo": "google/IFEval" + }, + "metric_config": { + "evaluation_description": "Accuracy on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2922 + } + }, + { + "evaluation_name": "BBH", + "source_data": { + "dataset_name": "BBH", + "source_type": "hf_dataset", + "hf_repo": "SaylorTwift/bbh" + }, + "metric_config": { + "evaluation_description": "Accuracy on BBH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6559 + } + }, + { + "evaluation_name": "MATH Level 5", + "source_data": { + "dataset_name": "MATH Level 5", + "source_type": "hf_dataset", + "hf_repo": "DigitalLearningGmbH/MATH-lighteval" + }, + "metric_config": { + "evaluation_description": "Exact Match on MATH Level 5", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2545 + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "hf_dataset", + "hf_repo": "Idavidrein/gpqa" + }, + "metric_config": { + "evaluation_description": "Accuracy on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2659 + } + }, + { + "evaluation_name": "MUSR", + "source_data": { + "dataset_name": "MUSR", + "source_type": "hf_dataset", + "hf_repo": "TAUR-Lab/MuSR" + }, + "metric_config": { + "evaluation_description": "Accuracy on MUSR", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3929 + } + }, + { + "evaluation_name": "MMLU-PRO", + "source_data": { + "dataset_name": "MMLU-PRO", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/MMLU-Pro" + }, + "metric_config": { + "evaluation_description": "Accuracy on MMLU-PRO", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.5207 + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/Quazim0t0_ODB-14b-sce.json b/data/models/Quazim0t0_ODB-14b-sce.json similarity index 100% rename from data/Quazim0t0_ODB-14b-sce.json rename to data/models/Quazim0t0_ODB-14b-sce.json diff --git a/data/Quazim0t0_Oasis-14B-ties.json b/data/models/Quazim0t0_Oasis-14B-ties.json similarity index 100% rename from data/Quazim0t0_Oasis-14B-ties.json rename to data/models/Quazim0t0_Oasis-14B-ties.json diff --git a/data/Quazim0t0_Origami-14B-sce.json b/data/models/Quazim0t0_Origami-14B-sce.json similarity index 100% rename from data/Quazim0t0_Origami-14B-sce.json rename to data/models/Quazim0t0_Origami-14B-sce.json diff --git a/data/Quazim0t0_Phi4.Turn.R1Distill.16bit.json b/data/models/Quazim0t0_Phi4.Turn.R1Distill.16bit.json similarity index 100% rename from data/Quazim0t0_Phi4.Turn.R1Distill.16bit.json rename to data/models/Quazim0t0_Phi4.Turn.R1Distill.16bit.json diff --git a/data/Quazim0t0_Phi4.Turn.R1Distill_v1.5.1-Tensors.json b/data/models/Quazim0t0_Phi4.Turn.R1Distill_v1.5.1-Tensors.json similarity index 100% rename from data/Quazim0t0_Phi4.Turn.R1Distill_v1.5.1-Tensors.json rename to data/models/Quazim0t0_Phi4.Turn.R1Distill_v1.5.1-Tensors.json diff --git a/data/Quazim0t0_Phi4Basis-14B-sce.json b/data/models/Quazim0t0_Phi4Basis-14B-sce.json similarity index 100% rename from data/Quazim0t0_Phi4Basis-14B-sce.json rename to data/models/Quazim0t0_Phi4Basis-14B-sce.json diff --git a/data/Quazim0t0_Ponder-14B-linear.json b/data/models/Quazim0t0_Ponder-14B-linear.json similarity index 100% rename from data/Quazim0t0_Ponder-14B-linear.json rename to data/models/Quazim0t0_Ponder-14B-linear.json diff --git a/data/Quazim0t0_RZA-14B-sce.json b/data/models/Quazim0t0_RZA-14B-sce.json similarity index 100% rename from data/Quazim0t0_RZA-14B-sce.json rename to data/models/Quazim0t0_RZA-14B-sce.json diff --git a/data/Quazim0t0_Rosemary-14b.json b/data/models/Quazim0t0_Rosemary-14b.json similarity index 100% rename from data/Quazim0t0_Rosemary-14b.json rename to data/models/Quazim0t0_Rosemary-14b.json diff --git a/data/Quazim0t0_Rune-14b.json b/data/models/Quazim0t0_Rune-14b.json similarity index 100% rename from data/Quazim0t0_Rune-14b.json rename to data/models/Quazim0t0_Rune-14b.json diff --git a/data/Quazim0t0_SZA-14B-sce.json b/data/models/Quazim0t0_SZA-14B-sce.json similarity index 100% rename from data/Quazim0t0_SZA-14B-sce.json rename to data/models/Quazim0t0_SZA-14B-sce.json diff --git a/data/Quazim0t0_Sake-20b.json b/data/models/Quazim0t0_Sake-20b.json similarity index 100% rename from data/Quazim0t0_Sake-20b.json rename to data/models/Quazim0t0_Sake-20b.json diff --git a/data/Quazim0t0_Spok-14b-sce.json b/data/models/Quazim0t0_Spok-14b-sce.json similarity index 100% rename from data/Quazim0t0_Spok-14b-sce.json rename to data/models/Quazim0t0_Spok-14b-sce.json diff --git a/data/Quazim0t0_Sumatra-20b.json b/data/models/Quazim0t0_Sumatra-20b.json similarity index 100% rename from data/Quazim0t0_Sumatra-20b.json rename to data/models/Quazim0t0_Sumatra-20b.json diff --git a/data/Quazim0t0_SuperNova14b.json b/data/models/Quazim0t0_SuperNova14b.json similarity index 100% rename from data/Quazim0t0_SuperNova14b.json rename to data/models/Quazim0t0_SuperNova14b.json diff --git a/data/Quazim0t0_TB0-8B-sce.json b/data/models/Quazim0t0_TB0-8B-sce.json similarity index 100% rename from data/Quazim0t0_TB0-8B-sce.json rename to data/models/Quazim0t0_TB0-8B-sce.json diff --git a/data/Quazim0t0_TBL-8B-sce.json b/data/models/Quazim0t0_TBL-8B-sce.json similarity index 100% rename from data/Quazim0t0_TBL-8B-sce.json rename to data/models/Quazim0t0_TBL-8B-sce.json diff --git a/data/Quazim0t0_ThinkPhi1.1-Tensors.json b/data/models/Quazim0t0_ThinkPhi1.1-Tensors.json similarity index 100% rename from data/Quazim0t0_ThinkPhi1.1-Tensors.json rename to data/models/Quazim0t0_ThinkPhi1.1-Tensors.json diff --git a/data/Quazim0t0_Venti-20b.json b/data/models/Quazim0t0_Venti-20b.json similarity index 100% rename from data/Quazim0t0_Venti-20b.json rename to data/models/Quazim0t0_Venti-20b.json diff --git a/data/Quazim0t0_Venti-Blend-sce.json b/data/models/Quazim0t0_Venti-Blend-sce.json similarity index 100% rename from data/Quazim0t0_Venti-Blend-sce.json rename to data/models/Quazim0t0_Venti-Blend-sce.json diff --git a/data/Quazim0t0_Vine-14b-sce.json b/data/models/Quazim0t0_Vine-14b-sce.json similarity index 100% rename from data/Quazim0t0_Vine-14b-sce.json rename to data/models/Quazim0t0_Vine-14b-sce.json diff --git a/data/Quazim0t0_Wendy-14B.json b/data/models/Quazim0t0_Wendy-14B.json similarity index 100% rename from data/Quazim0t0_Wendy-14B.json rename to data/models/Quazim0t0_Wendy-14B.json diff --git a/data/Quazim0t0_Wu-14b-sce.json b/data/models/Quazim0t0_Wu-14b-sce.json similarity index 100% rename from data/Quazim0t0_Wu-14b-sce.json rename to data/models/Quazim0t0_Wu-14b-sce.json diff --git a/data/Quazim0t0_bloom-14b-stock.json b/data/models/Quazim0t0_bloom-14b-stock.json similarity index 100% rename from data/Quazim0t0_bloom-14b-stock.json rename to data/models/Quazim0t0_bloom-14b-stock.json diff --git a/data/Quazim0t0_caramel-14B.json b/data/models/Quazim0t0_caramel-14B.json similarity index 100% rename from data/Quazim0t0_caramel-14B.json rename to data/models/Quazim0t0_caramel-14B.json diff --git a/data/Quazim0t0_graphite-14b-sce.json b/data/models/Quazim0t0_graphite-14b-sce.json similarity index 100% rename from data/Quazim0t0_graphite-14b-sce.json rename to data/models/Quazim0t0_graphite-14b-sce.json diff --git a/data/Quazim0t0_mocha-14B.json b/data/models/Quazim0t0_mocha-14B.json similarity index 100% rename from data/Quazim0t0_mocha-14B.json rename to data/models/Quazim0t0_mocha-14B.json diff --git a/data/Quazim0t0_mosaic-14b-sce.json b/data/models/Quazim0t0_mosaic-14b-sce.json similarity index 100% rename from data/Quazim0t0_mosaic-14b-sce.json rename to data/models/Quazim0t0_mosaic-14b-sce.json diff --git a/data/Quazim0t0_tesseract-14b-stock.json b/data/models/Quazim0t0_tesseract-14b-stock.json similarity index 100% rename from data/Quazim0t0_tesseract-14b-stock.json rename to data/models/Quazim0t0_tesseract-14b-stock.json diff --git a/data/Quazim0t0_time-14b-stock.json b/data/models/Quazim0t0_time-14b-stock.json similarity index 100% rename from data/Quazim0t0_time-14b-stock.json rename to data/models/Quazim0t0_time-14b-stock.json diff --git a/data/Qwen_QwQ-32B-Preview.json b/data/models/Qwen_QwQ-32B-Preview.json similarity index 100% rename from data/Qwen_QwQ-32B-Preview.json rename to data/models/Qwen_QwQ-32B-Preview.json diff --git a/data/Qwen_QwQ-32B.json b/data/models/Qwen_QwQ-32B.json similarity index 100% rename from data/Qwen_QwQ-32B.json rename to data/models/Qwen_QwQ-32B.json diff --git a/data/Qwen_Qwen1.5-0.5B-Chat.json b/data/models/Qwen_Qwen1.5-0.5B-Chat.json similarity index 100% rename from data/Qwen_Qwen1.5-0.5B-Chat.json rename to data/models/Qwen_Qwen1.5-0.5B-Chat.json diff --git a/data/Qwen_Qwen1.5-0.5B.json b/data/models/Qwen_Qwen1.5-0.5B.json similarity index 100% rename from data/Qwen_Qwen1.5-0.5B.json rename to data/models/Qwen_Qwen1.5-0.5B.json diff --git a/data/Qwen_Qwen1.5-1.8B-Chat.json b/data/models/Qwen_Qwen1.5-1.8B-Chat.json similarity index 100% rename from data/Qwen_Qwen1.5-1.8B-Chat.json rename to data/models/Qwen_Qwen1.5-1.8B-Chat.json diff --git a/data/Qwen_Qwen1.5-1.8B.json b/data/models/Qwen_Qwen1.5-1.8B.json similarity index 100% rename from data/Qwen_Qwen1.5-1.8B.json rename to data/models/Qwen_Qwen1.5-1.8B.json diff --git a/data/models/Qwen_Qwen1.5-110B-Chat.json b/data/models/Qwen_Qwen1.5-110B-Chat.json new file mode 100644 index 0000000000000000000000000000000000000000..e05f5e26382a2a530d3ef3f1c4954ebd4070fed8 --- /dev/null +++ b/data/models/Qwen_Qwen1.5-110B-Chat.json @@ -0,0 +1,145 @@ +{ + "model_info": { + "name": "Qwen1.5-110B-Chat", + "id": "Qwen/Qwen1.5-110B-Chat", + "developer": "Qwen", + "inference_platform": "unknown", + "additional_details": { + "precision": "bfloat16", + "architecture": "Qwen2ForCausalLM", + "params_billions": "111.21" + } + }, + "evaluations": [ + { + "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-110B-Chat/1773936498.240187", + "retrieved_timestamp": "1773936498.240187", + "source_metadata": { + "source_name": "HF Open LLM v2", + "source_type": "documentation", + "source_organization_name": "Hugging Face", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "lm-evaluation-harness", + "version": "0.4.0", + "additional_details": { + "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess" + } + }, + "benchmark": "hfopenllm_v2", + "evaluation_results": [ + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "hf_dataset", + "hf_repo": "google/IFEval" + }, + "metric_config": { + "evaluation_description": "Accuracy on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.5939 + } + }, + { + "evaluation_name": "BBH", + "source_data": { + "dataset_name": "BBH", + "source_type": "hf_dataset", + "hf_repo": "SaylorTwift/bbh" + }, + "metric_config": { + "evaluation_description": "Accuracy on BBH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6184 + } + }, + { + "evaluation_name": "MATH Level 5", + "source_data": { + "dataset_name": "MATH Level 5", + "source_type": "hf_dataset", + "hf_repo": "DigitalLearningGmbH/MATH-lighteval" + }, + "metric_config": { + "evaluation_description": "Exact Match on MATH Level 5", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2341 + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "hf_dataset", + "hf_repo": "Idavidrein/gpqa" + }, + "metric_config": { + "evaluation_description": "Accuracy on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3414 + } + }, + { + "evaluation_name": "MUSR", + "source_data": { + "dataset_name": "MUSR", + "source_type": "hf_dataset", + "hf_repo": "TAUR-Lab/MuSR" + }, + "metric_config": { + "evaluation_description": "Accuracy on MUSR", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4522 + } + }, + { + "evaluation_name": "MMLU-PRO", + "source_data": { + "dataset_name": "MMLU-PRO", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/MMLU-Pro" + }, + "metric_config": { + "evaluation_description": "Accuracy on MMLU-PRO", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4825 + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/Qwen_Qwen1.5-110B.json b/data/models/Qwen_Qwen1.5-110B.json similarity index 100% rename from data/Qwen_Qwen1.5-110B.json rename to data/models/Qwen_Qwen1.5-110B.json diff --git a/data/Qwen_Qwen1.5-14B-Chat.json b/data/models/Qwen_Qwen1.5-14B-Chat.json similarity index 100% rename from data/Qwen_Qwen1.5-14B-Chat.json rename to data/models/Qwen_Qwen1.5-14B-Chat.json diff --git a/data/models/Qwen_Qwen1.5-14B.json b/data/models/Qwen_Qwen1.5-14B.json new file mode 100644 index 0000000000000000000000000000000000000000..58869e644ae1e53ef1a435db9789c63aa24ed826 --- /dev/null +++ b/data/models/Qwen_Qwen1.5-14B.json @@ -0,0 +1,145 @@ +{ + "model_info": { + "name": "Qwen1.5-14B", + "id": "Qwen/Qwen1.5-14B", + "developer": "Qwen", + "inference_platform": "unknown", + "additional_details": { + "precision": "bfloat16", + "architecture": "Qwen2ForCausalLM", + "params_billions": "14.167" + } + }, + "evaluations": [ + { + "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-14B/1773936498.240187", + "retrieved_timestamp": "1773936498.240187", + "source_metadata": { + "source_name": "HF Open LLM v2", + "source_type": "documentation", + "source_organization_name": "Hugging Face", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "lm-evaluation-harness", + "version": "0.4.0", + "additional_details": { + "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess" + } + }, + "benchmark": "hfopenllm_v2", + "evaluation_results": [ + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "hf_dataset", + "hf_repo": "google/IFEval" + }, + "metric_config": { + "evaluation_description": "Accuracy on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2905 + } + }, + { + "evaluation_name": "BBH", + "source_data": { + "dataset_name": "BBH", + "source_type": "hf_dataset", + "hf_repo": "SaylorTwift/bbh" + }, + "metric_config": { + "evaluation_description": "Accuracy on BBH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.508 + } + }, + { + "evaluation_name": "MATH Level 5", + "source_data": { + "dataset_name": "MATH Level 5", + "source_type": "hf_dataset", + "hf_repo": "DigitalLearningGmbH/MATH-lighteval" + }, + "metric_config": { + "evaluation_description": "Exact Match on MATH Level 5", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2024 + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "hf_dataset", + "hf_repo": "Idavidrein/gpqa" + }, + "metric_config": { + "evaluation_description": "Accuracy on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2945 + } + }, + { + "evaluation_name": "MUSR", + "source_data": { + "dataset_name": "MUSR", + "source_type": "hf_dataset", + "hf_repo": "TAUR-Lab/MuSR" + }, + "metric_config": { + "evaluation_description": "Accuracy on MUSR", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4186 + } + }, + { + "evaluation_name": "MMLU-PRO", + "source_data": { + "dataset_name": "MMLU-PRO", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/MMLU-Pro" + }, + "metric_config": { + "evaluation_description": "Accuracy on MMLU-PRO", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3644 + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/Qwen_Qwen1.5-32B-Chat.json b/data/models/Qwen_Qwen1.5-32B-Chat.json similarity index 100% rename from data/Qwen_Qwen1.5-32B-Chat.json rename to data/models/Qwen_Qwen1.5-32B-Chat.json diff --git a/data/models/Qwen_Qwen1.5-32B.json b/data/models/Qwen_Qwen1.5-32B.json new file mode 100644 index 0000000000000000000000000000000000000000..3ac7aafaf7a00a4a638f0ad334e2dcb6423b7625 --- /dev/null +++ b/data/models/Qwen_Qwen1.5-32B.json @@ -0,0 +1,145 @@ +{ + "model_info": { + "name": "Qwen1.5-32B", + "id": "Qwen/Qwen1.5-32B", + "developer": "Qwen", + "inference_platform": "unknown", + "additional_details": { + "precision": "bfloat16", + "architecture": "Qwen2ForCausalLM", + "params_billions": "32.512" + } + }, + "evaluations": [ + { + "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-32B/1773936498.240187", + "retrieved_timestamp": "1773936498.240187", + "source_metadata": { + "source_name": "HF Open LLM v2", + "source_type": "documentation", + "source_organization_name": "Hugging Face", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "lm-evaluation-harness", + "version": "0.4.0", + "additional_details": { + "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess" + } + }, + "benchmark": "hfopenllm_v2", + "evaluation_results": [ + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "hf_dataset", + "hf_repo": "google/IFEval" + }, + "metric_config": { + "evaluation_description": "Accuracy on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3297 + } + }, + { + "evaluation_name": "BBH", + "source_data": { + "dataset_name": "BBH", + "source_type": "hf_dataset", + "hf_repo": "SaylorTwift/bbh" + }, + "metric_config": { + "evaluation_description": "Accuracy on BBH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.5715 + } + }, + { + "evaluation_name": "MATH Level 5", + "source_data": { + "dataset_name": "MATH Level 5", + "source_type": "hf_dataset", + "hf_repo": "DigitalLearningGmbH/MATH-lighteval" + }, + "metric_config": { + "evaluation_description": "Exact Match on MATH Level 5", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3029 + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "hf_dataset", + "hf_repo": "Idavidrein/gpqa" + }, + "metric_config": { + "evaluation_description": "Accuracy on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3297 + } + }, + { + "evaluation_name": "MUSR", + "source_data": { + "dataset_name": "MUSR", + "source_type": "hf_dataset", + "hf_repo": "TAUR-Lab/MuSR" + }, + "metric_config": { + "evaluation_description": "Accuracy on MUSR", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4278 + } + }, + { + "evaluation_name": "MMLU-PRO", + "source_data": { + "dataset_name": "MMLU-PRO", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/MMLU-Pro" + }, + "metric_config": { + "evaluation_description": "Accuracy on MMLU-PRO", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.45 + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/Qwen_Qwen1.5-4B-Chat.json b/data/models/Qwen_Qwen1.5-4B-Chat.json similarity index 100% rename from data/Qwen_Qwen1.5-4B-Chat.json rename to data/models/Qwen_Qwen1.5-4B-Chat.json diff --git a/data/Qwen_Qwen1.5-4B.json b/data/models/Qwen_Qwen1.5-4B.json similarity index 100% rename from data/Qwen_Qwen1.5-4B.json rename to data/models/Qwen_Qwen1.5-4B.json diff --git a/data/Qwen_Qwen1.5-72B-Chat.json b/data/models/Qwen_Qwen1.5-72B-Chat.json similarity index 100% rename from data/Qwen_Qwen1.5-72B-Chat.json rename to data/models/Qwen_Qwen1.5-72B-Chat.json diff --git a/data/Qwen_Qwen1.5-7B-Chat.json b/data/models/Qwen_Qwen1.5-7B-Chat.json similarity index 100% rename from data/Qwen_Qwen1.5-7B-Chat.json rename to data/models/Qwen_Qwen1.5-7B-Chat.json diff --git a/data/models/Qwen_Qwen1.5-7B.json b/data/models/Qwen_Qwen1.5-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..6a387ecf5261bc09fb8b0e56d542f0bef1a971b6 --- /dev/null +++ b/data/models/Qwen_Qwen1.5-7B.json @@ -0,0 +1,145 @@ +{ + "model_info": { + "name": "Qwen1.5-7B", + "id": "Qwen/Qwen1.5-7B", + "developer": "Qwen", + "inference_platform": "unknown", + "additional_details": { + "precision": "bfloat16", + "architecture": "Qwen2ForCausalLM", + "params_billions": "7.721" + } + }, + "evaluations": [ + { + "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-7B/1773936498.240187", + "retrieved_timestamp": "1773936498.240187", + "source_metadata": { + "source_name": "HF Open LLM v2", + "source_type": "documentation", + "source_organization_name": "Hugging Face", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "lm-evaluation-harness", + "version": "0.4.0", + "additional_details": { + "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess" + } + }, + "benchmark": "hfopenllm_v2", + "evaluation_results": [ + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "hf_dataset", + "hf_repo": "google/IFEval" + }, + "metric_config": { + "evaluation_description": "Accuracy on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2684 + } + }, + { + "evaluation_name": "BBH", + "source_data": { + "dataset_name": "BBH", + "source_type": "hf_dataset", + "hf_repo": "SaylorTwift/bbh" + }, + "metric_config": { + "evaluation_description": "Accuracy on BBH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.456 + } + }, + { + "evaluation_name": "MATH Level 5", + "source_data": { + "dataset_name": "MATH Level 5", + "source_type": "hf_dataset", + "hf_repo": "DigitalLearningGmbH/MATH-lighteval" + }, + "metric_config": { + "evaluation_description": "Exact Match on MATH Level 5", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.0929 + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "hf_dataset", + "hf_repo": "Idavidrein/gpqa" + }, + "metric_config": { + "evaluation_description": "Accuracy on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2987 + } + }, + { + "evaluation_name": "MUSR", + "source_data": { + "dataset_name": "MUSR", + "source_type": "hf_dataset", + "hf_repo": "TAUR-Lab/MuSR" + }, + "metric_config": { + "evaluation_description": "Accuracy on MUSR", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4103 + } + }, + { + "evaluation_name": "MMLU-PRO", + "source_data": { + "dataset_name": "MMLU-PRO", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/MMLU-Pro" + }, + "metric_config": { + "evaluation_description": "Accuracy on MMLU-PRO", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2916 + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/Qwen_Qwen1.5-MoE-A2.7B-Chat.json b/data/models/Qwen_Qwen1.5-MoE-A2.7B-Chat.json similarity index 100% rename from data/Qwen_Qwen1.5-MoE-A2.7B-Chat.json rename to data/models/Qwen_Qwen1.5-MoE-A2.7B-Chat.json diff --git a/data/Qwen_Qwen1.5-MoE-A2.7B.json b/data/models/Qwen_Qwen1.5-MoE-A2.7B.json similarity index 100% rename from data/Qwen_Qwen1.5-MoE-A2.7B.json rename to data/models/Qwen_Qwen1.5-MoE-A2.7B.json diff --git a/data/Qwen_Qwen2-0.5B-Instruct.json b/data/models/Qwen_Qwen2-0.5B-Instruct.json similarity index 100% rename from data/Qwen_Qwen2-0.5B-Instruct.json rename to data/models/Qwen_Qwen2-0.5B-Instruct.json diff --git a/data/Qwen_Qwen2-0.5B.json b/data/models/Qwen_Qwen2-0.5B.json similarity index 100% rename from data/Qwen_Qwen2-0.5B.json rename to data/models/Qwen_Qwen2-0.5B.json diff --git a/data/Qwen_Qwen2-1.5B-Instruct.json b/data/models/Qwen_Qwen2-1.5B-Instruct.json similarity index 100% rename from data/Qwen_Qwen2-1.5B-Instruct.json rename to data/models/Qwen_Qwen2-1.5B-Instruct.json diff --git a/data/Qwen_Qwen2-1.5B.json b/data/models/Qwen_Qwen2-1.5B.json similarity index 100% rename from data/Qwen_Qwen2-1.5B.json rename to data/models/Qwen_Qwen2-1.5B.json diff --git a/data/Qwen_Qwen2-57B-A14B-Instruct.json b/data/models/Qwen_Qwen2-57B-A14B-Instruct.json similarity index 100% rename from data/Qwen_Qwen2-57B-A14B-Instruct.json rename to data/models/Qwen_Qwen2-57B-A14B-Instruct.json diff --git a/data/Qwen_Qwen2-57B-A14B.json b/data/models/Qwen_Qwen2-57B-A14B.json similarity index 100% rename from data/Qwen_Qwen2-57B-A14B.json rename to data/models/Qwen_Qwen2-57B-A14B.json diff --git a/data/models/Qwen_Qwen2-72B-Instruct.json b/data/models/Qwen_Qwen2-72B-Instruct.json new file mode 100644 index 0000000000000000000000000000000000000000..59cbdde5c09a70d0d527e9dfa773a4a99ebe3801 --- /dev/null +++ b/data/models/Qwen_Qwen2-72B-Instruct.json @@ -0,0 +1,145 @@ +{ + "model_info": { + "name": "Qwen2-72B-Instruct", + "id": "Qwen/Qwen2-72B-Instruct", + "developer": "Qwen", + "inference_platform": "unknown", + "additional_details": { + "precision": "bfloat16", + "architecture": "Qwen2ForCausalLM", + "params_billions": "72.706" + } + }, + "evaluations": [ + { + "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-72B-Instruct/1773936498.240187", + "retrieved_timestamp": "1773936498.240187", + "source_metadata": { + "source_name": "HF Open LLM v2", + "source_type": "documentation", + "source_organization_name": "Hugging Face", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "lm-evaluation-harness", + "version": "0.4.0", + "additional_details": { + "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess" + } + }, + "benchmark": "hfopenllm_v2", + "evaluation_results": [ + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "hf_dataset", + "hf_repo": "google/IFEval" + }, + "metric_config": { + "evaluation_description": "Accuracy on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.7989 + } + }, + { + "evaluation_name": "BBH", + "source_data": { + "dataset_name": "BBH", + "source_type": "hf_dataset", + "hf_repo": "SaylorTwift/bbh" + }, + "metric_config": { + "evaluation_description": "Accuracy on BBH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6977 + } + }, + { + "evaluation_name": "MATH Level 5", + "source_data": { + "dataset_name": "MATH Level 5", + "source_type": "hf_dataset", + "hf_repo": "DigitalLearningGmbH/MATH-lighteval" + }, + "metric_config": { + "evaluation_description": "Exact Match on MATH Level 5", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4177 + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "hf_dataset", + "hf_repo": "Idavidrein/gpqa" + }, + "metric_config": { + "evaluation_description": "Accuracy on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3725 + } + }, + { + "evaluation_name": "MUSR", + "source_data": { + "dataset_name": "MUSR", + "source_type": "hf_dataset", + "hf_repo": "TAUR-Lab/MuSR" + }, + "metric_config": { + "evaluation_description": "Accuracy on MUSR", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.456 + } + }, + { + "evaluation_name": "MMLU-PRO", + "source_data": { + "dataset_name": "MMLU-PRO", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/MMLU-Pro" + }, + "metric_config": { + "evaluation_description": "Accuracy on MMLU-PRO", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.5403 + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/Qwen_Qwen2-72B.json b/data/models/Qwen_Qwen2-72B.json similarity index 100% rename from data/Qwen_Qwen2-72B.json rename to data/models/Qwen_Qwen2-72B.json diff --git a/data/Qwen_Qwen2-7B-Instruct.json b/data/models/Qwen_Qwen2-7B-Instruct.json similarity index 100% rename from data/Qwen_Qwen2-7B-Instruct.json rename to data/models/Qwen_Qwen2-7B-Instruct.json diff --git a/data/Qwen_Qwen2-7B.json b/data/models/Qwen_Qwen2-7B.json similarity index 100% rename from data/Qwen_Qwen2-7B.json rename to data/models/Qwen_Qwen2-7B.json diff --git a/data/Qwen_Qwen2-Math-72B-Instruct.json b/data/models/Qwen_Qwen2-Math-72B-Instruct.json similarity index 100% rename from data/Qwen_Qwen2-Math-72B-Instruct.json rename to data/models/Qwen_Qwen2-Math-72B-Instruct.json diff --git a/data/Qwen_Qwen2-Math-7B.json b/data/models/Qwen_Qwen2-Math-7B.json similarity index 100% rename from data/Qwen_Qwen2-Math-7B.json rename to data/models/Qwen_Qwen2-Math-7B.json diff --git a/data/Qwen_Qwen2-VL-72B-Instruct.json b/data/models/Qwen_Qwen2-VL-72B-Instruct.json similarity index 100% rename from data/Qwen_Qwen2-VL-72B-Instruct.json rename to data/models/Qwen_Qwen2-VL-72B-Instruct.json diff --git a/data/Qwen_Qwen2-VL-7B-Instruct.json b/data/models/Qwen_Qwen2-VL-7B-Instruct.json similarity index 100% rename from data/Qwen_Qwen2-VL-7B-Instruct.json rename to data/models/Qwen_Qwen2-VL-7B-Instruct.json diff --git a/data/Qwen_Qwen2.5-0.5B-Instruct.json b/data/models/Qwen_Qwen2.5-0.5B-Instruct.json similarity index 99% rename from data/Qwen_Qwen2.5-0.5B-Instruct.json rename to data/models/Qwen_Qwen2.5-0.5B-Instruct.json index ff626bd43f5a0267c85cd8e28dde66f183db6457..3d0aedd0be81cb65537fa8994b8b471d9f8ba0cf 100644 --- a/data/Qwen_Qwen2.5-0.5B-Instruct.json +++ b/data/models/Qwen_Qwen2.5-0.5B-Instruct.json @@ -5,9 +5,9 @@ "developer": "Qwen", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "Qwen2ForCausalLM", - "params_billions": "0.494" + "params_billions": "0.5" } }, "evaluations": [ @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3071 + "score": 0.3153 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3341 + "score": 0.3322 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0 + "score": 0.1035 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2576 + "score": 0.2592 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3329 + "score": 0.3342 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1697 + "score": 0.172 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3153 + "score": 0.3071 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3322 + "score": 0.3341 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1035 + "score": 0.0 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2592 + "score": 0.2576 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3342 + "score": 0.3329 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.172 + "score": 0.1697 } } ], diff --git a/data/Qwen_Qwen2.5-0.5B.json b/data/models/Qwen_Qwen2.5-0.5B.json similarity index 100% rename from data/Qwen_Qwen2.5-0.5B.json rename to data/models/Qwen_Qwen2.5-0.5B.json diff --git a/data/Qwen_Qwen2.5-1.5B-Instruct.json b/data/models/Qwen_Qwen2.5-1.5B-Instruct.json similarity index 100% rename from data/Qwen_Qwen2.5-1.5B-Instruct.json rename to data/models/Qwen_Qwen2.5-1.5B-Instruct.json diff --git a/data/Qwen_Qwen2.5-1.5B.json b/data/models/Qwen_Qwen2.5-1.5B.json similarity index 100% rename from data/Qwen_Qwen2.5-1.5B.json rename to data/models/Qwen_Qwen2.5-1.5B.json diff --git a/data/Qwen_Qwen2.5-14B-Instruct-1M.json b/data/models/Qwen_Qwen2.5-14B-Instruct-1M.json similarity index 100% rename from data/Qwen_Qwen2.5-14B-Instruct-1M.json rename to data/models/Qwen_Qwen2.5-14B-Instruct-1M.json diff --git a/data/Qwen_Qwen2.5-14B-Instruct.json b/data/models/Qwen_Qwen2.5-14B-Instruct.json similarity index 100% rename from data/Qwen_Qwen2.5-14B-Instruct.json rename to data/models/Qwen_Qwen2.5-14B-Instruct.json diff --git a/data/Qwen_Qwen2.5-14B.json b/data/models/Qwen_Qwen2.5-14B.json similarity index 100% rename from data/Qwen_Qwen2.5-14B.json rename to data/models/Qwen_Qwen2.5-14B.json diff --git a/data/Qwen_Qwen2.5-32B-Instruct.json b/data/models/Qwen_Qwen2.5-32B-Instruct.json similarity index 100% rename from data/Qwen_Qwen2.5-32B-Instruct.json rename to data/models/Qwen_Qwen2.5-32B-Instruct.json diff --git a/data/Qwen_Qwen2.5-32B.json b/data/models/Qwen_Qwen2.5-32B.json similarity index 100% rename from data/Qwen_Qwen2.5-32B.json rename to data/models/Qwen_Qwen2.5-32B.json diff --git a/data/Qwen_Qwen2.5-3B-Instruct.json b/data/models/Qwen_Qwen2.5-3B-Instruct.json similarity index 100% rename from data/Qwen_Qwen2.5-3B-Instruct.json rename to data/models/Qwen_Qwen2.5-3B-Instruct.json diff --git a/data/Qwen_Qwen2.5-3B.json b/data/models/Qwen_Qwen2.5-3B.json similarity index 100% rename from data/Qwen_Qwen2.5-3B.json rename to data/models/Qwen_Qwen2.5-3B.json diff --git a/data/Qwen_Qwen2.5-72B-Instruct.json b/data/models/Qwen_Qwen2.5-72B-Instruct.json similarity index 100% rename from data/Qwen_Qwen2.5-72B-Instruct.json rename to data/models/Qwen_Qwen2.5-72B-Instruct.json diff --git a/data/Qwen_Qwen2.5-72B.json b/data/models/Qwen_Qwen2.5-72B.json similarity index 100% rename from data/Qwen_Qwen2.5-72B.json rename to data/models/Qwen_Qwen2.5-72B.json diff --git a/data/Qwen_Qwen2.5-7B-Instruct-1M.json b/data/models/Qwen_Qwen2.5-7B-Instruct-1M.json similarity index 100% rename from data/Qwen_Qwen2.5-7B-Instruct-1M.json rename to data/models/Qwen_Qwen2.5-7B-Instruct-1M.json diff --git a/data/Qwen_Qwen2.5-7B-Instruct.json b/data/models/Qwen_Qwen2.5-7B-Instruct.json similarity index 100% rename from data/Qwen_Qwen2.5-7B-Instruct.json rename to data/models/Qwen_Qwen2.5-7B-Instruct.json diff --git a/data/Qwen_Qwen2.5-7B.json b/data/models/Qwen_Qwen2.5-7B.json similarity index 100% rename from data/Qwen_Qwen2.5-7B.json rename to data/models/Qwen_Qwen2.5-7B.json diff --git a/data/Qwen_Qwen2.5-Coder-14B-Instruct.json b/data/models/Qwen_Qwen2.5-Coder-14B-Instruct.json similarity index 100% rename from data/Qwen_Qwen2.5-Coder-14B-Instruct.json rename to data/models/Qwen_Qwen2.5-Coder-14B-Instruct.json diff --git a/data/Qwen_Qwen2.5-Coder-14B.json b/data/models/Qwen_Qwen2.5-Coder-14B.json similarity index 100% rename from data/Qwen_Qwen2.5-Coder-14B.json rename to data/models/Qwen_Qwen2.5-Coder-14B.json diff --git a/data/Qwen_Qwen2.5-Coder-32B-Instruct.json b/data/models/Qwen_Qwen2.5-Coder-32B-Instruct.json similarity index 100% rename from data/Qwen_Qwen2.5-Coder-32B-Instruct.json rename to data/models/Qwen_Qwen2.5-Coder-32B-Instruct.json diff --git a/data/Qwen_Qwen2.5-Coder-32B.json b/data/models/Qwen_Qwen2.5-Coder-32B.json similarity index 100% rename from data/Qwen_Qwen2.5-Coder-32B.json rename to data/models/Qwen_Qwen2.5-Coder-32B.json diff --git a/data/Qwen_Qwen2.5-Coder-7B-Instruct.json b/data/models/Qwen_Qwen2.5-Coder-7B-Instruct.json similarity index 100% rename from data/Qwen_Qwen2.5-Coder-7B-Instruct.json rename to data/models/Qwen_Qwen2.5-Coder-7B-Instruct.json diff --git a/data/Qwen_Qwen2.5-Coder-7B.json b/data/models/Qwen_Qwen2.5-Coder-7B.json similarity index 100% rename from data/Qwen_Qwen2.5-Coder-7B.json rename to data/models/Qwen_Qwen2.5-Coder-7B.json diff --git a/data/Qwen_Qwen2.5-Math-1.5B-Instruct.json b/data/models/Qwen_Qwen2.5-Math-1.5B-Instruct.json similarity index 100% rename from data/Qwen_Qwen2.5-Math-1.5B-Instruct.json rename to data/models/Qwen_Qwen2.5-Math-1.5B-Instruct.json diff --git a/data/Qwen_Qwen2.5-Math-72B-Instruct.json b/data/models/Qwen_Qwen2.5-Math-72B-Instruct.json similarity index 100% rename from data/Qwen_Qwen2.5-Math-72B-Instruct.json rename to data/models/Qwen_Qwen2.5-Math-72B-Instruct.json diff --git a/data/Qwen_Qwen2.5-Math-7B-Instruct.json b/data/models/Qwen_Qwen2.5-Math-7B-Instruct.json similarity index 100% rename from data/Qwen_Qwen2.5-Math-7B-Instruct.json rename to data/models/Qwen_Qwen2.5-Math-7B-Instruct.json diff --git a/data/Qwen_Qwen2.5-Math-7B.json b/data/models/Qwen_Qwen2.5-Math-7B.json similarity index 100% rename from data/Qwen_Qwen2.5-Math-7B.json rename to data/models/Qwen_Qwen2.5-Math-7B.json diff --git a/data/Qwen_WorldPM-72B.json b/data/models/Qwen_WorldPM-72B.json similarity index 100% rename from data/Qwen_WorldPM-72B.json rename to data/models/Qwen_WorldPM-72B.json diff --git a/data/R-I-S-E_RISE-Judge-Qwen2.5-32B.json b/data/models/R-I-S-E_RISE-Judge-Qwen2.5-32B.json similarity index 100% rename from data/R-I-S-E_RISE-Judge-Qwen2.5-32B.json rename to data/models/R-I-S-E_RISE-Judge-Qwen2.5-32B.json diff --git a/data/R-I-S-E_RISE-Judge-Qwen2.5-7B.json b/data/models/R-I-S-E_RISE-Judge-Qwen2.5-7B.json similarity index 100% rename from data/R-I-S-E_RISE-Judge-Qwen2.5-7B.json rename to data/models/R-I-S-E_RISE-Judge-Qwen2.5-7B.json diff --git a/data/RDson_WomboCombo-R1-Coder-14B-Preview.json b/data/models/RDson_WomboCombo-R1-Coder-14B-Preview.json similarity index 100% rename from data/RDson_WomboCombo-R1-Coder-14B-Preview.json rename to data/models/RDson_WomboCombo-R1-Coder-14B-Preview.json diff --git a/data/RESMPDEV_EVA-Qwen2.5-1.5B-FRFR.json b/data/models/RESMPDEV_EVA-Qwen2.5-1.5B-FRFR.json similarity index 100% rename from data/RESMPDEV_EVA-Qwen2.5-1.5B-FRFR.json rename to data/models/RESMPDEV_EVA-Qwen2.5-1.5B-FRFR.json diff --git a/data/RESMPDEV_Qwen2-Wukong-0.5B.json b/data/models/RESMPDEV_Qwen2-Wukong-0.5B.json similarity index 100% rename from data/RESMPDEV_Qwen2-Wukong-0.5B.json rename to data/models/RESMPDEV_Qwen2-Wukong-0.5B.json diff --git a/data/RLHFlow_ArmoRM-Llama3-8B-v0.1.json b/data/models/RLHFlow_ArmoRM-Llama3-8B-v0.1.json similarity index 100% rename from data/RLHFlow_ArmoRM-Llama3-8B-v0.1.json rename to data/models/RLHFlow_ArmoRM-Llama3-8B-v0.1.json diff --git a/data/RLHFlow_LLaMA3-iterative-DPO-final.json b/data/models/RLHFlow_LLaMA3-iterative-DPO-final.json similarity index 100% rename from data/RLHFlow_LLaMA3-iterative-DPO-final.json rename to data/models/RLHFlow_LLaMA3-iterative-DPO-final.json diff --git a/data/RLHFlow_RewardModel-Mistral-7B-for-DPA-v1.json b/data/models/RLHFlow_RewardModel-Mistral-7B-for-DPA-v1.json similarity index 100% rename from data/RLHFlow_RewardModel-Mistral-7B-for-DPA-v1.json rename to data/models/RLHFlow_RewardModel-Mistral-7B-for-DPA-v1.json diff --git a/data/RLHFlow_pair-preference-model-LLaMA3-8B.json b/data/models/RLHFlow_pair-preference-model-LLaMA3-8B.json similarity index 100% rename from data/RLHFlow_pair-preference-model-LLaMA3-8B.json rename to data/models/RLHFlow_pair-preference-model-LLaMA3-8B.json diff --git a/data/RWKV_rwkv-raven-14b.json b/data/models/RWKV_rwkv-raven-14b.json similarity index 100% rename from data/RWKV_rwkv-raven-14b.json rename to data/models/RWKV_rwkv-raven-14b.json diff --git a/data/Rakuten_RakutenAI-2.0-mini-instruct.json b/data/models/Rakuten_RakutenAI-2.0-mini-instruct.json similarity index 100% rename from data/Rakuten_RakutenAI-2.0-mini-instruct.json rename to data/models/Rakuten_RakutenAI-2.0-mini-instruct.json diff --git a/data/Rakuten_RakutenAI-7B-chat.json b/data/models/Rakuten_RakutenAI-7B-chat.json similarity index 100% rename from data/Rakuten_RakutenAI-7B-chat.json rename to data/models/Rakuten_RakutenAI-7B-chat.json diff --git a/data/Rakuten_RakutenAI-7B.json b/data/models/Rakuten_RakutenAI-7B.json similarity index 100% rename from data/Rakuten_RakutenAI-7B.json rename to data/models/Rakuten_RakutenAI-7B.json diff --git a/data/Ray2333_GRM-Gemma-2B-rewardmodel-ft.json b/data/models/Ray2333_GRM-Gemma-2B-rewardmodel-ft.json similarity index 100% rename from data/Ray2333_GRM-Gemma-2B-rewardmodel-ft.json rename to data/models/Ray2333_GRM-Gemma-2B-rewardmodel-ft.json diff --git a/data/Ray2333_GRM-Gemma-2B-sftreg.json b/data/models/Ray2333_GRM-Gemma-2B-sftreg.json similarity index 100% rename from data/Ray2333_GRM-Gemma-2B-sftreg.json rename to data/models/Ray2333_GRM-Gemma-2B-sftreg.json diff --git a/data/Ray2333_GRM-Llama3-8B-rewardmodel-ft.json b/data/models/Ray2333_GRM-Llama3-8B-rewardmodel-ft.json similarity index 100% rename from data/Ray2333_GRM-Llama3-8B-rewardmodel-ft.json rename to data/models/Ray2333_GRM-Llama3-8B-rewardmodel-ft.json diff --git a/data/Ray2333_GRM-gemma2-2B-rewardmodel-ft.json b/data/models/Ray2333_GRM-gemma2-2B-rewardmodel-ft.json similarity index 100% rename from data/Ray2333_GRM-gemma2-2B-rewardmodel-ft.json rename to data/models/Ray2333_GRM-gemma2-2B-rewardmodel-ft.json index cd8835eaf989dbd622c51262fef9b72b10534d67..24dd55ac376e51965493e81885df295559db7184 100644 --- a/data/Ray2333_GRM-gemma2-2B-rewardmodel-ft.json +++ b/data/models/Ray2333_GRM-gemma2-2B-rewardmodel-ft.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/Ray2333_GRM-gemma2-2B-rewardmodel-ft/1766412838.146816", + "evaluation_id": "reward-bench-2/Ray2333_GRM-gemma2-2B-rewardmodel-ft/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,128 +31,104 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8839 + "score": 0.5966 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9302 + "score": 0.5305 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7719 + "score": 0.3125 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Math", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Math score - measures mathematical reasoning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9216 + "score": 0.5902 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.912 + "score": 0.9222 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } - } - ], - "detailed_evaluation_results": null, - "generation_config": null - }, - { - "evaluation_id": "reward-bench-2/Ray2333_GRM-gemma2-2B-rewardmodel-ft/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "rewardbench", - "version": "0.1.3", - "additional_details": { - "subsets": "Chat, Chat Hard, Safety, Reasoning", - "hf_space": "allenai/reward-bench" - } - }, - "benchmark": "reward-bench", - "evaluation_results": [ + }, { - "evaluation_name": "Score", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5966 + "score": 0.7455 }, "source_data": { "dataset_name": "RewardBench 2", @@ -161,111 +137,135 @@ } }, { - "evaluation_name": "Factuality", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5305 + "score": 0.4788 }, "source_data": { "dataset_name": "RewardBench 2", "source_type": "hf_dataset", "hf_repo": "allenai/reward-bench-2-results" } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": null + }, + { + "evaluation_id": "reward-bench/Ray2333_GRM-gemma2-2B-rewardmodel-ft/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ { - "evaluation_name": "Precise IF", + "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3125 + "score": 0.8839 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5902 + "score": 0.9302 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9222 + "score": 0.7719 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7455 + "score": 0.9216 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4788 + "score": 0.912 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/Ray2333_GRM-llama3-8B-distill.json b/data/models/Ray2333_GRM-llama3-8B-distill.json similarity index 100% rename from data/Ray2333_GRM-llama3-8B-distill.json rename to data/models/Ray2333_GRM-llama3-8B-distill.json index 28bee941f1d39ce464ca73210ca8cbef67a5d082..a698e2dc0120e472c95dbd375c2ce72c243acf20 100644 --- a/data/Ray2333_GRM-llama3-8B-distill.json +++ b/data/models/Ray2333_GRM-llama3-8B-distill.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench-2/Ray2333_GRM-llama3-8B-distill/1766412838.146816", + "evaluation_id": "reward-bench/Ray2333_GRM-llama3-8B-distill/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,127 +31,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.589 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5874 + "score": 0.8464 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3875 + "score": 0.9832 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5902 + "score": 0.6842 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7222 + "score": 0.8676 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6727 + "score": 0.9133 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5743 + "score": 0.7209 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], @@ -159,10 +141,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench/Ray2333_GRM-llama3-8B-distill/1766412838.146816", + "evaluation_id": "reward-bench-2/Ray2333_GRM-llama3-8B-distill/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -181,109 +163,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8464 + "score": 0.589 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9832 + "score": 0.5874 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6842 + "score": 0.3875 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.5902 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8676 + "score": 0.7222 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9133 + "score": 0.6727 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7209 + "score": 0.5743 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], diff --git a/data/Ray2333_GRM-llama3-8B-sftreg.json b/data/models/Ray2333_GRM-llama3-8B-sftreg.json similarity index 100% rename from data/Ray2333_GRM-llama3-8B-sftreg.json rename to data/models/Ray2333_GRM-llama3-8B-sftreg.json index 15f35c842dfd5169bfd85bae53f76a90d5850421..bd70639489991e3e8880cad041284119bd35ecc9 100644 --- a/data/Ray2333_GRM-llama3-8B-sftreg.json +++ b/data/models/Ray2333_GRM-llama3-8B-sftreg.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench-2/Ray2333_GRM-llama3-8B-sftreg/1766412838.146816", + "evaluation_id": "reward-bench/Ray2333_GRM-llama3-8B-sftreg/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,127 +31,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6089 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6189 + "score": 0.8542 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3875 + "score": 0.986 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5792 + "score": 0.6776 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7867 + "score": 0.8919 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6828 + "score": 0.9229 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5981 + "score": 0.7309 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], @@ -159,10 +141,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench/Ray2333_GRM-llama3-8B-sftreg/1766412838.146816", + "evaluation_id": "reward-bench-2/Ray2333_GRM-llama3-8B-sftreg/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -181,109 +163,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8542 + "score": 0.6089 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.986 + "score": 0.6189 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6776 + "score": 0.3875 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.5792 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8919 + "score": 0.7867 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9229 + "score": 0.6828 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7309 + "score": 0.5981 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], diff --git a/data/Ray2333_GRM-llama3.2-3B-rewardmodel-ft.json b/data/models/Ray2333_GRM-llama3.2-3B-rewardmodel-ft.json similarity index 100% rename from data/Ray2333_GRM-llama3.2-3B-rewardmodel-ft.json rename to data/models/Ray2333_GRM-llama3.2-3B-rewardmodel-ft.json diff --git a/data/Ray2333_Gemma-2B-rewardmodel-baseline.json b/data/models/Ray2333_Gemma-2B-rewardmodel-baseline.json similarity index 100% rename from data/Ray2333_Gemma-2B-rewardmodel-baseline.json rename to data/models/Ray2333_Gemma-2B-rewardmodel-baseline.json diff --git a/data/Ray2333_Gemma-2B-rewardmodel-ft.json b/data/models/Ray2333_Gemma-2B-rewardmodel-ft.json similarity index 100% rename from data/Ray2333_Gemma-2B-rewardmodel-ft.json rename to data/models/Ray2333_Gemma-2B-rewardmodel-ft.json diff --git a/data/Ray2333_reward-model-Mistral-7B-instruct-Unifie....json b/data/models/Ray2333_reward-model-Mistral-7B-instruct-Unifie....json similarity index 100% rename from data/Ray2333_reward-model-Mistral-7B-instruct-Unifie....json rename to data/models/Ray2333_reward-model-Mistral-7B-instruct-Unifie....json diff --git a/data/Replete-AI_L3-Pneuma-8B.json b/data/models/Replete-AI_L3-Pneuma-8B.json similarity index 100% rename from data/Replete-AI_L3-Pneuma-8B.json rename to data/models/Replete-AI_L3-Pneuma-8B.json diff --git a/data/Replete-AI_L3.1-Pneuma-8B.json b/data/models/Replete-AI_L3.1-Pneuma-8B.json similarity index 100% rename from data/Replete-AI_L3.1-Pneuma-8B.json rename to data/models/Replete-AI_L3.1-Pneuma-8B.json diff --git a/data/Replete-AI_Llama3-8B-Instruct-Replete-Adapted.json b/data/models/Replete-AI_Llama3-8B-Instruct-Replete-Adapted.json similarity index 100% rename from data/Replete-AI_Llama3-8B-Instruct-Replete-Adapted.json rename to data/models/Replete-AI_Llama3-8B-Instruct-Replete-Adapted.json diff --git a/data/Replete-AI_Replete-Coder-Instruct-8b-Merged.json b/data/models/Replete-AI_Replete-Coder-Instruct-8b-Merged.json similarity index 100% rename from data/Replete-AI_Replete-Coder-Instruct-8b-Merged.json rename to data/models/Replete-AI_Replete-Coder-Instruct-8b-Merged.json diff --git a/data/Replete-AI_Replete-Coder-Llama3-8B.json b/data/models/Replete-AI_Replete-Coder-Llama3-8B.json similarity index 100% rename from data/Replete-AI_Replete-Coder-Llama3-8B.json rename to data/models/Replete-AI_Replete-Coder-Llama3-8B.json diff --git a/data/Replete-AI_Replete-Coder-Qwen2-1.5b.json b/data/models/Replete-AI_Replete-Coder-Qwen2-1.5b.json similarity index 100% rename from data/Replete-AI_Replete-Coder-Qwen2-1.5b.json rename to data/models/Replete-AI_Replete-Coder-Qwen2-1.5b.json diff --git a/data/Replete-AI_Replete-LLM-Qwen2-7b.json b/data/models/Replete-AI_Replete-LLM-Qwen2-7b.json similarity index 100% rename from data/Replete-AI_Replete-LLM-Qwen2-7b.json rename to data/models/Replete-AI_Replete-LLM-Qwen2-7b.json diff --git a/data/Replete-AI_Replete-LLM-Qwen2-7b_Beta-Preview.json b/data/models/Replete-AI_Replete-LLM-Qwen2-7b_Beta-Preview.json similarity index 100% rename from data/Replete-AI_Replete-LLM-Qwen2-7b_Beta-Preview.json rename to data/models/Replete-AI_Replete-LLM-Qwen2-7b_Beta-Preview.json diff --git a/data/Replete-AI_Replete-LLM-V2-Llama-3.1-8b.json b/data/models/Replete-AI_Replete-LLM-V2-Llama-3.1-8b.json similarity index 100% rename from data/Replete-AI_Replete-LLM-V2-Llama-3.1-8b.json rename to data/models/Replete-AI_Replete-LLM-V2-Llama-3.1-8b.json diff --git a/data/RezVortex_JAJUKA-WEWILLNEVERFORGETYOU-3B.json b/data/models/RezVortex_JAJUKA-WEWILLNEVERFORGETYOU-3B.json similarity index 100% rename from data/RezVortex_JAJUKA-WEWILLNEVERFORGETYOU-3B.json rename to data/models/RezVortex_JAJUKA-WEWILLNEVERFORGETYOU-3B.json diff --git a/data/RezVortex_Jajuka-3b.json b/data/models/RezVortex_Jajuka-3b.json similarity index 100% rename from data/RezVortex_Jajuka-3b.json rename to data/models/RezVortex_Jajuka-3b.json diff --git a/data/Ro-xe_FMixIA-7B-DARE-0.json b/data/models/Ro-xe_FMixIA-7B-DARE-0.json similarity index 100% rename from data/Ro-xe_FMixIA-7B-DARE-0.json rename to data/models/Ro-xe_FMixIA-7B-DARE-0.json diff --git a/data/Ro-xe_FMixIA-7B-SLERP-27.json b/data/models/Ro-xe_FMixIA-7B-SLERP-27.json similarity index 100% rename from data/Ro-xe_FMixIA-7B-SLERP-27.json rename to data/models/Ro-xe_FMixIA-7B-SLERP-27.json diff --git a/data/Ro-xe_FMixIA-7B-TIES-1.json b/data/models/Ro-xe_FMixIA-7B-TIES-1.json similarity index 100% rename from data/Ro-xe_FMixIA-7B-TIES-1.json rename to data/models/Ro-xe_FMixIA-7B-TIES-1.json diff --git a/data/Ro-xe_FMixIA-FrankenMerge-9.5B-PT-9.json b/data/models/Ro-xe_FMixIA-FrankenMerge-9.5B-PT-9.json similarity index 100% rename from data/Ro-xe_FMixIA-FrankenMerge-9.5B-PT-9.json rename to data/models/Ro-xe_FMixIA-FrankenMerge-9.5B-PT-9.json diff --git a/data/Rombo-Org_Rombo-LLM-V2.5-Qwen-7b.json b/data/models/Rombo-Org_Rombo-LLM-V2.5-Qwen-7b.json similarity index 100% rename from data/Rombo-Org_Rombo-LLM-V2.5-Qwen-7b.json rename to data/models/Rombo-Org_Rombo-LLM-V2.5-Qwen-7b.json diff --git a/data/RubielLabarta_LogoS-7Bx2-MoE-13B-v0.2.json b/data/models/RubielLabarta_LogoS-7Bx2-MoE-13B-v0.2.json similarity index 100% rename from data/RubielLabarta_LogoS-7Bx2-MoE-13B-v0.2.json rename to data/models/RubielLabarta_LogoS-7Bx2-MoE-13B-v0.2.json diff --git a/data/SF-Foundation_TextEval-Llama3.1-70B.json b/data/models/SF-Foundation_TextEval-Llama3.1-70B.json similarity index 100% rename from data/SF-Foundation_TextEval-Llama3.1-70B.json rename to data/models/SF-Foundation_TextEval-Llama3.1-70B.json diff --git a/data/SF-Foundation_TextEval-OffsetBias-12B.json b/data/models/SF-Foundation_TextEval-OffsetBias-12B.json similarity index 100% rename from data/SF-Foundation_TextEval-OffsetBias-12B.json rename to data/models/SF-Foundation_TextEval-OffsetBias-12B.json diff --git a/data/SaisExperiments_Evil-Alpaca-3B-L3.2.json b/data/models/SaisExperiments_Evil-Alpaca-3B-L3.2.json similarity index 100% rename from data/SaisExperiments_Evil-Alpaca-3B-L3.2.json rename to data/models/SaisExperiments_Evil-Alpaca-3B-L3.2.json diff --git a/data/SaisExperiments_Gemma-2-2B-Opus-Instruct.json b/data/models/SaisExperiments_Gemma-2-2B-Opus-Instruct.json similarity index 100% rename from data/SaisExperiments_Gemma-2-2B-Opus-Instruct.json rename to data/models/SaisExperiments_Gemma-2-2B-Opus-Instruct.json diff --git a/data/SaisExperiments_Gemma-2-2B-Stheno-Filtered.json b/data/models/SaisExperiments_Gemma-2-2B-Stheno-Filtered.json similarity index 100% rename from data/SaisExperiments_Gemma-2-2B-Stheno-Filtered.json rename to data/models/SaisExperiments_Gemma-2-2B-Stheno-Filtered.json diff --git a/data/SaisExperiments_Not-So-Small-Alpaca-24B.json b/data/models/SaisExperiments_Not-So-Small-Alpaca-24B.json similarity index 100% rename from data/SaisExperiments_Not-So-Small-Alpaca-24B.json rename to data/models/SaisExperiments_Not-So-Small-Alpaca-24B.json diff --git a/data/SaisExperiments_QwOwO-7B-V1.json b/data/models/SaisExperiments_QwOwO-7B-V1.json similarity index 100% rename from data/SaisExperiments_QwOwO-7B-V1.json rename to data/models/SaisExperiments_QwOwO-7B-V1.json diff --git a/data/SaisExperiments_RightSheep-Llama3.2-3B.json b/data/models/SaisExperiments_RightSheep-Llama3.2-3B.json similarity index 100% rename from data/SaisExperiments_RightSheep-Llama3.2-3B.json rename to data/models/SaisExperiments_RightSheep-Llama3.2-3B.json diff --git a/data/Sakalti_Anemoi-3B.json b/data/models/Sakalti_Anemoi-3B.json similarity index 100% rename from data/Sakalti_Anemoi-3B.json rename to data/models/Sakalti_Anemoi-3B.json diff --git a/data/Sakalti_Euphrates-14B.json b/data/models/Sakalti_Euphrates-14B.json similarity index 100% rename from data/Sakalti_Euphrates-14B.json rename to data/models/Sakalti_Euphrates-14B.json diff --git a/data/Sakalti_Llama3.2-3B-Uranus-1.json b/data/models/Sakalti_Llama3.2-3B-Uranus-1.json similarity index 100% rename from data/Sakalti_Llama3.2-3B-Uranus-1.json rename to data/models/Sakalti_Llama3.2-3B-Uranus-1.json diff --git a/data/Sakalti_Magro-7B-v1.1.json b/data/models/Sakalti_Magro-7B-v1.1.json similarity index 100% rename from data/Sakalti_Magro-7B-v1.1.json rename to data/models/Sakalti_Magro-7B-v1.1.json diff --git a/data/Sakalti_Neptuno-3B.json b/data/models/Sakalti_Neptuno-3B.json similarity index 100% rename from data/Sakalti_Neptuno-3B.json rename to data/models/Sakalti_Neptuno-3B.json diff --git a/data/Sakalti_Neptuno-Alpha.json b/data/models/Sakalti_Neptuno-Alpha.json similarity index 100% rename from data/Sakalti_Neptuno-Alpha.json rename to data/models/Sakalti_Neptuno-Alpha.json diff --git a/data/Sakalti_Oxyge1-33B.json b/data/models/Sakalti_Oxyge1-33B.json similarity index 100% rename from data/Sakalti_Oxyge1-33B.json rename to data/models/Sakalti_Oxyge1-33B.json diff --git a/data/Sakalti_Phi3.5-Comets-3.8B.json b/data/models/Sakalti_Phi3.5-Comets-3.8B.json similarity index 100% rename from data/Sakalti_Phi3.5-Comets-3.8B.json rename to data/models/Sakalti_Phi3.5-Comets-3.8B.json diff --git a/data/Sakalti_Qwen2.5-1B-Instruct.json b/data/models/Sakalti_Qwen2.5-1B-Instruct.json similarity index 100% rename from data/Sakalti_Qwen2.5-1B-Instruct.json rename to data/models/Sakalti_Qwen2.5-1B-Instruct.json diff --git a/data/Sakalti_QwenTest-7.json b/data/models/Sakalti_QwenTest-7.json similarity index 100% rename from data/Sakalti_QwenTest-7.json rename to data/models/Sakalti_QwenTest-7.json diff --git a/data/Sakalti_SJT-0.5B.json b/data/models/Sakalti_SJT-0.5B.json similarity index 100% rename from data/Sakalti_SJT-0.5B.json rename to data/models/Sakalti_SJT-0.5B.json diff --git a/data/Sakalti_SJT-1.5B-Alpha-1.1.json b/data/models/Sakalti_SJT-1.5B-Alpha-1.1.json similarity index 100% rename from data/Sakalti_SJT-1.5B-Alpha-1.1.json rename to data/models/Sakalti_SJT-1.5B-Alpha-1.1.json diff --git a/data/Sakalti_SJT-1.5B-Alpha.json b/data/models/Sakalti_SJT-1.5B-Alpha.json similarity index 100% rename from data/Sakalti_SJT-1.5B-Alpha.json rename to data/models/Sakalti_SJT-1.5B-Alpha.json diff --git a/data/Sakalti_SJT-1.7B.json b/data/models/Sakalti_SJT-1.7B.json similarity index 100% rename from data/Sakalti_SJT-1.7B.json rename to data/models/Sakalti_SJT-1.7B.json diff --git a/data/Sakalti_SJT-14B.json b/data/models/Sakalti_SJT-14B.json similarity index 100% rename from data/Sakalti_SJT-14B.json rename to data/models/Sakalti_SJT-14B.json diff --git a/data/Sakalti_SJT-2.4B.json b/data/models/Sakalti_SJT-2.4B.json similarity index 100% rename from data/Sakalti_SJT-2.4B.json rename to data/models/Sakalti_SJT-2.4B.json diff --git a/data/Sakalti_SJT-24B-Alpha.json b/data/models/Sakalti_SJT-24B-Alpha.json similarity index 100% rename from data/Sakalti_SJT-24B-Alpha.json rename to data/models/Sakalti_SJT-24B-Alpha.json diff --git a/data/Sakalti_SJT-2B-V1.1.json b/data/models/Sakalti_SJT-2B-V1.1.json similarity index 100% rename from data/Sakalti_SJT-2B-V1.1.json rename to data/models/Sakalti_SJT-2B-V1.1.json diff --git a/data/Sakalti_SJT-2B.json b/data/models/Sakalti_SJT-2B.json similarity index 100% rename from data/Sakalti_SJT-2B.json rename to data/models/Sakalti_SJT-2B.json diff --git a/data/Sakalti_SJT-3.7B.json b/data/models/Sakalti_SJT-3.7B.json similarity index 100% rename from data/Sakalti_SJT-3.7B.json rename to data/models/Sakalti_SJT-3.7B.json diff --git a/data/Sakalti_SJT-4B.json b/data/models/Sakalti_SJT-4B.json similarity index 100% rename from data/Sakalti_SJT-4B.json rename to data/models/Sakalti_SJT-4B.json diff --git a/data/Sakalti_SJT-7.5B.json b/data/models/Sakalti_SJT-7.5B.json similarity index 100% rename from data/Sakalti_SJT-7.5B.json rename to data/models/Sakalti_SJT-7.5B.json diff --git a/data/Sakalti_SJT-7B-V1.1-Multilingal.json b/data/models/Sakalti_SJT-7B-V1.1-Multilingal.json similarity index 100% rename from data/Sakalti_SJT-7B-V1.1-Multilingal.json rename to data/models/Sakalti_SJT-7B-V1.1-Multilingal.json diff --git a/data/Sakalti_SJT-7B-V1.1.json b/data/models/Sakalti_SJT-7B-V1.1.json similarity index 100% rename from data/Sakalti_SJT-7B-V1.1.json rename to data/models/Sakalti_SJT-7B-V1.1.json diff --git a/data/Sakalti_SJT-8B-V1.1.json b/data/models/Sakalti_SJT-8B-V1.1.json similarity index 100% rename from data/Sakalti_SJT-8B-V1.1.json rename to data/models/Sakalti_SJT-8B-V1.1.json diff --git a/data/Sakalti_SJT-8B.json b/data/models/Sakalti_SJT-8B.json similarity index 100% rename from data/Sakalti_SJT-8B.json rename to data/models/Sakalti_SJT-8B.json diff --git a/data/Sakalti_SJT-900M.json b/data/models/Sakalti_SJT-900M.json similarity index 100% rename from data/Sakalti_SJT-900M.json rename to data/models/Sakalti_SJT-900M.json diff --git a/data/Sakalti_SJT-Moe2x7.5B.json b/data/models/Sakalti_SJT-Moe2x7.5B.json similarity index 100% rename from data/Sakalti_SJT-Moe2x7.5B.json rename to data/models/Sakalti_SJT-Moe2x7.5B.json diff --git a/data/Sakalti_SJTPass-2.json b/data/models/Sakalti_SJTPass-2.json similarity index 100% rename from data/Sakalti_SJTPass-2.json rename to data/models/Sakalti_SJTPass-2.json diff --git a/data/Sakalti_SJTPass-4.json b/data/models/Sakalti_SJTPass-4.json similarity index 100% rename from data/Sakalti_SJTPass-4.json rename to data/models/Sakalti_SJTPass-4.json diff --git a/data/Sakalti_SJTPass-5.json b/data/models/Sakalti_SJTPass-5.json similarity index 100% rename from data/Sakalti_SJTPass-5.json rename to data/models/Sakalti_SJTPass-5.json diff --git a/data/Sakalti_Saba-Passthrough-2.json b/data/models/Sakalti_Saba-Passthrough-2.json similarity index 100% rename from data/Sakalti_Saba-Passthrough-2.json rename to data/models/Sakalti_Saba-Passthrough-2.json diff --git a/data/Sakalti_Saba1-1.8B.json b/data/models/Sakalti_Saba1-1.8B.json similarity index 100% rename from data/Sakalti_Saba1-1.8B.json rename to data/models/Sakalti_Saba1-1.8B.json diff --git a/data/Sakalti_Saba1-7B.json b/data/models/Sakalti_Saba1-7B.json similarity index 100% rename from data/Sakalti_Saba1-7B.json rename to data/models/Sakalti_Saba1-7B.json diff --git a/data/Sakalti_Saba1.5-1.5B.json b/data/models/Sakalti_Saba1.5-1.5B.json similarity index 100% rename from data/Sakalti_Saba1.5-1.5B.json rename to data/models/Sakalti_Saba1.5-1.5B.json diff --git a/data/Sakalti_Saba1.5-Pro-3B.json b/data/models/Sakalti_Saba1.5-Pro-3B.json similarity index 100% rename from data/Sakalti_Saba1.5-Pro-3B.json rename to data/models/Sakalti_Saba1.5-Pro-3B.json diff --git a/data/Sakalti_Saba2-14B-Preview.json b/data/models/Sakalti_Saba2-14B-Preview.json similarity index 100% rename from data/Sakalti_Saba2-14B-Preview.json rename to data/models/Sakalti_Saba2-14B-Preview.json diff --git a/data/Sakalti_Saba2-3B.json b/data/models/Sakalti_Saba2-3B.json similarity index 100% rename from data/Sakalti_Saba2-3B.json rename to data/models/Sakalti_Saba2-3B.json diff --git a/data/Sakalti_Sailor-japanese.json b/data/models/Sakalti_Sailor-japanese.json similarity index 100% rename from data/Sakalti_Sailor-japanese.json rename to data/models/Sakalti_Sailor-japanese.json diff --git a/data/Sakalti_Saka-1.5B.json b/data/models/Sakalti_Saka-1.5B.json similarity index 100% rename from data/Sakalti_Saka-1.5B.json rename to data/models/Sakalti_Saka-1.5B.json diff --git a/data/Sakalti_Saka-14B.json b/data/models/Sakalti_Saka-14B.json similarity index 100% rename from data/Sakalti_Saka-14B.json rename to data/models/Sakalti_Saka-14B.json diff --git a/data/Sakalti_Saka-24B.json b/data/models/Sakalti_Saka-24B.json similarity index 100% rename from data/Sakalti_Saka-24B.json rename to data/models/Sakalti_Saka-24B.json diff --git a/data/Sakalti_Saka-7.2B.json b/data/models/Sakalti_Saka-7.2B.json similarity index 100% rename from data/Sakalti_Saka-7.2B.json rename to data/models/Sakalti_Saka-7.2B.json diff --git a/data/Sakalti_Saka-7.6B.json b/data/models/Sakalti_Saka-7.6B.json similarity index 100% rename from data/Sakalti_Saka-7.6B.json rename to data/models/Sakalti_Saka-7.6B.json diff --git a/data/Sakalti_SakaMoe-3x1.6B-Instruct.json b/data/models/Sakalti_SakaMoe-3x1.6B-Instruct.json similarity index 100% rename from data/Sakalti_SakaMoe-3x1.6B-Instruct.json rename to data/models/Sakalti_SakaMoe-3x1.6B-Instruct.json diff --git a/data/Sakalti_SakalFusion-7B-Alpha.json b/data/models/Sakalti_SakalFusion-7B-Alpha.json similarity index 100% rename from data/Sakalti_SakalFusion-7B-Alpha.json rename to data/models/Sakalti_SakalFusion-7B-Alpha.json diff --git a/data/Sakalti_SakalFusion-7B-Beta.json b/data/models/Sakalti_SakalFusion-7B-Beta.json similarity index 100% rename from data/Sakalti_SakalFusion-7B-Beta.json rename to data/models/Sakalti_SakalFusion-7B-Beta.json diff --git a/data/Sakalti_Tara-3.8B-v1.1.json b/data/models/Sakalti_Tara-3.8B-v1.1.json similarity index 100% rename from data/Sakalti_Tara-3.8B-v1.1.json rename to data/models/Sakalti_Tara-3.8B-v1.1.json diff --git a/data/Sakalti_light-1.1-3B.json b/data/models/Sakalti_light-1.1-3B.json similarity index 100% rename from data/Sakalti_light-1.1-3B.json rename to data/models/Sakalti_light-1.1-3B.json diff --git a/data/Sakalti_light-3B.json b/data/models/Sakalti_light-3B.json similarity index 100% rename from data/Sakalti_light-3B.json rename to data/models/Sakalti_light-3B.json diff --git a/data/Sakalti_light-3b-beta.json b/data/models/Sakalti_light-3b-beta.json similarity index 100% rename from data/Sakalti_light-3b-beta.json rename to data/models/Sakalti_light-3b-beta.json diff --git a/data/Sakalti_light-7b-beta.json b/data/models/Sakalti_light-7b-beta.json similarity index 100% rename from data/Sakalti_light-7b-beta.json rename to data/models/Sakalti_light-7b-beta.json diff --git a/data/Sakalti_llama-3-yanyuedao-8b-instruct.json b/data/models/Sakalti_llama-3-yanyuedao-8b-instruct.json similarity index 100% rename from data/Sakalti_llama-3-yanyuedao-8b-instruct.json rename to data/models/Sakalti_llama-3-yanyuedao-8b-instruct.json diff --git a/data/Sakalti_magro-7B.json b/data/models/Sakalti_magro-7B.json similarity index 100% rename from data/Sakalti_magro-7B.json rename to data/models/Sakalti_magro-7B.json diff --git a/data/Sakalti_mergekit-01.json b/data/models/Sakalti_mergekit-01.json similarity index 100% rename from data/Sakalti_mergekit-01.json rename to data/models/Sakalti_mergekit-01.json diff --git a/data/Sakalti_mergekit-della_linear-vmeykci.json b/data/models/Sakalti_mergekit-della_linear-vmeykci.json similarity index 100% rename from data/Sakalti_mergekit-della_linear-vmeykci.json rename to data/models/Sakalti_mergekit-della_linear-vmeykci.json diff --git a/data/Sakalti_model-3.json b/data/models/Sakalti_model-3.json similarity index 100% rename from data/Sakalti_model-3.json rename to data/models/Sakalti_model-3.json diff --git a/data/Sakalti_qwen2.5-2.3B.json b/data/models/Sakalti_qwen2.5-2.3B.json similarity index 100% rename from data/Sakalti_qwen2.5-2.3B.json rename to data/models/Sakalti_qwen2.5-2.3B.json diff --git a/data/Sakalti_tara-3.8B.json b/data/models/Sakalti_tara-3.8B.json similarity index 100% rename from data/Sakalti_tara-3.8B.json rename to data/models/Sakalti_tara-3.8B.json diff --git a/data/Sakalti_ultiima-14B-v0.2.json b/data/models/Sakalti_ultiima-14B-v0.2.json similarity index 100% rename from data/Sakalti_ultiima-14B-v0.2.json rename to data/models/Sakalti_ultiima-14B-v0.2.json diff --git a/data/Sakalti_ultiima-14B-v0.3.json b/data/models/Sakalti_ultiima-14B-v0.3.json similarity index 100% rename from data/Sakalti_ultiima-14B-v0.3.json rename to data/models/Sakalti_ultiima-14B-v0.3.json diff --git a/data/Sakalti_ultiima-14B-v0.4.json b/data/models/Sakalti_ultiima-14B-v0.4.json similarity index 100% rename from data/Sakalti_ultiima-14B-v0.4.json rename to data/models/Sakalti_ultiima-14B-v0.4.json diff --git a/data/Sakalti_ultiima-14B.json b/data/models/Sakalti_ultiima-14B.json similarity index 100% rename from data/Sakalti_ultiima-14B.json rename to data/models/Sakalti_ultiima-14B.json diff --git a/data/Sakalti_ultiima-32B.json b/data/models/Sakalti_ultiima-32B.json similarity index 100% rename from data/Sakalti_ultiima-32B.json rename to data/models/Sakalti_ultiima-32B.json diff --git a/data/Sakalti_ultiima-72B-v1.5.json b/data/models/Sakalti_ultiima-72B-v1.5.json similarity index 100% rename from data/Sakalti_ultiima-72B-v1.5.json rename to data/models/Sakalti_ultiima-72B-v1.5.json diff --git a/data/Sakalti_ultiima-72B.json b/data/models/Sakalti_ultiima-72B.json similarity index 100% rename from data/Sakalti_ultiima-72B.json rename to data/models/Sakalti_ultiima-72B.json diff --git a/data/Salesforce_LLaMA-3-8B-SFR-Iterative-DPO-R.json b/data/models/Salesforce_LLaMA-3-8B-SFR-Iterative-DPO-R.json similarity index 100% rename from data/Salesforce_LLaMA-3-8B-SFR-Iterative-DPO-R.json rename to data/models/Salesforce_LLaMA-3-8B-SFR-Iterative-DPO-R.json diff --git a/data/Salesforce_SFR-LLaMa-3.1-70B-Judge-r.json b/data/models/Salesforce_SFR-LLaMa-3.1-70B-Judge-r.json similarity index 100% rename from data/Salesforce_SFR-LLaMa-3.1-70B-Judge-r.json rename to data/models/Salesforce_SFR-LLaMa-3.1-70B-Judge-r.json diff --git a/data/Salesforce_SFR-LLaMa-3.1-8B-Judge-r.json b/data/models/Salesforce_SFR-LLaMa-3.1-8B-Judge-r.json similarity index 100% rename from data/Salesforce_SFR-LLaMa-3.1-8B-Judge-r.json rename to data/models/Salesforce_SFR-LLaMa-3.1-8B-Judge-r.json diff --git a/data/Salesforce_SFR-nemo-12B-Judge-r.json b/data/models/Salesforce_SFR-nemo-12B-Judge-r.json similarity index 100% rename from data/Salesforce_SFR-nemo-12B-Judge-r.json rename to data/models/Salesforce_SFR-nemo-12B-Judge-r.json diff --git a/data/SanjiWatsuki_Kunoichi-DPO-v2-7B.json b/data/models/SanjiWatsuki_Kunoichi-DPO-v2-7B.json similarity index 100% rename from data/SanjiWatsuki_Kunoichi-DPO-v2-7B.json rename to data/models/SanjiWatsuki_Kunoichi-DPO-v2-7B.json diff --git a/data/SanjiWatsuki_Silicon-Maid-7B.json b/data/models/SanjiWatsuki_Silicon-Maid-7B.json similarity index 100% rename from data/SanjiWatsuki_Silicon-Maid-7B.json rename to data/models/SanjiWatsuki_Silicon-Maid-7B.json diff --git a/data/Sao10K_70B-L3.3-Cirrus-x1.json b/data/models/Sao10K_70B-L3.3-Cirrus-x1.json similarity index 100% rename from data/Sao10K_70B-L3.3-Cirrus-x1.json rename to data/models/Sao10K_70B-L3.3-Cirrus-x1.json diff --git a/data/Sao10K_Fimbulvetr-11B-v2.json b/data/models/Sao10K_Fimbulvetr-11B-v2.json similarity index 100% rename from data/Sao10K_Fimbulvetr-11B-v2.json rename to data/models/Sao10K_Fimbulvetr-11B-v2.json diff --git a/data/Sao10K_L3-70B-Euryale-v2.1.json b/data/models/Sao10K_L3-70B-Euryale-v2.1.json similarity index 99% rename from data/Sao10K_L3-70B-Euryale-v2.1.json rename to data/models/Sao10K_L3-70B-Euryale-v2.1.json index ec8a48e9b72becd7fee9984e54fa0ccd6756908a..cc2db283b642f8d244eb3c9854fd6c321275f927 100644 --- a/data/Sao10K_L3-70B-Euryale-v2.1.json +++ b/data/models/Sao10K_L3-70B-Euryale-v2.1.json @@ -5,7 +5,7 @@ "developer": "Sao10K", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "LlamaForCausalLM", "params_billions": "70.554" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7281 + "score": 0.7384 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6503 + "score": 0.6471 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2243 + "score": 0.2137 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4196 + "score": 0.4209 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5096 + "score": 0.5104 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7384 + "score": 0.7281 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6471 + "score": 0.6503 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2137 + "score": 0.2243 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4209 + "score": 0.4196 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5104 + "score": 0.5096 } } ], diff --git a/data/Sao10K_L3-8B-Lunaris-v1.json b/data/models/Sao10K_L3-8B-Lunaris-v1.json similarity index 100% rename from data/Sao10K_L3-8B-Lunaris-v1.json rename to data/models/Sao10K_L3-8B-Lunaris-v1.json diff --git a/data/Sao10K_L3-8B-Niitama-v1.json b/data/models/Sao10K_L3-8B-Niitama-v1.json similarity index 100% rename from data/Sao10K_L3-8B-Niitama-v1.json rename to data/models/Sao10K_L3-8B-Niitama-v1.json diff --git a/data/Sao10K_L3-8B-Stheno-v3.2.json b/data/models/Sao10K_L3-8B-Stheno-v3.2.json similarity index 100% rename from data/Sao10K_L3-8B-Stheno-v3.2.json rename to data/models/Sao10K_L3-8B-Stheno-v3.2.json diff --git a/data/Sao10K_L3-8B-Stheno-v3.3-32K.json b/data/models/Sao10K_L3-8B-Stheno-v3.3-32K.json similarity index 100% rename from data/Sao10K_L3-8B-Stheno-v3.3-32K.json rename to data/models/Sao10K_L3-8B-Stheno-v3.3-32K.json diff --git a/data/Sao10K_MN-12B-Lyra-v3.json b/data/models/Sao10K_MN-12B-Lyra-v3.json similarity index 100% rename from data/Sao10K_MN-12B-Lyra-v3.json rename to data/models/Sao10K_MN-12B-Lyra-v3.json diff --git a/data/Saxo_Linkbricks-Horizon-AI-Avengers-V1-32B.json b/data/models/Saxo_Linkbricks-Horizon-AI-Avengers-V1-32B.json similarity index 100% rename from data/Saxo_Linkbricks-Horizon-AI-Avengers-V1-32B.json rename to data/models/Saxo_Linkbricks-Horizon-AI-Avengers-V1-32B.json diff --git a/data/Saxo_Linkbricks-Horizon-AI-Avengers-V2-32B.json b/data/models/Saxo_Linkbricks-Horizon-AI-Avengers-V2-32B.json similarity index 100% rename from data/Saxo_Linkbricks-Horizon-AI-Avengers-V2-32B.json rename to data/models/Saxo_Linkbricks-Horizon-AI-Avengers-V2-32B.json diff --git a/data/Saxo_Linkbricks-Horizon-AI-Avengers-V3-32B.json b/data/models/Saxo_Linkbricks-Horizon-AI-Avengers-V3-32B.json similarity index 100% rename from data/Saxo_Linkbricks-Horizon-AI-Avengers-V3-32B.json rename to data/models/Saxo_Linkbricks-Horizon-AI-Avengers-V3-32B.json diff --git a/data/Saxo_Linkbricks-Horizon-AI-Avengers-V4-32B.json b/data/models/Saxo_Linkbricks-Horizon-AI-Avengers-V4-32B.json similarity index 100% rename from data/Saxo_Linkbricks-Horizon-AI-Avengers-V4-32B.json rename to data/models/Saxo_Linkbricks-Horizon-AI-Avengers-V4-32B.json diff --git a/data/Saxo_Linkbricks-Horizon-AI-Avengers-V5-32B.json b/data/models/Saxo_Linkbricks-Horizon-AI-Avengers-V5-32B.json similarity index 100% rename from data/Saxo_Linkbricks-Horizon-AI-Avengers-V5-32B.json rename to data/models/Saxo_Linkbricks-Horizon-AI-Avengers-V5-32B.json diff --git a/data/Saxo_Linkbricks-Horizon-AI-Avengers-V6-32B.json b/data/models/Saxo_Linkbricks-Horizon-AI-Avengers-V6-32B.json similarity index 100% rename from data/Saxo_Linkbricks-Horizon-AI-Avengers-V6-32B.json rename to data/models/Saxo_Linkbricks-Horizon-AI-Avengers-V6-32B.json diff --git a/data/Saxo_Linkbricks-Horizon-AI-Korean-Avengers-V2-27B.json b/data/models/Saxo_Linkbricks-Horizon-AI-Korean-Avengers-V2-27B.json similarity index 100% rename from data/Saxo_Linkbricks-Horizon-AI-Korean-Avengers-V2-27B.json rename to data/models/Saxo_Linkbricks-Horizon-AI-Korean-Avengers-V2-27B.json diff --git a/data/Saxo_Linkbricks-Horizon-AI-Korean-Avengers-V3-27B.json b/data/models/Saxo_Linkbricks-Horizon-AI-Korean-Avengers-V3-27B.json similarity index 100% rename from data/Saxo_Linkbricks-Horizon-AI-Korean-Avengers-V3-27B.json rename to data/models/Saxo_Linkbricks-Horizon-AI-Korean-Avengers-V3-27B.json diff --git a/data/Saxo_Linkbricks-Horizon-AI-Korean-Superb-22B.json b/data/models/Saxo_Linkbricks-Horizon-AI-Korean-Superb-22B.json similarity index 100% rename from data/Saxo_Linkbricks-Horizon-AI-Korean-Superb-22B.json rename to data/models/Saxo_Linkbricks-Horizon-AI-Korean-Superb-22B.json diff --git a/data/Saxo_Linkbricks-Horizon-AI-Korean-Superb-27B.json b/data/models/Saxo_Linkbricks-Horizon-AI-Korean-Superb-27B.json similarity index 100% rename from data/Saxo_Linkbricks-Horizon-AI-Korean-Superb-27B.json rename to data/models/Saxo_Linkbricks-Horizon-AI-Korean-Superb-27B.json diff --git a/data/Saxo_Linkbricks-Horizon-AI-Superb-27B.json b/data/models/Saxo_Linkbricks-Horizon-AI-Superb-27B.json similarity index 100% rename from data/Saxo_Linkbricks-Horizon-AI-Superb-27B.json rename to data/models/Saxo_Linkbricks-Horizon-AI-Superb-27B.json diff --git a/data/Schrieffer_Llama-SARM-4B.json b/data/models/Schrieffer_Llama-SARM-4B.json similarity index 100% rename from data/Schrieffer_Llama-SARM-4B.json rename to data/models/Schrieffer_Llama-SARM-4B.json diff --git a/data/SeaLLMs_SeaLLM-7B-v2.5.json b/data/models/SeaLLMs_SeaLLM-7B-v2.5.json similarity index 100% rename from data/SeaLLMs_SeaLLM-7B-v2.5.json rename to data/models/SeaLLMs_SeaLLM-7B-v2.5.json diff --git a/data/SeaLLMs_SeaLLM-7B-v2.json b/data/models/SeaLLMs_SeaLLM-7B-v2.json similarity index 100% rename from data/SeaLLMs_SeaLLM-7B-v2.json rename to data/models/SeaLLMs_SeaLLM-7B-v2.json diff --git a/data/SeaLLMs_SeaLLMs-v3-7B-Chat.json b/data/models/SeaLLMs_SeaLLMs-v3-7B-Chat.json similarity index 100% rename from data/SeaLLMs_SeaLLMs-v3-7B-Chat.json rename to data/models/SeaLLMs_SeaLLMs-v3-7B-Chat.json diff --git a/data/SenseLLM_ReflectionCoder-CL-34B.json b/data/models/SenseLLM_ReflectionCoder-CL-34B.json similarity index 100% rename from data/SenseLLM_ReflectionCoder-CL-34B.json rename to data/models/SenseLLM_ReflectionCoder-CL-34B.json diff --git a/data/SenseLLM_ReflectionCoder-DS-33B.json b/data/models/SenseLLM_ReflectionCoder-DS-33B.json similarity index 100% rename from data/SenseLLM_ReflectionCoder-DS-33B.json rename to data/models/SenseLLM_ReflectionCoder-DS-33B.json diff --git a/data/SentientAGI_Dobby-Mini-Leashed-Llama-3.1-8B.json b/data/models/SentientAGI_Dobby-Mini-Leashed-Llama-3.1-8B.json similarity index 100% rename from data/SentientAGI_Dobby-Mini-Leashed-Llama-3.1-8B.json rename to data/models/SentientAGI_Dobby-Mini-Leashed-Llama-3.1-8B.json diff --git a/data/SentientAGI_Dobby-Mini-Unhinged-Llama-3.1-8B.json b/data/models/SentientAGI_Dobby-Mini-Unhinged-Llama-3.1-8B.json similarity index 100% rename from data/SentientAGI_Dobby-Mini-Unhinged-Llama-3.1-8B.json rename to data/models/SentientAGI_Dobby-Mini-Unhinged-Llama-3.1-8B.json diff --git a/data/SeppeV_SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo.json b/data/models/SeppeV_SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo.json similarity index 100% rename from data/SeppeV_SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo.json rename to data/models/SeppeV_SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo.json diff --git a/data/Sharathhebbar24_SSH_355M.json b/data/models/Sharathhebbar24_SSH_355M.json similarity index 100% rename from data/Sharathhebbar24_SSH_355M.json rename to data/models/Sharathhebbar24_SSH_355M.json diff --git a/data/Sharathhebbar24_chat_gpt2_dpo.json b/data/models/Sharathhebbar24_chat_gpt2_dpo.json similarity index 100% rename from data/Sharathhebbar24_chat_gpt2_dpo.json rename to data/models/Sharathhebbar24_chat_gpt2_dpo.json diff --git a/data/ShikaiChen_LDL-Reward-Gemma-2-27B-v0.1.json b/data/models/ShikaiChen_LDL-Reward-Gemma-2-27B-v0.1.json similarity index 100% rename from data/ShikaiChen_LDL-Reward-Gemma-2-27B-v0.1.json rename to data/models/ShikaiChen_LDL-Reward-Gemma-2-27B-v0.1.json diff --git a/data/Shreyash2010_Uma-4x4B-Instruct-v0.1.json b/data/models/Shreyash2010_Uma-4x4B-Instruct-v0.1.json similarity index 100% rename from data/Shreyash2010_Uma-4x4B-Instruct-v0.1.json rename to data/models/Shreyash2010_Uma-4x4B-Instruct-v0.1.json diff --git a/data/Sicarius-Prototyping_Brainy_LLAMA.json b/data/models/Sicarius-Prototyping_Brainy_LLAMA.json similarity index 100% rename from data/Sicarius-Prototyping_Brainy_LLAMA.json rename to data/models/Sicarius-Prototyping_Brainy_LLAMA.json diff --git a/data/Sicarius-Prototyping_Micropenis_1B.json b/data/models/Sicarius-Prototyping_Micropenis_1B.json similarity index 100% rename from data/Sicarius-Prototyping_Micropenis_1B.json rename to data/models/Sicarius-Prototyping_Micropenis_1B.json diff --git a/data/Sicarius-Prototyping_bacon_and_food.json b/data/models/Sicarius-Prototyping_bacon_and_food.json similarity index 100% rename from data/Sicarius-Prototyping_bacon_and_food.json rename to data/models/Sicarius-Prototyping_bacon_and_food.json diff --git a/data/SicariusSicariiStuff_2B-ad.json b/data/models/SicariusSicariiStuff_2B-ad.json similarity index 100% rename from data/SicariusSicariiStuff_2B-ad.json rename to data/models/SicariusSicariiStuff_2B-ad.json diff --git a/data/SicariusSicariiStuff_2B_or_not_2B.json b/data/models/SicariusSicariiStuff_2B_or_not_2B.json similarity index 100% rename from data/SicariusSicariiStuff_2B_or_not_2B.json rename to data/models/SicariusSicariiStuff_2B_or_not_2B.json diff --git a/data/SicariusSicariiStuff_Dusk_Rainbow.json b/data/models/SicariusSicariiStuff_Dusk_Rainbow.json similarity index 100% rename from data/SicariusSicariiStuff_Dusk_Rainbow.json rename to data/models/SicariusSicariiStuff_Dusk_Rainbow.json diff --git a/data/SicariusSicariiStuff_Eximius_Persona_5B.json b/data/models/SicariusSicariiStuff_Eximius_Persona_5B.json similarity index 100% rename from data/SicariusSicariiStuff_Eximius_Persona_5B.json rename to data/models/SicariusSicariiStuff_Eximius_Persona_5B.json diff --git a/data/SicariusSicariiStuff_Impish_LLAMA_3B.json b/data/models/SicariusSicariiStuff_Impish_LLAMA_3B.json similarity index 100% rename from data/SicariusSicariiStuff_Impish_LLAMA_3B.json rename to data/models/SicariusSicariiStuff_Impish_LLAMA_3B.json diff --git a/data/SicariusSicariiStuff_Impish_Mind_8B.json b/data/models/SicariusSicariiStuff_Impish_Mind_8B.json similarity index 100% rename from data/SicariusSicariiStuff_Impish_Mind_8B.json rename to data/models/SicariusSicariiStuff_Impish_Mind_8B.json diff --git a/data/SicariusSicariiStuff_Impish_QWEN_14B-1M.json b/data/models/SicariusSicariiStuff_Impish_QWEN_14B-1M.json similarity index 100% rename from data/SicariusSicariiStuff_Impish_QWEN_14B-1M.json rename to data/models/SicariusSicariiStuff_Impish_QWEN_14B-1M.json diff --git a/data/SicariusSicariiStuff_Impish_QWEN_7B-1M.json b/data/models/SicariusSicariiStuff_Impish_QWEN_7B-1M.json similarity index 100% rename from data/SicariusSicariiStuff_Impish_QWEN_7B-1M.json rename to data/models/SicariusSicariiStuff_Impish_QWEN_7B-1M.json diff --git a/data/SicariusSicariiStuff_LLAMA-3_8B_Unaligned_BETA.json b/data/models/SicariusSicariiStuff_LLAMA-3_8B_Unaligned_BETA.json similarity index 100% rename from data/SicariusSicariiStuff_LLAMA-3_8B_Unaligned_BETA.json rename to data/models/SicariusSicariiStuff_LLAMA-3_8B_Unaligned_BETA.json diff --git a/data/SicariusSicariiStuff_Phi-Line_14B.json b/data/models/SicariusSicariiStuff_Phi-Line_14B.json similarity index 100% rename from data/SicariusSicariiStuff_Phi-Line_14B.json rename to data/models/SicariusSicariiStuff_Phi-Line_14B.json diff --git a/data/SicariusSicariiStuff_Phi-lthy4.json b/data/models/SicariusSicariiStuff_Phi-lthy4.json similarity index 100% rename from data/SicariusSicariiStuff_Phi-lthy4.json rename to data/models/SicariusSicariiStuff_Phi-lthy4.json diff --git a/data/SicariusSicariiStuff_Qwen2.5-14B_Uncencored.json b/data/models/SicariusSicariiStuff_Qwen2.5-14B_Uncencored.json similarity index 100% rename from data/SicariusSicariiStuff_Qwen2.5-14B_Uncencored.json rename to data/models/SicariusSicariiStuff_Qwen2.5-14B_Uncencored.json diff --git a/data/SicariusSicariiStuff_Qwen2.5-14B_Uncensored.json b/data/models/SicariusSicariiStuff_Qwen2.5-14B_Uncensored.json similarity index 100% rename from data/SicariusSicariiStuff_Qwen2.5-14B_Uncensored.json rename to data/models/SicariusSicariiStuff_Qwen2.5-14B_Uncensored.json diff --git a/data/SicariusSicariiStuff_Qwen2.5-14B_Uncensored_Instruct.json b/data/models/SicariusSicariiStuff_Qwen2.5-14B_Uncensored_Instruct.json similarity index 100% rename from data/SicariusSicariiStuff_Qwen2.5-14B_Uncensored_Instruct.json rename to data/models/SicariusSicariiStuff_Qwen2.5-14B_Uncensored_Instruct.json diff --git a/data/SicariusSicariiStuff_Redemption_Wind_24B.json b/data/models/SicariusSicariiStuff_Redemption_Wind_24B.json similarity index 100% rename from data/SicariusSicariiStuff_Redemption_Wind_24B.json rename to data/models/SicariusSicariiStuff_Redemption_Wind_24B.json diff --git a/data/SicariusSicariiStuff_Winged_Imp_8B.json b/data/models/SicariusSicariiStuff_Winged_Imp_8B.json similarity index 100% rename from data/SicariusSicariiStuff_Winged_Imp_8B.json rename to data/models/SicariusSicariiStuff_Winged_Imp_8B.json diff --git a/data/SicariusSicariiStuff_Wingless_Imp_8B.json b/data/models/SicariusSicariiStuff_Wingless_Imp_8B.json similarity index 100% rename from data/SicariusSicariiStuff_Wingless_Imp_8B.json rename to data/models/SicariusSicariiStuff_Wingless_Imp_8B.json diff --git a/data/SicariusSicariiStuff_Zion_Alpha.json b/data/models/SicariusSicariiStuff_Zion_Alpha.json similarity index 100% rename from data/SicariusSicariiStuff_Zion_Alpha.json rename to data/models/SicariusSicariiStuff_Zion_Alpha.json diff --git a/data/SicariusSicariiStuff_dn_ep02.json b/data/models/SicariusSicariiStuff_dn_ep02.json similarity index 100% rename from data/SicariusSicariiStuff_dn_ep02.json rename to data/models/SicariusSicariiStuff_dn_ep02.json diff --git a/data/SkyOrbis_SKY-Ko-Llama3.1-8B-lora-epoch1.json b/data/models/SkyOrbis_SKY-Ko-Llama3.1-8B-lora-epoch1.json similarity index 100% rename from data/SkyOrbis_SKY-Ko-Llama3.1-8B-lora-epoch1.json rename to data/models/SkyOrbis_SKY-Ko-Llama3.1-8B-lora-epoch1.json diff --git a/data/SkyOrbis_SKY-Ko-Llama3.1-8B-lora.json b/data/models/SkyOrbis_SKY-Ko-Llama3.1-8B-lora.json similarity index 100% rename from data/SkyOrbis_SKY-Ko-Llama3.1-8B-lora.json rename to data/models/SkyOrbis_SKY-Ko-Llama3.1-8B-lora.json diff --git a/data/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-epoch3.json b/data/models/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-epoch3.json similarity index 100% rename from data/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-epoch3.json rename to data/models/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-epoch3.json diff --git a/data/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-epoch5.json b/data/models/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-epoch5.json similarity index 100% rename from data/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-epoch5.json rename to data/models/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-epoch5.json diff --git a/data/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-v2-epoch3.json b/data/models/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-v2-epoch3.json similarity index 100% rename from data/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-v2-epoch3.json rename to data/models/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-v2-epoch3.json diff --git a/data/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-v2-epoch5.json b/data/models/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-v2-epoch5.json similarity index 100% rename from data/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-v2-epoch5.json rename to data/models/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-v2-epoch5.json diff --git a/data/SkyOrbis_SKY-Ko-Llama3.2-3B-lora-epoch1.json b/data/models/SkyOrbis_SKY-Ko-Llama3.2-3B-lora-epoch1.json similarity index 100% rename from data/SkyOrbis_SKY-Ko-Llama3.2-3B-lora-epoch1.json rename to data/models/SkyOrbis_SKY-Ko-Llama3.2-3B-lora-epoch1.json diff --git a/data/SkyOrbis_SKY-Ko-Llama3.2-3B-lora-epoch2.json b/data/models/SkyOrbis_SKY-Ko-Llama3.2-3B-lora-epoch2.json similarity index 100% rename from data/SkyOrbis_SKY-Ko-Llama3.2-3B-lora-epoch2.json rename to data/models/SkyOrbis_SKY-Ko-Llama3.2-3B-lora-epoch2.json diff --git a/data/SkyOrbis_SKY-Ko-Llama3.2-3B-lora-epoch3.json b/data/models/SkyOrbis_SKY-Ko-Llama3.2-3B-lora-epoch3.json similarity index 100% rename from data/SkyOrbis_SKY-Ko-Llama3.2-3B-lora-epoch3.json rename to data/models/SkyOrbis_SKY-Ko-Llama3.2-3B-lora-epoch3.json diff --git a/data/SkyOrbis_SKY-Ko-Qwen2.5-3B-Instruct.json b/data/models/SkyOrbis_SKY-Ko-Qwen2.5-3B-Instruct.json similarity index 100% rename from data/SkyOrbis_SKY-Ko-Qwen2.5-3B-Instruct.json rename to data/models/SkyOrbis_SKY-Ko-Qwen2.5-3B-Instruct.json diff --git a/data/SkyOrbis_SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000.json b/data/models/SkyOrbis_SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000.json similarity index 100% rename from data/SkyOrbis_SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000.json rename to data/models/SkyOrbis_SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000.json diff --git a/data/SkyOrbis_SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000.json b/data/models/SkyOrbis_SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000.json similarity index 100% rename from data/SkyOrbis_SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000.json rename to data/models/SkyOrbis_SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000.json diff --git a/data/Skywork_Skywork-Critic-Llama-3.1-70B.json b/data/models/Skywork_Skywork-Critic-Llama-3.1-70B.json similarity index 100% rename from data/Skywork_Skywork-Critic-Llama-3.1-70B.json rename to data/models/Skywork_Skywork-Critic-Llama-3.1-70B.json diff --git a/data/Skywork_Skywork-Critic-Llama-3.1-8B.json b/data/models/Skywork_Skywork-Critic-Llama-3.1-8B.json similarity index 100% rename from data/Skywork_Skywork-Critic-Llama-3.1-8B.json rename to data/models/Skywork_Skywork-Critic-Llama-3.1-8B.json diff --git a/data/Skywork_Skywork-Reward-Gemma-2-27B-v0.2.json b/data/models/Skywork_Skywork-Reward-Gemma-2-27B-v0.2.json similarity index 100% rename from data/Skywork_Skywork-Reward-Gemma-2-27B-v0.2.json rename to data/models/Skywork_Skywork-Reward-Gemma-2-27B-v0.2.json index b7762e6bd49fb3c54b1d66a2e387f2b61e207352..c44d72de9680d22f1055239d1d93cea4a8b3db11 100644 --- a/data/Skywork_Skywork-Reward-Gemma-2-27B-v0.2.json +++ b/data/models/Skywork_Skywork-Reward-Gemma-2-27B-v0.2.json @@ -139,10 +139,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Gemma-2-27B-v0.2/1766412838.146816", + "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Gemma-2-27B-v0.2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -161,128 +161,104 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9426 + "score": 0.7531 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9609 + "score": 0.7674 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8991 + "score": 0.375 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Math", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Math score - measures mathematical reasoning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9297 + "score": 0.6721 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9807 + "score": 0.9689 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } - } - ], - "detailed_evaluation_results": null, - "generation_config": null - }, - { - "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Gemma-2-27B-v0.2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "rewardbench", - "version": "0.1.3", - "additional_details": { - "subsets": "Chat, Chat Hard, Safety, Reasoning", - "hf_space": "allenai/reward-bench" - } - }, - "benchmark": "reward-bench", - "evaluation_results": [ + }, { - "evaluation_name": "Score", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7531 + "score": 0.9172 }, "source_data": { "dataset_name": "RewardBench 2", @@ -291,111 +267,135 @@ } }, { - "evaluation_name": "Factuality", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7674 + "score": 0.8182 }, "source_data": { "dataset_name": "RewardBench 2", "source_type": "hf_dataset", "hf_repo": "allenai/reward-bench-2-results" } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": null + }, + { + "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Gemma-2-27B-v0.2/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ { - "evaluation_name": "Precise IF", + "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.375 + "score": 0.9426 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6721 + "score": 0.9609 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9689 + "score": 0.8991 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9172 + "score": 0.9297 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8182 + "score": 0.9807 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/Skywork_Skywork-Reward-Gemma-2-27B.json b/data/models/Skywork_Skywork-Reward-Gemma-2-27B.json similarity index 100% rename from data/Skywork_Skywork-Reward-Gemma-2-27B.json rename to data/models/Skywork_Skywork-Reward-Gemma-2-27B.json diff --git a/data/Skywork_Skywork-Reward-Llama-3.1-8B-v0.2.json b/data/models/Skywork_Skywork-Reward-Llama-3.1-8B-v0.2.json similarity index 100% rename from data/Skywork_Skywork-Reward-Llama-3.1-8B-v0.2.json rename to data/models/Skywork_Skywork-Reward-Llama-3.1-8B-v0.2.json index 257e1711290adf0666a2c139ab61e68721a44b75..326953a05f3530c26d156ae4dffb5aa47e2a0336 100644 --- a/data/Skywork_Skywork-Reward-Llama-3.1-8B-v0.2.json +++ b/data/models/Skywork_Skywork-Reward-Llama-3.1-8B-v0.2.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/1766412838.146816", + "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,128 +31,104 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9313 + "score": 0.7175 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9469 + "score": 0.6968 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8838 + "score": 0.4062 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Math", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Math score - measures mathematical reasoning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.927 + "score": 0.6011 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9675 + "score": 0.9422 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } - } - ], - "detailed_evaluation_results": null, - "generation_config": null - }, - { - "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "rewardbench", - "version": "0.1.3", - "additional_details": { - "subsets": "Chat, Chat Hard, Safety, Reasoning", - "hf_space": "allenai/reward-bench" - } - }, - "benchmark": "reward-bench", - "evaluation_results": [ + }, { - "evaluation_name": "Score", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7175 + "score": 0.9414 }, "source_data": { "dataset_name": "RewardBench 2", @@ -161,111 +137,135 @@ } }, { - "evaluation_name": "Factuality", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6968 + "score": 0.7169 }, "source_data": { "dataset_name": "RewardBench 2", "source_type": "hf_dataset", "hf_repo": "allenai/reward-bench-2-results" } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": null + }, + { + "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ { - "evaluation_name": "Precise IF", + "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4062 + "score": 0.9313 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6011 + "score": 0.9469 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9422 + "score": 0.8838 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9414 + "score": 0.927 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7169 + "score": 0.9675 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/Skywork_Skywork-Reward-Llama-3.1-8B.json b/data/models/Skywork_Skywork-Reward-Llama-3.1-8B.json similarity index 100% rename from data/Skywork_Skywork-Reward-Llama-3.1-8B.json rename to data/models/Skywork_Skywork-Reward-Llama-3.1-8B.json diff --git a/data/Skywork_Skywork-Reward-V2-Llama-3.1-8B.json b/data/models/Skywork_Skywork-Reward-V2-Llama-3.1-8B.json similarity index 100% rename from data/Skywork_Skywork-Reward-V2-Llama-3.1-8B.json rename to data/models/Skywork_Skywork-Reward-V2-Llama-3.1-8B.json diff --git a/data/Skywork_Skywork-Reward-V2-Llama-3.2-1B.json b/data/models/Skywork_Skywork-Reward-V2-Llama-3.2-1B.json similarity index 100% rename from data/Skywork_Skywork-Reward-V2-Llama-3.2-1B.json rename to data/models/Skywork_Skywork-Reward-V2-Llama-3.2-1B.json diff --git a/data/Skywork_Skywork-Reward-V2-Llama-3.2-3B.json b/data/models/Skywork_Skywork-Reward-V2-Llama-3.2-3B.json similarity index 100% rename from data/Skywork_Skywork-Reward-V2-Llama-3.2-3B.json rename to data/models/Skywork_Skywork-Reward-V2-Llama-3.2-3B.json diff --git a/data/Skywork_Skywork-Reward-V2-Qwen3-0.6B.json b/data/models/Skywork_Skywork-Reward-V2-Qwen3-0.6B.json similarity index 100% rename from data/Skywork_Skywork-Reward-V2-Qwen3-0.6B.json rename to data/models/Skywork_Skywork-Reward-V2-Qwen3-0.6B.json diff --git a/data/Skywork_Skywork-Reward-V2-Qwen3-1.7B.json b/data/models/Skywork_Skywork-Reward-V2-Qwen3-1.7B.json similarity index 100% rename from data/Skywork_Skywork-Reward-V2-Qwen3-1.7B.json rename to data/models/Skywork_Skywork-Reward-V2-Qwen3-1.7B.json diff --git a/data/Skywork_Skywork-Reward-V2-Qwen3-4B.json b/data/models/Skywork_Skywork-Reward-V2-Qwen3-4B.json similarity index 100% rename from data/Skywork_Skywork-Reward-V2-Qwen3-4B.json rename to data/models/Skywork_Skywork-Reward-V2-Qwen3-4B.json diff --git a/data/Skywork_Skywork-Reward-V2-Qwen3-8B.json b/data/models/Skywork_Skywork-Reward-V2-Qwen3-8B.json similarity index 100% rename from data/Skywork_Skywork-Reward-V2-Qwen3-8B.json rename to data/models/Skywork_Skywork-Reward-V2-Qwen3-8B.json diff --git a/data/Skywork_Skywork-VL-Reward-7B.json b/data/models/Skywork_Skywork-VL-Reward-7B.json similarity index 100% rename from data/Skywork_Skywork-VL-Reward-7B.json rename to data/models/Skywork_Skywork-VL-Reward-7B.json index d1caca7afd32adac0ce3eaccc5894c6b1d1db99d..651d1416fd84d9618565234fc2f23befa272cb51 100644 --- a/data/Skywork_Skywork-VL-Reward-7B.json +++ b/data/models/Skywork_Skywork-VL-Reward-7B.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/Skywork_Skywork-VL-Reward-7B/1766412838.146816", + "evaluation_id": "reward-bench-2/Skywork_Skywork-VL-Reward-7B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,128 +31,104 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9007 + "score": 0.6885 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8994 + "score": 0.6063 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.875 + "score": 0.35 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Math", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Math score - measures mathematical reasoning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9108 + "score": 0.6339 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9176 + "score": 0.8911 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } - } - ], - "detailed_evaluation_results": null, - "generation_config": null - }, - { - "evaluation_id": "reward-bench-2/Skywork_Skywork-VL-Reward-7B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "rewardbench", - "version": "0.1.3", - "additional_details": { - "subsets": "Chat, Chat Hard, Safety, Reasoning", - "hf_space": "allenai/reward-bench" - } - }, - "benchmark": "reward-bench", - "evaluation_results": [ + }, { - "evaluation_name": "Score", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6885 + "score": 0.8909 }, "source_data": { "dataset_name": "RewardBench 2", @@ -161,111 +137,135 @@ } }, { - "evaluation_name": "Factuality", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6063 + "score": 0.7586 }, "source_data": { "dataset_name": "RewardBench 2", "source_type": "hf_dataset", "hf_repo": "allenai/reward-bench-2-results" } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": null + }, + { + "evaluation_id": "reward-bench/Skywork_Skywork-VL-Reward-7B/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ { - "evaluation_name": "Precise IF", + "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.35 + "score": 0.9007 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6339 + "score": 0.8994 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8911 + "score": 0.875 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8909 + "score": 0.9108 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7586 + "score": 0.9176 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/Skywork_Skywork-o1-Open-Llama-3.1-8B.json b/data/models/Skywork_Skywork-o1-Open-Llama-3.1-8B.json similarity index 100% rename from data/Skywork_Skywork-o1-Open-Llama-3.1-8B.json rename to data/models/Skywork_Skywork-o1-Open-Llama-3.1-8B.json diff --git a/data/Solshine_Brimful-merged-replete.json b/data/models/Solshine_Brimful-merged-replete.json similarity index 100% rename from data/Solshine_Brimful-merged-replete.json rename to data/models/Solshine_Brimful-merged-replete.json diff --git a/data/Solshine_Llama-3-1-big-thoughtful-passthrough-merge-2.json b/data/models/Solshine_Llama-3-1-big-thoughtful-passthrough-merge-2.json similarity index 100% rename from data/Solshine_Llama-3-1-big-thoughtful-passthrough-merge-2.json rename to data/models/Solshine_Llama-3-1-big-thoughtful-passthrough-merge-2.json diff --git a/data/Sorawiz_Gemma-9B-Base.json b/data/models/Sorawiz_Gemma-9B-Base.json similarity index 100% rename from data/Sorawiz_Gemma-9B-Base.json rename to data/models/Sorawiz_Gemma-9B-Base.json diff --git a/data/Sorawiz_Gemma-Creative-9B-Base.json b/data/models/Sorawiz_Gemma-Creative-9B-Base.json similarity index 100% rename from data/Sorawiz_Gemma-Creative-9B-Base.json rename to data/models/Sorawiz_Gemma-Creative-9B-Base.json diff --git a/data/Sourjayon_DeepSeek-R1-8b-Sify.json b/data/models/Sourjayon_DeepSeek-R1-8b-Sify.json similarity index 100% rename from data/Sourjayon_DeepSeek-R1-8b-Sify.json rename to data/models/Sourjayon_DeepSeek-R1-8b-Sify.json diff --git a/data/Sourjayon_DeepSeek-R1-ForumNXT.json b/data/models/Sourjayon_DeepSeek-R1-ForumNXT.json similarity index 100% rename from data/Sourjayon_DeepSeek-R1-ForumNXT.json rename to data/models/Sourjayon_DeepSeek-R1-ForumNXT.json diff --git a/data/SpaceYL_ECE_Poirot.json b/data/models/SpaceYL_ECE_Poirot.json similarity index 100% rename from data/SpaceYL_ECE_Poirot.json rename to data/models/SpaceYL_ECE_Poirot.json diff --git a/data/Spestly_Athena-1-3B.json b/data/models/Spestly_Athena-1-3B.json similarity index 100% rename from data/Spestly_Athena-1-3B.json rename to data/models/Spestly_Athena-1-3B.json diff --git a/data/Spestly_Atlas-Pro-1.5B-Preview.json b/data/models/Spestly_Atlas-Pro-1.5B-Preview.json similarity index 100% rename from data/Spestly_Atlas-Pro-1.5B-Preview.json rename to data/models/Spestly_Atlas-Pro-1.5B-Preview.json diff --git a/data/Spestly_Atlas-Pro-7B-Preview.json b/data/models/Spestly_Atlas-Pro-7B-Preview.json similarity index 100% rename from data/Spestly_Atlas-Pro-7B-Preview.json rename to data/models/Spestly_Atlas-Pro-7B-Preview.json diff --git a/data/Stark2008_GutenLaserPi.json b/data/models/Stark2008_GutenLaserPi.json similarity index 100% rename from data/Stark2008_GutenLaserPi.json rename to data/models/Stark2008_GutenLaserPi.json diff --git a/data/Stark2008_LayleleFlamPi.json b/data/models/Stark2008_LayleleFlamPi.json similarity index 100% rename from data/Stark2008_LayleleFlamPi.json rename to data/models/Stark2008_LayleleFlamPi.json diff --git a/data/Stark2008_VisFlamCat.json b/data/models/Stark2008_VisFlamCat.json similarity index 100% rename from data/Stark2008_VisFlamCat.json rename to data/models/Stark2008_VisFlamCat.json diff --git a/data/Steelskull_L3.3-MS-Nevoria-70b.json b/data/models/Steelskull_L3.3-MS-Nevoria-70b.json similarity index 100% rename from data/Steelskull_L3.3-MS-Nevoria-70b.json rename to data/models/Steelskull_L3.3-MS-Nevoria-70b.json diff --git a/data/Steelskull_L3.3-Nevoria-R1-70b.json b/data/models/Steelskull_L3.3-Nevoria-R1-70b.json similarity index 100% rename from data/Steelskull_L3.3-Nevoria-R1-70b.json rename to data/models/Steelskull_L3.3-Nevoria-R1-70b.json diff --git a/data/StelleX_Qwen2.5_Math_7B_Cot.json b/data/models/StelleX_Qwen2.5_Math_7B_Cot.json similarity index 100% rename from data/StelleX_Qwen2.5_Math_7B_Cot.json rename to data/models/StelleX_Qwen2.5_Math_7B_Cot.json diff --git a/data/StelleX_Vorisatex-7B-preview.json b/data/models/StelleX_Vorisatex-7B-preview.json similarity index 100% rename from data/StelleX_Vorisatex-7B-preview.json rename to data/models/StelleX_Vorisatex-7B-preview.json diff --git a/data/SultanR_SmolTulu-1.7b-Instruct.json b/data/models/SultanR_SmolTulu-1.7b-Instruct.json similarity index 100% rename from data/SultanR_SmolTulu-1.7b-Instruct.json rename to data/models/SultanR_SmolTulu-1.7b-Instruct.json diff --git a/data/SultanR_SmolTulu-1.7b-RM.json b/data/models/SultanR_SmolTulu-1.7b-RM.json similarity index 100% rename from data/SultanR_SmolTulu-1.7b-RM.json rename to data/models/SultanR_SmolTulu-1.7b-RM.json diff --git a/data/SultanR_SmolTulu-1.7b-Reinforced.json b/data/models/SultanR_SmolTulu-1.7b-Reinforced.json similarity index 100% rename from data/SultanR_SmolTulu-1.7b-Reinforced.json rename to data/models/SultanR_SmolTulu-1.7b-Reinforced.json diff --git a/data/SultanR_SmolTulu-1.7b-it-v0.json b/data/models/SultanR_SmolTulu-1.7b-it-v0.json similarity index 100% rename from data/SultanR_SmolTulu-1.7b-it-v0.json rename to data/models/SultanR_SmolTulu-1.7b-it-v0.json diff --git a/data/Supichi_BBA-123.json b/data/models/Supichi_BBA-123.json similarity index 100% rename from data/Supichi_BBA-123.json rename to data/models/Supichi_BBA-123.json diff --git a/data/Supichi_BBA99.json b/data/models/Supichi_BBA99.json similarity index 100% rename from data/Supichi_BBA99.json rename to data/models/Supichi_BBA99.json diff --git a/data/Supichi_BBAIK29.json b/data/models/Supichi_BBAIK29.json similarity index 100% rename from data/Supichi_BBAIK29.json rename to data/models/Supichi_BBAIK29.json diff --git a/data/Supichi_BBAI_135_Gemma.json b/data/models/Supichi_BBAI_135_Gemma.json similarity index 100% rename from data/Supichi_BBAI_135_Gemma.json rename to data/models/Supichi_BBAI_135_Gemma.json diff --git a/data/Supichi_BBAI_250_Xia0_gZ.json b/data/models/Supichi_BBAI_250_Xia0_gZ.json similarity index 100% rename from data/Supichi_BBAI_250_Xia0_gZ.json rename to data/models/Supichi_BBAI_250_Xia0_gZ.json diff --git a/data/Supichi_BBAI_275_Tsunami_gZ.json b/data/models/Supichi_BBAI_275_Tsunami_gZ.json similarity index 100% rename from data/Supichi_BBAI_275_Tsunami_gZ.json rename to data/models/Supichi_BBAI_275_Tsunami_gZ.json diff --git a/data/Supichi_BBAI_525_Tsu_gZ_Xia0.json b/data/models/Supichi_BBAI_525_Tsu_gZ_Xia0.json similarity index 100% rename from data/Supichi_BBAI_525_Tsu_gZ_Xia0.json rename to data/models/Supichi_BBAI_525_Tsu_gZ_Xia0.json diff --git a/data/Supichi_BBAI_78B_Calme_3_1_Ties.json b/data/models/Supichi_BBAI_78B_Calme_3_1_Ties.json similarity index 100% rename from data/Supichi_BBAI_78B_Calme_3_1_Ties.json rename to data/models/Supichi_BBAI_78B_Calme_3_1_Ties.json diff --git a/data/Supichi_BBAI_QWEEN_V000000_LUMEN_14B.json b/data/models/Supichi_BBAI_QWEEN_V000000_LUMEN_14B.json similarity index 100% rename from data/Supichi_BBAI_QWEEN_V000000_LUMEN_14B.json rename to data/models/Supichi_BBAI_QWEEN_V000000_LUMEN_14B.json diff --git a/data/Supichi_HF_TOKEN.json b/data/models/Supichi_HF_TOKEN.json similarity index 100% rename from data/Supichi_HF_TOKEN.json rename to data/models/Supichi_HF_TOKEN.json diff --git a/data/Supichi_NJS26.json b/data/models/Supichi_NJS26.json similarity index 100% rename from data/Supichi_NJS26.json rename to data/models/Supichi_NJS26.json diff --git a/data/Svak_MN-12B-Inferor-v0.0.json b/data/models/Svak_MN-12B-Inferor-v0.0.json similarity index 100% rename from data/Svak_MN-12B-Inferor-v0.0.json rename to data/models/Svak_MN-12B-Inferor-v0.0.json diff --git a/data/Svak_MN-12B-Inferor-v0.1.json b/data/models/Svak_MN-12B-Inferor-v0.1.json similarity index 100% rename from data/Svak_MN-12B-Inferor-v0.1.json rename to data/models/Svak_MN-12B-Inferor-v0.1.json diff --git a/data/Syed-Hasan-8503_Phi-3-mini-4K-instruct-cpo-simpo.json b/data/models/Syed-Hasan-8503_Phi-3-mini-4K-instruct-cpo-simpo.json similarity index 100% rename from data/Syed-Hasan-8503_Phi-3-mini-4K-instruct-cpo-simpo.json rename to data/models/Syed-Hasan-8503_Phi-3-mini-4K-instruct-cpo-simpo.json diff --git a/data/T145_KRONOS-8B-V1-P1.json b/data/models/T145_KRONOS-8B-V1-P1.json similarity index 100% rename from data/T145_KRONOS-8B-V1-P1.json rename to data/models/T145_KRONOS-8B-V1-P1.json diff --git a/data/T145_KRONOS-8B-V1-P2.json b/data/models/T145_KRONOS-8B-V1-P2.json similarity index 100% rename from data/T145_KRONOS-8B-V1-P2.json rename to data/models/T145_KRONOS-8B-V1-P2.json diff --git a/data/T145_KRONOS-8B-V1-P3.json b/data/models/T145_KRONOS-8B-V1-P3.json similarity index 100% rename from data/T145_KRONOS-8B-V1-P3.json rename to data/models/T145_KRONOS-8B-V1-P3.json diff --git a/data/T145_KRONOS-8B-V2.json b/data/models/T145_KRONOS-8B-V2.json similarity index 100% rename from data/T145_KRONOS-8B-V2.json rename to data/models/T145_KRONOS-8B-V2.json diff --git a/data/T145_KRONOS-8B-V3.json b/data/models/T145_KRONOS-8B-V3.json similarity index 100% rename from data/T145_KRONOS-8B-V3.json rename to data/models/T145_KRONOS-8B-V3.json diff --git a/data/T145_KRONOS-8B-V4.json b/data/models/T145_KRONOS-8B-V4.json similarity index 100% rename from data/T145_KRONOS-8B-V4.json rename to data/models/T145_KRONOS-8B-V4.json diff --git a/data/T145_KRONOS-8B-V5.json b/data/models/T145_KRONOS-8B-V5.json similarity index 100% rename from data/T145_KRONOS-8B-V5.json rename to data/models/T145_KRONOS-8B-V5.json diff --git a/data/T145_KRONOS-8B-V6.json b/data/models/T145_KRONOS-8B-V6.json similarity index 100% rename from data/T145_KRONOS-8B-V6.json rename to data/models/T145_KRONOS-8B-V6.json diff --git a/data/T145_KRONOS-8B-V7.json b/data/models/T145_KRONOS-8B-V7.json similarity index 100% rename from data/T145_KRONOS-8B-V7.json rename to data/models/T145_KRONOS-8B-V7.json diff --git a/data/T145_KRONOS-8B-V8.json b/data/models/T145_KRONOS-8B-V8.json similarity index 100% rename from data/T145_KRONOS-8B-V8.json rename to data/models/T145_KRONOS-8B-V8.json diff --git a/data/T145_KRONOS-8B-V9.json b/data/models/T145_KRONOS-8B-V9.json similarity index 100% rename from data/T145_KRONOS-8B-V9.json rename to data/models/T145_KRONOS-8B-V9.json diff --git a/data/T145_Llama-3.1-8B-Instruct-Zeus.json b/data/models/T145_Llama-3.1-8B-Instruct-Zeus.json similarity index 100% rename from data/T145_Llama-3.1-8B-Instruct-Zeus.json rename to data/models/T145_Llama-3.1-8B-Instruct-Zeus.json diff --git a/data/T145_Llama-3.1-8B-Zeus.json b/data/models/T145_Llama-3.1-8B-Zeus.json similarity index 100% rename from data/T145_Llama-3.1-8B-Zeus.json rename to data/models/T145_Llama-3.1-8B-Zeus.json diff --git a/data/T145_Meta-Llama-3.1-8B-Instruct-TIES.json b/data/models/T145_Meta-Llama-3.1-8B-Instruct-TIES.json similarity index 100% rename from data/T145_Meta-Llama-3.1-8B-Instruct-TIES.json rename to data/models/T145_Meta-Llama-3.1-8B-Instruct-TIES.json diff --git a/data/T145_ZEUS-8B-V10.json b/data/models/T145_ZEUS-8B-V10.json similarity index 100% rename from data/T145_ZEUS-8B-V10.json rename to data/models/T145_ZEUS-8B-V10.json diff --git a/data/T145_ZEUS-8B-V11.json b/data/models/T145_ZEUS-8B-V11.json similarity index 100% rename from data/T145_ZEUS-8B-V11.json rename to data/models/T145_ZEUS-8B-V11.json diff --git a/data/T145_ZEUS-8B-V12.json b/data/models/T145_ZEUS-8B-V12.json similarity index 100% rename from data/T145_ZEUS-8B-V12.json rename to data/models/T145_ZEUS-8B-V12.json diff --git a/data/T145_ZEUS-8B-V13-abliterated.json b/data/models/T145_ZEUS-8B-V13-abliterated.json similarity index 100% rename from data/T145_ZEUS-8B-V13-abliterated.json rename to data/models/T145_ZEUS-8B-V13-abliterated.json diff --git a/data/T145_ZEUS-8B-V13.json b/data/models/T145_ZEUS-8B-V13.json similarity index 100% rename from data/T145_ZEUS-8B-V13.json rename to data/models/T145_ZEUS-8B-V13.json diff --git a/data/T145_ZEUS-8B-V14.json b/data/models/T145_ZEUS-8B-V14.json similarity index 100% rename from data/T145_ZEUS-8B-V14.json rename to data/models/T145_ZEUS-8B-V14.json diff --git a/data/T145_ZEUS-8B-V15.json b/data/models/T145_ZEUS-8B-V15.json similarity index 100% rename from data/T145_ZEUS-8B-V15.json rename to data/models/T145_ZEUS-8B-V15.json diff --git a/data/T145_ZEUS-8B-V16.json b/data/models/T145_ZEUS-8B-V16.json similarity index 100% rename from data/T145_ZEUS-8B-V16.json rename to data/models/T145_ZEUS-8B-V16.json diff --git a/data/T145_ZEUS-8B-V17-abliterated-V2.json b/data/models/T145_ZEUS-8B-V17-abliterated-V2.json similarity index 100% rename from data/T145_ZEUS-8B-V17-abliterated-V2.json rename to data/models/T145_ZEUS-8B-V17-abliterated-V2.json diff --git a/data/T145_ZEUS-8B-V17-abliterated-V4.json b/data/models/T145_ZEUS-8B-V17-abliterated-V4.json similarity index 100% rename from data/T145_ZEUS-8B-V17-abliterated-V4.json rename to data/models/T145_ZEUS-8B-V17-abliterated-V4.json diff --git a/data/T145_ZEUS-8B-V17-abliterated.json b/data/models/T145_ZEUS-8B-V17-abliterated.json similarity index 100% rename from data/T145_ZEUS-8B-V17-abliterated.json rename to data/models/T145_ZEUS-8B-V17-abliterated.json diff --git a/data/T145_ZEUS-8B-V17.json b/data/models/T145_ZEUS-8B-V17.json similarity index 100% rename from data/T145_ZEUS-8B-V17.json rename to data/models/T145_ZEUS-8B-V17.json diff --git a/data/T145_ZEUS-8B-V18.json b/data/models/T145_ZEUS-8B-V18.json similarity index 100% rename from data/T145_ZEUS-8B-V18.json rename to data/models/T145_ZEUS-8B-V18.json diff --git a/data/T145_ZEUS-8B-V19.json b/data/models/T145_ZEUS-8B-V19.json similarity index 100% rename from data/T145_ZEUS-8B-V19.json rename to data/models/T145_ZEUS-8B-V19.json diff --git a/data/T145_ZEUS-8B-V2-ORPO.json b/data/models/T145_ZEUS-8B-V2-ORPO.json similarity index 100% rename from data/T145_ZEUS-8B-V2-ORPO.json rename to data/models/T145_ZEUS-8B-V2-ORPO.json diff --git a/data/T145_ZEUS-8B-V2-abliterated.json b/data/models/T145_ZEUS-8B-V2-abliterated.json similarity index 100% rename from data/T145_ZEUS-8B-V2-abliterated.json rename to data/models/T145_ZEUS-8B-V2-abliterated.json diff --git a/data/T145_ZEUS-8B-V2.json b/data/models/T145_ZEUS-8B-V2.json similarity index 100% rename from data/T145_ZEUS-8B-V2.json rename to data/models/T145_ZEUS-8B-V2.json diff --git a/data/T145_ZEUS-8B-V20.json b/data/models/T145_ZEUS-8B-V20.json similarity index 100% rename from data/T145_ZEUS-8B-V20.json rename to data/models/T145_ZEUS-8B-V20.json diff --git a/data/T145_ZEUS-8B-V21.json b/data/models/T145_ZEUS-8B-V21.json similarity index 100% rename from data/T145_ZEUS-8B-V21.json rename to data/models/T145_ZEUS-8B-V21.json diff --git a/data/T145_ZEUS-8B-V22.json b/data/models/T145_ZEUS-8B-V22.json similarity index 100% rename from data/T145_ZEUS-8B-V22.json rename to data/models/T145_ZEUS-8B-V22.json diff --git a/data/T145_ZEUS-8B-V23.json b/data/models/T145_ZEUS-8B-V23.json similarity index 100% rename from data/T145_ZEUS-8B-V23.json rename to data/models/T145_ZEUS-8B-V23.json diff --git a/data/T145_ZEUS-8B-V24.json b/data/models/T145_ZEUS-8B-V24.json similarity index 100% rename from data/T145_ZEUS-8B-V24.json rename to data/models/T145_ZEUS-8B-V24.json diff --git a/data/T145_ZEUS-8B-V25.json b/data/models/T145_ZEUS-8B-V25.json similarity index 100% rename from data/T145_ZEUS-8B-V25.json rename to data/models/T145_ZEUS-8B-V25.json diff --git a/data/T145_ZEUS-8B-V26.json b/data/models/T145_ZEUS-8B-V26.json similarity index 100% rename from data/T145_ZEUS-8B-V26.json rename to data/models/T145_ZEUS-8B-V26.json diff --git a/data/T145_ZEUS-8B-V27.json b/data/models/T145_ZEUS-8B-V27.json similarity index 100% rename from data/T145_ZEUS-8B-V27.json rename to data/models/T145_ZEUS-8B-V27.json diff --git a/data/T145_ZEUS-8B-V28.json b/data/models/T145_ZEUS-8B-V28.json similarity index 100% rename from data/T145_ZEUS-8B-V28.json rename to data/models/T145_ZEUS-8B-V28.json diff --git a/data/T145_ZEUS-8B-V29.json b/data/models/T145_ZEUS-8B-V29.json similarity index 100% rename from data/T145_ZEUS-8B-V29.json rename to data/models/T145_ZEUS-8B-V29.json diff --git a/data/T145_ZEUS-8B-V2L1.json b/data/models/T145_ZEUS-8B-V2L1.json similarity index 100% rename from data/T145_ZEUS-8B-V2L1.json rename to data/models/T145_ZEUS-8B-V2L1.json diff --git a/data/T145_ZEUS-8B-V2L2.json b/data/models/T145_ZEUS-8B-V2L2.json similarity index 100% rename from data/T145_ZEUS-8B-V2L2.json rename to data/models/T145_ZEUS-8B-V2L2.json diff --git a/data/T145_ZEUS-8B-V3.json b/data/models/T145_ZEUS-8B-V3.json similarity index 100% rename from data/T145_ZEUS-8B-V3.json rename to data/models/T145_ZEUS-8B-V3.json diff --git a/data/T145_ZEUS-8B-V30.json b/data/models/T145_ZEUS-8B-V30.json similarity index 100% rename from data/T145_ZEUS-8B-V30.json rename to data/models/T145_ZEUS-8B-V30.json diff --git a/data/T145_ZEUS-8B-V4.json b/data/models/T145_ZEUS-8B-V4.json similarity index 100% rename from data/T145_ZEUS-8B-V4.json rename to data/models/T145_ZEUS-8B-V4.json diff --git a/data/T145_ZEUS-8B-V6.json b/data/models/T145_ZEUS-8B-V6.json similarity index 100% rename from data/T145_ZEUS-8B-V6.json rename to data/models/T145_ZEUS-8B-V6.json diff --git a/data/T145_ZEUS-8B-V7.json b/data/models/T145_ZEUS-8B-V7.json similarity index 100% rename from data/T145_ZEUS-8B-V7.json rename to data/models/T145_ZEUS-8B-V7.json diff --git a/data/T145_ZEUS-8B-V8.json b/data/models/T145_ZEUS-8B-V8.json similarity index 100% rename from data/T145_ZEUS-8B-V8.json rename to data/models/T145_ZEUS-8B-V8.json diff --git a/data/T145_ZEUS-8B-V9.json b/data/models/T145_ZEUS-8B-V9.json similarity index 100% rename from data/T145_ZEUS-8B-V9.json rename to data/models/T145_ZEUS-8B-V9.json diff --git a/data/T145_qwen-2.5-3B-merge-test.json b/data/models/T145_qwen-2.5-3B-merge-test.json similarity index 100% rename from data/T145_qwen-2.5-3B-merge-test.json rename to data/models/T145_qwen-2.5-3B-merge-test.json diff --git a/data/THUDM_glm-4-9b-chat-1m-hf.json b/data/models/THUDM_glm-4-9b-chat-1m-hf.json similarity index 100% rename from data/THUDM_glm-4-9b-chat-1m-hf.json rename to data/models/THUDM_glm-4-9b-chat-1m-hf.json diff --git a/data/THUDM_glm-4-9b-chat-1m.json b/data/models/THUDM_glm-4-9b-chat-1m.json similarity index 100% rename from data/THUDM_glm-4-9b-chat-1m.json rename to data/models/THUDM_glm-4-9b-chat-1m.json diff --git a/data/THUDM_glm-4-9b-chat-hf.json b/data/models/THUDM_glm-4-9b-chat-hf.json similarity index 100% rename from data/THUDM_glm-4-9b-chat-hf.json rename to data/models/THUDM_glm-4-9b-chat-hf.json diff --git a/data/THUDM_glm-4-9b-chat.json b/data/models/THUDM_glm-4-9b-chat.json similarity index 100% rename from data/THUDM_glm-4-9b-chat.json rename to data/models/THUDM_glm-4-9b-chat.json diff --git a/data/THUDM_glm-4-9b.json b/data/models/THUDM_glm-4-9b.json similarity index 100% rename from data/THUDM_glm-4-9b.json rename to data/models/THUDM_glm-4-9b.json diff --git a/data/TIGER-Lab_AceCodeRM-7B.json b/data/models/TIGER-Lab_AceCodeRM-7B.json similarity index 100% rename from data/TIGER-Lab_AceCodeRM-7B.json rename to data/models/TIGER-Lab_AceCodeRM-7B.json diff --git a/data/TIGER-Lab_AceCoder-Qwen2.5-7B-Ins-Rule.json b/data/models/TIGER-Lab_AceCoder-Qwen2.5-7B-Ins-Rule.json similarity index 100% rename from data/TIGER-Lab_AceCoder-Qwen2.5-7B-Ins-Rule.json rename to data/models/TIGER-Lab_AceCoder-Qwen2.5-7B-Ins-Rule.json diff --git a/data/TIGER-Lab_AceCoder-Qwen2.5-Coder-7B-Base-Rule.json b/data/models/TIGER-Lab_AceCoder-Qwen2.5-Coder-7B-Base-Rule.json similarity index 100% rename from data/TIGER-Lab_AceCoder-Qwen2.5-Coder-7B-Base-Rule.json rename to data/models/TIGER-Lab_AceCoder-Qwen2.5-Coder-7B-Base-Rule.json diff --git a/data/TIGER-Lab_AceCoder-Qwen2.5-Coder-7B-Ins-Rule.json b/data/models/TIGER-Lab_AceCoder-Qwen2.5-Coder-7B-Ins-Rule.json similarity index 100% rename from data/TIGER-Lab_AceCoder-Qwen2.5-Coder-7B-Ins-Rule.json rename to data/models/TIGER-Lab_AceCoder-Qwen2.5-Coder-7B-Ins-Rule.json diff --git a/data/TIGER-Lab_MAmmoTH2-7B-Plus.json b/data/models/TIGER-Lab_MAmmoTH2-7B-Plus.json similarity index 100% rename from data/TIGER-Lab_MAmmoTH2-7B-Plus.json rename to data/models/TIGER-Lab_MAmmoTH2-7B-Plus.json diff --git a/data/TIGER-Lab_Qwen2.5-Math-7B-CFT.json b/data/models/TIGER-Lab_Qwen2.5-Math-7B-CFT.json similarity index 100% rename from data/TIGER-Lab_Qwen2.5-Math-7B-CFT.json rename to data/models/TIGER-Lab_Qwen2.5-Math-7B-CFT.json diff --git a/data/TTTXXX01_Mistral-7B-Base-SimPO2-5e-7.json b/data/models/TTTXXX01_Mistral-7B-Base-SimPO2-5e-7.json similarity index 100% rename from data/TTTXXX01_Mistral-7B-Base-SimPO2-5e-7.json rename to data/models/TTTXXX01_Mistral-7B-Base-SimPO2-5e-7.json diff --git a/data/Tarek07_Progenitor-V1.1-LLaMa-70B.json b/data/models/Tarek07_Progenitor-V1.1-LLaMa-70B.json similarity index 100% rename from data/Tarek07_Progenitor-V1.1-LLaMa-70B.json rename to data/models/Tarek07_Progenitor-V1.1-LLaMa-70B.json diff --git a/data/Tarek07_Thalassic-Alpha-LLaMa-70B.json b/data/models/Tarek07_Thalassic-Alpha-LLaMa-70B.json similarity index 100% rename from data/Tarek07_Thalassic-Alpha-LLaMa-70B.json rename to data/models/Tarek07_Thalassic-Alpha-LLaMa-70B.json diff --git a/data/TeeZee_DoubleBagel-57B-v1.0.json b/data/models/TeeZee_DoubleBagel-57B-v1.0.json similarity index 100% rename from data/TeeZee_DoubleBagel-57B-v1.0.json rename to data/models/TeeZee_DoubleBagel-57B-v1.0.json diff --git a/data/Telugu-LLM-Labs_Indic-gemma-2b-finetuned-sft-Navarasa-2.0.json b/data/models/Telugu-LLM-Labs_Indic-gemma-2b-finetuned-sft-Navarasa-2.0.json similarity index 100% rename from data/Telugu-LLM-Labs_Indic-gemma-2b-finetuned-sft-Navarasa-2.0.json rename to data/models/Telugu-LLM-Labs_Indic-gemma-2b-finetuned-sft-Navarasa-2.0.json diff --git a/data/Telugu-LLM-Labs_Indic-gemma-7b-finetuned-sft-Navarasa-2.0.json b/data/models/Telugu-LLM-Labs_Indic-gemma-7b-finetuned-sft-Navarasa-2.0.json similarity index 100% rename from data/Telugu-LLM-Labs_Indic-gemma-7b-finetuned-sft-Navarasa-2.0.json rename to data/models/Telugu-LLM-Labs_Indic-gemma-7b-finetuned-sft-Navarasa-2.0.json diff --git a/data/TencentARC_LLaMA-Pro-8B-Instruct.json b/data/models/TencentARC_LLaMA-Pro-8B-Instruct.json similarity index 100% rename from data/TencentARC_LLaMA-Pro-8B-Instruct.json rename to data/models/TencentARC_LLaMA-Pro-8B-Instruct.json diff --git a/data/TencentARC_LLaMA-Pro-8B.json b/data/models/TencentARC_LLaMA-Pro-8B.json similarity index 100% rename from data/TencentARC_LLaMA-Pro-8B.json rename to data/models/TencentARC_LLaMA-Pro-8B.json diff --git a/data/TencentARC_MetaMath-Mistral-Pro.json b/data/models/TencentARC_MetaMath-Mistral-Pro.json similarity index 100% rename from data/TencentARC_MetaMath-Mistral-Pro.json rename to data/models/TencentARC_MetaMath-Mistral-Pro.json diff --git a/data/TencentARC_Mistral_Pro_8B_v0.1.json b/data/models/TencentARC_Mistral_Pro_8B_v0.1.json similarity index 100% rename from data/TencentARC_Mistral_Pro_8B_v0.1.json rename to data/models/TencentARC_Mistral_Pro_8B_v0.1.json diff --git a/data/TheDrummer_Cydonia-22B-v1.2.json b/data/models/TheDrummer_Cydonia-22B-v1.2.json similarity index 100% rename from data/TheDrummer_Cydonia-22B-v1.2.json rename to data/models/TheDrummer_Cydonia-22B-v1.2.json diff --git a/data/TheDrummer_Gemmasutra-9B-v1.json b/data/models/TheDrummer_Gemmasutra-9B-v1.json similarity index 100% rename from data/TheDrummer_Gemmasutra-9B-v1.json rename to data/models/TheDrummer_Gemmasutra-9B-v1.json diff --git a/data/TheDrummer_Gemmasutra-Mini-2B-v1.json b/data/models/TheDrummer_Gemmasutra-Mini-2B-v1.json similarity index 100% rename from data/TheDrummer_Gemmasutra-Mini-2B-v1.json rename to data/models/TheDrummer_Gemmasutra-Mini-2B-v1.json diff --git a/data/TheDrummer_Llama-3SOME-8B-v2.json b/data/models/TheDrummer_Llama-3SOME-8B-v2.json similarity index 100% rename from data/TheDrummer_Llama-3SOME-8B-v2.json rename to data/models/TheDrummer_Llama-3SOME-8B-v2.json diff --git a/data/TheDrummer_Ministrations-8B-v1.json b/data/models/TheDrummer_Ministrations-8B-v1.json similarity index 100% rename from data/TheDrummer_Ministrations-8B-v1.json rename to data/models/TheDrummer_Ministrations-8B-v1.json diff --git a/data/TheDrummer_Rocinante-12B-v1.json b/data/models/TheDrummer_Rocinante-12B-v1.json similarity index 100% rename from data/TheDrummer_Rocinante-12B-v1.json rename to data/models/TheDrummer_Rocinante-12B-v1.json diff --git a/data/TheDrummer_Tiger-Gemma-9B-v1.json b/data/models/TheDrummer_Tiger-Gemma-9B-v1.json similarity index 100% rename from data/TheDrummer_Tiger-Gemma-9B-v1.json rename to data/models/TheDrummer_Tiger-Gemma-9B-v1.json diff --git a/data/TheDrummer_Tiger-Gemma-9B-v2.json b/data/models/TheDrummer_Tiger-Gemma-9B-v2.json similarity index 100% rename from data/TheDrummer_Tiger-Gemma-9B-v2.json rename to data/models/TheDrummer_Tiger-Gemma-9B-v2.json diff --git a/data/TheDrummer_Tiger-Gemma-9B-v3.json b/data/models/TheDrummer_Tiger-Gemma-9B-v3.json similarity index 100% rename from data/TheDrummer_Tiger-Gemma-9B-v3.json rename to data/models/TheDrummer_Tiger-Gemma-9B-v3.json diff --git a/data/TheDrunkenSnail_Daughter-of-Rhodia-12B.json b/data/models/TheDrunkenSnail_Daughter-of-Rhodia-12B.json similarity index 100% rename from data/TheDrunkenSnail_Daughter-of-Rhodia-12B.json rename to data/models/TheDrunkenSnail_Daughter-of-Rhodia-12B.json diff --git a/data/TheDrunkenSnail_Mother-of-Rhodia-12B.json b/data/models/TheDrunkenSnail_Mother-of-Rhodia-12B.json similarity index 100% rename from data/TheDrunkenSnail_Mother-of-Rhodia-12B.json rename to data/models/TheDrunkenSnail_Mother-of-Rhodia-12B.json diff --git a/data/TheDrunkenSnail_Son-of-Rhodia.json b/data/models/TheDrunkenSnail_Son-of-Rhodia.json similarity index 100% rename from data/TheDrunkenSnail_Son-of-Rhodia.json rename to data/models/TheDrunkenSnail_Son-of-Rhodia.json diff --git a/data/TheHierophant_Underground-Cognitive-V0.3-test.json b/data/models/TheHierophant_Underground-Cognitive-V0.3-test.json similarity index 100% rename from data/TheHierophant_Underground-Cognitive-V0.3-test.json rename to data/models/TheHierophant_Underground-Cognitive-V0.3-test.json diff --git a/data/TheTsar1209_nemo-carpmuscle-v0.1.json b/data/models/TheTsar1209_nemo-carpmuscle-v0.1.json similarity index 100% rename from data/TheTsar1209_nemo-carpmuscle-v0.1.json rename to data/models/TheTsar1209_nemo-carpmuscle-v0.1.json diff --git a/data/TheTsar1209_qwen-carpmuscle-r-v0.3.json b/data/models/TheTsar1209_qwen-carpmuscle-r-v0.3.json similarity index 100% rename from data/TheTsar1209_qwen-carpmuscle-r-v0.3.json rename to data/models/TheTsar1209_qwen-carpmuscle-r-v0.3.json diff --git a/data/TheTsar1209_qwen-carpmuscle-v0.1.json b/data/models/TheTsar1209_qwen-carpmuscle-v0.1.json similarity index 100% rename from data/TheTsar1209_qwen-carpmuscle-v0.1.json rename to data/models/TheTsar1209_qwen-carpmuscle-v0.1.json diff --git a/data/TheTsar1209_qwen-carpmuscle-v0.2.json b/data/models/TheTsar1209_qwen-carpmuscle-v0.2.json similarity index 100% rename from data/TheTsar1209_qwen-carpmuscle-v0.2.json rename to data/models/TheTsar1209_qwen-carpmuscle-v0.2.json diff --git a/data/TheTsar1209_qwen-carpmuscle-v0.3.json b/data/models/TheTsar1209_qwen-carpmuscle-v0.3.json similarity index 100% rename from data/TheTsar1209_qwen-carpmuscle-v0.3.json rename to data/models/TheTsar1209_qwen-carpmuscle-v0.3.json diff --git a/data/TheTsar1209_qwen-carpmuscle-v0.4.1.json b/data/models/TheTsar1209_qwen-carpmuscle-v0.4.1.json similarity index 100% rename from data/TheTsar1209_qwen-carpmuscle-v0.4.1.json rename to data/models/TheTsar1209_qwen-carpmuscle-v0.4.1.json diff --git a/data/TheTsar1209_qwen-carpmuscle-v0.4.json b/data/models/TheTsar1209_qwen-carpmuscle-v0.4.json similarity index 100% rename from data/TheTsar1209_qwen-carpmuscle-v0.4.json rename to data/models/TheTsar1209_qwen-carpmuscle-v0.4.json diff --git a/data/Tijmen2_cosmosage-v3.json b/data/models/Tijmen2_cosmosage-v3.json similarity index 100% rename from data/Tijmen2_cosmosage-v3.json rename to data/models/Tijmen2_cosmosage-v3.json diff --git a/data/TinyLlama_TinyLlama-1.1B-Chat-v0.1.json b/data/models/TinyLlama_TinyLlama-1.1B-Chat-v0.1.json similarity index 100% rename from data/TinyLlama_TinyLlama-1.1B-Chat-v0.1.json rename to data/models/TinyLlama_TinyLlama-1.1B-Chat-v0.1.json diff --git a/data/TinyLlama_TinyLlama-1.1B-Chat-v0.5.json b/data/models/TinyLlama_TinyLlama-1.1B-Chat-v0.5.json similarity index 100% rename from data/TinyLlama_TinyLlama-1.1B-Chat-v0.5.json rename to data/models/TinyLlama_TinyLlama-1.1B-Chat-v0.5.json diff --git a/data/TinyLlama_TinyLlama-1.1B-Chat-v0.6.json b/data/models/TinyLlama_TinyLlama-1.1B-Chat-v0.6.json similarity index 100% rename from data/TinyLlama_TinyLlama-1.1B-Chat-v0.6.json rename to data/models/TinyLlama_TinyLlama-1.1B-Chat-v0.6.json diff --git a/data/TinyLlama_TinyLlama-1.1B-Chat-v1.0.json b/data/models/TinyLlama_TinyLlama-1.1B-Chat-v1.0.json similarity index 100% rename from data/TinyLlama_TinyLlama-1.1B-Chat-v1.0.json rename to data/models/TinyLlama_TinyLlama-1.1B-Chat-v1.0.json diff --git a/data/TinyLlama_TinyLlama-1.1B-intermediate-step-1431k-3T.json b/data/models/TinyLlama_TinyLlama-1.1B-intermediate-step-1431k-3T.json similarity index 100% rename from data/TinyLlama_TinyLlama-1.1B-intermediate-step-1431k-3T.json rename to data/models/TinyLlama_TinyLlama-1.1B-intermediate-step-1431k-3T.json diff --git a/data/TinyLlama_TinyLlama_v1.1.json b/data/models/TinyLlama_TinyLlama_v1.1.json similarity index 100% rename from data/TinyLlama_TinyLlama_v1.1.json rename to data/models/TinyLlama_TinyLlama_v1.1.json diff --git a/data/ToastyPigeon_Sto-vo-kor-12B.json b/data/models/ToastyPigeon_Sto-vo-kor-12B.json similarity index 100% rename from data/ToastyPigeon_Sto-vo-kor-12B.json rename to data/models/ToastyPigeon_Sto-vo-kor-12B.json diff --git a/data/Trappu_Magnum-Picaro-0.7-v2-12b.json b/data/models/Trappu_Magnum-Picaro-0.7-v2-12b.json similarity index 100% rename from data/Trappu_Magnum-Picaro-0.7-v2-12b.json rename to data/models/Trappu_Magnum-Picaro-0.7-v2-12b.json diff --git a/data/Trappu_Nemo-Picaro-12B.json b/data/models/Trappu_Nemo-Picaro-12B.json similarity index 100% rename from data/Trappu_Nemo-Picaro-12B.json rename to data/models/Trappu_Nemo-Picaro-12B.json diff --git a/data/Tremontaine_L3-12B-Lunaris-v1.json b/data/models/Tremontaine_L3-12B-Lunaris-v1.json similarity index 100% rename from data/Tremontaine_L3-12B-Lunaris-v1.json rename to data/models/Tremontaine_L3-12B-Lunaris-v1.json diff --git a/data/Triangle104_Annunaki-12b.json b/data/models/Triangle104_Annunaki-12b.json similarity index 100% rename from data/Triangle104_Annunaki-12b.json rename to data/models/Triangle104_Annunaki-12b.json diff --git a/data/Triangle104_BigTalker-Lite-8B.json b/data/models/Triangle104_BigTalker-Lite-8B.json similarity index 100% rename from data/Triangle104_BigTalker-Lite-8B.json rename to data/models/Triangle104_BigTalker-Lite-8B.json diff --git a/data/Triangle104_Chatty-Harry_V2.0.json b/data/models/Triangle104_Chatty-Harry_V2.0.json similarity index 100% rename from data/Triangle104_Chatty-Harry_V2.0.json rename to data/models/Triangle104_Chatty-Harry_V2.0.json diff --git a/data/Triangle104_Chatty-Harry_V3.0.json b/data/models/Triangle104_Chatty-Harry_V3.0.json similarity index 100% rename from data/Triangle104_Chatty-Harry_V3.0.json rename to data/models/Triangle104_Chatty-Harry_V3.0.json diff --git a/data/Triangle104_Chronos-Prism_V1.0.json b/data/models/Triangle104_Chronos-Prism_V1.0.json similarity index 100% rename from data/Triangle104_Chronos-Prism_V1.0.json rename to data/models/Triangle104_Chronos-Prism_V1.0.json diff --git a/data/Triangle104_DS-Distilled-Hermes-Llama-3.1.json b/data/models/Triangle104_DS-Distilled-Hermes-Llama-3.1.json similarity index 100% rename from data/Triangle104_DS-Distilled-Hermes-Llama-3.1.json rename to data/models/Triangle104_DS-Distilled-Hermes-Llama-3.1.json diff --git a/data/Triangle104_DS-Distilled-Hermes-Llama-3.1_TIES.json b/data/models/Triangle104_DS-Distilled-Hermes-Llama-3.1_TIES.json similarity index 100% rename from data/Triangle104_DS-Distilled-Hermes-Llama-3.1_TIES.json rename to data/models/Triangle104_DS-Distilled-Hermes-Llama-3.1_TIES.json diff --git a/data/Triangle104_DS-R1-Distill-Q2.5-10B-Harmony.json b/data/models/Triangle104_DS-R1-Distill-Q2.5-10B-Harmony.json similarity index 100% rename from data/Triangle104_DS-R1-Distill-Q2.5-10B-Harmony.json rename to data/models/Triangle104_DS-R1-Distill-Q2.5-10B-Harmony.json diff --git a/data/Triangle104_DS-R1-Distill-Q2.5-14B-Harmony_V0.1.json b/data/models/Triangle104_DS-R1-Distill-Q2.5-14B-Harmony_V0.1.json similarity index 100% rename from data/Triangle104_DS-R1-Distill-Q2.5-14B-Harmony_V0.1.json rename to data/models/Triangle104_DS-R1-Distill-Q2.5-14B-Harmony_V0.1.json diff --git a/data/Triangle104_DS-R1-Distill-Q2.5-7B-RP.json b/data/models/Triangle104_DS-R1-Distill-Q2.5-7B-RP.json similarity index 100% rename from data/Triangle104_DS-R1-Distill-Q2.5-7B-RP.json rename to data/models/Triangle104_DS-R1-Distill-Q2.5-7B-RP.json diff --git a/data/Triangle104_DS-R1-Llama-8B-Harmony.json b/data/models/Triangle104_DS-R1-Llama-8B-Harmony.json similarity index 100% rename from data/Triangle104_DS-R1-Llama-8B-Harmony.json rename to data/models/Triangle104_DS-R1-Llama-8B-Harmony.json diff --git a/data/Triangle104_DSR1-Distill-Llama-Lit-8B.json b/data/models/Triangle104_DSR1-Distill-Llama-Lit-8B.json similarity index 100% rename from data/Triangle104_DSR1-Distill-Llama-Lit-8B.json rename to data/models/Triangle104_DSR1-Distill-Llama-Lit-8B.json diff --git a/data/Triangle104_DSR1-Distill-Qwen-7B-RP.json b/data/models/Triangle104_DSR1-Distill-Qwen-7B-RP.json similarity index 100% rename from data/Triangle104_DSR1-Distill-Qwen-7B-RP.json rename to data/models/Triangle104_DSR1-Distill-Qwen-7B-RP.json diff --git a/data/Triangle104_Dark-Chivalry_V1.0.json b/data/models/Triangle104_Dark-Chivalry_V1.0.json similarity index 100% rename from data/Triangle104_Dark-Chivalry_V1.0.json rename to data/models/Triangle104_Dark-Chivalry_V1.0.json diff --git a/data/Triangle104_Distilled-DarkPlanet-Allades-8B.json b/data/models/Triangle104_Distilled-DarkPlanet-Allades-8B.json similarity index 100% rename from data/Triangle104_Distilled-DarkPlanet-Allades-8B.json rename to data/models/Triangle104_Distilled-DarkPlanet-Allades-8B.json diff --git a/data/Triangle104_Distilled-DarkPlanet-Allades-8B_TIES.json b/data/models/Triangle104_Distilled-DarkPlanet-Allades-8B_TIES.json similarity index 100% rename from data/Triangle104_Distilled-DarkPlanet-Allades-8B_TIES.json rename to data/models/Triangle104_Distilled-DarkPlanet-Allades-8B_TIES.json diff --git a/data/Triangle104_Distilled-Whiskey-8b.json b/data/models/Triangle104_Distilled-Whiskey-8b.json similarity index 100% rename from data/Triangle104_Distilled-Whiskey-8b.json rename to data/models/Triangle104_Distilled-Whiskey-8b.json diff --git a/data/Triangle104_Dolphin3-Llama3.2-Smart.json b/data/models/Triangle104_Dolphin3-Llama3.2-Smart.json similarity index 100% rename from data/Triangle104_Dolphin3-Llama3.2-Smart.json rename to data/models/Triangle104_Dolphin3-Llama3.2-Smart.json diff --git a/data/Triangle104_Gemmadevi-Stock-10B.json b/data/models/Triangle104_Gemmadevi-Stock-10B.json similarity index 100% rename from data/Triangle104_Gemmadevi-Stock-10B.json rename to data/models/Triangle104_Gemmadevi-Stock-10B.json diff --git a/data/Triangle104_Hermes-Llama-3.2-CoT-Summary.json b/data/models/Triangle104_Hermes-Llama-3.2-CoT-Summary.json similarity index 100% rename from data/Triangle104_Hermes-Llama-3.2-CoT-Summary.json rename to data/models/Triangle104_Hermes-Llama-3.2-CoT-Summary.json diff --git a/data/Triangle104_Hermes-Llama-3.2-CoT.json b/data/models/Triangle104_Hermes-Llama-3.2-CoT.json similarity index 100% rename from data/Triangle104_Hermes-Llama-3.2-CoT.json rename to data/models/Triangle104_Hermes-Llama-3.2-CoT.json diff --git a/data/Triangle104_Hermes3-L3.1-DirtyHarry-8B.json b/data/models/Triangle104_Hermes3-L3.1-DirtyHarry-8B.json similarity index 100% rename from data/Triangle104_Hermes3-L3.1-DirtyHarry-8B.json rename to data/models/Triangle104_Hermes3-L3.1-DirtyHarry-8B.json diff --git a/data/Triangle104_Herodotos-14B.json b/data/models/Triangle104_Herodotos-14B.json similarity index 100% rename from data/Triangle104_Herodotos-14B.json rename to data/models/Triangle104_Herodotos-14B.json diff --git a/data/Triangle104_Herodotos-14B_V0.1.json b/data/models/Triangle104_Herodotos-14B_V0.1.json similarity index 100% rename from data/Triangle104_Herodotos-14B_V0.1.json rename to data/models/Triangle104_Herodotos-14B_V0.1.json diff --git a/data/Triangle104_L3.1-8B-Dusky-Ink.json b/data/models/Triangle104_L3.1-8B-Dusky-Ink.json similarity index 100% rename from data/Triangle104_L3.1-8B-Dusky-Ink.json rename to data/models/Triangle104_L3.1-8B-Dusky-Ink.json diff --git a/data/Triangle104_L3.1-8B-Dusky-Ink_v0.r1.json b/data/models/Triangle104_L3.1-8B-Dusky-Ink_v0.r1.json similarity index 100% rename from data/Triangle104_L3.1-8B-Dusky-Ink_v0.r1.json rename to data/models/Triangle104_L3.1-8B-Dusky-Ink_v0.r1.json diff --git a/data/Triangle104_LThreePointOne-8B-HermesBlackroot.json b/data/models/Triangle104_LThreePointOne-8B-HermesBlackroot.json similarity index 100% rename from data/Triangle104_LThreePointOne-8B-HermesBlackroot.json rename to data/models/Triangle104_LThreePointOne-8B-HermesBlackroot.json diff --git a/data/Triangle104_LThreePointOne-8B-HermesInk.json b/data/models/Triangle104_LThreePointOne-8B-HermesInk.json similarity index 100% rename from data/Triangle104_LThreePointOne-8B-HermesInk.json rename to data/models/Triangle104_LThreePointOne-8B-HermesInk.json diff --git a/data/Triangle104_Llama3.1-Allades-Lit-8b.json b/data/models/Triangle104_Llama3.1-Allades-Lit-8b.json similarity index 100% rename from data/Triangle104_Llama3.1-Allades-Lit-8b.json rename to data/models/Triangle104_Llama3.1-Allades-Lit-8b.json diff --git a/data/Triangle104_Llama3.1-cc-Lit-8b.json b/data/models/Triangle104_Llama3.1-cc-Lit-8b.json similarity index 100% rename from data/Triangle104_Llama3.1-cc-Lit-8b.json rename to data/models/Triangle104_Llama3.1-cc-Lit-8b.json diff --git a/data/Triangle104_Minerva-1.5b.json b/data/models/Triangle104_Minerva-1.5b.json similarity index 100% rename from data/Triangle104_Minerva-1.5b.json rename to data/models/Triangle104_Minerva-1.5b.json diff --git a/data/Triangle104_Minerva-1.5b_V0.2.json b/data/models/Triangle104_Minerva-1.5b_V0.2.json similarity index 100% rename from data/Triangle104_Minerva-1.5b_V0.2.json rename to data/models/Triangle104_Minerva-1.5b_V0.2.json diff --git a/data/Triangle104_Minerva-10b.json b/data/models/Triangle104_Minerva-10b.json similarity index 100% rename from data/Triangle104_Minerva-10b.json rename to data/models/Triangle104_Minerva-10b.json diff --git a/data/Triangle104_Minerva-14b-V0.1.json b/data/models/Triangle104_Minerva-14b-V0.1.json similarity index 100% rename from data/Triangle104_Minerva-14b-V0.1.json rename to data/models/Triangle104_Minerva-14b-V0.1.json diff --git a/data/Triangle104_Minerva-14b.json b/data/models/Triangle104_Minerva-14b.json similarity index 100% rename from data/Triangle104_Minerva-14b.json rename to data/models/Triangle104_Minerva-14b.json diff --git a/data/Triangle104_Minerva-7b.json b/data/models/Triangle104_Minerva-7b.json similarity index 100% rename from data/Triangle104_Minerva-7b.json rename to data/models/Triangle104_Minerva-7b.json diff --git a/data/Triangle104_Minerva-8b.json b/data/models/Triangle104_Minerva-8b.json similarity index 100% rename from data/Triangle104_Minerva-8b.json rename to data/models/Triangle104_Minerva-8b.json diff --git a/data/Triangle104_Mistral-Redemption-Arc.json b/data/models/Triangle104_Mistral-Redemption-Arc.json similarity index 100% rename from data/Triangle104_Mistral-Redemption-Arc.json rename to data/models/Triangle104_Mistral-Redemption-Arc.json diff --git a/data/Triangle104_Mistral-Small-24b-Harmony.json b/data/models/Triangle104_Mistral-Small-24b-Harmony.json similarity index 100% rename from data/Triangle104_Mistral-Small-24b-Harmony.json rename to data/models/Triangle104_Mistral-Small-24b-Harmony.json diff --git a/data/Triangle104_Pans_Gutenbergum_V0.1.json b/data/models/Triangle104_Pans_Gutenbergum_V0.1.json similarity index 100% rename from data/Triangle104_Pans_Gutenbergum_V0.1.json rename to data/models/Triangle104_Pans_Gutenbergum_V0.1.json diff --git a/data/Triangle104_Pans_Gutenbergum_V0.2.json b/data/models/Triangle104_Pans_Gutenbergum_V0.2.json similarity index 100% rename from data/Triangle104_Pans_Gutenbergum_V0.2.json rename to data/models/Triangle104_Pans_Gutenbergum_V0.2.json diff --git a/data/Triangle104_Pantheon_ChatWaifu_V0.2.json b/data/models/Triangle104_Pantheon_ChatWaifu_V0.2.json similarity index 100% rename from data/Triangle104_Pantheon_ChatWaifu_V0.2.json rename to data/models/Triangle104_Pantheon_ChatWaifu_V0.2.json diff --git a/data/Triangle104_Phi-4-AbliteratedRP.json b/data/models/Triangle104_Phi-4-AbliteratedRP.json similarity index 100% rename from data/Triangle104_Phi-4-AbliteratedRP.json rename to data/models/Triangle104_Phi-4-AbliteratedRP.json diff --git a/data/Triangle104_Phi4-RP-o1-Ablit.json b/data/models/Triangle104_Phi4-RP-o1-Ablit.json similarity index 100% rename from data/Triangle104_Phi4-RP-o1-Ablit.json rename to data/models/Triangle104_Phi4-RP-o1-Ablit.json diff --git a/data/Triangle104_Phi4-RP-o1.json b/data/models/Triangle104_Phi4-RP-o1.json similarity index 100% rename from data/Triangle104_Phi4-RP-o1.json rename to data/models/Triangle104_Phi4-RP-o1.json diff --git a/data/Triangle104_Porpoise-R1-Llama3.2-3b.json b/data/models/Triangle104_Porpoise-R1-Llama3.2-3b.json similarity index 100% rename from data/Triangle104_Porpoise-R1-Llama3.2-3b.json rename to data/models/Triangle104_Porpoise-R1-Llama3.2-3b.json diff --git a/data/Triangle104_Q2.5-14B-Instruct-1M-Harmony.json b/data/models/Triangle104_Q2.5-14B-Instruct-1M-Harmony.json similarity index 100% rename from data/Triangle104_Q2.5-14B-Instruct-1M-Harmony.json rename to data/models/Triangle104_Q2.5-14B-Instruct-1M-Harmony.json diff --git a/data/Triangle104_Q2.5-AthensCOT.json b/data/models/Triangle104_Q2.5-AthensCOT.json similarity index 100% rename from data/Triangle104_Q2.5-AthensCOT.json rename to data/models/Triangle104_Q2.5-AthensCOT.json diff --git a/data/Triangle104_Q2.5-CodeR1-3B.json b/data/models/Triangle104_Q2.5-CodeR1-3B.json similarity index 100% rename from data/Triangle104_Q2.5-CodeR1-3B.json rename to data/models/Triangle104_Q2.5-CodeR1-3B.json diff --git a/data/Triangle104_Q2.5-EVACOT-7b.json b/data/models/Triangle104_Q2.5-EVACOT-7b.json similarity index 100% rename from data/Triangle104_Q2.5-EVACOT-7b.json rename to data/models/Triangle104_Q2.5-EVACOT-7b.json diff --git a/data/Triangle104_Q2.5-EvaHumane-RP.json b/data/models/Triangle104_Q2.5-EvaHumane-RP.json similarity index 100% rename from data/Triangle104_Q2.5-EvaHumane-RP.json rename to data/models/Triangle104_Q2.5-EvaHumane-RP.json diff --git a/data/Triangle104_Q2.5-Humane-RP.json b/data/models/Triangle104_Q2.5-Humane-RP.json similarity index 100% rename from data/Triangle104_Q2.5-Humane-RP.json rename to data/models/Triangle104_Q2.5-Humane-RP.json diff --git a/data/Triangle104_Q2.5-Instruct-1M_Harmony.json b/data/models/Triangle104_Q2.5-Instruct-1M_Harmony.json similarity index 100% rename from data/Triangle104_Q2.5-Instruct-1M_Harmony.json rename to data/models/Triangle104_Q2.5-Instruct-1M_Harmony.json diff --git a/data/Triangle104_Q2.5-R1-3B.json b/data/models/Triangle104_Q2.5-R1-3B.json similarity index 100% rename from data/Triangle104_Q2.5-R1-3B.json rename to data/models/Triangle104_Q2.5-R1-3B.json diff --git a/data/Triangle104_Q2.5-R1-7B.json b/data/models/Triangle104_Q2.5-R1-7B.json similarity index 100% rename from data/Triangle104_Q2.5-R1-7B.json rename to data/models/Triangle104_Q2.5-R1-7B.json diff --git a/data/Triangle104_Robo-Gutenberg_V1.0.json b/data/models/Triangle104_Robo-Gutenberg_V1.0.json similarity index 100% rename from data/Triangle104_Robo-Gutenberg_V1.0.json rename to data/models/Triangle104_Robo-Gutenberg_V1.0.json diff --git a/data/Triangle104_Rocinante-Prism_V2.0.json b/data/models/Triangle104_Rocinante-Prism_V2.0.json similarity index 100% rename from data/Triangle104_Rocinante-Prism_V2.0.json rename to data/models/Triangle104_Rocinante-Prism_V2.0.json diff --git a/data/Triangle104_Rocinante-Prism_V2.1.json b/data/models/Triangle104_Rocinante-Prism_V2.1.json similarity index 100% rename from data/Triangle104_Rocinante-Prism_V2.1.json rename to data/models/Triangle104_Rocinante-Prism_V2.1.json diff --git a/data/Triangle104_RomboHermes3-R1-Llama3.2-3b.json b/data/models/Triangle104_RomboHermes3-R1-Llama3.2-3b.json similarity index 100% rename from data/Triangle104_RomboHermes3-R1-Llama3.2-3b.json rename to data/models/Triangle104_RomboHermes3-R1-Llama3.2-3b.json diff --git a/data/Triangle104_Rombos-Novasky-7B_V1c.json b/data/models/Triangle104_Rombos-Novasky-7B_V1c.json similarity index 100% rename from data/Triangle104_Rombos-Novasky-7B_V1c.json rename to data/models/Triangle104_Rombos-Novasky-7B_V1c.json diff --git a/data/Triangle104_Set-70b.json b/data/models/Triangle104_Set-70b.json similarity index 100% rename from data/Triangle104_Set-70b.json rename to data/models/Triangle104_Set-70b.json diff --git a/data/Tsunami-th_Tsunami-0.5-7B-Instruct.json b/data/models/Tsunami-th_Tsunami-0.5-7B-Instruct.json similarity index 100% rename from data/Tsunami-th_Tsunami-0.5-7B-Instruct.json rename to data/models/Tsunami-th_Tsunami-0.5-7B-Instruct.json diff --git a/data/Tsunami-th_Tsunami-0.5x-7B-Instruct.json b/data/models/Tsunami-th_Tsunami-0.5x-7B-Instruct.json similarity index 100% rename from data/Tsunami-th_Tsunami-0.5x-7B-Instruct.json rename to data/models/Tsunami-th_Tsunami-0.5x-7B-Instruct.json diff --git a/data/Tsunami-th_Tsunami-1.0-14B-Instruct.json b/data/models/Tsunami-th_Tsunami-1.0-14B-Instruct.json similarity index 100% rename from data/Tsunami-th_Tsunami-1.0-14B-Instruct.json rename to data/models/Tsunami-th_Tsunami-1.0-14B-Instruct.json diff --git a/data/Tsunami-th_Tsunami-1.0-7B-Instruct.json b/data/models/Tsunami-th_Tsunami-1.0-7B-Instruct.json similarity index 100% rename from data/Tsunami-th_Tsunami-1.0-7B-Instruct.json rename to data/models/Tsunami-th_Tsunami-1.0-7B-Instruct.json diff --git a/data/UCLA-AGI_Gemma-2-9B-It-SPPO-Iter1.json b/data/models/UCLA-AGI_Gemma-2-9B-It-SPPO-Iter1.json similarity index 100% rename from data/UCLA-AGI_Gemma-2-9B-It-SPPO-Iter1.json rename to data/models/UCLA-AGI_Gemma-2-9B-It-SPPO-Iter1.json diff --git a/data/UCLA-AGI_Gemma-2-9B-It-SPPO-Iter2.json b/data/models/UCLA-AGI_Gemma-2-9B-It-SPPO-Iter2.json similarity index 100% rename from data/UCLA-AGI_Gemma-2-9B-It-SPPO-Iter2.json rename to data/models/UCLA-AGI_Gemma-2-9B-It-SPPO-Iter2.json diff --git a/data/UCLA-AGI_Gemma-2-9B-It-SPPO-Iter3.json b/data/models/UCLA-AGI_Gemma-2-9B-It-SPPO-Iter3.json similarity index 100% rename from data/UCLA-AGI_Gemma-2-9B-It-SPPO-Iter3.json rename to data/models/UCLA-AGI_Gemma-2-9B-It-SPPO-Iter3.json diff --git a/data/UCLA-AGI_Llama-3-Instruct-8B-SPPO-Iter1.json b/data/models/UCLA-AGI_Llama-3-Instruct-8B-SPPO-Iter1.json similarity index 100% rename from data/UCLA-AGI_Llama-3-Instruct-8B-SPPO-Iter1.json rename to data/models/UCLA-AGI_Llama-3-Instruct-8B-SPPO-Iter1.json diff --git a/data/UCLA-AGI_Llama-3-Instruct-8B-SPPO-Iter2.json b/data/models/UCLA-AGI_Llama-3-Instruct-8B-SPPO-Iter2.json similarity index 100% rename from data/UCLA-AGI_Llama-3-Instruct-8B-SPPO-Iter2.json rename to data/models/UCLA-AGI_Llama-3-Instruct-8B-SPPO-Iter2.json diff --git a/data/UCLA-AGI_Llama-3-Instruct-8B-SPPO-Iter3.json b/data/models/UCLA-AGI_Llama-3-Instruct-8B-SPPO-Iter3.json similarity index 100% rename from data/UCLA-AGI_Llama-3-Instruct-8B-SPPO-Iter3.json rename to data/models/UCLA-AGI_Llama-3-Instruct-8B-SPPO-Iter3.json diff --git a/data/UCLA-AGI_Mistral7B-PairRM-SPPO-Iter1.json b/data/models/UCLA-AGI_Mistral7B-PairRM-SPPO-Iter1.json similarity index 100% rename from data/UCLA-AGI_Mistral7B-PairRM-SPPO-Iter1.json rename to data/models/UCLA-AGI_Mistral7B-PairRM-SPPO-Iter1.json diff --git a/data/UCLA-AGI_Mistral7B-PairRM-SPPO-Iter2.json b/data/models/UCLA-AGI_Mistral7B-PairRM-SPPO-Iter2.json similarity index 100% rename from data/UCLA-AGI_Mistral7B-PairRM-SPPO-Iter2.json rename to data/models/UCLA-AGI_Mistral7B-PairRM-SPPO-Iter2.json diff --git a/data/UCLA-AGI_Mistral7B-PairRM-SPPO-Iter3.json b/data/models/UCLA-AGI_Mistral7B-PairRM-SPPO-Iter3.json similarity index 100% rename from data/UCLA-AGI_Mistral7B-PairRM-SPPO-Iter3.json rename to data/models/UCLA-AGI_Mistral7B-PairRM-SPPO-Iter3.json diff --git a/data/UCLA-AGI_Mistral7B-PairRM-SPPO.json b/data/models/UCLA-AGI_Mistral7B-PairRM-SPPO.json similarity index 100% rename from data/UCLA-AGI_Mistral7B-PairRM-SPPO.json rename to data/models/UCLA-AGI_Mistral7B-PairRM-SPPO.json diff --git a/data/UKzExecution_LlamaExecutor-8B-3.0.5.json b/data/models/UKzExecution_LlamaExecutor-8B-3.0.5.json similarity index 100% rename from data/UKzExecution_LlamaExecutor-8B-3.0.5.json rename to data/models/UKzExecution_LlamaExecutor-8B-3.0.5.json diff --git a/data/Unbabel_TowerInstruct-Mistral-7B-v0.2.json b/data/models/Unbabel_TowerInstruct-Mistral-7B-v0.2.json similarity index 100% rename from data/Unbabel_TowerInstruct-Mistral-7B-v0.2.json rename to data/models/Unbabel_TowerInstruct-Mistral-7B-v0.2.json diff --git a/data/Undi95_MG-FinalMix-72B.json b/data/models/Undi95_MG-FinalMix-72B.json similarity index 100% rename from data/Undi95_MG-FinalMix-72B.json rename to data/models/Undi95_MG-FinalMix-72B.json diff --git a/data/Undi95_Phi4-abliterated.json b/data/models/Undi95_Phi4-abliterated.json similarity index 100% rename from data/Undi95_Phi4-abliterated.json rename to data/models/Undi95_Phi4-abliterated.json diff --git a/data/V3N0M_Jenna-Tiny-2.0.json b/data/models/V3N0M_Jenna-Tiny-2.0.json similarity index 100% rename from data/V3N0M_Jenna-Tiny-2.0.json rename to data/models/V3N0M_Jenna-Tiny-2.0.json diff --git a/data/VAGOsolutions_Llama-3-SauerkrautLM-70b-Instruct.json b/data/models/VAGOsolutions_Llama-3-SauerkrautLM-70b-Instruct.json similarity index 100% rename from data/VAGOsolutions_Llama-3-SauerkrautLM-70b-Instruct.json rename to data/models/VAGOsolutions_Llama-3-SauerkrautLM-70b-Instruct.json diff --git a/data/VAGOsolutions_Llama-3-SauerkrautLM-8b-Instruct.json b/data/models/VAGOsolutions_Llama-3-SauerkrautLM-8b-Instruct.json similarity index 100% rename from data/VAGOsolutions_Llama-3-SauerkrautLM-8b-Instruct.json rename to data/models/VAGOsolutions_Llama-3-SauerkrautLM-8b-Instruct.json diff --git a/data/VAGOsolutions_Llama-3.1-SauerkrautLM-70b-Instruct.json b/data/models/VAGOsolutions_Llama-3.1-SauerkrautLM-70b-Instruct.json similarity index 100% rename from data/VAGOsolutions_Llama-3.1-SauerkrautLM-70b-Instruct.json rename to data/models/VAGOsolutions_Llama-3.1-SauerkrautLM-70b-Instruct.json diff --git a/data/VAGOsolutions_Llama-3.1-SauerkrautLM-8b-Instruct.json b/data/models/VAGOsolutions_Llama-3.1-SauerkrautLM-8b-Instruct.json similarity index 100% rename from data/VAGOsolutions_Llama-3.1-SauerkrautLM-8b-Instruct.json rename to data/models/VAGOsolutions_Llama-3.1-SauerkrautLM-8b-Instruct.json diff --git a/data/VAGOsolutions_SauerkrautLM-1.5b.json b/data/models/VAGOsolutions_SauerkrautLM-1.5b.json similarity index 100% rename from data/VAGOsolutions_SauerkrautLM-1.5b.json rename to data/models/VAGOsolutions_SauerkrautLM-1.5b.json diff --git a/data/VAGOsolutions_SauerkrautLM-7b-HerO.json b/data/models/VAGOsolutions_SauerkrautLM-7b-HerO.json similarity index 100% rename from data/VAGOsolutions_SauerkrautLM-7b-HerO.json rename to data/models/VAGOsolutions_SauerkrautLM-7b-HerO.json diff --git a/data/VAGOsolutions_SauerkrautLM-7b-LaserChat.json b/data/models/VAGOsolutions_SauerkrautLM-7b-LaserChat.json similarity index 100% rename from data/VAGOsolutions_SauerkrautLM-7b-LaserChat.json rename to data/models/VAGOsolutions_SauerkrautLM-7b-LaserChat.json diff --git a/data/VAGOsolutions_SauerkrautLM-Gemma-2b.json b/data/models/VAGOsolutions_SauerkrautLM-Gemma-2b.json similarity index 100% rename from data/VAGOsolutions_SauerkrautLM-Gemma-2b.json rename to data/models/VAGOsolutions_SauerkrautLM-Gemma-2b.json diff --git a/data/VAGOsolutions_SauerkrautLM-Gemma-7b.json b/data/models/VAGOsolutions_SauerkrautLM-Gemma-7b.json similarity index 100% rename from data/VAGOsolutions_SauerkrautLM-Gemma-7b.json rename to data/models/VAGOsolutions_SauerkrautLM-Gemma-7b.json diff --git a/data/VAGOsolutions_SauerkrautLM-Mixtral-8x7B-Instruct.json b/data/models/VAGOsolutions_SauerkrautLM-Mixtral-8x7B-Instruct.json similarity index 100% rename from data/VAGOsolutions_SauerkrautLM-Mixtral-8x7B-Instruct.json rename to data/models/VAGOsolutions_SauerkrautLM-Mixtral-8x7B-Instruct.json diff --git a/data/VAGOsolutions_SauerkrautLM-Nemo-12b-Instruct.json b/data/models/VAGOsolutions_SauerkrautLM-Nemo-12b-Instruct.json similarity index 100% rename from data/VAGOsolutions_SauerkrautLM-Nemo-12b-Instruct.json rename to data/models/VAGOsolutions_SauerkrautLM-Nemo-12b-Instruct.json diff --git a/data/VAGOsolutions_SauerkrautLM-Phi-3-medium.json b/data/models/VAGOsolutions_SauerkrautLM-Phi-3-medium.json similarity index 100% rename from data/VAGOsolutions_SauerkrautLM-Phi-3-medium.json rename to data/models/VAGOsolutions_SauerkrautLM-Phi-3-medium.json diff --git a/data/VAGOsolutions_SauerkrautLM-SOLAR-Instruct.json b/data/models/VAGOsolutions_SauerkrautLM-SOLAR-Instruct.json similarity index 100% rename from data/VAGOsolutions_SauerkrautLM-SOLAR-Instruct.json rename to data/models/VAGOsolutions_SauerkrautLM-SOLAR-Instruct.json diff --git a/data/VAGOsolutions_SauerkrautLM-gemma-2-2b-it.json b/data/models/VAGOsolutions_SauerkrautLM-gemma-2-2b-it.json similarity index 100% rename from data/VAGOsolutions_SauerkrautLM-gemma-2-2b-it.json rename to data/models/VAGOsolutions_SauerkrautLM-gemma-2-2b-it.json diff --git a/data/VAGOsolutions_SauerkrautLM-gemma-2-9b-it.json b/data/models/VAGOsolutions_SauerkrautLM-gemma-2-9b-it.json similarity index 100% rename from data/VAGOsolutions_SauerkrautLM-gemma-2-9b-it.json rename to data/models/VAGOsolutions_SauerkrautLM-gemma-2-9b-it.json diff --git a/data/VAGOsolutions_SauerkrautLM-v2-14b-DPO.json b/data/models/VAGOsolutions_SauerkrautLM-v2-14b-DPO.json similarity index 100% rename from data/VAGOsolutions_SauerkrautLM-v2-14b-DPO.json rename to data/models/VAGOsolutions_SauerkrautLM-v2-14b-DPO.json diff --git a/data/VAGOsolutions_SauerkrautLM-v2-14b-SFT.json b/data/models/VAGOsolutions_SauerkrautLM-v2-14b-SFT.json similarity index 100% rename from data/VAGOsolutions_SauerkrautLM-v2-14b-SFT.json rename to data/models/VAGOsolutions_SauerkrautLM-v2-14b-SFT.json diff --git a/data/VIRNECT_llama-3-Korean-8B-r-v-0.1.json b/data/models/VIRNECT_llama-3-Korean-8B-r-v-0.1.json similarity index 100% rename from data/VIRNECT_llama-3-Korean-8B-r-v-0.1.json rename to data/models/VIRNECT_llama-3-Korean-8B-r-v-0.1.json diff --git a/data/VIRNECT_llama-3-Korean-8B.json b/data/models/VIRNECT_llama-3-Korean-8B.json similarity index 99% rename from data/VIRNECT_llama-3-Korean-8B.json rename to data/models/VIRNECT_llama-3-Korean-8B.json index c6e488243be22383257d7ce7830645e7786a2e58..b477e70103838992147fd839878365ba09417099 100644 --- a/data/VIRNECT_llama-3-Korean-8B.json +++ b/data/models/VIRNECT_llama-3-Korean-8B.json @@ -5,7 +5,7 @@ "developer": "VIRNECT", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "LlamaForCausalLM", "params_billions": "8.03" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5058 + "score": 0.5021 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4908 + "score": 0.4918 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0929 + "score": 0.108 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3662 + "score": 0.3648 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3539 + "score": 0.3536 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5021 + "score": 0.5058 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4918 + "score": 0.4908 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.108 + "score": 0.0929 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3648 + "score": 0.3662 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3536 + "score": 0.3539 } } ], diff --git a/data/ValiantLabs_Llama3-70B-Fireplace.json b/data/models/ValiantLabs_Llama3-70B-Fireplace.json similarity index 100% rename from data/ValiantLabs_Llama3-70B-Fireplace.json rename to data/models/ValiantLabs_Llama3-70B-Fireplace.json diff --git a/data/ValiantLabs_Llama3-70B-ShiningValiant2.json b/data/models/ValiantLabs_Llama3-70B-ShiningValiant2.json similarity index 100% rename from data/ValiantLabs_Llama3-70B-ShiningValiant2.json rename to data/models/ValiantLabs_Llama3-70B-ShiningValiant2.json diff --git a/data/ValiantLabs_Llama3.1-70B-ShiningValiant2.json b/data/models/ValiantLabs_Llama3.1-70B-ShiningValiant2.json similarity index 100% rename from data/ValiantLabs_Llama3.1-70B-ShiningValiant2.json rename to data/models/ValiantLabs_Llama3.1-70B-ShiningValiant2.json diff --git a/data/ValiantLabs_Llama3.1-8B-Cobalt.json b/data/models/ValiantLabs_Llama3.1-8B-Cobalt.json similarity index 100% rename from data/ValiantLabs_Llama3.1-8B-Cobalt.json rename to data/models/ValiantLabs_Llama3.1-8B-Cobalt.json diff --git a/data/ValiantLabs_Llama3.1-8B-Enigma.json b/data/models/ValiantLabs_Llama3.1-8B-Enigma.json similarity index 100% rename from data/ValiantLabs_Llama3.1-8B-Enigma.json rename to data/models/ValiantLabs_Llama3.1-8B-Enigma.json diff --git a/data/ValiantLabs_Llama3.1-8B-Esper2.json b/data/models/ValiantLabs_Llama3.1-8B-Esper2.json similarity index 100% rename from data/ValiantLabs_Llama3.1-8B-Esper2.json rename to data/models/ValiantLabs_Llama3.1-8B-Esper2.json diff --git a/data/ValiantLabs_Llama3.1-8B-Fireplace2.json b/data/models/ValiantLabs_Llama3.1-8B-Fireplace2.json similarity index 99% rename from data/ValiantLabs_Llama3.1-8B-Fireplace2.json rename to data/models/ValiantLabs_Llama3.1-8B-Fireplace2.json index 83bfd16d6019777398f6524fe27b29626bcb5cc6..e5e5b50db78f27aee8bda2959572ee45b7414bbe 100644 --- a/data/ValiantLabs_Llama3.1-8B-Fireplace2.json +++ b/data/models/ValiantLabs_Llama3.1-8B-Fireplace2.json @@ -5,7 +5,7 @@ "developer": "ValiantLabs", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "LlamaForCausalLM", "params_billions": "8.03" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5328 + "score": 0.5483 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4613 + "score": 0.461 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0876 + "score": 0.0582 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2894 + "score": 0.2886 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3367 + "score": 0.3433 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2424 + "score": 0.2407 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5483 + "score": 0.5328 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.461 + "score": 0.4613 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0582 + "score": 0.0876 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2886 + "score": 0.2894 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3433 + "score": 0.3367 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2407 + "score": 0.2424 } } ], diff --git a/data/ValiantLabs_Llama3.1-8B-ShiningValiant2.json b/data/models/ValiantLabs_Llama3.1-8B-ShiningValiant2.json similarity index 100% rename from data/ValiantLabs_Llama3.1-8B-ShiningValiant2.json rename to data/models/ValiantLabs_Llama3.1-8B-ShiningValiant2.json diff --git a/data/ValiantLabs_Llama3.2-3B-Enigma.json b/data/models/ValiantLabs_Llama3.2-3B-Enigma.json similarity index 100% rename from data/ValiantLabs_Llama3.2-3B-Enigma.json rename to data/models/ValiantLabs_Llama3.2-3B-Enigma.json diff --git a/data/ValiantLabs_Llama3.2-3B-Esper2.json b/data/models/ValiantLabs_Llama3.2-3B-Esper2.json similarity index 100% rename from data/ValiantLabs_Llama3.2-3B-Esper2.json rename to data/models/ValiantLabs_Llama3.2-3B-Esper2.json diff --git a/data/ValiantLabs_Llama3.2-3B-ShiningValiant2.json b/data/models/ValiantLabs_Llama3.2-3B-ShiningValiant2.json similarity index 100% rename from data/ValiantLabs_Llama3.2-3B-ShiningValiant2.json rename to data/models/ValiantLabs_Llama3.2-3B-ShiningValiant2.json diff --git a/data/Vikhrmodels_Vikhr-Llama3.1-8B-Instruct-R-21-09-24.json b/data/models/Vikhrmodels_Vikhr-Llama3.1-8B-Instruct-R-21-09-24.json similarity index 100% rename from data/Vikhrmodels_Vikhr-Llama3.1-8B-Instruct-R-21-09-24.json rename to data/models/Vikhrmodels_Vikhr-Llama3.1-8B-Instruct-R-21-09-24.json diff --git a/data/Vikhrmodels_Vikhr-Nemo-12B-Instruct-R-21-09-24.json b/data/models/Vikhrmodels_Vikhr-Nemo-12B-Instruct-R-21-09-24.json similarity index 100% rename from data/Vikhrmodels_Vikhr-Nemo-12B-Instruct-R-21-09-24.json rename to data/models/Vikhrmodels_Vikhr-Nemo-12B-Instruct-R-21-09-24.json diff --git a/data/Weyaxi_Bagel-Hermes-2x34B.json b/data/models/Weyaxi_Bagel-Hermes-2x34B.json similarity index 100% rename from data/Weyaxi_Bagel-Hermes-2x34B.json rename to data/models/Weyaxi_Bagel-Hermes-2x34B.json diff --git a/data/Weyaxi_Bagel-Hermes-34B-Slerp.json b/data/models/Weyaxi_Bagel-Hermes-34B-Slerp.json similarity index 100% rename from data/Weyaxi_Bagel-Hermes-34B-Slerp.json rename to data/models/Weyaxi_Bagel-Hermes-34B-Slerp.json diff --git a/data/Weyaxi_Einstein-v4-7B.json b/data/models/Weyaxi_Einstein-v4-7B.json similarity index 100% rename from data/Weyaxi_Einstein-v4-7B.json rename to data/models/Weyaxi_Einstein-v4-7B.json diff --git a/data/Weyaxi_Einstein-v6.1-Llama3-8B.json b/data/models/Weyaxi_Einstein-v6.1-Llama3-8B.json similarity index 100% rename from data/Weyaxi_Einstein-v6.1-Llama3-8B.json rename to data/models/Weyaxi_Einstein-v6.1-Llama3-8B.json diff --git a/data/Weyaxi_Einstein-v6.1-developed-by-Weyaxi-Llama3-8B.json b/data/models/Weyaxi_Einstein-v6.1-developed-by-Weyaxi-Llama3-8B.json similarity index 100% rename from data/Weyaxi_Einstein-v6.1-developed-by-Weyaxi-Llama3-8B.json rename to data/models/Weyaxi_Einstein-v6.1-developed-by-Weyaxi-Llama3-8B.json diff --git a/data/Weyaxi_Einstein-v7-Qwen2-7B.json b/data/models/Weyaxi_Einstein-v7-Qwen2-7B.json similarity index 100% rename from data/Weyaxi_Einstein-v7-Qwen2-7B.json rename to data/models/Weyaxi_Einstein-v7-Qwen2-7B.json diff --git a/data/Weyaxi_Einstein-v8-Llama3.2-1B.json b/data/models/Weyaxi_Einstein-v8-Llama3.2-1B.json similarity index 100% rename from data/Weyaxi_Einstein-v8-Llama3.2-1B.json rename to data/models/Weyaxi_Einstein-v8-Llama3.2-1B.json diff --git a/data/Weyaxi_SauerkrautLM-UNA-SOLAR-Instruct.json b/data/models/Weyaxi_SauerkrautLM-UNA-SOLAR-Instruct.json similarity index 100% rename from data/Weyaxi_SauerkrautLM-UNA-SOLAR-Instruct.json rename to data/models/Weyaxi_SauerkrautLM-UNA-SOLAR-Instruct.json diff --git a/data/WizardLMTeam_WizardLM-13B-V1.0.json b/data/models/WizardLMTeam_WizardLM-13B-V1.0.json similarity index 100% rename from data/WizardLMTeam_WizardLM-13B-V1.0.json rename to data/models/WizardLMTeam_WizardLM-13B-V1.0.json diff --git a/data/WizardLMTeam_WizardLM-13B-V1.2.json b/data/models/WizardLMTeam_WizardLM-13B-V1.2.json similarity index 100% rename from data/WizardLMTeam_WizardLM-13B-V1.2.json rename to data/models/WizardLMTeam_WizardLM-13B-V1.2.json diff --git a/data/WizardLMTeam_WizardLM-70B-V1.0.json b/data/models/WizardLMTeam_WizardLM-70B-V1.0.json similarity index 100% rename from data/WizardLMTeam_WizardLM-70B-V1.0.json rename to data/models/WizardLMTeam_WizardLM-70B-V1.0.json diff --git a/data/Wladastic_Mini-Think-Base-1B.json b/data/models/Wladastic_Mini-Think-Base-1B.json similarity index 100% rename from data/Wladastic_Mini-Think-Base-1B.json rename to data/models/Wladastic_Mini-Think-Base-1B.json diff --git a/data/Xclbr7_Arcanum-12b.json b/data/models/Xclbr7_Arcanum-12b.json similarity index 100% rename from data/Xclbr7_Arcanum-12b.json rename to data/models/Xclbr7_Arcanum-12b.json diff --git a/data/Xclbr7_Hyena-12b.json b/data/models/Xclbr7_Hyena-12b.json similarity index 100% rename from data/Xclbr7_Hyena-12b.json rename to data/models/Xclbr7_Hyena-12b.json diff --git a/data/Xclbr7_caliburn-12b.json b/data/models/Xclbr7_caliburn-12b.json similarity index 100% rename from data/Xclbr7_caliburn-12b.json rename to data/models/Xclbr7_caliburn-12b.json diff --git a/data/Xclbr7_caliburn-v2-12b.json b/data/models/Xclbr7_caliburn-v2-12b.json similarity index 100% rename from data/Xclbr7_caliburn-v2-12b.json rename to data/models/Xclbr7_caliburn-v2-12b.json diff --git a/data/Xiaojian9992024_Llama3.2-1B-THREADRIPPER-v0.2.json b/data/models/Xiaojian9992024_Llama3.2-1B-THREADRIPPER-v0.2.json similarity index 100% rename from data/Xiaojian9992024_Llama3.2-1B-THREADRIPPER-v0.2.json rename to data/models/Xiaojian9992024_Llama3.2-1B-THREADRIPPER-v0.2.json diff --git a/data/Xiaojian9992024_Llama3.2-1B-THREADRIPPER.json b/data/models/Xiaojian9992024_Llama3.2-1B-THREADRIPPER.json similarity index 100% rename from data/Xiaojian9992024_Llama3.2-1B-THREADRIPPER.json rename to data/models/Xiaojian9992024_Llama3.2-1B-THREADRIPPER.json diff --git a/data/Xiaojian9992024_Phi-4-Megatron-Empathetic.json b/data/models/Xiaojian9992024_Phi-4-Megatron-Empathetic.json similarity index 100% rename from data/Xiaojian9992024_Phi-4-Megatron-Empathetic.json rename to data/models/Xiaojian9992024_Phi-4-Megatron-Empathetic.json diff --git a/data/Xiaojian9992024_Phi-4-mini-UNOFFICAL.json b/data/models/Xiaojian9992024_Phi-4-mini-UNOFFICAL.json similarity index 100% rename from data/Xiaojian9992024_Phi-4-mini-UNOFFICAL.json rename to data/models/Xiaojian9992024_Phi-4-mini-UNOFFICAL.json diff --git a/data/Xiaojian9992024_Qwen2.5-7B-MS-Destroyer.json b/data/models/Xiaojian9992024_Qwen2.5-7B-MS-Destroyer.json similarity index 100% rename from data/Xiaojian9992024_Qwen2.5-7B-MS-Destroyer.json rename to data/models/Xiaojian9992024_Qwen2.5-7B-MS-Destroyer.json diff --git a/data/Xiaojian9992024_Qwen2.5-Dyanka-7B-Preview-v0.2.json b/data/models/Xiaojian9992024_Qwen2.5-Dyanka-7B-Preview-v0.2.json similarity index 100% rename from data/Xiaojian9992024_Qwen2.5-Dyanka-7B-Preview-v0.2.json rename to data/models/Xiaojian9992024_Qwen2.5-Dyanka-7B-Preview-v0.2.json diff --git a/data/Xiaojian9992024_Qwen2.5-Dyanka-7B-Preview.json b/data/models/Xiaojian9992024_Qwen2.5-Dyanka-7B-Preview.json similarity index 100% rename from data/Xiaojian9992024_Qwen2.5-Dyanka-7B-Preview.json rename to data/models/Xiaojian9992024_Qwen2.5-Dyanka-7B-Preview.json diff --git a/data/Xiaojian9992024_Qwen2.5-THREADRIPPER-Medium-Censored.json b/data/models/Xiaojian9992024_Qwen2.5-THREADRIPPER-Medium-Censored.json similarity index 100% rename from data/Xiaojian9992024_Qwen2.5-THREADRIPPER-Medium-Censored.json rename to data/models/Xiaojian9992024_Qwen2.5-THREADRIPPER-Medium-Censored.json diff --git a/data/Xiaojian9992024_Qwen2.5-THREADRIPPER-Small-AnniversaryEdition.json b/data/models/Xiaojian9992024_Qwen2.5-THREADRIPPER-Small-AnniversaryEdition.json similarity index 100% rename from data/Xiaojian9992024_Qwen2.5-THREADRIPPER-Small-AnniversaryEdition.json rename to data/models/Xiaojian9992024_Qwen2.5-THREADRIPPER-Small-AnniversaryEdition.json diff --git a/data/Xiaojian9992024_Qwen2.5-THREADRIPPER-Small.json b/data/models/Xiaojian9992024_Qwen2.5-THREADRIPPER-Small.json similarity index 100% rename from data/Xiaojian9992024_Qwen2.5-THREADRIPPER-Small.json rename to data/models/Xiaojian9992024_Qwen2.5-THREADRIPPER-Small.json diff --git a/data/Xiaojian9992024_Qwen2.5-Ultra-1.5B-25.02-Exp.json b/data/models/Xiaojian9992024_Qwen2.5-Ultra-1.5B-25.02-Exp.json similarity index 100% rename from data/Xiaojian9992024_Qwen2.5-Ultra-1.5B-25.02-Exp.json rename to data/models/Xiaojian9992024_Qwen2.5-Ultra-1.5B-25.02-Exp.json diff --git a/data/Xiaojian9992024_Reflection-L3.2-JametMiniMix-3B.json b/data/models/Xiaojian9992024_Reflection-L3.2-JametMiniMix-3B.json similarity index 100% rename from data/Xiaojian9992024_Reflection-L3.2-JametMiniMix-3B.json rename to data/models/Xiaojian9992024_Reflection-L3.2-JametMiniMix-3B.json diff --git a/data/Xkev_Llama-3.2V-11B-cot.json b/data/models/Xkev_Llama-3.2V-11B-cot.json similarity index 100% rename from data/Xkev_Llama-3.2V-11B-cot.json rename to data/models/Xkev_Llama-3.2V-11B-cot.json diff --git a/data/YOYO-AI_Qwen2.5-14B-1M-YOYO-V3.json b/data/models/YOYO-AI_Qwen2.5-14B-1M-YOYO-V3.json similarity index 100% rename from data/YOYO-AI_Qwen2.5-14B-1M-YOYO-V3.json rename to data/models/YOYO-AI_Qwen2.5-14B-1M-YOYO-V3.json diff --git a/data/YOYO-AI_Qwen2.5-14B-YOYO-0505.json b/data/models/YOYO-AI_Qwen2.5-14B-YOYO-0505.json similarity index 100% rename from data/YOYO-AI_Qwen2.5-14B-YOYO-0505.json rename to data/models/YOYO-AI_Qwen2.5-14B-YOYO-0505.json diff --git a/data/YOYO-AI_Qwen2.5-14B-YOYO-0510-v2.json b/data/models/YOYO-AI_Qwen2.5-14B-YOYO-0510-v2.json similarity index 100% rename from data/YOYO-AI_Qwen2.5-14B-YOYO-0510-v2.json rename to data/models/YOYO-AI_Qwen2.5-14B-YOYO-0510-v2.json diff --git a/data/YOYO-AI_Qwen2.5-14B-YOYO-0805.json b/data/models/YOYO-AI_Qwen2.5-14B-YOYO-0805.json similarity index 100% rename from data/YOYO-AI_Qwen2.5-14B-YOYO-0805.json rename to data/models/YOYO-AI_Qwen2.5-14B-YOYO-0805.json diff --git a/data/YOYO-AI_Qwen2.5-14B-YOYO-1005-v2.json b/data/models/YOYO-AI_Qwen2.5-14B-YOYO-1005-v2.json similarity index 100% rename from data/YOYO-AI_Qwen2.5-14B-YOYO-1005-v2.json rename to data/models/YOYO-AI_Qwen2.5-14B-YOYO-1005-v2.json diff --git a/data/YOYO-AI_Qwen2.5-14B-YOYO-1005.json b/data/models/YOYO-AI_Qwen2.5-14B-YOYO-1005.json similarity index 100% rename from data/YOYO-AI_Qwen2.5-14B-YOYO-1005.json rename to data/models/YOYO-AI_Qwen2.5-14B-YOYO-1005.json diff --git a/data/YOYO-AI_Qwen2.5-14B-YOYO-1010-v2.json b/data/models/YOYO-AI_Qwen2.5-14B-YOYO-1010-v2.json similarity index 100% rename from data/YOYO-AI_Qwen2.5-14B-YOYO-1010-v2.json rename to data/models/YOYO-AI_Qwen2.5-14B-YOYO-1010-v2.json diff --git a/data/YOYO-AI_Qwen2.5-14B-YOYO-1010.json b/data/models/YOYO-AI_Qwen2.5-14B-YOYO-1010.json similarity index 100% rename from data/YOYO-AI_Qwen2.5-14B-YOYO-1010.json rename to data/models/YOYO-AI_Qwen2.5-14B-YOYO-1010.json diff --git a/data/YOYO-AI_Qwen2.5-14B-YOYO-SCE.json b/data/models/YOYO-AI_Qwen2.5-14B-YOYO-SCE.json similarity index 100% rename from data/YOYO-AI_Qwen2.5-14B-YOYO-SCE.json rename to data/models/YOYO-AI_Qwen2.5-14B-YOYO-SCE.json diff --git a/data/YOYO-AI_Qwen2.5-14B-YOYO-V4-p1.json b/data/models/YOYO-AI_Qwen2.5-14B-YOYO-V4-p1.json similarity index 100% rename from data/YOYO-AI_Qwen2.5-14B-YOYO-V4-p1.json rename to data/models/YOYO-AI_Qwen2.5-14B-YOYO-V4-p1.json diff --git a/data/YOYO-AI_Qwen2.5-14B-YOYO-V4-p2.json b/data/models/YOYO-AI_Qwen2.5-14B-YOYO-V4-p2.json similarity index 100% rename from data/YOYO-AI_Qwen2.5-14B-YOYO-V4-p2.json rename to data/models/YOYO-AI_Qwen2.5-14B-YOYO-V4-p2.json diff --git a/data/YOYO-AI_Qwen2.5-14B-YOYO-V4.json b/data/models/YOYO-AI_Qwen2.5-14B-YOYO-V4.json similarity index 100% rename from data/YOYO-AI_Qwen2.5-14B-YOYO-V4.json rename to data/models/YOYO-AI_Qwen2.5-14B-YOYO-V4.json diff --git a/data/YOYO-AI_Qwen2.5-14B-YOYO-latest-V2.json b/data/models/YOYO-AI_Qwen2.5-14B-YOYO-latest-V2.json similarity index 100% rename from data/YOYO-AI_Qwen2.5-14B-YOYO-latest-V2.json rename to data/models/YOYO-AI_Qwen2.5-14B-YOYO-latest-V2.json diff --git a/data/YOYO-AI_Qwen2.5-14B-YOYO-latest.json b/data/models/YOYO-AI_Qwen2.5-14B-YOYO-latest.json similarity index 100% rename from data/YOYO-AI_Qwen2.5-14B-YOYO-latest.json rename to data/models/YOYO-AI_Qwen2.5-14B-YOYO-latest.json diff --git a/data/YOYO-AI_Qwen2.5-14B-it-restore.json b/data/models/YOYO-AI_Qwen2.5-14B-it-restore.json similarity index 100% rename from data/YOYO-AI_Qwen2.5-14B-it-restore.json rename to data/models/YOYO-AI_Qwen2.5-14B-it-restore.json diff --git a/data/YOYO-AI_Qwen2.5-7B-it-restore.json b/data/models/YOYO-AI_Qwen2.5-7B-it-restore.json similarity index 100% rename from data/YOYO-AI_Qwen2.5-7B-it-restore.json rename to data/models/YOYO-AI_Qwen2.5-7B-it-restore.json diff --git a/data/YOYO-AI_Qwen2.5-Coder-14B-YOYO-1010.json b/data/models/YOYO-AI_Qwen2.5-Coder-14B-YOYO-1010.json similarity index 100% rename from data/YOYO-AI_Qwen2.5-Coder-14B-YOYO-1010.json rename to data/models/YOYO-AI_Qwen2.5-Coder-14B-YOYO-1010.json diff --git a/data/YOYO-AI_ZYH-LLM-Qwen2.5-14B-V2.json b/data/models/YOYO-AI_ZYH-LLM-Qwen2.5-14B-V2.json similarity index 100% rename from data/YOYO-AI_ZYH-LLM-Qwen2.5-14B-V2.json rename to data/models/YOYO-AI_ZYH-LLM-Qwen2.5-14B-V2.json diff --git a/data/YOYO-AI_ZYH-LLM-Qwen2.5-14B-V3.json b/data/models/YOYO-AI_ZYH-LLM-Qwen2.5-14B-V3.json similarity index 100% rename from data/YOYO-AI_ZYH-LLM-Qwen2.5-14B-V3.json rename to data/models/YOYO-AI_ZYH-LLM-Qwen2.5-14B-V3.json diff --git a/data/YOYO-AI_ZYH-LLM-Qwen2.5-14B-V4.json b/data/models/YOYO-AI_ZYH-LLM-Qwen2.5-14B-V4.json similarity index 100% rename from data/YOYO-AI_ZYH-LLM-Qwen2.5-14B-V4.json rename to data/models/YOYO-AI_ZYH-LLM-Qwen2.5-14B-V4.json diff --git a/data/YOYO-AI_ZYH-LLM-Qwen2.5-14B.json b/data/models/YOYO-AI_ZYH-LLM-Qwen2.5-14B.json similarity index 100% rename from data/YOYO-AI_ZYH-LLM-Qwen2.5-14B.json rename to data/models/YOYO-AI_ZYH-LLM-Qwen2.5-14B.json diff --git a/data/Yash21_TinyYi-7B-Test.json b/data/models/Yash21_TinyYi-7B-Test.json similarity index 100% rename from data/Yash21_TinyYi-7B-Test.json rename to data/models/Yash21_TinyYi-7B-Test.json diff --git a/data/Youlln_1PARAMMYL-8B-ModelStock.json b/data/models/Youlln_1PARAMMYL-8B-ModelStock.json similarity index 100% rename from data/Youlln_1PARAMMYL-8B-ModelStock.json rename to data/models/Youlln_1PARAMMYL-8B-ModelStock.json diff --git a/data/Youlln_2PRYMMAL-Yi1.5-6B-SLERP.json b/data/models/Youlln_2PRYMMAL-Yi1.5-6B-SLERP.json similarity index 100% rename from data/Youlln_2PRYMMAL-Yi1.5-6B-SLERP.json rename to data/models/Youlln_2PRYMMAL-Yi1.5-6B-SLERP.json diff --git a/data/Youlln_3PRYMMAL-PHI3-3B-SLERP.json b/data/models/Youlln_3PRYMMAL-PHI3-3B-SLERP.json similarity index 100% rename from data/Youlln_3PRYMMAL-PHI3-3B-SLERP.json rename to data/models/Youlln_3PRYMMAL-PHI3-3B-SLERP.json diff --git a/data/Youlln_4PRYMMAL-GEMMA2-9B-SLERP.json b/data/models/Youlln_4PRYMMAL-GEMMA2-9B-SLERP.json similarity index 100% rename from data/Youlln_4PRYMMAL-GEMMA2-9B-SLERP.json rename to data/models/Youlln_4PRYMMAL-GEMMA2-9B-SLERP.json diff --git a/data/Youlln_ECE-MIRAGE-1-12B.json b/data/models/Youlln_ECE-MIRAGE-1-12B.json similarity index 100% rename from data/Youlln_ECE-MIRAGE-1-12B.json rename to data/models/Youlln_ECE-MIRAGE-1-12B.json diff --git a/data/Youlln_ECE-MIRAGE-1-15B.json b/data/models/Youlln_ECE-MIRAGE-1-15B.json similarity index 100% rename from data/Youlln_ECE-MIRAGE-1-15B.json rename to data/models/Youlln_ECE-MIRAGE-1-15B.json diff --git a/data/Youlln_ECE-PRYMMAL-0.5B-FT-V3-MUSR.json b/data/models/Youlln_ECE-PRYMMAL-0.5B-FT-V3-MUSR.json similarity index 100% rename from data/Youlln_ECE-PRYMMAL-0.5B-FT-V3-MUSR.json rename to data/models/Youlln_ECE-PRYMMAL-0.5B-FT-V3-MUSR.json diff --git a/data/Youlln_ECE-PRYMMAL-0.5B-FT-V3.json b/data/models/Youlln_ECE-PRYMMAL-0.5B-FT-V3.json similarity index 100% rename from data/Youlln_ECE-PRYMMAL-0.5B-FT-V3.json rename to data/models/Youlln_ECE-PRYMMAL-0.5B-FT-V3.json diff --git a/data/Youlln_ECE-PRYMMAL-0.5B-FT-V4-MUSR.json b/data/models/Youlln_ECE-PRYMMAL-0.5B-FT-V4-MUSR.json similarity index 100% rename from data/Youlln_ECE-PRYMMAL-0.5B-FT-V4-MUSR.json rename to data/models/Youlln_ECE-PRYMMAL-0.5B-FT-V4-MUSR.json diff --git a/data/Youlln_ECE-PRYMMAL-0.5B-SLERP-V2.json b/data/models/Youlln_ECE-PRYMMAL-0.5B-SLERP-V2.json similarity index 100% rename from data/Youlln_ECE-PRYMMAL-0.5B-SLERP-V2.json rename to data/models/Youlln_ECE-PRYMMAL-0.5B-SLERP-V2.json diff --git a/data/Youlln_ECE-PRYMMAL-0.5B-SLERP-V3.json b/data/models/Youlln_ECE-PRYMMAL-0.5B-SLERP-V3.json similarity index 100% rename from data/Youlln_ECE-PRYMMAL-0.5B-SLERP-V3.json rename to data/models/Youlln_ECE-PRYMMAL-0.5B-SLERP-V3.json diff --git a/data/Youlln_ECE-PRYMMAL-YL-1B-SLERP-V1.json b/data/models/Youlln_ECE-PRYMMAL-YL-1B-SLERP-V1.json similarity index 100% rename from data/Youlln_ECE-PRYMMAL-YL-1B-SLERP-V1.json rename to data/models/Youlln_ECE-PRYMMAL-YL-1B-SLERP-V1.json diff --git a/data/Youlln_ECE-PRYMMAL-YL-1B-SLERP-V2.json b/data/models/Youlln_ECE-PRYMMAL-YL-1B-SLERP-V2.json similarity index 100% rename from data/Youlln_ECE-PRYMMAL-YL-1B-SLERP-V2.json rename to data/models/Youlln_ECE-PRYMMAL-YL-1B-SLERP-V2.json diff --git a/data/Youlln_ECE-PRYMMAL-YL-7B-SLERP-V4.json b/data/models/Youlln_ECE-PRYMMAL-YL-7B-SLERP-V4.json similarity index 100% rename from data/Youlln_ECE-PRYMMAL-YL-7B-SLERP-V4.json rename to data/models/Youlln_ECE-PRYMMAL-YL-7B-SLERP-V4.json diff --git a/data/Youlln_ECE-PRYMMAL0.5-FT.json b/data/models/Youlln_ECE-PRYMMAL0.5-FT.json similarity index 100% rename from data/Youlln_ECE-PRYMMAL0.5-FT.json rename to data/models/Youlln_ECE-PRYMMAL0.5-FT.json diff --git a/data/Youlln_ECE-PRYMMAL0.5B-Youri.json b/data/models/Youlln_ECE-PRYMMAL0.5B-Youri.json similarity index 100% rename from data/Youlln_ECE-PRYMMAL0.5B-Youri.json rename to data/models/Youlln_ECE-PRYMMAL0.5B-Youri.json diff --git a/data/Youlln_ECE-PRYMMAL1B-FT-V1.json b/data/models/Youlln_ECE-PRYMMAL1B-FT-V1.json similarity index 100% rename from data/Youlln_ECE-PRYMMAL1B-FT-V1.json rename to data/models/Youlln_ECE-PRYMMAL1B-FT-V1.json diff --git a/data/Youlln_ECE-Qwen0.5B-FT-V2.json b/data/models/Youlln_ECE-Qwen0.5B-FT-V2.json similarity index 100% rename from data/Youlln_ECE-Qwen0.5B-FT-V2.json rename to data/models/Youlln_ECE-Qwen0.5B-FT-V2.json diff --git a/data/Youlln_ECE.EIFFEIL.ia-0.5B-SLERP.json b/data/models/Youlln_ECE.EIFFEIL.ia-0.5B-SLERP.json similarity index 100% rename from data/Youlln_ECE.EIFFEIL.ia-0.5B-SLERP.json rename to data/models/Youlln_ECE.EIFFEIL.ia-0.5B-SLERP.json diff --git a/data/YoungPanda_qwenqwen.json b/data/models/YoungPanda_qwenqwen.json similarity index 100% rename from data/YoungPanda_qwenqwen.json rename to data/models/YoungPanda_qwenqwen.json diff --git a/data/Yuma42_KangalKhan-RawRuby-7B.json b/data/models/Yuma42_KangalKhan-RawRuby-7B.json similarity index 100% rename from data/Yuma42_KangalKhan-RawRuby-7B.json rename to data/models/Yuma42_KangalKhan-RawRuby-7B.json diff --git a/data/Yuma42_Llama3.1-IgneousIguana-8B.json b/data/models/Yuma42_Llama3.1-IgneousIguana-8B.json similarity index 100% rename from data/Yuma42_Llama3.1-IgneousIguana-8B.json rename to data/models/Yuma42_Llama3.1-IgneousIguana-8B.json diff --git a/data/Yuma42_Llama3.1-SuperHawk-8B.json b/data/models/Yuma42_Llama3.1-SuperHawk-8B.json similarity index 100% rename from data/Yuma42_Llama3.1-SuperHawk-8B.json rename to data/models/Yuma42_Llama3.1-SuperHawk-8B.json diff --git a/data/Z1-Coder_Z1-Coder-7B.json b/data/models/Z1-Coder_Z1-Coder-7B.json similarity index 100% rename from data/Z1-Coder_Z1-Coder-7B.json rename to data/models/Z1-Coder_Z1-Coder-7B.json diff --git a/data/ZHLiu627_zephyr-7b-gemma-dpo-avg.json b/data/models/ZHLiu627_zephyr-7b-gemma-dpo-avg.json similarity index 100% rename from data/ZHLiu627_zephyr-7b-gemma-dpo-avg.json rename to data/models/ZHLiu627_zephyr-7b-gemma-dpo-avg.json diff --git a/data/ZHLiu627_zephyr-7b-gemma-rpo-avg.json b/data/models/ZHLiu627_zephyr-7b-gemma-rpo-avg.json similarity index 100% rename from data/ZHLiu627_zephyr-7b-gemma-rpo-avg.json rename to data/models/ZHLiu627_zephyr-7b-gemma-rpo-avg.json diff --git a/data/ZeroXClem_L3-Aspire-Heart-Matrix-8B.json b/data/models/ZeroXClem_L3-Aspire-Heart-Matrix-8B.json similarity index 100% rename from data/ZeroXClem_L3-Aspire-Heart-Matrix-8B.json rename to data/models/ZeroXClem_L3-Aspire-Heart-Matrix-8B.json diff --git a/data/ZeroXClem_Llama-3.1-8B-AthenaSky-MegaMix.json b/data/models/ZeroXClem_Llama-3.1-8B-AthenaSky-MegaMix.json similarity index 100% rename from data/ZeroXClem_Llama-3.1-8B-AthenaSky-MegaMix.json rename to data/models/ZeroXClem_Llama-3.1-8B-AthenaSky-MegaMix.json diff --git a/data/ZeroXClem_Llama-3.1-8B-RainbowLight-EtherealMix.json b/data/models/ZeroXClem_Llama-3.1-8B-RainbowLight-EtherealMix.json similarity index 100% rename from data/ZeroXClem_Llama-3.1-8B-RainbowLight-EtherealMix.json rename to data/models/ZeroXClem_Llama-3.1-8B-RainbowLight-EtherealMix.json diff --git a/data/ZeroXClem_Llama-3.1-8B-SpecialTitanFusion.json b/data/models/ZeroXClem_Llama-3.1-8B-SpecialTitanFusion.json similarity index 100% rename from data/ZeroXClem_Llama-3.1-8B-SpecialTitanFusion.json rename to data/models/ZeroXClem_Llama-3.1-8B-SpecialTitanFusion.json diff --git a/data/ZeroXClem_Llama-3.1-8B-SuperNova-EtherealHermes.json b/data/models/ZeroXClem_Llama-3.1-8B-SuperNova-EtherealHermes.json similarity index 100% rename from data/ZeroXClem_Llama-3.1-8B-SuperNova-EtherealHermes.json rename to data/models/ZeroXClem_Llama-3.1-8B-SuperNova-EtherealHermes.json diff --git a/data/ZeroXClem_Llama-3.1-8B-SuperTulu-LexiNova.json b/data/models/ZeroXClem_Llama-3.1-8B-SuperTulu-LexiNova.json similarity index 100% rename from data/ZeroXClem_Llama-3.1-8B-SuperTulu-LexiNova.json rename to data/models/ZeroXClem_Llama-3.1-8B-SuperTulu-LexiNova.json diff --git a/data/ZeroXClem_Qwen-2.5-Aether-SlerpFusion-7B.json b/data/models/ZeroXClem_Qwen-2.5-Aether-SlerpFusion-7B.json similarity index 100% rename from data/ZeroXClem_Qwen-2.5-Aether-SlerpFusion-7B.json rename to data/models/ZeroXClem_Qwen-2.5-Aether-SlerpFusion-7B.json diff --git a/data/ZeroXClem_Qwen2.5-7B-CelestialHarmony-1M.json b/data/models/ZeroXClem_Qwen2.5-7B-CelestialHarmony-1M.json similarity index 100% rename from data/ZeroXClem_Qwen2.5-7B-CelestialHarmony-1M.json rename to data/models/ZeroXClem_Qwen2.5-7B-CelestialHarmony-1M.json diff --git a/data/ZeroXClem_Qwen2.5-7B-HomerAnvita-NerdMix.json b/data/models/ZeroXClem_Qwen2.5-7B-HomerAnvita-NerdMix.json similarity index 100% rename from data/ZeroXClem_Qwen2.5-7B-HomerAnvita-NerdMix.json rename to data/models/ZeroXClem_Qwen2.5-7B-HomerAnvita-NerdMix.json diff --git a/data/ZeroXClem_Qwen2.5-7B-HomerCreative-Mix.json b/data/models/ZeroXClem_Qwen2.5-7B-HomerCreative-Mix.json similarity index 100% rename from data/ZeroXClem_Qwen2.5-7B-HomerCreative-Mix.json rename to data/models/ZeroXClem_Qwen2.5-7B-HomerCreative-Mix.json diff --git a/data/ZeroXClem_Qwen2.5-7B-Qandora-CySec.json b/data/models/ZeroXClem_Qwen2.5-7B-Qandora-CySec.json similarity index 100% rename from data/ZeroXClem_Qwen2.5-7B-Qandora-CySec.json rename to data/models/ZeroXClem_Qwen2.5-7B-Qandora-CySec.json diff --git a/data/ZeusLabs_L3-Aethora-15B-V2.json b/data/models/ZeusLabs_L3-Aethora-15B-V2.json similarity index 100% rename from data/ZeusLabs_L3-Aethora-15B-V2.json rename to data/models/ZeusLabs_L3-Aethora-15B-V2.json diff --git a/data/ZhangShenao_SELM-Llama-3-8B-Instruct-iter-3.json b/data/models/ZhangShenao_SELM-Llama-3-8B-Instruct-iter-3.json similarity index 100% rename from data/ZhangShenao_SELM-Llama-3-8B-Instruct-iter-3.json rename to data/models/ZhangShenao_SELM-Llama-3-8B-Instruct-iter-3.json diff --git a/data/ZiyiYe_Con-J-Qwen2-7B.json b/data/models/ZiyiYe_Con-J-Qwen2-7B.json similarity index 100% rename from data/ZiyiYe_Con-J-Qwen2-7B.json rename to data/models/ZiyiYe_Con-J-Qwen2-7B.json diff --git a/data/aaditya_Llama3-OpenBioLLM-70B.json b/data/models/aaditya_Llama3-OpenBioLLM-70B.json similarity index 100% rename from data/aaditya_Llama3-OpenBioLLM-70B.json rename to data/models/aaditya_Llama3-OpenBioLLM-70B.json diff --git a/data/abacusai_Dracarys-72B-Instruct.json b/data/models/abacusai_Dracarys-72B-Instruct.json similarity index 100% rename from data/abacusai_Dracarys-72B-Instruct.json rename to data/models/abacusai_Dracarys-72B-Instruct.json diff --git a/data/abacusai_Liberated-Qwen1.5-14B.json b/data/models/abacusai_Liberated-Qwen1.5-14B.json similarity index 100% rename from data/abacusai_Liberated-Qwen1.5-14B.json rename to data/models/abacusai_Liberated-Qwen1.5-14B.json diff --git a/data/abacusai_Llama-3-Smaug-8B.json b/data/models/abacusai_Llama-3-Smaug-8B.json similarity index 100% rename from data/abacusai_Llama-3-Smaug-8B.json rename to data/models/abacusai_Llama-3-Smaug-8B.json diff --git a/data/abacusai_Smaug-34B-v0.1.json b/data/models/abacusai_Smaug-34B-v0.1.json similarity index 100% rename from data/abacusai_Smaug-34B-v0.1.json rename to data/models/abacusai_Smaug-34B-v0.1.json diff --git a/data/abacusai_Smaug-72B-v0.1.json b/data/models/abacusai_Smaug-72B-v0.1.json similarity index 100% rename from data/abacusai_Smaug-72B-v0.1.json rename to data/models/abacusai_Smaug-72B-v0.1.json diff --git a/data/abacusai_Smaug-Llama-3-70B-Instruct-32K.json b/data/models/abacusai_Smaug-Llama-3-70B-Instruct-32K.json similarity index 100% rename from data/abacusai_Smaug-Llama-3-70B-Instruct-32K.json rename to data/models/abacusai_Smaug-Llama-3-70B-Instruct-32K.json diff --git a/data/abacusai_Smaug-Mixtral-v0.1.json b/data/models/abacusai_Smaug-Mixtral-v0.1.json similarity index 100% rename from data/abacusai_Smaug-Mixtral-v0.1.json rename to data/models/abacusai_Smaug-Mixtral-v0.1.json diff --git a/data/abacusai_Smaug-Qwen2-72B-Instruct.json b/data/models/abacusai_Smaug-Qwen2-72B-Instruct.json similarity index 100% rename from data/abacusai_Smaug-Qwen2-72B-Instruct.json rename to data/models/abacusai_Smaug-Qwen2-72B-Instruct.json diff --git a/data/abacusai_bigstral-12b-32k.json b/data/models/abacusai_bigstral-12b-32k.json similarity index 100% rename from data/abacusai_bigstral-12b-32k.json rename to data/models/abacusai_bigstral-12b-32k.json diff --git a/data/abacusai_bigyi-15b.json b/data/models/abacusai_bigyi-15b.json similarity index 100% rename from data/abacusai_bigyi-15b.json rename to data/models/abacusai_bigyi-15b.json diff --git a/data/abhishek_autotrain-0tmgq-5tpbg.json b/data/models/abhishek_autotrain-0tmgq-5tpbg.json similarity index 100% rename from data/abhishek_autotrain-0tmgq-5tpbg.json rename to data/models/abhishek_autotrain-0tmgq-5tpbg.json diff --git a/data/abhishek_autotrain-llama3-70b-orpo-v1.json b/data/models/abhishek_autotrain-llama3-70b-orpo-v1.json similarity index 100% rename from data/abhishek_autotrain-llama3-70b-orpo-v1.json rename to data/models/abhishek_autotrain-llama3-70b-orpo-v1.json diff --git a/data/abhishek_autotrain-llama3-70b-orpo-v2.json b/data/models/abhishek_autotrain-llama3-70b-orpo-v2.json similarity index 100% rename from data/abhishek_autotrain-llama3-70b-orpo-v2.json rename to data/models/abhishek_autotrain-llama3-70b-orpo-v2.json diff --git a/data/abhishek_autotrain-llama3-orpo-v2.json b/data/models/abhishek_autotrain-llama3-orpo-v2.json similarity index 100% rename from data/abhishek_autotrain-llama3-orpo-v2.json rename to data/models/abhishek_autotrain-llama3-orpo-v2.json diff --git a/data/abhishek_autotrain-vr4a1-e5mms.json b/data/models/abhishek_autotrain-vr4a1-e5mms.json similarity index 100% rename from data/abhishek_autotrain-vr4a1-e5mms.json rename to data/models/abhishek_autotrain-vr4a1-e5mms.json diff --git a/data/abideen_MedPhi-4-14B-v1.json b/data/models/abideen_MedPhi-4-14B-v1.json similarity index 100% rename from data/abideen_MedPhi-4-14B-v1.json rename to data/models/abideen_MedPhi-4-14B-v1.json diff --git a/data/adamo1139_Yi-34B-200K-AEZAKMI-v2.json b/data/models/adamo1139_Yi-34B-200K-AEZAKMI-v2.json similarity index 100% rename from data/adamo1139_Yi-34B-200K-AEZAKMI-v2.json rename to data/models/adamo1139_Yi-34B-200K-AEZAKMI-v2.json diff --git a/data/adriszmar_QAIMath-Qwen2.5-7B-TIES.json b/data/models/adriszmar_QAIMath-Qwen2.5-7B-TIES.json similarity index 99% rename from data/adriszmar_QAIMath-Qwen2.5-7B-TIES.json rename to data/models/adriszmar_QAIMath-Qwen2.5-7B-TIES.json index 580c6ab8916f244442dcb361908ba8e362770e37..c33319fca8a7a2c1761f8b03ebff7ebfa630dd25 100644 --- a/data/adriszmar_QAIMath-Qwen2.5-7B-TIES.json +++ b/data/models/adriszmar_QAIMath-Qwen2.5-7B-TIES.json @@ -5,7 +5,7 @@ "developer": "adriszmar", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "Qwen2ForCausalLM", "params_billions": "7.616" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1746 + "score": 0.1685 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3126 + "score": 0.3124 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0 + "score": 0.0015 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.245 + "score": 0.2492 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4096 + "score": 0.3963 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1087 + "score": 0.1066 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1685 + "score": 0.1746 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3124 + "score": 0.3126 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0015 + "score": 0.0 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2492 + "score": 0.245 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3963 + "score": 0.4096 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1066 + "score": 0.1087 } } ], diff --git a/data/aevalone_distill_qw_test.json b/data/models/aevalone_distill_qw_test.json similarity index 100% rename from data/aevalone_distill_qw_test.json rename to data/models/aevalone_distill_qw_test.json diff --git a/data/agentlans_Gemma2-9B-AdvancedFuse.json b/data/models/agentlans_Gemma2-9B-AdvancedFuse.json similarity index 100% rename from data/agentlans_Gemma2-9B-AdvancedFuse.json rename to data/models/agentlans_Gemma2-9B-AdvancedFuse.json diff --git a/data/agentlans_Llama-3.2-1B-Instruct-CrashCourse12K.json b/data/models/agentlans_Llama-3.2-1B-Instruct-CrashCourse12K.json similarity index 100% rename from data/agentlans_Llama-3.2-1B-Instruct-CrashCourse12K.json rename to data/models/agentlans_Llama-3.2-1B-Instruct-CrashCourse12K.json diff --git a/data/agentlans_Llama3.1-8B-drill.json b/data/models/agentlans_Llama3.1-8B-drill.json similarity index 100% rename from data/agentlans_Llama3.1-8B-drill.json rename to data/models/agentlans_Llama3.1-8B-drill.json diff --git a/data/agentlans_Llama3.1-Daredevilish-Instruct.json b/data/models/agentlans_Llama3.1-Daredevilish-Instruct.json similarity index 100% rename from data/agentlans_Llama3.1-Daredevilish-Instruct.json rename to data/models/agentlans_Llama3.1-Daredevilish-Instruct.json diff --git a/data/agentlans_Llama3.1-Daredevilish.json b/data/models/agentlans_Llama3.1-Daredevilish.json similarity index 100% rename from data/agentlans_Llama3.1-Daredevilish.json rename to data/models/agentlans_Llama3.1-Daredevilish.json diff --git a/data/agentlans_Llama3.1-LexiHermes-SuperStorm.json b/data/models/agentlans_Llama3.1-LexiHermes-SuperStorm.json similarity index 100% rename from data/agentlans_Llama3.1-LexiHermes-SuperStorm.json rename to data/models/agentlans_Llama3.1-LexiHermes-SuperStorm.json diff --git a/data/agentlans_Llama3.1-SuperDeepFuse-CrashCourse12K.json b/data/models/agentlans_Llama3.1-SuperDeepFuse-CrashCourse12K.json similarity index 100% rename from data/agentlans_Llama3.1-SuperDeepFuse-CrashCourse12K.json rename to data/models/agentlans_Llama3.1-SuperDeepFuse-CrashCourse12K.json diff --git a/data/agentlans_Llama3.1-SuperDeepFuse.json b/data/models/agentlans_Llama3.1-SuperDeepFuse.json similarity index 100% rename from data/agentlans_Llama3.1-SuperDeepFuse.json rename to data/models/agentlans_Llama3.1-SuperDeepFuse.json diff --git a/data/agentlans_Qwen2.5-0.5B-Instruct-CrashCourse-dropout.json b/data/models/agentlans_Qwen2.5-0.5B-Instruct-CrashCourse-dropout.json similarity index 100% rename from data/agentlans_Qwen2.5-0.5B-Instruct-CrashCourse-dropout.json rename to data/models/agentlans_Qwen2.5-0.5B-Instruct-CrashCourse-dropout.json diff --git a/data/ahmeda335_13_outOf_32_pruned_layers_llama3.1-8b.json b/data/models/ahmeda335_13_outOf_32_pruned_layers_llama3.1-8b.json similarity index 100% rename from data/ahmeda335_13_outOf_32_pruned_layers_llama3.1-8b.json rename to data/models/ahmeda335_13_outOf_32_pruned_layers_llama3.1-8b.json diff --git a/data/ai21_J1-Grande-v1-17B.json b/data/models/ai21_J1-Grande-v1-17B.json similarity index 100% rename from data/ai21_J1-Grande-v1-17B.json rename to data/models/ai21_J1-Grande-v1-17B.json diff --git a/data/ai21_J1-Grande-v2-beta-17B.json b/data/models/ai21_J1-Grande-v2-beta-17B.json similarity index 100% rename from data/ai21_J1-Grande-v2-beta-17B.json rename to data/models/ai21_J1-Grande-v2-beta-17B.json diff --git a/data/ai21_J1-Jumbo-v1-178B.json b/data/models/ai21_J1-Jumbo-v1-178B.json similarity index 100% rename from data/ai21_J1-Jumbo-v1-178B.json rename to data/models/ai21_J1-Jumbo-v1-178B.json diff --git a/data/ai21_J1-Large-v1-7.5B.json b/data/models/ai21_J1-Large-v1-7.5B.json similarity index 100% rename from data/ai21_J1-Large-v1-7.5B.json rename to data/models/ai21_J1-Large-v1-7.5B.json diff --git a/data/ai21_Jurassic-2-Grande-17B.json b/data/models/ai21_Jurassic-2-Grande-17B.json similarity index 100% rename from data/ai21_Jurassic-2-Grande-17B.json rename to data/models/ai21_Jurassic-2-Grande-17B.json diff --git a/data/ai21_Jurassic-2-Jumbo-178B.json b/data/models/ai21_Jurassic-2-Jumbo-178B.json similarity index 100% rename from data/ai21_Jurassic-2-Jumbo-178B.json rename to data/models/ai21_Jurassic-2-Jumbo-178B.json diff --git a/data/ai21_Jurassic-2-Large-7.5B.json b/data/models/ai21_Jurassic-2-Large-7.5B.json similarity index 100% rename from data/ai21_Jurassic-2-Large-7.5B.json rename to data/models/ai21_Jurassic-2-Large-7.5B.json diff --git a/data/ai21_j2-grande.json b/data/models/ai21_j2-grande.json similarity index 100% rename from data/ai21_j2-grande.json rename to data/models/ai21_j2-grande.json diff --git a/data/ai21_j2-jumbo.json b/data/models/ai21_j2-jumbo.json similarity index 100% rename from data/ai21_j2-jumbo.json rename to data/models/ai21_j2-jumbo.json diff --git a/data/ai21_jamba-1.5-large.json b/data/models/ai21_jamba-1.5-large.json similarity index 100% rename from data/ai21_jamba-1.5-large.json rename to data/models/ai21_jamba-1.5-large.json diff --git a/data/ai21_jamba-1.5-mini.json b/data/models/ai21_jamba-1.5-mini.json similarity index 100% rename from data/ai21_jamba-1.5-mini.json rename to data/models/ai21_jamba-1.5-mini.json diff --git a/data/ai21_jamba-instruct.json b/data/models/ai21_jamba-instruct.json similarity index 100% rename from data/ai21_jamba-instruct.json rename to data/models/ai21_jamba-instruct.json diff --git a/data/ai21labs_Jamba-v0.1.json b/data/models/ai21labs_Jamba-v0.1.json similarity index 100% rename from data/ai21labs_Jamba-v0.1.json rename to data/models/ai21labs_Jamba-v0.1.json diff --git a/data/ai2_llama-2-chat-7b-nectar-3.8m.json.json b/data/models/ai2_llama-2-chat-7b-nectar-3.8m.json.json similarity index 100% rename from data/ai2_llama-2-chat-7b-nectar-3.8m.json.json rename to data/models/ai2_llama-2-chat-7b-nectar-3.8m.json.json diff --git a/data/ai2_llama-2-chat-nectar-180k.json.json b/data/models/ai2_llama-2-chat-nectar-180k.json.json similarity index 100% rename from data/ai2_llama-2-chat-nectar-180k.json.json rename to data/models/ai2_llama-2-chat-nectar-180k.json.json diff --git a/data/ai2_llama-2-chat-ultrafeedback-60k.jsonl.json b/data/models/ai2_llama-2-chat-ultrafeedback-60k.jsonl.json similarity index 100% rename from data/ai2_llama-2-chat-ultrafeedback-60k.jsonl.json rename to data/models/ai2_llama-2-chat-ultrafeedback-60k.jsonl.json diff --git a/data/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check....json b/data/models/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check....json similarity index 100% rename from data/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check....json rename to data/models/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check....json index 27dd1984f5ffba8bf83974186dd69d87a7e7f3ca..a96a6a2c58b30640ee0de43b8043a05f02c8d033 100644 --- a/data/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check....json +++ b/data/models/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check....json @@ -38,7 +38,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7008 + "score": 0.7058 }, "source_data": { "dataset_name": "RewardBench", @@ -56,7 +56,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.9385 + "score": 0.9525 }, "source_data": { "dataset_name": "RewardBench", @@ -74,7 +74,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3882 + "score": 0.3947 }, "source_data": { "dataset_name": "RewardBench", @@ -92,7 +92,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7757 + "score": 0.7703 }, "source_data": { "dataset_name": "RewardBench", @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6895 + "score": 0.7004 }, "source_data": { "dataset_name": "RewardBench", @@ -152,7 +152,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.9385 + "score": 0.9413 }, "source_data": { "dataset_name": "RewardBench", @@ -170,7 +170,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3706 + "score": 0.3882 }, "source_data": { "dataset_name": "RewardBench", @@ -188,7 +188,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7595 + "score": 0.7716 }, "source_data": { "dataset_name": "RewardBench", @@ -422,7 +422,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6808 + "score": 0.7008 }, "source_data": { "dataset_name": "RewardBench", @@ -440,7 +440,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.9302 + "score": 0.9385 }, "source_data": { "dataset_name": "RewardBench", @@ -458,7 +458,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3596 + "score": 0.3882 }, "source_data": { "dataset_name": "RewardBench", @@ -476,7 +476,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7527 + "score": 0.7757 }, "source_data": { "dataset_name": "RewardBench", @@ -614,7 +614,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7058 + "score": 0.6808 }, "source_data": { "dataset_name": "RewardBench", @@ -632,7 +632,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.9525 + "score": 0.9302 }, "source_data": { "dataset_name": "RewardBench", @@ -650,7 +650,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3947 + "score": 0.3596 }, "source_data": { "dataset_name": "RewardBench", @@ -668,7 +668,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7703 + "score": 0.7527 }, "source_data": { "dataset_name": "RewardBench", @@ -806,7 +806,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7004 + "score": 0.6895 }, "source_data": { "dataset_name": "RewardBench", @@ -824,7 +824,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.9413 + "score": 0.9385 }, "source_data": { "dataset_name": "RewardBench", @@ -842,7 +842,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3882 + "score": 0.3706 }, "source_data": { "dataset_name": "RewardBench", @@ -860,7 +860,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7716 + "score": 0.7595 }, "source_data": { "dataset_name": "RewardBench", diff --git a/data/ai2_tulu-2-7b-rm-v0-nectar-binarized-700k.json.json b/data/models/ai2_tulu-2-7b-rm-v0-nectar-binarized-700k.json.json similarity index 100% rename from data/ai2_tulu-2-7b-rm-v0-nectar-binarized-700k.json.json rename to data/models/ai2_tulu-2-7b-rm-v0-nectar-binarized-700k.json.json diff --git a/data/ai2_tulu-2-7b-rm-v0-nectar-binarized.json.json b/data/models/ai2_tulu-2-7b-rm-v0-nectar-binarized.json.json similarity index 100% rename from data/ai2_tulu-2-7b-rm-v0-nectar-binarized.json.json rename to data/models/ai2_tulu-2-7b-rm-v0-nectar-binarized.json.json diff --git a/data/ai2_tulu-2-7b-rm-v0.json.json b/data/models/ai2_tulu-2-7b-rm-v0.json.json similarity index 100% rename from data/ai2_tulu-2-7b-rm-v0.json.json rename to data/models/ai2_tulu-2-7b-rm-v0.json.json diff --git a/data/ai4bharat_Airavata.json b/data/models/ai4bharat_Airavata.json similarity index 100% rename from data/ai4bharat_Airavata.json rename to data/models/ai4bharat_Airavata.json diff --git a/data/aixonlab_Aether-12b.json b/data/models/aixonlab_Aether-12b.json similarity index 100% rename from data/aixonlab_Aether-12b.json rename to data/models/aixonlab_Aether-12b.json diff --git a/data/aixonlab_Grey-12b.json b/data/models/aixonlab_Grey-12b.json similarity index 100% rename from data/aixonlab_Grey-12b.json rename to data/models/aixonlab_Grey-12b.json diff --git a/data/aixonlab_Zara-14b-v1.2.json b/data/models/aixonlab_Zara-14b-v1.2.json similarity index 100% rename from data/aixonlab_Zara-14b-v1.2.json rename to data/models/aixonlab_Zara-14b-v1.2.json diff --git a/data/akhadangi_Llama3.2.1B.0.01-First.json b/data/models/akhadangi_Llama3.2.1B.0.01-First.json similarity index 100% rename from data/akhadangi_Llama3.2.1B.0.01-First.json rename to data/models/akhadangi_Llama3.2.1B.0.01-First.json diff --git a/data/akhadangi_Llama3.2.1B.0.01-Last.json b/data/models/akhadangi_Llama3.2.1B.0.01-Last.json similarity index 100% rename from data/akhadangi_Llama3.2.1B.0.01-Last.json rename to data/models/akhadangi_Llama3.2.1B.0.01-Last.json diff --git a/data/akhadangi_Llama3.2.1B.0.1-First.json b/data/models/akhadangi_Llama3.2.1B.0.1-First.json similarity index 100% rename from data/akhadangi_Llama3.2.1B.0.1-First.json rename to data/models/akhadangi_Llama3.2.1B.0.1-First.json diff --git a/data/akhadangi_Llama3.2.1B.0.1-Last.json b/data/models/akhadangi_Llama3.2.1B.0.1-Last.json similarity index 100% rename from data/akhadangi_Llama3.2.1B.0.1-Last.json rename to data/models/akhadangi_Llama3.2.1B.0.1-Last.json diff --git a/data/akhadangi_Llama3.2.1B.BaseFiT.json b/data/models/akhadangi_Llama3.2.1B.BaseFiT.json similarity index 100% rename from data/akhadangi_Llama3.2.1B.BaseFiT.json rename to data/models/akhadangi_Llama3.2.1B.BaseFiT.json diff --git a/data/akjindal53244_Llama-3.1-Storm-8B.json b/data/models/akjindal53244_Llama-3.1-Storm-8B.json similarity index 99% rename from data/akjindal53244_Llama-3.1-Storm-8B.json rename to data/models/akjindal53244_Llama-3.1-Storm-8B.json index e41c0a8078b50d10a0b68211949112ef13277f22..40e662764779e61971d6e759ef0343947ee329fc 100644 --- a/data/akjindal53244_Llama-3.1-Storm-8B.json +++ b/data/models/akjindal53244_Llama-3.1-Storm-8B.json @@ -5,7 +5,7 @@ "developer": "akjindal53244", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "LlamaForCausalLM", "params_billions": "8.03" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.8051 + "score": 0.8033 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5189 + "score": 0.5196 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1722 + "score": 0.1624 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3263 + "score": 0.3096 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3803 + "score": 0.3812 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.8033 + "score": 0.8051 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5196 + "score": 0.5189 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1624 + "score": 0.1722 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3096 + "score": 0.3263 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3812 + "score": 0.3803 } } ], diff --git a/data/alcholjung_llama3_medical_tuned.json b/data/models/alcholjung_llama3_medical_tuned.json similarity index 100% rename from data/alcholjung_llama3_medical_tuned.json rename to data/models/alcholjung_llama3_medical_tuned.json diff --git a/data/aleph-alpha_Luminous-Base-13B.json b/data/models/aleph-alpha_Luminous-Base-13B.json similarity index 100% rename from data/aleph-alpha_Luminous-Base-13B.json rename to data/models/aleph-alpha_Luminous-Base-13B.json diff --git a/data/aleph-alpha_Luminous-Extended-30B.json b/data/models/aleph-alpha_Luminous-Extended-30B.json similarity index 100% rename from data/aleph-alpha_Luminous-Extended-30B.json rename to data/models/aleph-alpha_Luminous-Extended-30B.json diff --git a/data/aleph-alpha_Luminous-Supreme-70B.json b/data/models/aleph-alpha_Luminous-Supreme-70B.json similarity index 100% rename from data/aleph-alpha_Luminous-Supreme-70B.json rename to data/models/aleph-alpha_Luminous-Supreme-70B.json diff --git a/data/alibaba_qwen-3-coder-480b.json b/data/models/alibaba_qwen-3-coder-480b.json similarity index 98% rename from data/alibaba_qwen-3-coder-480b.json rename to data/models/alibaba_qwen-3-coder-480b.json index 2f8005671f8081ba0cce98ac5f838a5df7b09295..f950751e93e00355bc7f7e7d39efd2dbbec7d40a 100644 --- a/data/alibaba_qwen-3-coder-480b.json +++ b/data/models/alibaba_qwen-3-coder-480b.json @@ -4,13 +4,13 @@ "id": "alibaba/qwen-3-coder-480b", "developer": "Alibaba", "additional_details": { - "agent_name": "OpenHands", - "agent_organization": "OpenHands" + "agent_name": "Terminus 2", + "agent_organization": "Terminal Bench" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/dakou-agent__qwen-3-coder-480b/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/openhands__qwen-3-coder-480b/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-28", + "evaluation_timestamp": "2025-11-02", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,7 +43,7 @@ "max_score": 100.0 }, "score_details": { - "score": 27.2, + "score": 25.4, "uncertainty": { "standard_error": { "value": 2.6 @@ -53,7 +53,7 @@ }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Dakou Agent\" -m \"Qwen 3 Coder 480B\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Qwen 3 Coder 480B\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Dakou Agent\" -m \"Qwen 3 Coder 480B\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Qwen 3 Coder 480B\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__qwen-3-coder-480b/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/dakou-agent__qwen-3-coder-480b/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-01", + "evaluation_timestamp": "2025-12-28", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 23.9, + "score": 27.2, "uncertainty": { "standard_error": { - "value": 2.8 + "value": 2.6 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Qwen 3 Coder 480B\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Dakou Agent\" -m \"Qwen 3 Coder 480B\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Qwen 3 Coder 480B\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Dakou Agent\" -m \"Qwen 3 Coder 480B\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/openhands__qwen-3-coder-480b/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__qwen-3-coder-480b/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-02", + "evaluation_timestamp": "2025-11-01", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,17 @@ "max_score": 100.0 }, "score_details": { - "score": 25.4, + "score": 23.9, "uncertainty": { "standard_error": { - "value": 2.6 + "value": 2.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Qwen 3 Coder 480B\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Qwen 3 Coder 480B\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Qwen 3 Coder 480B\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Qwen 3 Coder 480B\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/alibaba_qwen3-235b-a22b-instruct-2507.json b/data/models/alibaba_qwen3-235b-a22b-instruct-2507.json similarity index 100% rename from data/alibaba_qwen3-235b-a22b-instruct-2507.json rename to data/models/alibaba_qwen3-235b-a22b-instruct-2507.json diff --git a/data/alibaba_qwen3-235b-a22b-thinking-2507.json b/data/models/alibaba_qwen3-235b-a22b-thinking-2507.json similarity index 100% rename from data/alibaba_qwen3-235b-a22b-thinking-2507.json rename to data/models/alibaba_qwen3-235b-a22b-thinking-2507.json diff --git a/data/alibaba_qwen3-30b-a3b.json b/data/models/alibaba_qwen3-30b-a3b.json similarity index 100% rename from data/alibaba_qwen3-30b-a3b.json rename to data/models/alibaba_qwen3-30b-a3b.json diff --git a/data/alibaba_qwen3-max.json b/data/models/alibaba_qwen3-max.json similarity index 100% rename from data/alibaba_qwen3-max.json rename to data/models/alibaba_qwen3-max.json diff --git a/data/alibaba_qwen3-next-80b-a3b-thinking.json b/data/models/alibaba_qwen3-next-80b-a3b-thinking.json similarity index 100% rename from data/alibaba_qwen3-next-80b-a3b-thinking.json rename to data/models/alibaba_qwen3-next-80b-a3b-thinking.json diff --git a/data/aliyun_qwen3-next-80b-a3b-thinking.json b/data/models/aliyun_qwen3-next-80b-a3b-thinking.json similarity index 100% rename from data/aliyun_qwen3-next-80b-a3b-thinking.json rename to data/models/aliyun_qwen3-next-80b-a3b-thinking.json diff --git a/data/allenai_Llama-3.1-70B-Instruct-RM-RB2.json b/data/models/allenai_Llama-3.1-70B-Instruct-RM-RB2.json similarity index 100% rename from data/allenai_Llama-3.1-70B-Instruct-RM-RB2.json rename to data/models/allenai_Llama-3.1-70B-Instruct-RM-RB2.json index 3edb4e861c9c885f1d8b857f90568f09968d9fbc..6b82bed55df1bdb62067f41e1f2f588c7fd5e772 100644 --- a/data/allenai_Llama-3.1-70B-Instruct-RM-RB2.json +++ b/data/models/allenai_Llama-3.1-70B-Instruct-RM-RB2.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/allenai_Llama-3.1-70B-Instruct-RM-RB2/1766412838.146816", + "evaluation_id": "reward-bench-2/allenai_Llama-3.1-70B-Instruct-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,109 +31,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9021 + "score": 0.7606 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9665 + "score": 0.8126 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8355 + "score": 0.4188 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6995 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9095 + "score": 0.8844 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8969 + "score": 0.8646 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.0 + "score": 0.8835 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], @@ -141,10 +159,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench-2/allenai_Llama-3.1-70B-Instruct-RM-RB2/1766412838.146816", + "evaluation_id": "reward-bench/allenai_Llama-3.1-70B-Instruct-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -163,127 +181,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7606 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8126 + "score": 0.9021 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4188 + "score": 0.9665 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6995 + "score": 0.8355 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8844 + "score": 0.9095 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8646 + "score": 0.8969 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8835 + "score": 0.0 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/allenai_Llama-3.1-8B-Base-RM-RB2.json b/data/models/allenai_Llama-3.1-8B-Base-RM-RB2.json similarity index 100% rename from data/allenai_Llama-3.1-8B-Base-RM-RB2.json rename to data/models/allenai_Llama-3.1-8B-Base-RM-RB2.json index 8e81065be5c6f388ddbf031b25c6ea031d346e21..b77566d88bfa5b7f18f1a64562aee1f65b27a08c 100644 --- a/data/allenai_Llama-3.1-8B-Base-RM-RB2.json +++ b/data/models/allenai_Llama-3.1-8B-Base-RM-RB2.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench-2/allenai_Llama-3.1-8B-Base-RM-RB2/1766412838.146816", + "evaluation_id": "reward-bench/allenai_Llama-3.1-8B-Base-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,127 +31,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.649 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.72 + "score": 0.8463 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3625 + "score": 0.933 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.612 + "score": 0.7785 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8267 + "score": 0.8851 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8323 + "score": 0.7886 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5406 + "score": 0.0 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], @@ -159,10 +141,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench/allenai_Llama-3.1-8B-Base-RM-RB2/1766412838.146816", + "evaluation_id": "reward-bench-2/allenai_Llama-3.1-8B-Base-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -181,109 +163,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8463 + "score": 0.649 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.933 + "score": 0.72 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7785 + "score": 0.3625 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.612 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8851 + "score": 0.8267 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7886 + "score": 0.8323 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.0 + "score": 0.5406 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], diff --git a/data/allenai_Llama-3.1-8B-Instruct-RM-RB2.json b/data/models/allenai_Llama-3.1-8B-Instruct-RM-RB2.json similarity index 100% rename from data/allenai_Llama-3.1-8B-Instruct-RM-RB2.json rename to data/models/allenai_Llama-3.1-8B-Instruct-RM-RB2.json diff --git a/data/allenai_Llama-3.1-Tulu-3-70B-DPO.json b/data/models/allenai_Llama-3.1-Tulu-3-70B-DPO.json similarity index 100% rename from data/allenai_Llama-3.1-Tulu-3-70B-DPO.json rename to data/models/allenai_Llama-3.1-Tulu-3-70B-DPO.json diff --git a/data/allenai_Llama-3.1-Tulu-3-70B-SFT-RM-RB2.json b/data/models/allenai_Llama-3.1-Tulu-3-70B-SFT-RM-RB2.json similarity index 100% rename from data/allenai_Llama-3.1-Tulu-3-70B-SFT-RM-RB2.json rename to data/models/allenai_Llama-3.1-Tulu-3-70B-SFT-RM-RB2.json diff --git a/data/allenai_Llama-3.1-Tulu-3-70B-SFT.json b/data/models/allenai_Llama-3.1-Tulu-3-70B-SFT.json similarity index 100% rename from data/allenai_Llama-3.1-Tulu-3-70B-SFT.json rename to data/models/allenai_Llama-3.1-Tulu-3-70B-SFT.json diff --git a/data/allenai_Llama-3.1-Tulu-3-70B.json b/data/models/allenai_Llama-3.1-Tulu-3-70B.json similarity index 100% rename from data/allenai_Llama-3.1-Tulu-3-70B.json rename to data/models/allenai_Llama-3.1-Tulu-3-70B.json diff --git a/data/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2.json b/data/models/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2.json similarity index 100% rename from data/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2.json rename to data/models/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2.json index 52eed39924ec31b815aba5be343333e3d61411ae..408cdcea425aaae910c976c1f57587d29fcf9c63 100644 --- a/data/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2.json +++ b/data/models/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1766412838.146816", + "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,127 +31,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.687 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7516 + "score": 0.8431 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3875 + "score": 0.9553 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6284 + "score": 0.761 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.86 + "score": 0.8662 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8545 + "score": 0.7898 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6397 + "score": 0.0 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], @@ -159,10 +141,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1766412838.146816", + "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -181,109 +163,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8431 + "score": 0.687 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9553 + "score": 0.7516 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.761 + "score": 0.3875 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8662 + "score": 0.86 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7898 + "score": 0.8545 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.0 + "score": 0.6397 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], diff --git a/data/allenai_Llama-3.1-Tulu-3-8B-DPO.json b/data/models/allenai_Llama-3.1-Tulu-3-8B-DPO.json similarity index 100% rename from data/allenai_Llama-3.1-Tulu-3-8B-DPO.json rename to data/models/allenai_Llama-3.1-Tulu-3-8B-DPO.json diff --git a/data/allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2.json b/data/models/allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2.json similarity index 100% rename from data/allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2.json rename to data/models/allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2.json diff --git a/data/allenai_Llama-3.1-Tulu-3-8B-RM.json b/data/models/allenai_Llama-3.1-Tulu-3-8B-RM.json similarity index 100% rename from data/allenai_Llama-3.1-Tulu-3-8B-RM.json rename to data/models/allenai_Llama-3.1-Tulu-3-8B-RM.json diff --git a/data/allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2.json b/data/models/allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2.json similarity index 100% rename from data/allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2.json rename to data/models/allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2.json index e5b15dac5cb2f4285962219bbf6cd6008c5fe41a..84c4146c17907f25ce74864c179dc0e7dcd19fbd 100644 --- a/data/allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2.json +++ b/data/models/allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/1766412838.146816", + "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,109 +31,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8551 + "score": 0.6821 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9497 + "score": 0.7326 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7917 + "score": 0.3875 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.5792 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8784 + "score": 0.8978 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8005 + "score": 0.8889 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.0 + "score": 0.6063 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], @@ -141,10 +159,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/1766412838.146816", + "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -163,127 +181,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6821 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7326 + "score": 0.8551 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3875 + "score": 0.9497 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5792 + "score": 0.7917 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8978 + "score": 0.8784 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8889 + "score": 0.8005 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6063 + "score": 0.0 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/allenai_Llama-3.1-Tulu-3-8B-SFT.json b/data/models/allenai_Llama-3.1-Tulu-3-8B-SFT.json similarity index 100% rename from data/allenai_Llama-3.1-Tulu-3-8B-SFT.json rename to data/models/allenai_Llama-3.1-Tulu-3-8B-SFT.json diff --git a/data/allenai_Llama-3.1-Tulu-3-8B.json b/data/models/allenai_Llama-3.1-Tulu-3-8B.json similarity index 99% rename from data/allenai_Llama-3.1-Tulu-3-8B.json rename to data/models/allenai_Llama-3.1-Tulu-3-8B.json index 3ad8a9ac23bd0436da7fc124729f9555d65628da..cdc4277bf3338da5223bb18b774ff35b0fb5f41e 100644 --- a/data/allenai_Llama-3.1-Tulu-3-8B.json +++ b/data/models/allenai_Llama-3.1-Tulu-3-8B.json @@ -5,7 +5,7 @@ "developer": "allenai", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "LlamaForCausalLM", "params_billions": "8.03" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.8255 + "score": 0.8267 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4061 + "score": 0.405 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2115 + "score": 0.1964 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.297 + "score": 0.2987 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2821 + "score": 0.2827 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.8267 + "score": 0.8255 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.405 + "score": 0.4061 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1964 + "score": 0.2115 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2987 + "score": 0.297 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2827 + "score": 0.2821 } } ], diff --git a/data/allenai_OLMo-1.7-7B-hf.json b/data/models/allenai_OLMo-1.7-7B-hf.json similarity index 100% rename from data/allenai_OLMo-1.7-7B-hf.json rename to data/models/allenai_OLMo-1.7-7B-hf.json diff --git a/data/allenai_OLMo-1B-hf.json b/data/models/allenai_OLMo-1B-hf.json similarity index 100% rename from data/allenai_OLMo-1B-hf.json rename to data/models/allenai_OLMo-1B-hf.json diff --git a/data/models/allenai_OLMo-2-1124-7B-Instruct.json b/data/models/allenai_OLMo-2-1124-7B-Instruct.json new file mode 100644 index 0000000000000000000000000000000000000000..2bd2e72dfe74f50da5304f1fba1834e9c2b078ee --- /dev/null +++ b/data/models/allenai_OLMo-2-1124-7B-Instruct.json @@ -0,0 +1,145 @@ +{ + "model_info": { + "name": "OLMo-2-1124-7B-Instruct", + "id": "allenai/OLMo-2-1124-7B-Instruct", + "developer": "allenai", + "inference_platform": "unknown", + "additional_details": { + "precision": "float16", + "architecture": "Olmo2ForCausalLM", + "params_billions": "7.299" + } + }, + "evaluations": [ + { + "evaluation_id": "hfopenllm_v2/allenai_OLMo-2-1124-7B-Instruct/1773936498.240187", + "retrieved_timestamp": "1773936498.240187", + "source_metadata": { + "source_name": "HF Open LLM v2", + "source_type": "documentation", + "source_organization_name": "Hugging Face", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "lm-evaluation-harness", + "version": "0.4.0", + "additional_details": { + "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess" + } + }, + "benchmark": "hfopenllm_v2", + "evaluation_results": [ + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "hf_dataset", + "hf_repo": "google/IFEval" + }, + "metric_config": { + "evaluation_description": "Accuracy on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.7244 + } + }, + { + "evaluation_name": "BBH", + "source_data": { + "dataset_name": "BBH", + "source_type": "hf_dataset", + "hf_repo": "SaylorTwift/bbh" + }, + "metric_config": { + "evaluation_description": "Accuracy on BBH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4022 + } + }, + { + "evaluation_name": "MATH Level 5", + "source_data": { + "dataset_name": "MATH Level 5", + "source_type": "hf_dataset", + "hf_repo": "DigitalLearningGmbH/MATH-lighteval" + }, + "metric_config": { + "evaluation_description": "Exact Match on MATH Level 5", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.1488 + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "hf_dataset", + "hf_repo": "Idavidrein/gpqa" + }, + "metric_config": { + "evaluation_description": "Accuracy on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2785 + } + }, + { + "evaluation_name": "MUSR", + "source_data": { + "dataset_name": "MUSR", + "source_type": "hf_dataset", + "hf_repo": "TAUR-Lab/MuSR" + }, + "metric_config": { + "evaluation_description": "Accuracy on MUSR", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3508 + } + }, + { + "evaluation_name": "MMLU-PRO", + "source_data": { + "dataset_name": "MMLU-PRO", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/MMLU-Pro" + }, + "metric_config": { + "evaluation_description": "Accuracy on MMLU-PRO", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2672 + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/allenai_OLMo-7B-Instruct-hf.json b/data/models/allenai_OLMo-7B-Instruct-hf.json similarity index 100% rename from data/allenai_OLMo-7B-Instruct-hf.json rename to data/models/allenai_OLMo-7B-Instruct-hf.json diff --git a/data/allenai_OLMo-7B-Instruct.json b/data/models/allenai_OLMo-7B-Instruct.json similarity index 100% rename from data/allenai_OLMo-7B-Instruct.json rename to data/models/allenai_OLMo-7B-Instruct.json diff --git a/data/allenai_OLMo-7B-hf.json b/data/models/allenai_OLMo-7B-hf.json similarity index 100% rename from data/allenai_OLMo-7B-hf.json rename to data/models/allenai_OLMo-7B-hf.json diff --git a/data/models/allenai_OLMoE-1B-7B-0125-Instruct.json b/data/models/allenai_OLMoE-1B-7B-0125-Instruct.json new file mode 100644 index 0000000000000000000000000000000000000000..27d6873218911a3538ecfc84cd9db537793ac89c --- /dev/null +++ b/data/models/allenai_OLMoE-1B-7B-0125-Instruct.json @@ -0,0 +1,145 @@ +{ + "model_info": { + "name": "OLMoE-1B-7B-0125-Instruct", + "id": "allenai/OLMoE-1B-7B-0125-Instruct", + "developer": "allenai", + "inference_platform": "unknown", + "additional_details": { + "precision": "float16", + "architecture": "OlmoeForCausalLM", + "params_billions": "6.919" + } + }, + "evaluations": [ + { + "evaluation_id": "hfopenllm_v2/allenai_OLMoE-1B-7B-0125-Instruct/1773936498.240187", + "retrieved_timestamp": "1773936498.240187", + "source_metadata": { + "source_name": "HF Open LLM v2", + "source_type": "documentation", + "source_organization_name": "Hugging Face", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "lm-evaluation-harness", + "version": "0.4.0", + "additional_details": { + "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess" + } + }, + "benchmark": "hfopenllm_v2", + "evaluation_results": [ + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "hf_dataset", + "hf_repo": "google/IFEval" + }, + "metric_config": { + "evaluation_description": "Accuracy on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6757 + } + }, + { + "evaluation_name": "BBH", + "source_data": { + "dataset_name": "BBH", + "source_type": "hf_dataset", + "hf_repo": "SaylorTwift/bbh" + }, + "metric_config": { + "evaluation_description": "Accuracy on BBH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3825 + } + }, + { + "evaluation_name": "MATH Level 5", + "source_data": { + "dataset_name": "MATH Level 5", + "source_type": "hf_dataset", + "hf_repo": "DigitalLearningGmbH/MATH-lighteval" + }, + "metric_config": { + "evaluation_description": "Exact Match on MATH Level 5", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.0899 + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "hf_dataset", + "hf_repo": "Idavidrein/gpqa" + }, + "metric_config": { + "evaluation_description": "Accuracy on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2601 + } + }, + { + "evaluation_name": "MUSR", + "source_data": { + "dataset_name": "MUSR", + "source_type": "hf_dataset", + "hf_repo": "TAUR-Lab/MuSR" + }, + "metric_config": { + "evaluation_description": "Accuracy on MUSR", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3636 + } + }, + { + "evaluation_name": "MMLU-PRO", + "source_data": { + "dataset_name": "MMLU-PRO", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/MMLU-Pro" + }, + "metric_config": { + "evaluation_description": "Accuracy on MMLU-PRO", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.1915 + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/allenai_OLMoE-1B-7B-0924-Instruct.json b/data/models/allenai_OLMoE-1B-7B-0924-Instruct.json similarity index 100% rename from data/allenai_OLMoE-1B-7B-0924-Instruct.json rename to data/models/allenai_OLMoE-1B-7B-0924-Instruct.json diff --git a/data/allenai_OLMoE-1B-7B-0924.json b/data/models/allenai_OLMoE-1B-7B-0924.json similarity index 100% rename from data/allenai_OLMoE-1B-7B-0924.json rename to data/models/allenai_OLMoE-1B-7B-0924.json diff --git a/data/allenai_llama-3-tulu-2-70b-uf-mean-rm.json b/data/models/allenai_llama-3-tulu-2-70b-uf-mean-rm.json similarity index 100% rename from data/allenai_llama-3-tulu-2-70b-uf-mean-rm.json rename to data/models/allenai_llama-3-tulu-2-70b-uf-mean-rm.json diff --git a/data/allenai_llama-3-tulu-2-8b-uf-mean-rm.json b/data/models/allenai_llama-3-tulu-2-8b-uf-mean-rm.json similarity index 100% rename from data/allenai_llama-3-tulu-2-8b-uf-mean-rm.json rename to data/models/allenai_llama-3-tulu-2-8b-uf-mean-rm.json diff --git a/data/allenai_llama-3-tulu-2-dpo-70b.json b/data/models/allenai_llama-3-tulu-2-dpo-70b.json similarity index 100% rename from data/allenai_llama-3-tulu-2-dpo-70b.json rename to data/models/allenai_llama-3-tulu-2-dpo-70b.json diff --git a/data/allenai_llama-3-tulu-2-dpo-8b.json b/data/models/allenai_llama-3-tulu-2-dpo-8b.json similarity index 100% rename from data/allenai_llama-3-tulu-2-dpo-8b.json rename to data/models/allenai_llama-3-tulu-2-dpo-8b.json diff --git a/data/allenai_olmo-1.7-7b.json b/data/models/allenai_olmo-1.7-7b.json similarity index 100% rename from data/allenai_olmo-1.7-7b.json rename to data/models/allenai_olmo-1.7-7b.json diff --git a/data/allenai_olmo-2-0325-32b-instruct.json b/data/models/allenai_olmo-2-0325-32b-instruct.json similarity index 100% rename from data/allenai_olmo-2-0325-32b-instruct.json rename to data/models/allenai_olmo-2-0325-32b-instruct.json diff --git a/data/allenai_olmo-2-1124-13b-instruct.json b/data/models/allenai_olmo-2-1124-13b-instruct.json similarity index 100% rename from data/allenai_olmo-2-1124-13b-instruct.json rename to data/models/allenai_olmo-2-1124-13b-instruct.json diff --git a/data/allenai_olmo-2-1124-7b-instruct.json b/data/models/allenai_olmo-2-1124-7b-instruct.json similarity index 100% rename from data/allenai_olmo-2-1124-7b-instruct.json rename to data/models/allenai_olmo-2-1124-7b-instruct.json diff --git a/data/allenai_olmo-7b.json b/data/models/allenai_olmo-7b.json similarity index 100% rename from data/allenai_olmo-7b.json rename to data/models/allenai_olmo-7b.json diff --git a/data/allenai_olmoe-1b-7b-0125-instruct.json b/data/models/allenai_olmoe-1b-7b-0125-instruct.json similarity index 100% rename from data/allenai_olmoe-1b-7b-0125-instruct.json rename to data/models/allenai_olmoe-1b-7b-0125-instruct.json diff --git a/data/allenai_open_instruct_dev-reward_modeling__1__1739590997.json b/data/models/allenai_open_instruct_dev-reward_modeling__1__1739590997.json similarity index 100% rename from data/allenai_open_instruct_dev-reward_modeling__1__1739590997.json rename to data/models/allenai_open_instruct_dev-reward_modeling__1__1739590997.json diff --git a/data/allenai_open_instruct_dev-reward_modeling__1__1739871066.json b/data/models/allenai_open_instruct_dev-reward_modeling__1__1739871066.json similarity index 100% rename from data/allenai_open_instruct_dev-reward_modeling__1__1739871066.json rename to data/models/allenai_open_instruct_dev-reward_modeling__1__1739871066.json diff --git a/data/allenai_open_instruct_dev-reward_modeling__1__1739925892.json b/data/models/allenai_open_instruct_dev-reward_modeling__1__1739925892.json similarity index 100% rename from data/allenai_open_instruct_dev-reward_modeling__1__1739925892.json rename to data/models/allenai_open_instruct_dev-reward_modeling__1__1739925892.json diff --git a/data/allenai_open_instruct_dev-reward_modeling__1__1739943850.json b/data/models/allenai_open_instruct_dev-reward_modeling__1__1739943850.json similarity index 100% rename from data/allenai_open_instruct_dev-reward_modeling__1__1739943850.json rename to data/models/allenai_open_instruct_dev-reward_modeling__1__1739943850.json diff --git a/data/allenai_open_instruct_dev-reward_modeling__1__1739943881.json b/data/models/allenai_open_instruct_dev-reward_modeling__1__1739943881.json similarity index 100% rename from data/allenai_open_instruct_dev-reward_modeling__1__1739943881.json rename to data/models/allenai_open_instruct_dev-reward_modeling__1__1739943881.json diff --git a/data/allenai_open_instruct_dev-reward_modeling__1__1739943972.json b/data/models/allenai_open_instruct_dev-reward_modeling__1__1739943972.json similarity index 100% rename from data/allenai_open_instruct_dev-reward_modeling__1__1739943972.json rename to data/models/allenai_open_instruct_dev-reward_modeling__1__1739943972.json diff --git a/data/allenai_open_instruct_dev-reward_modeling__1__1739957701.json b/data/models/allenai_open_instruct_dev-reward_modeling__1__1739957701.json similarity index 100% rename from data/allenai_open_instruct_dev-reward_modeling__1__1739957701.json rename to data/models/allenai_open_instruct_dev-reward_modeling__1__1739957701.json diff --git a/data/allenai_open_instruct_dev-reward_modeling__1__1739971507.json b/data/models/allenai_open_instruct_dev-reward_modeling__1__1739971507.json similarity index 100% rename from data/allenai_open_instruct_dev-reward_modeling__1__1739971507.json rename to data/models/allenai_open_instruct_dev-reward_modeling__1__1739971507.json diff --git a/data/allenai_open_instruct_dev-reward_modeling__1__1739971529.json b/data/models/allenai_open_instruct_dev-reward_modeling__1__1739971529.json similarity index 100% rename from data/allenai_open_instruct_dev-reward_modeling__1__1739971529.json rename to data/models/allenai_open_instruct_dev-reward_modeling__1__1739971529.json diff --git a/data/allenai_open_instruct_dev-reward_modeling__1__1739998765.json b/data/models/allenai_open_instruct_dev-reward_modeling__1__1739998765.json similarity index 100% rename from data/allenai_open_instruct_dev-reward_modeling__1__1739998765.json rename to data/models/allenai_open_instruct_dev-reward_modeling__1__1739998765.json diff --git a/data/allenai_open_instruct_dev-reward_modeling__1__1740005072.json b/data/models/allenai_open_instruct_dev-reward_modeling__1__1740005072.json similarity index 100% rename from data/allenai_open_instruct_dev-reward_modeling__1__1740005072.json rename to data/models/allenai_open_instruct_dev-reward_modeling__1__1740005072.json diff --git a/data/allenai_open_instruct_dev-reward_modeling__1__1740129284.json b/data/models/allenai_open_instruct_dev-reward_modeling__1__1740129284.json similarity index 100% rename from data/allenai_open_instruct_dev-reward_modeling__1__1740129284.json rename to data/models/allenai_open_instruct_dev-reward_modeling__1__1740129284.json diff --git a/data/allenai_open_instruct_dev-reward_modeling__1__1741286813.json b/data/models/allenai_open_instruct_dev-reward_modeling__1__1741286813.json similarity index 100% rename from data/allenai_open_instruct_dev-reward_modeling__1__1741286813.json rename to data/models/allenai_open_instruct_dev-reward_modeling__1__1741286813.json diff --git a/data/allenai_open_instruct_dev-reward_modeling__1__1741287363.json b/data/models/allenai_open_instruct_dev-reward_modeling__1__1741287363.json similarity index 100% rename from data/allenai_open_instruct_dev-reward_modeling__1__1741287363.json rename to data/models/allenai_open_instruct_dev-reward_modeling__1__1741287363.json diff --git a/data/allenai_open_instruct_dev-reward_modeling__1__1741292911.json b/data/models/allenai_open_instruct_dev-reward_modeling__1__1741292911.json similarity index 100% rename from data/allenai_open_instruct_dev-reward_modeling__1__1741292911.json rename to data/models/allenai_open_instruct_dev-reward_modeling__1__1741292911.json diff --git a/data/allenai_open_instruct_dev-reward_modeling__1__1742338142.json b/data/models/allenai_open_instruct_dev-reward_modeling__1__1742338142.json similarity index 100% rename from data/allenai_open_instruct_dev-reward_modeling__1__1742338142.json rename to data/models/allenai_open_instruct_dev-reward_modeling__1__1742338142.json diff --git a/data/allenai_open_instruct_dev-reward_modeling__1__1742519610.json b/data/models/allenai_open_instruct_dev-reward_modeling__1__1742519610.json similarity index 100% rename from data/allenai_open_instruct_dev-reward_modeling__1__1742519610.json rename to data/models/allenai_open_instruct_dev-reward_modeling__1__1742519610.json diff --git a/data/allenai_open_instruct_dev-reward_modeling__1__1742519628.json b/data/models/allenai_open_instruct_dev-reward_modeling__1__1742519628.json similarity index 100% rename from data/allenai_open_instruct_dev-reward_modeling__1__1742519628.json rename to data/models/allenai_open_instruct_dev-reward_modeling__1__1742519628.json diff --git a/data/allenai_open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455.json b/data/models/allenai_open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455.json rename to data/models/allenai_open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455.json diff --git a/data/allenai_open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511.json b/data/models/allenai_open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511.json rename to data/models/allenai_open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511.json diff --git a/data/allenai_open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406.json b/data/models/allenai_open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406.json rename to data/models/allenai_open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406.json diff --git a/data/allenai_open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136.json b/data/models/allenai_open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136.json rename to data/models/allenai_open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136.json diff --git a/data/allenai_open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398.json b/data/models/allenai_open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398.json rename to data/models/allenai_open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398.json diff --git a/data/allenai_open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535.json b/data/models/allenai_open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535.json rename to data/models/allenai_open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535.json diff --git a/data/allenai_open_instruct_dev-rm_1e-6_1_dpo__1__1743550054.json b/data/models/allenai_open_instruct_dev-rm_1e-6_1_dpo__1__1743550054.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_1e-6_1_dpo__1__1743550054.json rename to data/models/allenai_open_instruct_dev-rm_1e-6_1_dpo__1__1743550054.json diff --git a/data/allenai_open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271.json b/data/models/allenai_open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271.json rename to data/models/allenai_open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271.json diff --git a/data/allenai_open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181.json b/data/models/allenai_open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181.json rename to data/models/allenai_open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181.json diff --git a/data/allenai_open_instruct_dev-rm_1e-6_1_rl__1__1743551221.json b/data/models/allenai_open_instruct_dev-rm_1e-6_1_rl__1__1743551221.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_1e-6_1_rl__1__1743551221.json rename to data/models/allenai_open_instruct_dev-rm_1e-6_1_rl__1__1743551221.json diff --git a/data/allenai_open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262.json b/data/models/allenai_open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262.json rename to data/models/allenai_open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262.json diff --git a/data/allenai_open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523.json b/data/models/allenai_open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523.json rename to data/models/allenai_open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523.json diff --git a/data/allenai_open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750.json b/data/models/allenai_open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750.json rename to data/models/allenai_open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750.json diff --git a/data/allenai_open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427.json b/data/models/allenai_open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427.json rename to data/models/allenai_open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427.json diff --git a/data/allenai_open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446.json b/data/models/allenai_open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446.json rename to data/models/allenai_open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446.json diff --git a/data/allenai_open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094.json b/data/models/allenai_open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094.json rename to data/models/allenai_open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094.json diff --git a/data/allenai_open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636.json b/data/models/allenai_open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636.json rename to data/models/allenai_open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636.json diff --git a/data/allenai_open_instruct_dev-rm_1e-6_2_dpo__1__1743549325.json b/data/models/allenai_open_instruct_dev-rm_1e-6_2_dpo__1__1743549325.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_1e-6_2_dpo__1__1743549325.json rename to data/models/allenai_open_instruct_dev-rm_1e-6_2_dpo__1__1743549325.json diff --git a/data/allenai_open_instruct_dev-rm_1e-6_2_rl__1__1743551238.json b/data/models/allenai_open_instruct_dev-rm_1e-6_2_rl__1__1743551238.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_1e-6_2_rl__1__1743551238.json rename to data/models/allenai_open_instruct_dev-rm_1e-6_2_rl__1__1743551238.json diff --git a/data/allenai_open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906.json b/data/models/allenai_open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906.json rename to data/models/allenai_open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906.json diff --git a/data/allenai_open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529.json b/data/models/allenai_open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529.json rename to data/models/allenai_open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529.json diff --git a/data/allenai_open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305.json b/data/models/allenai_open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305.json rename to data/models/allenai_open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305.json diff --git a/data/allenai_open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778.json b/data/models/allenai_open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778.json rename to data/models/allenai_open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778.json diff --git a/data/allenai_open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459.json b/data/models/allenai_open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459.json rename to data/models/allenai_open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459.json diff --git a/data/allenai_open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747.json b/data/models/allenai_open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747.json rename to data/models/allenai_open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747.json diff --git a/data/allenai_open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935.json b/data/models/allenai_open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935.json rename to data/models/allenai_open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935.json diff --git a/data/allenai_open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360.json b/data/models/allenai_open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360.json rename to data/models/allenai_open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360.json diff --git a/data/allenai_open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366.json b/data/models/allenai_open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366.json rename to data/models/allenai_open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366.json diff --git a/data/allenai_open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352.json b/data/models/allenai_open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352.json rename to data/models/allenai_open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352.json diff --git a/data/allenai_open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634.json b/data/models/allenai_open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634.json rename to data/models/allenai_open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634.json diff --git a/data/allenai_open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988.json b/data/models/allenai_open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988.json rename to data/models/allenai_open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103.json b/data/models/allenai_open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835.json b/data/models/allenai_open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221.json b/data/models/allenai_open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826.json b/data/models/allenai_open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363.json b/data/models/allenai_open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498.json b/data/models/allenai_open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_1__2__1743897475.json b/data/models/allenai_open_instruct_dev-rm_3e-6_1__2__1743897475.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_1__2__1743897475.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_1__2__1743897475.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_1__3__1744311421.json b/data/models/allenai_open_instruct_dev-rm_3e-6_1__3__1744311421.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_1__3__1744311421.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_1__3__1744311421.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_1_dpo__1__1743549903.json b/data/models/allenai_open_instruct_dev-rm_3e-6_1_dpo__1__1743549903.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_1_dpo__1__1743549903.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_1_dpo__1__1743549903.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368.json b/data/models/allenai_open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182.json b/data/models/allenai_open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_1_no_if__2__1744316012.json b/data/models/allenai_open_instruct_dev-rm_3e-6_1_no_if__2__1744316012.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_1_no_if__2__1744316012.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_1_no_if__2__1744316012.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_1_no_if__3__1744315765.json b/data/models/allenai_open_instruct_dev-rm_3e-6_1_no_if__3__1744315765.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_1_no_if__3__1744315765.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_1_no_if__3__1744315765.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_1_rl__1__1743551527.json b/data/models/allenai_open_instruct_dev-rm_3e-6_1_rl__1__1743551527.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_1_rl__1__1743551527.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_1_rl__1__1743551527.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236.json b/data/models/allenai_open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530.json b/data/models/allenai_open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417.json b/data/models/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486.json b/data/models/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745.json b/data/models/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661.json b/data/models/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472.json b/data/models/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267.json b/data/models/allenai_open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759.json b/data/models/allenai_open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905.json b/data/models/allenai_open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363.json b/data/models/allenai_open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505.json b/data/models/allenai_open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_2_dpo__1__1743550180.json b/data/models/allenai_open_instruct_dev-rm_3e-6_2_dpo__1__1743550180.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_2_dpo__1__1743550180.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_2_dpo__1__1743550180.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187.json b/data/models/allenai_open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_2_rl__1__1743551509.json b/data/models/allenai_open_instruct_dev-rm_3e-6_2_rl__1__1743551509.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_2_rl__1__1743551509.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_2_rl__1__1743551509.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498.json b/data/models/allenai_open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926.json b/data/models/allenai_open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926.json diff --git a/data/allenai_open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661.json b/data/models/allenai_open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661.json rename to data/models/allenai_open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661.json diff --git a/data/allenai_open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598.json b/data/models/allenai_open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598.json rename to data/models/allenai_open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598.json diff --git a/data/allenai_open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923.json b/data/models/allenai_open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923.json rename to data/models/allenai_open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923.json diff --git a/data/allenai_open_instruct_dev-rm_llama_1e-6_1__1__1743896628.json b/data/models/allenai_open_instruct_dev-rm_llama_1e-6_1__1__1743896628.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llama_1e-6_1__1__1743896628.json rename to data/models/allenai_open_instruct_dev-rm_llama_1e-6_1__1__1743896628.json diff --git a/data/allenai_open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999.json b/data/models/allenai_open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999.json rename to data/models/allenai_open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999.json diff --git a/data/allenai_open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777.json b/data/models/allenai_open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777.json rename to data/models/allenai_open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777.json diff --git a/data/allenai_open_instruct_dev-rm_llama_1e-6_2__1__1743896638.json b/data/models/allenai_open_instruct_dev-rm_llama_1e-6_2__1__1743896638.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llama_1e-6_2__1__1743896638.json rename to data/models/allenai_open_instruct_dev-rm_llama_1e-6_2__1__1743896638.json diff --git a/data/allenai_open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938.json b/data/models/allenai_open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938.json rename to data/models/allenai_open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938.json diff --git a/data/allenai_open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885.json b/data/models/allenai_open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885.json rename to data/models/allenai_open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885.json diff --git a/data/allenai_open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773.json b/data/models/allenai_open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773.json rename to data/models/allenai_open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773.json diff --git a/data/allenai_open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867.json b/data/models/allenai_open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867.json rename to data/models/allenai_open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867.json diff --git a/data/allenai_open_instruct_dev-rm_llama_3e-6_1__1__1743929424.json b/data/models/allenai_open_instruct_dev-rm_llama_3e-6_1__1__1743929424.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llama_3e-6_1__1__1743929424.json rename to data/models/allenai_open_instruct_dev-rm_llama_3e-6_1__1__1743929424.json diff --git a/data/allenai_open_instruct_dev-rm_llama_3e-6_1__2__1744311395.json b/data/models/allenai_open_instruct_dev-rm_llama_3e-6_1__2__1744311395.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llama_3e-6_1__2__1744311395.json rename to data/models/allenai_open_instruct_dev-rm_llama_3e-6_1__2__1744311395.json diff --git a/data/allenai_open_instruct_dev-rm_llama_3e-6_1__3__1744311491.json b/data/models/allenai_open_instruct_dev-rm_llama_3e-6_1__3__1744311491.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llama_3e-6_1__3__1744311491.json rename to data/models/allenai_open_instruct_dev-rm_llama_3e-6_1__3__1744311491.json diff --git a/data/allenai_open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787.json b/data/models/allenai_open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787.json rename to data/models/allenai_open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787.json diff --git a/data/allenai_open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461.json b/data/models/allenai_open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461.json rename to data/models/allenai_open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461.json diff --git a/data/allenai_open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780.json b/data/models/allenai_open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780.json rename to data/models/allenai_open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780.json diff --git a/data/allenai_open_instruct_dev-rm_llama_3e-6_2__1__1743896489.json b/data/models/allenai_open_instruct_dev-rm_llama_3e-6_2__1__1743896489.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llama_3e-6_2__1__1743896489.json rename to data/models/allenai_open_instruct_dev-rm_llama_3e-6_2__1__1743896489.json diff --git a/data/allenai_open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713.json b/data/models/allenai_open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713.json rename to data/models/allenai_open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713.json diff --git a/data/allenai_open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911.json b/data/models/allenai_open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911.json rename to data/models/allenai_open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911.json diff --git a/data/allenai_open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412.json b/data/models/allenai_open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412.json rename to data/models/allenai_open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412.json diff --git a/data/allenai_open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922.json b/data/models/allenai_open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922.json rename to data/models/allenai_open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922.json diff --git a/data/allenai_open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495.json b/data/models/allenai_open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495.json rename to data/models/allenai_open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495.json diff --git a/data/allenai_open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507.json b/data/models/allenai_open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507.json rename to data/models/allenai_open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507.json diff --git a/data/allenai_open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507.json b/data/models/allenai_open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507.json rename to data/models/allenai_open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507.json diff --git a/data/allenai_open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917.json b/data/models/allenai_open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917.json rename to data/models/allenai_open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917.json diff --git a/data/allenai_open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961.json b/data/models/allenai_open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961.json rename to data/models/allenai_open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961.json diff --git a/data/allenai_open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830.json b/data/models/allenai_open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830.json rename to data/models/allenai_open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830.json diff --git a/data/allenai_open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024.json b/data/models/allenai_open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024.json rename to data/models/allenai_open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024.json diff --git a/data/allenai_open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914.json b/data/models/allenai_open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914.json rename to data/models/allenai_open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914.json diff --git a/data/allenai_open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091.json b/data/models/allenai_open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091.json rename to data/models/allenai_open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091.json diff --git a/data/allenai_open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829.json b/data/models/allenai_open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829.json rename to data/models/allenai_open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829.json diff --git a/data/allenai_open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050.json b/data/models/allenai_open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050.json rename to data/models/allenai_open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050.json diff --git a/data/allenai_open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916.json b/data/models/allenai_open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916.json rename to data/models/allenai_open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916.json diff --git a/data/allenai_open_instruct_dev-rm_qwen_3e-6_2__1__1743023576.json b/data/models/allenai_open_instruct_dev-rm_qwen_3e-6_2__1__1743023576.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_qwen_3e-6_2__1__1743023576.json rename to data/models/allenai_open_instruct_dev-rm_qwen_3e-6_2__1__1743023576.json diff --git a/data/allenai_open_instruct_dev-rm_qwen_3e-6_3__1__1743023619.json b/data/models/allenai_open_instruct_dev-rm_qwen_3e-6_3__1__1743023619.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_qwen_3e-6_3__1__1743023619.json rename to data/models/allenai_open_instruct_dev-rm_qwen_3e-6_3__1__1743023619.json diff --git a/data/allenai_open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583.json b/data/models/allenai_open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583.json rename to data/models/allenai_open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583.json diff --git a/data/allenai_open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604.json b/data/models/allenai_open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604.json rename to data/models/allenai_open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604.json diff --git a/data/allenai_open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738.json b/data/models/allenai_open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738.json rename to data/models/allenai_open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738.json diff --git a/data/allenai_open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191.json b/data/models/allenai_open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191.json rename to data/models/allenai_open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191.json diff --git a/data/allenai_open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737.json b/data/models/allenai_open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737.json rename to data/models/allenai_open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737.json diff --git a/data/allenai_open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138.json b/data/models/allenai_open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138.json rename to data/models/allenai_open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138.json diff --git a/data/allenai_open_instruct_dev-rm_tulu3_70b_1__8__1742924455.json b/data/models/allenai_open_instruct_dev-rm_tulu3_70b_1__8__1742924455.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_tulu3_70b_1__8__1742924455.json rename to data/models/allenai_open_instruct_dev-rm_tulu3_70b_1__8__1742924455.json diff --git a/data/allenai_open_instruct_dev-rm_tulu3_70b_2__8__1742982964.json b/data/models/allenai_open_instruct_dev-rm_tulu3_70b_2__8__1742982964.json similarity index 100% rename from data/allenai_open_instruct_dev-rm_tulu3_70b_2__8__1742982964.json rename to data/models/allenai_open_instruct_dev-rm_tulu3_70b_2__8__1742982964.json diff --git a/data/allenai_tulu-2-dpo-13b.json b/data/models/allenai_tulu-2-dpo-13b.json similarity index 100% rename from data/allenai_tulu-2-dpo-13b.json rename to data/models/allenai_tulu-2-dpo-13b.json diff --git a/data/allenai_tulu-2-dpo-70b.json b/data/models/allenai_tulu-2-dpo-70b.json similarity index 100% rename from data/allenai_tulu-2-dpo-70b.json rename to data/models/allenai_tulu-2-dpo-70b.json diff --git a/data/allenai_tulu-2-dpo-7b.json b/data/models/allenai_tulu-2-dpo-7b.json similarity index 100% rename from data/allenai_tulu-2-dpo-7b.json rename to data/models/allenai_tulu-2-dpo-7b.json diff --git a/data/allenai_tulu-v2.5-13b-preference-mix-rm.json b/data/models/allenai_tulu-v2.5-13b-preference-mix-rm.json similarity index 100% rename from data/allenai_tulu-v2.5-13b-preference-mix-rm.json rename to data/models/allenai_tulu-v2.5-13b-preference-mix-rm.json diff --git a/data/allenai_tulu-v2.5-13b-uf-rm.json b/data/models/allenai_tulu-v2.5-13b-uf-rm.json similarity index 100% rename from data/allenai_tulu-v2.5-13b-uf-rm.json rename to data/models/allenai_tulu-v2.5-13b-uf-rm.json diff --git a/data/allenai_tulu-v2.5-70b-preference-mix-rm.json b/data/models/allenai_tulu-v2.5-70b-preference-mix-rm.json similarity index 100% rename from data/allenai_tulu-v2.5-70b-preference-mix-rm.json rename to data/models/allenai_tulu-v2.5-70b-preference-mix-rm.json diff --git a/data/allenai_tulu-v2.5-70b-uf-rm.json b/data/models/allenai_tulu-v2.5-70b-uf-rm.json similarity index 100% rename from data/allenai_tulu-v2.5-70b-uf-rm.json rename to data/models/allenai_tulu-v2.5-70b-uf-rm.json diff --git a/data/allknowingroger_Chocolatine-24B.json b/data/models/allknowingroger_Chocolatine-24B.json similarity index 100% rename from data/allknowingroger_Chocolatine-24B.json rename to data/models/allknowingroger_Chocolatine-24B.json diff --git a/data/allknowingroger_Gemma2Slerp1-2.6B.json b/data/models/allknowingroger_Gemma2Slerp1-2.6B.json similarity index 100% rename from data/allknowingroger_Gemma2Slerp1-2.6B.json rename to data/models/allknowingroger_Gemma2Slerp1-2.6B.json diff --git a/data/allknowingroger_Gemma2Slerp1-27B.json b/data/models/allknowingroger_Gemma2Slerp1-27B.json similarity index 100% rename from data/allknowingroger_Gemma2Slerp1-27B.json rename to data/models/allknowingroger_Gemma2Slerp1-27B.json diff --git a/data/allknowingroger_Gemma2Slerp2-2.6B.json b/data/models/allknowingroger_Gemma2Slerp2-2.6B.json similarity index 100% rename from data/allknowingroger_Gemma2Slerp2-2.6B.json rename to data/models/allknowingroger_Gemma2Slerp2-2.6B.json diff --git a/data/allknowingroger_Gemma2Slerp2-27B.json b/data/models/allknowingroger_Gemma2Slerp2-27B.json similarity index 100% rename from data/allknowingroger_Gemma2Slerp2-27B.json rename to data/models/allknowingroger_Gemma2Slerp2-27B.json diff --git a/data/allknowingroger_Gemma2Slerp3-27B.json b/data/models/allknowingroger_Gemma2Slerp3-27B.json similarity index 100% rename from data/allknowingroger_Gemma2Slerp3-27B.json rename to data/models/allknowingroger_Gemma2Slerp3-27B.json diff --git a/data/allknowingroger_Gemma2Slerp4-27B.json b/data/models/allknowingroger_Gemma2Slerp4-27B.json similarity index 100% rename from data/allknowingroger_Gemma2Slerp4-27B.json rename to data/models/allknowingroger_Gemma2Slerp4-27B.json diff --git a/data/allknowingroger_GemmaSlerp-9B.json b/data/models/allknowingroger_GemmaSlerp-9B.json similarity index 100% rename from data/allknowingroger_GemmaSlerp-9B.json rename to data/models/allknowingroger_GemmaSlerp-9B.json diff --git a/data/allknowingroger_GemmaSlerp2-9B.json b/data/models/allknowingroger_GemmaSlerp2-9B.json similarity index 100% rename from data/allknowingroger_GemmaSlerp2-9B.json rename to data/models/allknowingroger_GemmaSlerp2-9B.json diff --git a/data/allknowingroger_GemmaSlerp4-10B.json b/data/models/allknowingroger_GemmaSlerp4-10B.json similarity index 100% rename from data/allknowingroger_GemmaSlerp4-10B.json rename to data/models/allknowingroger_GemmaSlerp4-10B.json diff --git a/data/allknowingroger_GemmaSlerp5-10B.json b/data/models/allknowingroger_GemmaSlerp5-10B.json similarity index 100% rename from data/allknowingroger_GemmaSlerp5-10B.json rename to data/models/allknowingroger_GemmaSlerp5-10B.json diff --git a/data/allknowingroger_GemmaStock1-27B.json b/data/models/allknowingroger_GemmaStock1-27B.json similarity index 100% rename from data/allknowingroger_GemmaStock1-27B.json rename to data/models/allknowingroger_GemmaStock1-27B.json diff --git a/data/allknowingroger_HomerSlerp1-7B.json b/data/models/allknowingroger_HomerSlerp1-7B.json similarity index 100% rename from data/allknowingroger_HomerSlerp1-7B.json rename to data/models/allknowingroger_HomerSlerp1-7B.json diff --git a/data/allknowingroger_HomerSlerp2-7B.json b/data/models/allknowingroger_HomerSlerp2-7B.json similarity index 100% rename from data/allknowingroger_HomerSlerp2-7B.json rename to data/models/allknowingroger_HomerSlerp2-7B.json diff --git a/data/allknowingroger_HomerSlerp3-7B.json b/data/models/allknowingroger_HomerSlerp3-7B.json similarity index 100% rename from data/allknowingroger_HomerSlerp3-7B.json rename to data/models/allknowingroger_HomerSlerp3-7B.json diff --git a/data/allknowingroger_HomerSlerp4-7B.json b/data/models/allknowingroger_HomerSlerp4-7B.json similarity index 100% rename from data/allknowingroger_HomerSlerp4-7B.json rename to data/models/allknowingroger_HomerSlerp4-7B.json diff --git a/data/allknowingroger_LimyQstar-7B-slerp.json b/data/models/allknowingroger_LimyQstar-7B-slerp.json similarity index 100% rename from data/allknowingroger_LimyQstar-7B-slerp.json rename to data/models/allknowingroger_LimyQstar-7B-slerp.json diff --git a/data/allknowingroger_Llama3.1-60B.json b/data/models/allknowingroger_Llama3.1-60B.json similarity index 100% rename from data/allknowingroger_Llama3.1-60B.json rename to data/models/allknowingroger_Llama3.1-60B.json diff --git a/data/allknowingroger_Marco-01-slerp1-7B.json b/data/models/allknowingroger_Marco-01-slerp1-7B.json similarity index 100% rename from data/allknowingroger_Marco-01-slerp1-7B.json rename to data/models/allknowingroger_Marco-01-slerp1-7B.json diff --git a/data/allknowingroger_Meme-7B-slerp.json b/data/models/allknowingroger_Meme-7B-slerp.json similarity index 100% rename from data/allknowingroger_Meme-7B-slerp.json rename to data/models/allknowingroger_Meme-7B-slerp.json diff --git a/data/allknowingroger_Ministral-8B-slerp.json b/data/models/allknowingroger_Ministral-8B-slerp.json similarity index 100% rename from data/allknowingroger_Ministral-8B-slerp.json rename to data/models/allknowingroger_Ministral-8B-slerp.json diff --git a/data/allknowingroger_MistralPhi3-11B.json b/data/models/allknowingroger_MistralPhi3-11B.json similarity index 100% rename from data/allknowingroger_MistralPhi3-11B.json rename to data/models/allknowingroger_MistralPhi3-11B.json diff --git a/data/allknowingroger_Mistralmash1-7B-s.json b/data/models/allknowingroger_Mistralmash1-7B-s.json similarity index 100% rename from data/allknowingroger_Mistralmash1-7B-s.json rename to data/models/allknowingroger_Mistralmash1-7B-s.json diff --git a/data/allknowingroger_Mistralmash2-7B-s.json b/data/models/allknowingroger_Mistralmash2-7B-s.json similarity index 100% rename from data/allknowingroger_Mistralmash2-7B-s.json rename to data/models/allknowingroger_Mistralmash2-7B-s.json diff --git a/data/allknowingroger_MixTAO-19B-pass.json b/data/models/allknowingroger_MixTAO-19B-pass.json similarity index 100% rename from data/allknowingroger_MixTAO-19B-pass.json rename to data/models/allknowingroger_MixTAO-19B-pass.json diff --git a/data/allknowingroger_MixTaoTruthful-13B-slerp.json b/data/models/allknowingroger_MixTaoTruthful-13B-slerp.json similarity index 100% rename from data/allknowingroger_MixTaoTruthful-13B-slerp.json rename to data/models/allknowingroger_MixTaoTruthful-13B-slerp.json diff --git a/data/allknowingroger_MultiCalm-7B-slerp.json b/data/models/allknowingroger_MultiCalm-7B-slerp.json similarity index 100% rename from data/allknowingroger_MultiCalm-7B-slerp.json rename to data/models/allknowingroger_MultiCalm-7B-slerp.json diff --git a/data/allknowingroger_MultiMash-12B-slerp.json b/data/models/allknowingroger_MultiMash-12B-slerp.json similarity index 100% rename from data/allknowingroger_MultiMash-12B-slerp.json rename to data/models/allknowingroger_MultiMash-12B-slerp.json diff --git a/data/allknowingroger_MultiMash10-13B-slerp.json b/data/models/allknowingroger_MultiMash10-13B-slerp.json similarity index 100% rename from data/allknowingroger_MultiMash10-13B-slerp.json rename to data/models/allknowingroger_MultiMash10-13B-slerp.json diff --git a/data/allknowingroger_MultiMash11-13B-slerp.json b/data/models/allknowingroger_MultiMash11-13B-slerp.json similarity index 100% rename from data/allknowingroger_MultiMash11-13B-slerp.json rename to data/models/allknowingroger_MultiMash11-13B-slerp.json diff --git a/data/allknowingroger_MultiMash2-12B-slerp.json b/data/models/allknowingroger_MultiMash2-12B-slerp.json similarity index 100% rename from data/allknowingroger_MultiMash2-12B-slerp.json rename to data/models/allknowingroger_MultiMash2-12B-slerp.json diff --git a/data/allknowingroger_MultiMash5-12B-slerp.json b/data/models/allknowingroger_MultiMash5-12B-slerp.json similarity index 100% rename from data/allknowingroger_MultiMash5-12B-slerp.json rename to data/models/allknowingroger_MultiMash5-12B-slerp.json diff --git a/data/allknowingroger_MultiMash6-12B-slerp.json b/data/models/allknowingroger_MultiMash6-12B-slerp.json similarity index 100% rename from data/allknowingroger_MultiMash6-12B-slerp.json rename to data/models/allknowingroger_MultiMash6-12B-slerp.json diff --git a/data/allknowingroger_MultiMash7-12B-slerp.json b/data/models/allknowingroger_MultiMash7-12B-slerp.json similarity index 100% rename from data/allknowingroger_MultiMash7-12B-slerp.json rename to data/models/allknowingroger_MultiMash7-12B-slerp.json diff --git a/data/allknowingroger_MultiMash8-13B-slerp.json b/data/models/allknowingroger_MultiMash8-13B-slerp.json similarity index 100% rename from data/allknowingroger_MultiMash8-13B-slerp.json rename to data/models/allknowingroger_MultiMash8-13B-slerp.json diff --git a/data/allknowingroger_MultiMash9-13B-slerp.json b/data/models/allknowingroger_MultiMash9-13B-slerp.json similarity index 100% rename from data/allknowingroger_MultiMash9-13B-slerp.json rename to data/models/allknowingroger_MultiMash9-13B-slerp.json diff --git a/data/allknowingroger_MultiMerge-7B-slerp.json b/data/models/allknowingroger_MultiMerge-7B-slerp.json similarity index 100% rename from data/allknowingroger_MultiMerge-7B-slerp.json rename to data/models/allknowingroger_MultiMerge-7B-slerp.json diff --git a/data/allknowingroger_Multimash3-12B-slerp.json b/data/models/allknowingroger_Multimash3-12B-slerp.json similarity index 100% rename from data/allknowingroger_Multimash3-12B-slerp.json rename to data/models/allknowingroger_Multimash3-12B-slerp.json diff --git a/data/allknowingroger_Multimerge-19B-pass.json b/data/models/allknowingroger_Multimerge-19B-pass.json similarity index 100% rename from data/allknowingroger_Multimerge-19B-pass.json rename to data/models/allknowingroger_Multimerge-19B-pass.json diff --git a/data/allknowingroger_MultiverseEx26-7B-slerp.json b/data/models/allknowingroger_MultiverseEx26-7B-slerp.json similarity index 100% rename from data/allknowingroger_MultiverseEx26-7B-slerp.json rename to data/models/allknowingroger_MultiverseEx26-7B-slerp.json diff --git a/data/allknowingroger_NeuralWestSeverus-7B-slerp.json b/data/models/allknowingroger_NeuralWestSeverus-7B-slerp.json similarity index 100% rename from data/allknowingroger_NeuralWestSeverus-7B-slerp.json rename to data/models/allknowingroger_NeuralWestSeverus-7B-slerp.json diff --git a/data/allknowingroger_Neuralcoven-7B-slerp.json b/data/models/allknowingroger_Neuralcoven-7B-slerp.json similarity index 100% rename from data/allknowingroger_Neuralcoven-7B-slerp.json rename to data/models/allknowingroger_Neuralcoven-7B-slerp.json diff --git a/data/allknowingroger_Neuralmultiverse-7B-slerp.json b/data/models/allknowingroger_Neuralmultiverse-7B-slerp.json similarity index 100% rename from data/allknowingroger_Neuralmultiverse-7B-slerp.json rename to data/models/allknowingroger_Neuralmultiverse-7B-slerp.json diff --git a/data/allknowingroger_Ph3della5-14B.json b/data/models/allknowingroger_Ph3della5-14B.json similarity index 100% rename from data/allknowingroger_Ph3della5-14B.json rename to data/models/allknowingroger_Ph3della5-14B.json diff --git a/data/allknowingroger_Ph3merge-14B.json b/data/models/allknowingroger_Ph3merge-14B.json similarity index 100% rename from data/allknowingroger_Ph3merge-14B.json rename to data/models/allknowingroger_Ph3merge-14B.json diff --git a/data/allknowingroger_Ph3merge2-14B.json b/data/models/allknowingroger_Ph3merge2-14B.json similarity index 100% rename from data/allknowingroger_Ph3merge2-14B.json rename to data/models/allknowingroger_Ph3merge2-14B.json diff --git a/data/allknowingroger_Ph3merge3-14B.json b/data/models/allknowingroger_Ph3merge3-14B.json similarity index 100% rename from data/allknowingroger_Ph3merge3-14B.json rename to data/models/allknowingroger_Ph3merge3-14B.json diff --git a/data/allknowingroger_Ph3task1-14B.json b/data/models/allknowingroger_Ph3task1-14B.json similarity index 100% rename from data/allknowingroger_Ph3task1-14B.json rename to data/models/allknowingroger_Ph3task1-14B.json diff --git a/data/allknowingroger_Ph3task2-14B.json b/data/models/allknowingroger_Ph3task2-14B.json similarity index 100% rename from data/allknowingroger_Ph3task2-14B.json rename to data/models/allknowingroger_Ph3task2-14B.json diff --git a/data/allknowingroger_Ph3task3-14B.json b/data/models/allknowingroger_Ph3task3-14B.json similarity index 100% rename from data/allknowingroger_Ph3task3-14B.json rename to data/models/allknowingroger_Ph3task3-14B.json diff --git a/data/allknowingroger_Ph3unsloth-3B-slerp.json b/data/models/allknowingroger_Ph3unsloth-3B-slerp.json similarity index 100% rename from data/allknowingroger_Ph3unsloth-3B-slerp.json rename to data/models/allknowingroger_Ph3unsloth-3B-slerp.json diff --git a/data/allknowingroger_Phi3mash1-17B-pass.json b/data/models/allknowingroger_Phi3mash1-17B-pass.json similarity index 100% rename from data/allknowingroger_Phi3mash1-17B-pass.json rename to data/models/allknowingroger_Phi3mash1-17B-pass.json diff --git a/data/allknowingroger_Quen2-65B.json b/data/models/allknowingroger_Quen2-65B.json similarity index 100% rename from data/allknowingroger_Quen2-65B.json rename to data/models/allknowingroger_Quen2-65B.json diff --git a/data/allknowingroger_Qwen2.5-42B-AGI.json b/data/models/allknowingroger_Qwen2.5-42B-AGI.json similarity index 100% rename from data/allknowingroger_Qwen2.5-42B-AGI.json rename to data/models/allknowingroger_Qwen2.5-42B-AGI.json diff --git a/data/allknowingroger_Qwen2.5-7B-task2.json b/data/models/allknowingroger_Qwen2.5-7B-task2.json similarity index 100% rename from data/allknowingroger_Qwen2.5-7B-task2.json rename to data/models/allknowingroger_Qwen2.5-7B-task2.json diff --git a/data/allknowingroger_Qwen2.5-7B-task3.json b/data/models/allknowingroger_Qwen2.5-7B-task3.json similarity index 100% rename from data/allknowingroger_Qwen2.5-7B-task3.json rename to data/models/allknowingroger_Qwen2.5-7B-task3.json diff --git a/data/allknowingroger_Qwen2.5-7B-task4.json b/data/models/allknowingroger_Qwen2.5-7B-task4.json similarity index 100% rename from data/allknowingroger_Qwen2.5-7B-task4.json rename to data/models/allknowingroger_Qwen2.5-7B-task4.json diff --git a/data/allknowingroger_Qwen2.5-7B-task7.json b/data/models/allknowingroger_Qwen2.5-7B-task7.json similarity index 100% rename from data/allknowingroger_Qwen2.5-7B-task7.json rename to data/models/allknowingroger_Qwen2.5-7B-task7.json diff --git a/data/allknowingroger_Qwen2.5-7B-task8.json b/data/models/allknowingroger_Qwen2.5-7B-task8.json similarity index 100% rename from data/allknowingroger_Qwen2.5-7B-task8.json rename to data/models/allknowingroger_Qwen2.5-7B-task8.json diff --git a/data/allknowingroger_Qwen2.5-slerp-14B.json b/data/models/allknowingroger_Qwen2.5-slerp-14B.json similarity index 100% rename from data/allknowingroger_Qwen2.5-slerp-14B.json rename to data/models/allknowingroger_Qwen2.5-slerp-14B.json diff --git a/data/allknowingroger_QwenSlerp12-7B.json b/data/models/allknowingroger_QwenSlerp12-7B.json similarity index 100% rename from data/allknowingroger_QwenSlerp12-7B.json rename to data/models/allknowingroger_QwenSlerp12-7B.json diff --git a/data/allknowingroger_QwenSlerp4-14B.json b/data/models/allknowingroger_QwenSlerp4-14B.json similarity index 100% rename from data/allknowingroger_QwenSlerp4-14B.json rename to data/models/allknowingroger_QwenSlerp4-14B.json diff --git a/data/allknowingroger_QwenSlerp5-14B.json b/data/models/allknowingroger_QwenSlerp5-14B.json similarity index 100% rename from data/allknowingroger_QwenSlerp5-14B.json rename to data/models/allknowingroger_QwenSlerp5-14B.json diff --git a/data/allknowingroger_QwenSlerp6-14B.json b/data/models/allknowingroger_QwenSlerp6-14B.json similarity index 100% rename from data/allknowingroger_QwenSlerp6-14B.json rename to data/models/allknowingroger_QwenSlerp6-14B.json diff --git a/data/allknowingroger_QwenStock1-14B.json b/data/models/allknowingroger_QwenStock1-14B.json similarity index 100% rename from data/allknowingroger_QwenStock1-14B.json rename to data/models/allknowingroger_QwenStock1-14B.json diff --git a/data/allknowingroger_QwenStock2-14B.json b/data/models/allknowingroger_QwenStock2-14B.json similarity index 100% rename from data/allknowingroger_QwenStock2-14B.json rename to data/models/allknowingroger_QwenStock2-14B.json diff --git a/data/allknowingroger_QwenStock3-14B.json b/data/models/allknowingroger_QwenStock3-14B.json similarity index 100% rename from data/allknowingroger_QwenStock3-14B.json rename to data/models/allknowingroger_QwenStock3-14B.json diff --git a/data/allknowingroger_Qwenslerp2-14B.json b/data/models/allknowingroger_Qwenslerp2-14B.json similarity index 100% rename from data/allknowingroger_Qwenslerp2-14B.json rename to data/models/allknowingroger_Qwenslerp2-14B.json diff --git a/data/allknowingroger_Qwenslerp2-7B.json b/data/models/allknowingroger_Qwenslerp2-7B.json similarity index 100% rename from data/allknowingroger_Qwenslerp2-7B.json rename to data/models/allknowingroger_Qwenslerp2-7B.json diff --git a/data/allknowingroger_Qwenslerp3-14B.json b/data/models/allknowingroger_Qwenslerp3-14B.json similarity index 100% rename from data/allknowingroger_Qwenslerp3-14B.json rename to data/models/allknowingroger_Qwenslerp3-14B.json diff --git a/data/allknowingroger_Qwenslerp3-7B.json b/data/models/allknowingroger_Qwenslerp3-7B.json similarity index 100% rename from data/allknowingroger_Qwenslerp3-7B.json rename to data/models/allknowingroger_Qwenslerp3-7B.json diff --git a/data/allknowingroger_ROGERphi-7B-slerp.json b/data/models/allknowingroger_ROGERphi-7B-slerp.json similarity index 100% rename from data/allknowingroger_ROGERphi-7B-slerp.json rename to data/models/allknowingroger_ROGERphi-7B-slerp.json diff --git a/data/allknowingroger_RogerMerge-7B-slerp.json b/data/models/allknowingroger_RogerMerge-7B-slerp.json similarity index 100% rename from data/allknowingroger_RogerMerge-7B-slerp.json rename to data/models/allknowingroger_RogerMerge-7B-slerp.json diff --git a/data/allknowingroger_Rombos-LLM-V2.5-Qwen-42b.json b/data/models/allknowingroger_Rombos-LLM-V2.5-Qwen-42b.json similarity index 100% rename from data/allknowingroger_Rombos-LLM-V2.5-Qwen-42b.json rename to data/models/allknowingroger_Rombos-LLM-V2.5-Qwen-42b.json diff --git a/data/allknowingroger_Strangecoven-7B-slerp.json b/data/models/allknowingroger_Strangecoven-7B-slerp.json similarity index 100% rename from data/allknowingroger_Strangecoven-7B-slerp.json rename to data/models/allknowingroger_Strangecoven-7B-slerp.json diff --git a/data/allknowingroger_Weirdslerp2-25B.json b/data/models/allknowingroger_Weirdslerp2-25B.json similarity index 100% rename from data/allknowingroger_Weirdslerp2-25B.json rename to data/models/allknowingroger_Weirdslerp2-25B.json diff --git a/data/allknowingroger_WestlakeMaziyar-7B-slerp.json b/data/models/allknowingroger_WestlakeMaziyar-7B-slerp.json similarity index 100% rename from data/allknowingroger_WestlakeMaziyar-7B-slerp.json rename to data/models/allknowingroger_WestlakeMaziyar-7B-slerp.json diff --git a/data/allknowingroger_YamMaths-7B-slerp.json b/data/models/allknowingroger_YamMaths-7B-slerp.json similarity index 100% rename from data/allknowingroger_YamMaths-7B-slerp.json rename to data/models/allknowingroger_YamMaths-7B-slerp.json diff --git a/data/allknowingroger_Yi-1.5-34B.json b/data/models/allknowingroger_Yi-1.5-34B.json similarity index 100% rename from data/allknowingroger_Yi-1.5-34B.json rename to data/models/allknowingroger_Yi-1.5-34B.json diff --git a/data/allknowingroger_Yi-blossom-40B.json b/data/models/allknowingroger_Yi-blossom-40B.json similarity index 100% rename from data/allknowingroger_Yi-blossom-40B.json rename to data/models/allknowingroger_Yi-blossom-40B.json diff --git a/data/allknowingroger_Yibuddy-35B.json b/data/models/allknowingroger_Yibuddy-35B.json similarity index 100% rename from data/allknowingroger_Yibuddy-35B.json rename to data/models/allknowingroger_Yibuddy-35B.json diff --git a/data/allknowingroger_Yillama-40B.json b/data/models/allknowingroger_Yillama-40B.json similarity index 100% rename from data/allknowingroger_Yillama-40B.json rename to data/models/allknowingroger_Yillama-40B.json diff --git a/data/allknowingroger_Yislerp-34B.json b/data/models/allknowingroger_Yislerp-34B.json similarity index 100% rename from data/allknowingroger_Yislerp-34B.json rename to data/models/allknowingroger_Yislerp-34B.json diff --git a/data/allknowingroger_Yislerp2-34B.json b/data/models/allknowingroger_Yislerp2-34B.json similarity index 100% rename from data/allknowingroger_Yislerp2-34B.json rename to data/models/allknowingroger_Yislerp2-34B.json diff --git a/data/allknowingroger_Yunconglong-13B-slerp.json b/data/models/allknowingroger_Yunconglong-13B-slerp.json similarity index 100% rename from data/allknowingroger_Yunconglong-13B-slerp.json rename to data/models/allknowingroger_Yunconglong-13B-slerp.json diff --git a/data/allknowingroger_limyClown-7B-slerp.json b/data/models/allknowingroger_limyClown-7B-slerp.json similarity index 100% rename from data/allknowingroger_limyClown-7B-slerp.json rename to data/models/allknowingroger_limyClown-7B-slerp.json diff --git a/data/allknowingroger_llama3-Jallabi-40B-s.json b/data/models/allknowingroger_llama3-Jallabi-40B-s.json similarity index 100% rename from data/allknowingroger_llama3-Jallabi-40B-s.json rename to data/models/allknowingroger_llama3-Jallabi-40B-s.json diff --git a/data/allknowingroger_llama3AnFeng-40B.json b/data/models/allknowingroger_llama3AnFeng-40B.json similarity index 100% rename from data/allknowingroger_llama3AnFeng-40B.json rename to data/models/allknowingroger_llama3AnFeng-40B.json diff --git a/data/allura-org_L3.1-8b-RP-Ink.json b/data/models/allura-org_L3.1-8b-RP-Ink.json similarity index 100% rename from data/allura-org_L3.1-8b-RP-Ink.json rename to data/models/allura-org_L3.1-8b-RP-Ink.json diff --git a/data/allura-org_MN-12b-RP-Ink.json b/data/models/allura-org_MN-12b-RP-Ink.json similarity index 100% rename from data/allura-org_MN-12b-RP-Ink.json rename to data/models/allura-org_MN-12b-RP-Ink.json diff --git a/data/allura-org_MS-Meadowlark-22B.json b/data/models/allura-org_MS-Meadowlark-22B.json similarity index 100% rename from data/allura-org_MS-Meadowlark-22B.json rename to data/models/allura-org_MS-Meadowlark-22B.json diff --git a/data/allura-org_Mistral-Small-24b-Sertraline-0304.json b/data/models/allura-org_Mistral-Small-24b-Sertraline-0304.json similarity index 100% rename from data/allura-org_Mistral-Small-24b-Sertraline-0304.json rename to data/models/allura-org_Mistral-Small-24b-Sertraline-0304.json diff --git a/data/allura-org_Mistral-Small-Sisyphus-24b-2503.json b/data/models/allura-org_Mistral-Small-Sisyphus-24b-2503.json similarity index 100% rename from data/allura-org_Mistral-Small-Sisyphus-24b-2503.json rename to data/models/allura-org_Mistral-Small-Sisyphus-24b-2503.json diff --git a/data/allura-org_MoE-Girl-1BA-7BT.json b/data/models/allura-org_MoE-Girl-1BA-7BT.json similarity index 100% rename from data/allura-org_MoE-Girl-1BA-7BT.json rename to data/models/allura-org_MoE-Girl-1BA-7BT.json diff --git a/data/allura-org_TQ2.5-14B-Aletheia-v1.json b/data/models/allura-org_TQ2.5-14B-Aletheia-v1.json similarity index 100% rename from data/allura-org_TQ2.5-14B-Aletheia-v1.json rename to data/models/allura-org_TQ2.5-14B-Aletheia-v1.json diff --git a/data/allura-org_TQ2.5-14B-Neon-v1.json b/data/models/allura-org_TQ2.5-14B-Neon-v1.json similarity index 100% rename from data/allura-org_TQ2.5-14B-Neon-v1.json rename to data/models/allura-org_TQ2.5-14B-Neon-v1.json diff --git a/data/allura-org_Teleut-7b.json b/data/models/allura-org_Teleut-7b.json similarity index 100% rename from data/allura-org_Teleut-7b.json rename to data/models/allura-org_Teleut-7b.json diff --git a/data/aloobun_Meta-Llama-3-7B-28Layers.json b/data/models/aloobun_Meta-Llama-3-7B-28Layers.json similarity index 100% rename from data/aloobun_Meta-Llama-3-7B-28Layers.json rename to data/models/aloobun_Meta-Llama-3-7B-28Layers.json diff --git a/data/aloobun_d-SmolLM2-360M.json b/data/models/aloobun_d-SmolLM2-360M.json similarity index 100% rename from data/aloobun_d-SmolLM2-360M.json rename to data/models/aloobun_d-SmolLM2-360M.json diff --git a/data/alpindale_WizardLM-2-8x22B.json b/data/models/alpindale_WizardLM-2-8x22B.json similarity index 100% rename from data/alpindale_WizardLM-2-8x22B.json rename to data/models/alpindale_WizardLM-2-8x22B.json diff --git a/data/alpindale_magnum-72b-v1.json b/data/models/alpindale_magnum-72b-v1.json similarity index 100% rename from data/alpindale_magnum-72b-v1.json rename to data/models/alpindale_magnum-72b-v1.json diff --git a/data/altomek_YiSM-34B-0rn.json b/data/models/altomek_YiSM-34B-0rn.json similarity index 100% rename from data/altomek_YiSM-34B-0rn.json rename to data/models/altomek_YiSM-34B-0rn.json diff --git a/data/amazon_MegaBeam-Mistral-7B-300k.json b/data/models/amazon_MegaBeam-Mistral-7B-300k.json similarity index 100% rename from data/amazon_MegaBeam-Mistral-7B-300k.json rename to data/models/amazon_MegaBeam-Mistral-7B-300k.json diff --git a/data/amazon_nova-lite-v1_0.json b/data/models/amazon_nova-lite-v1_0.json similarity index 100% rename from data/amazon_nova-lite-v1_0.json rename to data/models/amazon_nova-lite-v1_0.json diff --git a/data/amazon_nova-micro-v1_0.json b/data/models/amazon_nova-micro-v1_0.json similarity index 100% rename from data/amazon_nova-micro-v1_0.json rename to data/models/amazon_nova-micro-v1_0.json diff --git a/data/amazon_nova-premier-v1_0.json b/data/models/amazon_nova-premier-v1_0.json similarity index 100% rename from data/amazon_nova-premier-v1_0.json rename to data/models/amazon_nova-premier-v1_0.json diff --git a/data/amazon_nova-pro-v1_0.json b/data/models/amazon_nova-pro-v1_0.json similarity index 100% rename from data/amazon_nova-pro-v1_0.json rename to data/models/amazon_nova-pro-v1_0.json diff --git a/data/amd_AMD-Llama-135m.json b/data/models/amd_AMD-Llama-135m.json similarity index 100% rename from data/amd_AMD-Llama-135m.json rename to data/models/amd_AMD-Llama-135m.json diff --git a/data/anakin87_gemma-2b-orpo.json b/data/models/anakin87_gemma-2b-orpo.json similarity index 100% rename from data/anakin87_gemma-2b-orpo.json rename to data/models/anakin87_gemma-2b-orpo.json diff --git a/data/anthracite-org_magnum-v1-72b.json b/data/models/anthracite-org_magnum-v1-72b.json similarity index 100% rename from data/anthracite-org_magnum-v1-72b.json rename to data/models/anthracite-org_magnum-v1-72b.json diff --git a/data/anthracite-org_magnum-v2-12b.json b/data/models/anthracite-org_magnum-v2-12b.json similarity index 100% rename from data/anthracite-org_magnum-v2-12b.json rename to data/models/anthracite-org_magnum-v2-12b.json diff --git a/data/anthracite-org_magnum-v2-72b.json b/data/models/anthracite-org_magnum-v2-72b.json similarity index 100% rename from data/anthracite-org_magnum-v2-72b.json rename to data/models/anthracite-org_magnum-v2-72b.json diff --git a/data/anthracite-org_magnum-v2.5-12b-kto.json b/data/models/anthracite-org_magnum-v2.5-12b-kto.json similarity index 100% rename from data/anthracite-org_magnum-v2.5-12b-kto.json rename to data/models/anthracite-org_magnum-v2.5-12b-kto.json diff --git a/data/anthracite-org_magnum-v3-27b-kto.json b/data/models/anthracite-org_magnum-v3-27b-kto.json similarity index 100% rename from data/anthracite-org_magnum-v3-27b-kto.json rename to data/models/anthracite-org_magnum-v3-27b-kto.json diff --git a/data/anthracite-org_magnum-v3-34b.json b/data/models/anthracite-org_magnum-v3-34b.json similarity index 100% rename from data/anthracite-org_magnum-v3-34b.json rename to data/models/anthracite-org_magnum-v3-34b.json diff --git a/data/anthracite-org_magnum-v3-9b-chatml.json b/data/models/anthracite-org_magnum-v3-9b-chatml.json similarity index 100% rename from data/anthracite-org_magnum-v3-9b-chatml.json rename to data/models/anthracite-org_magnum-v3-9b-chatml.json diff --git a/data/anthracite-org_magnum-v3-9b-customgemma2.json b/data/models/anthracite-org_magnum-v3-9b-customgemma2.json similarity index 100% rename from data/anthracite-org_magnum-v3-9b-customgemma2.json rename to data/models/anthracite-org_magnum-v3-9b-customgemma2.json diff --git a/data/anthracite-org_magnum-v4-12b.json b/data/models/anthracite-org_magnum-v4-12b.json similarity index 100% rename from data/anthracite-org_magnum-v4-12b.json rename to data/models/anthracite-org_magnum-v4-12b.json diff --git a/data/anthracite-org_magnum-v4-22b.json b/data/models/anthracite-org_magnum-v4-22b.json similarity index 100% rename from data/anthracite-org_magnum-v4-22b.json rename to data/models/anthracite-org_magnum-v4-22b.json diff --git a/data/anthracite-org_magnum-v4-27b.json b/data/models/anthracite-org_magnum-v4-27b.json similarity index 100% rename from data/anthracite-org_magnum-v4-27b.json rename to data/models/anthracite-org_magnum-v4-27b.json diff --git a/data/anthracite-org_magnum-v4-9b.json b/data/models/anthracite-org_magnum-v4-9b.json similarity index 100% rename from data/anthracite-org_magnum-v4-9b.json rename to data/models/anthracite-org_magnum-v4-9b.json diff --git a/data/anthropic_Opus_4.1.json b/data/models/anthropic_Opus_4.1.json similarity index 100% rename from data/anthropic_Opus_4.1.json rename to data/models/anthropic_Opus_4.1.json diff --git a/data/anthropic_Opus_4.5.json b/data/models/anthropic_Opus_4.5.json similarity index 100% rename from data/anthropic_Opus_4.5.json rename to data/models/anthropic_Opus_4.5.json diff --git a/data/anthropic_Opus_4.6.json b/data/models/anthropic_Opus_4.6.json similarity index 100% rename from data/anthropic_Opus_4.6.json rename to data/models/anthropic_Opus_4.6.json diff --git a/data/anthropic_Sonnet_4.5.json b/data/models/anthropic_Sonnet_4.5.json similarity index 100% rename from data/anthropic_Sonnet_4.5.json rename to data/models/anthropic_Sonnet_4.5.json diff --git a/data/anthropic_claude-2.0.json b/data/models/anthropic_claude-2.0.json similarity index 100% rename from data/anthropic_claude-2.0.json rename to data/models/anthropic_claude-2.0.json diff --git a/data/anthropic_claude-2.1.json b/data/models/anthropic_claude-2.1.json similarity index 100% rename from data/anthropic_claude-2.1.json rename to data/models/anthropic_claude-2.1.json diff --git a/data/anthropic_claude-3-5-haiku-20241022.json b/data/models/anthropic_claude-3-5-haiku-20241022.json similarity index 100% rename from data/anthropic_claude-3-5-haiku-20241022.json rename to data/models/anthropic_claude-3-5-haiku-20241022.json index 40d89ab29be589f81577a4b7da1462f24631eedf..d77bc43873534d2017192f9adf45a50bd095f489 100644 --- a/data/anthropic_claude-3-5-haiku-20241022.json +++ b/data/models/anthropic_claude-3-5-haiku-20241022.json @@ -7,8 +7,8 @@ }, "evaluations": [ { - "evaluation_id": "global-mmlu-lite/anthropic_claude-3-5-haiku-20241022/1773936583.743359", - "retrieved_timestamp": "1773936583.743359", + "evaluation_id": "global-mmlu-lite/anthropic_claude-3-5-haiku-20241022/1773936496.366405", + "retrieved_timestamp": "1773936496.366405", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", @@ -522,8 +522,8 @@ "generation_config": null }, { - "evaluation_id": "global-mmlu-lite/anthropic_claude-3-5-haiku-20241022/1773936496.366405", - "retrieved_timestamp": "1773936496.366405", + "evaluation_id": "global-mmlu-lite/anthropic_claude-3-5-haiku-20241022/1773936583.743359", + "retrieved_timestamp": "1773936583.743359", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", diff --git a/data/anthropic_claude-3-5-sonnet-20240620.json b/data/models/anthropic_claude-3-5-sonnet-20240620.json similarity index 100% rename from data/anthropic_claude-3-5-sonnet-20240620.json rename to data/models/anthropic_claude-3-5-sonnet-20240620.json diff --git a/data/anthropic_claude-3-5-sonnet-20241022.json b/data/models/anthropic_claude-3-5-sonnet-20241022.json similarity index 100% rename from data/anthropic_claude-3-5-sonnet-20241022.json rename to data/models/anthropic_claude-3-5-sonnet-20241022.json diff --git a/data/anthropic_claude-3-7-sonnet-20250219.json b/data/models/anthropic_claude-3-7-sonnet-20250219.json similarity index 100% rename from data/anthropic_claude-3-7-sonnet-20250219.json rename to data/models/anthropic_claude-3-7-sonnet-20250219.json diff --git a/data/anthropic_claude-3-haiku-20240307.json b/data/models/anthropic_claude-3-haiku-20240307.json similarity index 100% rename from data/anthropic_claude-3-haiku-20240307.json rename to data/models/anthropic_claude-3-haiku-20240307.json diff --git a/data/anthropic_claude-3-opus-20240229.json b/data/models/anthropic_claude-3-opus-20240229.json similarity index 100% rename from data/anthropic_claude-3-opus-20240229.json rename to data/models/anthropic_claude-3-opus-20240229.json diff --git a/data/anthropic_claude-3-sonnet-20240229.json b/data/models/anthropic_claude-3-sonnet-20240229.json similarity index 100% rename from data/anthropic_claude-3-sonnet-20240229.json rename to data/models/anthropic_claude-3-sonnet-20240229.json diff --git a/data/anthropic_claude-3.7-sonnet.json b/data/models/anthropic_claude-3.7-sonnet.json similarity index 100% rename from data/anthropic_claude-3.7-sonnet.json rename to data/models/anthropic_claude-3.7-sonnet.json diff --git a/data/anthropic_claude-haiku-4.5.json b/data/models/anthropic_claude-haiku-4.5.json similarity index 99% rename from data/anthropic_claude-haiku-4.5.json rename to data/models/anthropic_claude-haiku-4.5.json index d650e1c5480c1672cd640e832f6a73b85ae2e9a7..e264d494135c2a9e41ab0ec25ddb1450437254d8 100644 --- a/data/anthropic_claude-haiku-4.5.json +++ b/data/models/anthropic_claude-haiku-4.5.json @@ -4,8 +4,8 @@ "id": "anthropic/claude-haiku-4.5", "developer": "Anthropic", "additional_details": { - "agent_name": "Claude Code", - "agent_organization": "Anthropic" + "agent_name": "Goose", + "agent_organization": "Block" } }, "evaluations": [ @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__claude-haiku-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/openhands__claude-haiku-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-10-31", + "evaluation_timestamp": "2025-11-02", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 28.3, + "score": 13.9, "uncertainty": { "standard_error": { - "value": 2.9 + "value": 2.7 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Haiku 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Haiku 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Haiku 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Haiku 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/openhands__claude-haiku-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/claude-code__claude-haiku-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-02", + "evaluation_timestamp": "2025-11-04", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,17 @@ "max_score": 100.0 }, "score_details": { - "score": 13.9, + "score": 27.5, "uncertainty": { "standard_error": { - "value": 2.7 + "value": 2.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Haiku 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Haiku 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Haiku 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Haiku 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -232,7 +232,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/goose__claude-haiku-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__claude-haiku-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -256,7 +256,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-11", + "evaluation_timestamp": "2025-10-31", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -265,7 +265,7 @@ "max_score": 100.0 }, "score_details": { - "score": 35.5, + "score": 28.3, "uncertainty": { "standard_error": { "value": 2.9 @@ -275,7 +275,7 @@ }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Haiku 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Haiku 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -292,7 +292,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Haiku 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Haiku 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -306,7 +306,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/claude-code__claude-haiku-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/goose__claude-haiku-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -330,7 +330,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-04", + "evaluation_timestamp": "2025-12-11", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -339,17 +339,17 @@ "max_score": 100.0 }, "score_details": { - "score": 27.5, + "score": 35.5, "uncertainty": { "standard_error": { - "value": 2.8 + "value": 2.9 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Haiku 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Haiku 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -366,7 +366,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Haiku 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Haiku 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/anthropic_claude-instant-1.2.json b/data/models/anthropic_claude-instant-1.2.json similarity index 100% rename from data/anthropic_claude-instant-1.2.json rename to data/models/anthropic_claude-instant-1.2.json diff --git a/data/anthropic_claude-opus-4-1-20250805.json b/data/models/anthropic_claude-opus-4-1-20250805.json similarity index 100% rename from data/anthropic_claude-opus-4-1-20250805.json rename to data/models/anthropic_claude-opus-4-1-20250805.json diff --git a/data/anthropic_claude-opus-4-20250514-thinking-10k.json b/data/models/anthropic_claude-opus-4-20250514-thinking-10k.json similarity index 100% rename from data/anthropic_claude-opus-4-20250514-thinking-10k.json rename to data/models/anthropic_claude-opus-4-20250514-thinking-10k.json diff --git a/data/anthropic_claude-opus-4-20250514.json b/data/models/anthropic_claude-opus-4-20250514.json similarity index 100% rename from data/anthropic_claude-opus-4-20250514.json rename to data/models/anthropic_claude-opus-4-20250514.json diff --git a/data/anthropic_claude-opus-4-5.json b/data/models/anthropic_claude-opus-4-5.json similarity index 99% rename from data/anthropic_claude-opus-4-5.json rename to data/models/anthropic_claude-opus-4-5.json index f441521a2e20d0c8f8fe17763040e639c34a3f49..4e4da6a5a225e683bd80d8e74b30768f496388d6 100644 --- a/data/anthropic_claude-opus-4-5.json +++ b/data/models/anthropic_claude-opus-4-5.json @@ -4,8 +4,8 @@ "id": "anthropic/claude-opus-4-5", "developer": "Anthropic", "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } }, "evaluations": [ @@ -146,7 +146,7 @@ } }, { - "evaluation_id": "appworld/test_normal/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "appworld/test_normal/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -178,23 +178,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.64, + "score": 0.61, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "3.43", - "total_run_cost": "343.32", - "average_steps": "20.06", - "percent_finished": "0.82" + "average_agent_cost": "11.32", + "total_run_cost": "1132.47", + "average_steps": "21.99", + "percent_finished": "0.83" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -206,15 +206,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "appworld/test_normal/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "appworld/test_normal/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -246,23 +246,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.61, + "score": 0.64, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "11.32", - "total_run_cost": "1132.47", - "average_steps": "21.99", - "percent_finished": "0.83" + "average_agent_cost": "3.43", + "total_run_cost": "343.32", + "average_steps": "20.06", + "percent_finished": "0.82" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -274,8 +274,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -418,7 +418,7 @@ } }, { - "evaluation_id": "browsecompplus/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "browsecompplus/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -450,23 +450,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5294, + "score": 0.61, "uncertainty": { - "num_samples": 51 + "num_samples": 100 }, "details": { - "average_agent_cost": "11.66", - "total_run_cost": "594.68", - "average_steps": "31.04", - "percent_finished": "0.8431" + "average_agent_cost": "6.3", + "total_run_cost": "630.56", + "average_steps": "24.16", + "percent_finished": "1.0" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -478,15 +478,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } } }, { - "evaluation_id": "browsecompplus/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "browsecompplus/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -518,23 +518,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.49, + "score": 0.5294, "uncertainty": { - "num_samples": 100 + "num_samples": 51 }, "details": { - "average_agent_cost": "7.09", - "total_run_cost": "709.54", - "average_steps": "21.66", - "percent_finished": "0.93" + "average_agent_cost": "11.66", + "total_run_cost": "594.68", + "average_steps": "31.04", + "percent_finished": "0.8431" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -546,15 +546,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "browsecompplus/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "browsecompplus/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -601,8 +601,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -614,15 +614,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "browsecompplus/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "browsecompplus/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -654,23 +654,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.61, + "score": 0.49, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "6.3", - "total_run_cost": "630.56", - "average_steps": "24.16", - "percent_finished": "1.0" + "average_agent_cost": "7.09", + "total_run_cost": "709.54", + "average_steps": "21.66", + "percent_finished": "0.93" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -682,8 +682,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -758,7 +758,7 @@ } }, { - "evaluation_id": "swe-bench/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "swe-bench/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -790,14 +790,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.8072, + "score": 0.6061, "uncertainty": { - "num_samples": 83 + "num_samples": 99 }, "details": { - "average_agent_cost": "2.96", - "total_run_cost": "245.78", - "average_steps": "34.1", + "average_agent_cost": "3.97", + "total_run_cost": "393.16", + "average_steps": "43.44", "percent_finished": "1.0" } }, @@ -805,8 +805,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -818,15 +818,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } } }, { - "evaluation_id": "tau-bench-2/airline/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "swe-bench/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -839,33 +839,33 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "tau-bench-2_airline", + "benchmark": "swe-bench", "evaluation_results": [ { - "evaluation_name": "tau-bench-2/airline", + "evaluation_name": "swe-bench", "source_data": { - "dataset_name": "tau-bench-2/airline", + "dataset_name": "swe-bench", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)", + "evaluation_description": "SWE-bench benchmark evaluation", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.72, + "score": 0.7423, "uncertainty": { - "num_samples": 50 + "num_samples": 97 }, "details": { - "average_agent_cost": "0.78", - "total_run_cost": "39.67", - "average_steps": "11.88", + "average_agent_cost": "5.6", + "total_run_cost": "543.62", + "average_steps": "31.76", "percent_finished": "1.0" } }, @@ -873,8 +873,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -886,15 +886,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "tau-bench-2/airline/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "swe-bench/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -907,33 +907,33 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "tau-bench-2_airline", + "benchmark": "swe-bench", "evaluation_results": [ { - "evaluation_name": "tau-bench-2/airline", + "evaluation_name": "swe-bench", "source_data": { - "dataset_name": "tau-bench-2/airline", + "dataset_name": "swe-bench", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)", + "evaluation_description": "SWE-bench benchmark evaluation", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.66, + "score": 0.8072, "uncertainty": { - "num_samples": 50 + "num_samples": 83 }, "details": { - "average_agent_cost": "0.47", - "total_run_cost": "24.23", - "average_steps": "10.0", + "average_agent_cost": "2.96", + "total_run_cost": "245.78", + "average_steps": "34.1", "percent_finished": "1.0" } }, @@ -941,8 +941,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -954,15 +954,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "tau-bench-2/airline/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "swe-bench/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -975,33 +975,33 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "tau-bench-2_airline", + "benchmark": "swe-bench", "evaluation_results": [ { - "evaluation_name": "tau-bench-2/airline", + "evaluation_name": "swe-bench", "source_data": { - "dataset_name": "tau-bench-2/airline", + "dataset_name": "swe-bench", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)", + "evaluation_description": "SWE-bench benchmark evaluation", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.74, + "score": 0.65, "uncertainty": { - "num_samples": 50 + "num_samples": 100 }, "details": { - "average_agent_cost": "0.72", - "total_run_cost": "36.55", - "average_steps": "12.22", + "average_agent_cost": "4.85", + "total_run_cost": "485.22", + "average_steps": "39.13", "percent_finished": "1.0" } }, @@ -1009,8 +1009,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -1022,15 +1022,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } } }, { - "evaluation_id": "swe-bench/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/airline/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1043,33 +1043,33 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "swe-bench", + "benchmark": "tau-bench-2_airline", "evaluation_results": [ { - "evaluation_name": "swe-bench", + "evaluation_name": "tau-bench-2/airline", "source_data": { - "dataset_name": "swe-bench", + "dataset_name": "tau-bench-2/airline", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "SWE-bench benchmark evaluation", + "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6061, + "score": 0.66, "uncertainty": { - "num_samples": 99 + "num_samples": 50 }, "details": { - "average_agent_cost": "3.97", - "total_run_cost": "393.16", - "average_steps": "43.44", + "average_agent_cost": "0.47", + "total_run_cost": "24.23", + "average_steps": "10.0", "percent_finished": "1.0" } }, @@ -1098,7 +1098,7 @@ } }, { - "evaluation_id": "swe-bench/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/airline/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1111,33 +1111,33 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "swe-bench", + "benchmark": "tau-bench-2_airline", "evaluation_results": [ { - "evaluation_name": "swe-bench", + "evaluation_name": "tau-bench-2/airline", "source_data": { - "dataset_name": "swe-bench", + "dataset_name": "tau-bench-2/airline", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "SWE-bench benchmark evaluation", + "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.65, + "score": 0.74, "uncertainty": { - "num_samples": 100 + "num_samples": 50 }, "details": { - "average_agent_cost": "4.85", - "total_run_cost": "485.22", - "average_steps": "39.13", + "average_agent_cost": "0.72", + "total_run_cost": "36.55", + "average_steps": "12.22", "percent_finished": "1.0" } }, @@ -1145,8 +1145,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -1158,15 +1158,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "tau-bench-2/airline/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/airline/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1198,14 +1198,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.66, + "score": 0.72, "uncertainty": { "num_samples": 50 }, "details": { - "average_agent_cost": "1.3", - "total_run_cost": "65.66", - "average_steps": "11.5", + "average_agent_cost": "0.78", + "total_run_cost": "39.67", + "average_steps": "11.88", "percent_finished": "1.0" } }, @@ -1213,8 +1213,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -1226,8 +1226,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -1302,7 +1302,7 @@ } }, { - "evaluation_id": "swe-bench/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1315,33 +1315,33 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "swe-bench", + "benchmark": "tau-bench-2_retail", "evaluation_results": [ { - "evaluation_name": "swe-bench", + "evaluation_name": "tau-bench-2/retail", "source_data": { - "dataset_name": "swe-bench", + "dataset_name": "tau-bench-2/retail", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "SWE-bench benchmark evaluation", + "evaluation_description": "Tau Bench 2 benchmark evaluation (retail subset)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7423, + "score": 0.83, "uncertainty": { - "num_samples": 97 + "num_samples": 100 }, "details": { - "average_agent_cost": "5.6", - "total_run_cost": "543.62", - "average_steps": "31.76", + "average_agent_cost": "1.6", + "total_run_cost": "161.14", + "average_steps": "12.54", "percent_finished": "1.0" } }, @@ -1370,7 +1370,7 @@ } }, { - "evaluation_id": "tau-bench-2/retail/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1417,8 +1417,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -1430,15 +1430,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } } }, { - "evaluation_id": "tau-bench-2/retail/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1485,8 +1485,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -1498,15 +1498,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "tau-bench-2/retail/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1538,14 +1538,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.83, + "score": 0.78, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "1.6", - "total_run_cost": "161.14", - "average_steps": "12.54", + "average_agent_cost": "0.67", + "total_run_cost": "68.24", + "average_steps": "11.71", "percent_finished": "1.0" } }, @@ -1553,8 +1553,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -1566,15 +1566,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } } }, { - "evaluation_id": "tau-bench-2/telecom/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1587,33 +1587,33 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "tau-bench-2_telecom", + "benchmark": "tau-bench-2_retail", "evaluation_results": [ { - "evaluation_name": "tau-bench-2/telecom", + "evaluation_name": "tau-bench-2/retail", "source_data": { - "dataset_name": "tau-bench-2/telecom", + "dataset_name": "tau-bench-2/retail", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "Tau Bench 2 benchmark evaluation (telecom subset)", + "evaluation_description": "Tau Bench 2 benchmark evaluation (retail subset)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.58, + "score": 0.85, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "1.06", - "total_run_cost": "114.62", - "average_steps": "13.77", + "average_agent_cost": "0.55", + "total_run_cost": "56.18", + "average_steps": "12.54", "percent_finished": "1.0" } }, @@ -1621,8 +1621,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -1634,15 +1634,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "tau-bench-2/retail/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/airline/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1655,33 +1655,33 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "tau-bench-2_retail", + "benchmark": "tau-bench-2_airline", "evaluation_results": [ { - "evaluation_name": "tau-bench-2/retail", + "evaluation_name": "tau-bench-2/airline", "source_data": { - "dataset_name": "tau-bench-2/retail", + "dataset_name": "tau-bench-2/airline", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "Tau Bench 2 benchmark evaluation (retail subset)", + "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.85, + "score": 0.66, "uncertainty": { - "num_samples": 100 + "num_samples": 50 }, "details": { - "average_agent_cost": "0.55", - "total_run_cost": "56.18", - "average_steps": "12.54", + "average_agent_cost": "1.3", + "total_run_cost": "65.66", + "average_steps": "11.5", "percent_finished": "1.0" } }, @@ -1689,8 +1689,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -1702,15 +1702,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "tau-bench-2/retail/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/telecom/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1723,33 +1723,33 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "tau-bench-2_retail", + "benchmark": "tau-bench-2_telecom", "evaluation_results": [ { - "evaluation_name": "tau-bench-2/retail", + "evaluation_name": "tau-bench-2/telecom", "source_data": { - "dataset_name": "tau-bench-2/retail", + "dataset_name": "tau-bench-2/telecom", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "Tau Bench 2 benchmark evaluation (retail subset)", + "evaluation_description": "Tau Bench 2 benchmark evaluation (telecom subset)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.78, + "score": 0.76, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.67", - "total_run_cost": "68.24", - "average_steps": "11.71", + "average_agent_cost": "2.45", + "total_run_cost": "255.97", + "average_steps": "18.71", "percent_finished": "1.0" } }, @@ -1757,8 +1757,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -1770,15 +1770,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "tau-bench-2/telecom/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1815,9 +1815,9 @@ "num_samples": 100 }, "details": { - "average_agent_cost": "2.45", - "total_run_cost": "255.97", - "average_steps": "18.71", + "average_agent_cost": "0.92", + "total_run_cost": "102.01", + "average_steps": "17.22", "percent_finished": "1.0" } }, @@ -1825,8 +1825,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -1838,15 +1838,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } } }, { - "evaluation_id": "tau-bench-2/telecom/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1878,14 +1878,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.84, + "score": 0.76, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "1.25", - "total_run_cost": "136.84", - "average_steps": "17.15", + "average_agent_cost": "0.92", + "total_run_cost": "102.01", + "average_steps": "17.22", "percent_finished": "1.0" } }, @@ -1893,8 +1893,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -1906,15 +1906,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/telecom/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1946,14 +1946,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.76, + "score": 0.84, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.92", - "total_run_cost": "102.01", - "average_steps": "17.22", + "average_agent_cost": "1.25", + "total_run_cost": "136.84", + "average_steps": "17.15", "percent_finished": "1.0" } }, @@ -1961,8 +1961,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -1974,15 +1974,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/telecom/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2014,14 +2014,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.76, + "score": 0.58, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.92", - "total_run_cost": "102.01", - "average_steps": "17.22", + "average_agent_cost": "1.06", + "total_run_cost": "114.62", + "average_steps": "13.77", "percent_finished": "1.0" } }, @@ -2029,8 +2029,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -2042,8 +2042,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } diff --git a/data/anthropic_claude-opus-4.1.json b/data/models/anthropic_claude-opus-4.1.json similarity index 99% rename from data/anthropic_claude-opus-4.1.json rename to data/models/anthropic_claude-opus-4.1.json index b9584e3380d9b950b8982e86d2b8d65bc3dee3cc..dffd22cc4ffeffdda816bc746307e1e62ff77222 100644 --- a/data/anthropic_claude-opus-4.1.json +++ b/data/models/anthropic_claude-opus-4.1.json @@ -4,13 +4,13 @@ "id": "anthropic/claude-opus-4.1", "developer": "Anthropic", "additional_details": { - "agent_name": "Mini-SWE-Agent", - "agent_organization": "Princeton" + "agent_name": "Terminus 2", + "agent_organization": "Terminal Bench" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/openhands__claude-opus-4.1/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mini-swe-agent__claude-opus-4.1/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-02", + "evaluation_timestamp": "2025-11-03", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 36.9, + "score": 35.1, "uncertainty": { "standard_error": { - "value": 2.7 + "value": 2.5 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Opus 4.1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Opus 4.1\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Opus 4.1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Opus 4.1\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__claude-opus-4.1/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/claude-code__claude-opus-4.1/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-10-31", + "evaluation_timestamp": "2025-11-04", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 38.0, + "score": 34.8, "uncertainty": { "standard_error": { - "value": 2.6 + "value": 2.9 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.1\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.1\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/claude-code__claude-opus-4.1/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/openhands__claude-opus-4.1/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-04", + "evaluation_timestamp": "2025-11-02", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,17 @@ "max_score": 100.0 }, "score_details": { - "score": 34.8, + "score": 36.9, "uncertainty": { "standard_error": { - "value": 2.9 + "value": 2.7 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Opus 4.1\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Opus 4.1\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -232,7 +232,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/mini-swe-agent__claude-opus-4.1/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__claude-opus-4.1/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -256,7 +256,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-03", + "evaluation_timestamp": "2025-10-31", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -265,17 +265,17 @@ "max_score": 100.0 }, "score_details": { - "score": 35.1, + "score": 38.0, "uncertainty": { "standard_error": { - "value": 2.5 + "value": 2.6 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Opus 4.1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.1\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -292,7 +292,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Opus 4.1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.1\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/anthropic_claude-opus-4.5.json b/data/models/anthropic_claude-opus-4.5.json similarity index 99% rename from data/anthropic_claude-opus-4.5.json rename to data/models/anthropic_claude-opus-4.5.json index df080d7413012fa24728035c1abd3bed2bde4431..db911fda50bfaa4ee79d41805d7cf44e7a3058cc 100644 --- a/data/anthropic_claude-opus-4.5.json +++ b/data/models/anthropic_claude-opus-4.5.json @@ -4,13 +4,13 @@ "id": "anthropic/claude-opus-4.5", "developer": "Anthropic", "additional_details": { - "agent_name": "Mux", - "agent_organization": "Coder" + "agent_name": "Goose", + "agent_organization": "Block" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/goose__claude-opus-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/letta-code__claude-opus-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-11", + "evaluation_timestamp": "2025-12-17", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 54.3, + "score": 59.1, "uncertainty": { "standard_error": { - "value": 2.6 + "value": 2.4 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/letta-code__claude-opus-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/opencode__claude-opus-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-17", + "evaluation_timestamp": "2026-01-12", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,11 @@ "max_score": 100.0 }, "score_details": { - "score": 59.1, - "uncertainty": { - "standard_error": { - "value": 2.4 - }, - "num_samples": 435 - } + "score": 51.7 }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenCode\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +138,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenCode\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -232,7 +226,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/openhands__claude-opus-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mux__claude-opus-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -256,7 +250,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-01-04", + "evaluation_timestamp": "2026-01-17", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -265,17 +259,11 @@ "max_score": 100.0 }, "score_details": { - "score": 51.9, - "uncertainty": { - "standard_error": { - "value": 2.9 - }, - "num_samples": 435 - } + "score": 58.4 }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -292,7 +280,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -306,7 +294,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/opencode__claude-opus-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/openhands__claude-opus-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -330,7 +318,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-01-12", + "evaluation_timestamp": "2026-01-04", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -339,11 +327,17 @@ "max_score": 100.0 }, "score_details": { - "score": 51.7 + "score": 51.9, + "uncertainty": { + "standard_error": { + "value": 2.9 + }, + "num_samples": 435 + } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenCode\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -360,7 +354,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenCode\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -374,7 +368,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/claude-code__claude-opus-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/droid__claude-opus-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -398,7 +392,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-18", + "evaluation_timestamp": "2025-12-11", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -407,17 +401,17 @@ "max_score": 100.0 }, "score_details": { - "score": 52.1, + "score": 63.1, "uncertainty": { "standard_error": { - "value": 2.5 + "value": 2.7 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -434,7 +428,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -448,7 +442,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/droid__claude-opus-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/claude-code__claude-opus-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -472,7 +466,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-11", + "evaluation_timestamp": "2025-12-18", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -481,17 +475,17 @@ "max_score": 100.0 }, "score_details": { - "score": 63.1, + "score": 52.1, "uncertainty": { "standard_error": { - "value": 2.7 + "value": 2.5 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -508,7 +502,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -522,7 +516,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/mux__claude-opus-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/goose__claude-opus-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -546,7 +540,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-01-17", + "evaluation_timestamp": "2025-12-11", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -555,11 +549,17 @@ "max_score": 100.0 }, "score_details": { - "score": 58.4 + "score": 54.3, + "uncertainty": { + "standard_error": { + "value": 2.6 + }, + "num_samples": 435 + } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -576,7 +576,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/anthropic_claude-opus-4.6.json b/data/models/anthropic_claude-opus-4.6.json similarity index 99% rename from data/anthropic_claude-opus-4.6.json rename to data/models/anthropic_claude-opus-4.6.json index 58c5a7af3843a3b3a3cf98ed6a920cfdd0ab8a8f..7886ec8ae8cad49aaa26033d604cb1ebebddeda0 100644 --- a/data/anthropic_claude-opus-4.6.json +++ b/data/models/anthropic_claude-opus-4.6.json @@ -4,13 +4,13 @@ "id": "anthropic/claude-opus-4.6", "developer": "Anthropic", "additional_details": { - "agent_name": "TongAgents", - "agent_organization": "Bigai" + "agent_name": "Droid", + "agent_organization": "Factory" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/mux__claude-opus-4.6/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/claude-code__claude-opus-4.6/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-13", + "evaluation_timestamp": "2026-02-07", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 66.5, + "score": 58.0, "uncertainty": { "standard_error": { - "value": 2.5 + "value": 2.9 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/claude-code__claude-opus-4.6/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/tongagents__claude-opus-4.6/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-07", + "evaluation_timestamp": "2026-02-22", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 58.0, + "score": 71.9, "uncertainty": { "standard_error": { - "value": 2.9 + "value": 2.7 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"TongAgents\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"TongAgents\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/droid__claude-opus-4.6/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__claude-opus-4.6/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-05", + "evaluation_timestamp": "2026-02-06", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,17 @@ "max_score": 100.0 }, "score_details": { - "score": 69.9, + "score": 62.9, "uncertainty": { "standard_error": { - "value": 2.5 + "value": 2.7 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -232,7 +232,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/crux__claude-opus-4.6/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-kira__claude-opus-4.6/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -256,7 +256,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-23", + "evaluation_timestamp": "2026-02-22", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -265,11 +265,17 @@ "max_score": 100.0 }, "score_details": { - "score": 66.9 + "score": 74.7, + "uncertainty": { + "standard_error": { + "value": 2.6 + }, + "num_samples": 435 + } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus-KIRA\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -286,7 +292,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus-KIRA\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -300,7 +306,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__claude-opus-4.6/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mux__claude-opus-4.6/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -324,7 +330,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-06", + "evaluation_timestamp": "2026-02-13", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -333,17 +339,17 @@ "max_score": 100.0 }, "score_details": { - "score": 62.9, + "score": 66.5, "uncertainty": { "standard_error": { - "value": 2.7 + "value": 2.5 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -360,7 +366,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -374,7 +380,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-kira__claude-opus-4.6/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/crux__claude-opus-4.6/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -398,7 +404,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-22", + "evaluation_timestamp": "2026-02-23", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -407,17 +413,11 @@ "max_score": 100.0 }, "score_details": { - "score": 74.7, - "uncertainty": { - "standard_error": { - "value": 2.6 - }, - "num_samples": 435 - } + "score": 66.9 }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus-KIRA\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -434,7 +434,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus-KIRA\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -448,7 +448,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/tongagents__claude-opus-4.6/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/droid__claude-opus-4.6/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -472,7 +472,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-22", + "evaluation_timestamp": "2026-02-05", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -481,17 +481,17 @@ "max_score": 100.0 }, "score_details": { - "score": 71.9, + "score": 69.9, "uncertainty": { "standard_error": { - "value": 2.7 + "value": 2.5 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"TongAgents\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -508,7 +508,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"TongAgents\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/anthropic_claude-sonnet-4-20250514-thinking-10k.json b/data/models/anthropic_claude-sonnet-4-20250514-thinking-10k.json similarity index 100% rename from data/anthropic_claude-sonnet-4-20250514-thinking-10k.json rename to data/models/anthropic_claude-sonnet-4-20250514-thinking-10k.json diff --git a/data/anthropic_claude-sonnet-4-20250514.json b/data/models/anthropic_claude-sonnet-4-20250514.json similarity index 100% rename from data/anthropic_claude-sonnet-4-20250514.json rename to data/models/anthropic_claude-sonnet-4-20250514.json index b9ad033a7ea75f99c7213a0abf9a6fcf771340ca..4792ff1b3b42b04e906eb39316e2865188e5cf63 100644 --- a/data/anthropic_claude-sonnet-4-20250514.json +++ b/data/models/anthropic_claude-sonnet-4-20250514.json @@ -9,8 +9,8 @@ }, "evaluations": [ { - "evaluation_id": "global-mmlu-lite/anthropic_claude-sonnet-4-20250514/1773936496.366405", - "retrieved_timestamp": "1773936496.366405", + "evaluation_id": "global-mmlu-lite/anthropic_claude-sonnet-4-20250514/1773936583.743359", + "retrieved_timestamp": "1773936583.743359", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", @@ -524,8 +524,8 @@ "generation_config": null }, { - "evaluation_id": "global-mmlu-lite/anthropic_claude-sonnet-4-20250514/1773936583.743359", - "retrieved_timestamp": "1773936583.743359", + "evaluation_id": "global-mmlu-lite/anthropic_claude-sonnet-4-20250514/1773936496.366405", + "retrieved_timestamp": "1773936496.366405", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", diff --git a/data/anthropic_claude-sonnet-4-5-20250929.json b/data/models/anthropic_claude-sonnet-4-5-20250929.json similarity index 100% rename from data/anthropic_claude-sonnet-4-5-20250929.json rename to data/models/anthropic_claude-sonnet-4-5-20250929.json diff --git a/data/anthropic_claude-sonnet-4.5.json b/data/models/anthropic_claude-sonnet-4.5.json similarity index 100% rename from data/anthropic_claude-sonnet-4.5.json rename to data/models/anthropic_claude-sonnet-4.5.json index 3d0f90f7f9e5c9de85f6be04657526b4cff2d177..606e72f15a5601ecd2d0774a197b069b075f2ca1 100644 --- a/data/anthropic_claude-sonnet-4.5.json +++ b/data/models/anthropic_claude-sonnet-4.5.json @@ -10,7 +10,7 @@ }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/claude-code__claude-sonnet-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/camel-ai__claude-sonnet-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-04", + "evaluation_timestamp": "2025-12-24", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 40.1, + "score": 46.5, "uncertainty": { "standard_error": { - "value": 2.9 + "value": 2.4 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"CAMEL-AI\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"CAMEL-AI\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/camel-ai__claude-sonnet-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/maya__claude-sonnet-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-24", + "evaluation_timestamp": "2026-01-04", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,11 @@ "max_score": 100.0 }, "score_details": { - "score": 46.5, - "uncertainty": { - "standard_error": { - "value": 2.4 - }, - "num_samples": 435 - } + "score": 42.7 }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"CAMEL-AI\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"MAYA\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +212,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"CAMEL-AI\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"MAYA\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -232,7 +226,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/mini-swe-agent__claude-sonnet-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/goose__claude-sonnet-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -256,7 +250,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-03", + "evaluation_timestamp": "2025-12-11", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -265,17 +259,17 @@ "max_score": 100.0 }, "score_details": { - "score": 42.5, + "score": 43.1, "uncertainty": { "standard_error": { - "value": 2.8 + "value": 2.6 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -292,7 +286,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -306,7 +300,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/maya__claude-sonnet-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/claude-code__claude-sonnet-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -330,7 +324,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-01-04", + "evaluation_timestamp": "2025-11-04", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -339,11 +333,17 @@ "max_score": 100.0 }, "score_details": { - "score": 42.7 + "score": 40.1, + "uncertainty": { + "standard_error": { + "value": 2.9 + }, + "num_samples": 435 + } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"MAYA\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -360,7 +360,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"MAYA\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -374,7 +374,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/goose__claude-sonnet-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mini-swe-agent__claude-sonnet-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -398,7 +398,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-11", + "evaluation_timestamp": "2025-11-03", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -407,17 +407,17 @@ "max_score": 100.0 }, "score_details": { - "score": 43.1, + "score": 42.5, "uncertainty": { "standard_error": { - "value": 2.6 + "value": 2.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -434,7 +434,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/anthropic_claude-v1.3.json b/data/models/anthropic_claude-v1.3.json similarity index 100% rename from data/anthropic_claude-v1.3.json rename to data/models/anthropic_claude-v1.3.json diff --git a/data/apple_DCLM-7B.json b/data/models/apple_DCLM-7B.json similarity index 100% rename from data/apple_DCLM-7B.json rename to data/models/apple_DCLM-7B.json diff --git a/data/applied-compute_Applied_Compute__Small.json b/data/models/applied-compute_Applied_Compute__Small.json similarity index 100% rename from data/applied-compute_Applied_Compute__Small.json rename to data/models/applied-compute_Applied_Compute__Small.json diff --git a/data/appvoid_arco-2-instruct.json b/data/models/appvoid_arco-2-instruct.json similarity index 100% rename from data/appvoid_arco-2-instruct.json rename to data/models/appvoid_arco-2-instruct.json diff --git a/data/appvoid_arco-2.json b/data/models/appvoid_arco-2.json similarity index 100% rename from data/appvoid_arco-2.json rename to data/models/appvoid_arco-2.json diff --git a/data/arcee-ai_Arcee-Blitz.json b/data/models/arcee-ai_Arcee-Blitz.json similarity index 100% rename from data/arcee-ai_Arcee-Blitz.json rename to data/models/arcee-ai_Arcee-Blitz.json diff --git a/data/arcee-ai_Arcee-Maestro-7B-Preview.json b/data/models/arcee-ai_Arcee-Maestro-7B-Preview.json similarity index 100% rename from data/arcee-ai_Arcee-Maestro-7B-Preview.json rename to data/models/arcee-ai_Arcee-Maestro-7B-Preview.json diff --git a/data/arcee-ai_Arcee-Nova.json b/data/models/arcee-ai_Arcee-Nova.json similarity index 100% rename from data/arcee-ai_Arcee-Nova.json rename to data/models/arcee-ai_Arcee-Nova.json diff --git a/data/arcee-ai_Arcee-Spark.json b/data/models/arcee-ai_Arcee-Spark.json similarity index 100% rename from data/arcee-ai_Arcee-Spark.json rename to data/models/arcee-ai_Arcee-Spark.json diff --git a/data/arcee-ai_Llama-3.1-SuperNova-Lite.json b/data/models/arcee-ai_Llama-3.1-SuperNova-Lite.json similarity index 100% rename from data/arcee-ai_Llama-3.1-SuperNova-Lite.json rename to data/models/arcee-ai_Llama-3.1-SuperNova-Lite.json diff --git a/data/arcee-ai_Llama-Spark.json b/data/models/arcee-ai_Llama-Spark.json similarity index 100% rename from data/arcee-ai_Llama-Spark.json rename to data/models/arcee-ai_Llama-Spark.json diff --git a/data/arcee-ai_SuperNova-Medius.json b/data/models/arcee-ai_SuperNova-Medius.json similarity index 100% rename from data/arcee-ai_SuperNova-Medius.json rename to data/models/arcee-ai_SuperNova-Medius.json diff --git a/data/arcee-ai_Virtuoso-Lite.json b/data/models/arcee-ai_Virtuoso-Lite.json similarity index 100% rename from data/arcee-ai_Virtuoso-Lite.json rename to data/models/arcee-ai_Virtuoso-Lite.json diff --git a/data/arcee-ai_Virtuoso-Small-v2.json b/data/models/arcee-ai_Virtuoso-Small-v2.json similarity index 100% rename from data/arcee-ai_Virtuoso-Small-v2.json rename to data/models/arcee-ai_Virtuoso-Small-v2.json diff --git a/data/arcee-ai_Virtuoso-Small.json b/data/models/arcee-ai_Virtuoso-Small.json similarity index 100% rename from data/arcee-ai_Virtuoso-Small.json rename to data/models/arcee-ai_Virtuoso-Small.json diff --git a/data/arcee-ai_raspberry-3B.json b/data/models/arcee-ai_raspberry-3B.json similarity index 100% rename from data/arcee-ai_raspberry-3B.json rename to data/models/arcee-ai_raspberry-3B.json diff --git a/data/argilla-warehouse_Llama-3.1-8B-MagPie-Ultra.json b/data/models/argilla-warehouse_Llama-3.1-8B-MagPie-Ultra.json similarity index 100% rename from data/argilla-warehouse_Llama-3.1-8B-MagPie-Ultra.json rename to data/models/argilla-warehouse_Llama-3.1-8B-MagPie-Ultra.json diff --git a/data/argilla_notus-7b-v1.json b/data/models/argilla_notus-7b-v1.json similarity index 100% rename from data/argilla_notus-7b-v1.json rename to data/models/argilla_notus-7b-v1.json diff --git a/data/argilla_notux-8x7b-v1.json b/data/models/argilla_notux-8x7b-v1.json similarity index 100% rename from data/argilla_notux-8x7b-v1.json rename to data/models/argilla_notux-8x7b-v1.json diff --git a/data/arisin_orca-platypus-13B-slerp.json b/data/models/arisin_orca-platypus-13B-slerp.json similarity index 100% rename from data/arisin_orca-platypus-13B-slerp.json rename to data/models/arisin_orca-platypus-13B-slerp.json diff --git a/data/ark_ep-20250603132404-cgpjm.json b/data/models/ark_ep-20250603132404-cgpjm.json similarity index 100% rename from data/ark_ep-20250603132404-cgpjm.json rename to data/models/ark_ep-20250603132404-cgpjm.json diff --git a/data/arshiaafshani_Arsh-V1.json b/data/models/arshiaafshani_Arsh-V1.json similarity index 100% rename from data/arshiaafshani_Arsh-V1.json rename to data/models/arshiaafshani_Arsh-V1.json diff --git a/data/asharsha30_LLAMA_Harsha_8_B_ORDP_10k.json b/data/models/asharsha30_LLAMA_Harsha_8_B_ORDP_10k.json similarity index 100% rename from data/asharsha30_LLAMA_Harsha_8_B_ORDP_10k.json rename to data/models/asharsha30_LLAMA_Harsha_8_B_ORDP_10k.json diff --git a/data/ashercn97_a1-v0.0.1.json b/data/models/ashercn97_a1-v0.0.1.json similarity index 100% rename from data/ashercn97_a1-v0.0.1.json rename to data/models/ashercn97_a1-v0.0.1.json diff --git a/data/ashercn97_a1-v002.json b/data/models/ashercn97_a1-v002.json similarity index 100% rename from data/ashercn97_a1-v002.json rename to data/models/ashercn97_a1-v002.json diff --git a/data/assskelad_smollm2-360M-sft_SmallThoughts.json b/data/models/assskelad_smollm2-360M-sft_SmallThoughts.json similarity index 100% rename from data/assskelad_smollm2-360M-sft_SmallThoughts.json rename to data/models/assskelad_smollm2-360M-sft_SmallThoughts.json diff --git a/data/athirdpath_Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit.json b/data/models/athirdpath_Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit.json similarity index 100% rename from data/athirdpath_Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit.json rename to data/models/athirdpath_Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit.json diff --git a/data/automerger_YamshadowExperiment28-7B.json b/data/models/automerger_YamshadowExperiment28-7B.json similarity index 100% rename from data/automerger_YamshadowExperiment28-7B.json rename to data/models/automerger_YamshadowExperiment28-7B.json diff --git a/data/avemio_GRAG-NEMO-12B-ORPO-HESSIAN-AI.json b/data/models/avemio_GRAG-NEMO-12B-ORPO-HESSIAN-AI.json similarity index 100% rename from data/avemio_GRAG-NEMO-12B-ORPO-HESSIAN-AI.json rename to data/models/avemio_GRAG-NEMO-12B-ORPO-HESSIAN-AI.json diff --git a/data/awnr_Mistral-7B-v0.1-signtensors-1-over-2.json b/data/models/awnr_Mistral-7B-v0.1-signtensors-1-over-2.json similarity index 100% rename from data/awnr_Mistral-7B-v0.1-signtensors-1-over-2.json rename to data/models/awnr_Mistral-7B-v0.1-signtensors-1-over-2.json diff --git a/data/awnr_Mistral-7B-v0.1-signtensors-1-over-4.json b/data/models/awnr_Mistral-7B-v0.1-signtensors-1-over-4.json similarity index 100% rename from data/awnr_Mistral-7B-v0.1-signtensors-1-over-4.json rename to data/models/awnr_Mistral-7B-v0.1-signtensors-1-over-4.json diff --git a/data/awnr_Mistral-7B-v0.1-signtensors-3-over-8.json b/data/models/awnr_Mistral-7B-v0.1-signtensors-3-over-8.json similarity index 100% rename from data/awnr_Mistral-7B-v0.1-signtensors-3-over-8.json rename to data/models/awnr_Mistral-7B-v0.1-signtensors-3-over-8.json diff --git a/data/awnr_Mistral-7B-v0.1-signtensors-5-over-16.json b/data/models/awnr_Mistral-7B-v0.1-signtensors-5-over-16.json similarity index 100% rename from data/awnr_Mistral-7B-v0.1-signtensors-5-over-16.json rename to data/models/awnr_Mistral-7B-v0.1-signtensors-5-over-16.json diff --git a/data/awnr_Mistral-7B-v0.1-signtensors-7-over-16.json b/data/models/awnr_Mistral-7B-v0.1-signtensors-7-over-16.json similarity index 100% rename from data/awnr_Mistral-7B-v0.1-signtensors-7-over-16.json rename to data/models/awnr_Mistral-7B-v0.1-signtensors-7-over-16.json diff --git a/data/aws-prototyping_MegaBeam-Mistral-7B-512k.json b/data/models/aws-prototyping_MegaBeam-Mistral-7B-512k.json similarity index 100% rename from data/aws-prototyping_MegaBeam-Mistral-7B-512k.json rename to data/models/aws-prototyping_MegaBeam-Mistral-7B-512k.json diff --git a/data/axolotl-ai-co_romulus-mistral-nemo-12b-simpo.json b/data/models/axolotl-ai-co_romulus-mistral-nemo-12b-simpo.json similarity index 100% rename from data/axolotl-ai-co_romulus-mistral-nemo-12b-simpo.json rename to data/models/axolotl-ai-co_romulus-mistral-nemo-12b-simpo.json diff --git a/data/baconnier_Napoleon_24B_V0.0.json b/data/models/baconnier_Napoleon_24B_V0.0.json similarity index 100% rename from data/baconnier_Napoleon_24B_V0.0.json rename to data/models/baconnier_Napoleon_24B_V0.0.json diff --git a/data/baconnier_Napoleon_24B_V0.2.json b/data/models/baconnier_Napoleon_24B_V0.2.json similarity index 100% rename from data/baconnier_Napoleon_24B_V0.2.json rename to data/models/baconnier_Napoleon_24B_V0.2.json diff --git a/data/baebee_7B-Cetacea.json b/data/models/baebee_7B-Cetacea.json similarity index 100% rename from data/baebee_7B-Cetacea.json rename to data/models/baebee_7B-Cetacea.json diff --git a/data/baebee_mergekit-model_stock-nzjnheg.json b/data/models/baebee_mergekit-model_stock-nzjnheg.json similarity index 100% rename from data/baebee_mergekit-model_stock-nzjnheg.json rename to data/models/baebee_mergekit-model_stock-nzjnheg.json diff --git a/data/baebee_mergekit-ties-fnjenli.json b/data/models/baebee_mergekit-ties-fnjenli.json similarity index 100% rename from data/baebee_mergekit-ties-fnjenli.json rename to data/models/baebee_mergekit-ties-fnjenli.json diff --git a/data/bamec66557_MISCHIEVOUS-12B-Mix_0.1v.json b/data/models/bamec66557_MISCHIEVOUS-12B-Mix_0.1v.json similarity index 100% rename from data/bamec66557_MISCHIEVOUS-12B-Mix_0.1v.json rename to data/models/bamec66557_MISCHIEVOUS-12B-Mix_0.1v.json diff --git a/data/bamec66557_MISCHIEVOUS-12B-Mix_0.2v.json b/data/models/bamec66557_MISCHIEVOUS-12B-Mix_0.2v.json similarity index 100% rename from data/bamec66557_MISCHIEVOUS-12B-Mix_0.2v.json rename to data/models/bamec66557_MISCHIEVOUS-12B-Mix_0.2v.json diff --git a/data/bamec66557_MISCHIEVOUS-12B-Mix_0.3v.json b/data/models/bamec66557_MISCHIEVOUS-12B-Mix_0.3v.json similarity index 100% rename from data/bamec66557_MISCHIEVOUS-12B-Mix_0.3v.json rename to data/models/bamec66557_MISCHIEVOUS-12B-Mix_0.3v.json diff --git a/data/bamec66557_MISCHIEVOUS-12B-Mix_0.4v.json b/data/models/bamec66557_MISCHIEVOUS-12B-Mix_0.4v.json similarity index 100% rename from data/bamec66557_MISCHIEVOUS-12B-Mix_0.4v.json rename to data/models/bamec66557_MISCHIEVOUS-12B-Mix_0.4v.json diff --git a/data/bamec66557_MISCHIEVOUS-12B-Mix_0.5v.json b/data/models/bamec66557_MISCHIEVOUS-12B-Mix_0.5v.json similarity index 100% rename from data/bamec66557_MISCHIEVOUS-12B-Mix_0.5v.json rename to data/models/bamec66557_MISCHIEVOUS-12B-Mix_0.5v.json diff --git a/data/bamec66557_MISCHIEVOUS-12B-Mix_0.6v.json b/data/models/bamec66557_MISCHIEVOUS-12B-Mix_0.6v.json similarity index 100% rename from data/bamec66557_MISCHIEVOUS-12B-Mix_0.6v.json rename to data/models/bamec66557_MISCHIEVOUS-12B-Mix_0.6v.json diff --git a/data/bamec66557_MISCHIEVOUS-12B-Mix_III_IV_V.json b/data/models/bamec66557_MISCHIEVOUS-12B-Mix_III_IV_V.json similarity index 100% rename from data/bamec66557_MISCHIEVOUS-12B-Mix_III_IV_V.json rename to data/models/bamec66557_MISCHIEVOUS-12B-Mix_III_IV_V.json diff --git a/data/bamec66557_MISCHIEVOUS-12B-Mix_III_ex_V.json b/data/models/bamec66557_MISCHIEVOUS-12B-Mix_III_ex_V.json similarity index 100% rename from data/bamec66557_MISCHIEVOUS-12B-Mix_III_ex_V.json rename to data/models/bamec66557_MISCHIEVOUS-12B-Mix_III_ex_V.json diff --git a/data/bamec66557_MISCHIEVOUS-12B-Mix_Neo.json b/data/models/bamec66557_MISCHIEVOUS-12B-Mix_Neo.json similarity index 100% rename from data/bamec66557_MISCHIEVOUS-12B-Mix_Neo.json rename to data/models/bamec66557_MISCHIEVOUS-12B-Mix_Neo.json diff --git a/data/bamec66557_MISCHIEVOUS-12B.json b/data/models/bamec66557_MISCHIEVOUS-12B.json similarity index 100% rename from data/bamec66557_MISCHIEVOUS-12B.json rename to data/models/bamec66557_MISCHIEVOUS-12B.json diff --git a/data/bamec66557_Mistral-Nemo-VICIOUS_MESH-12B-2407.json b/data/models/bamec66557_Mistral-Nemo-VICIOUS_MESH-12B-2407.json similarity index 100% rename from data/bamec66557_Mistral-Nemo-VICIOUS_MESH-12B-2407.json rename to data/models/bamec66557_Mistral-Nemo-VICIOUS_MESH-12B-2407.json diff --git a/data/bamec66557_NameLess-12B-prob.json b/data/models/bamec66557_NameLess-12B-prob.json similarity index 100% rename from data/bamec66557_NameLess-12B-prob.json rename to data/models/bamec66557_NameLess-12B-prob.json diff --git a/data/bamec66557_VICIOUS_MESH-12B-0.1v.json b/data/models/bamec66557_VICIOUS_MESH-12B-0.1v.json similarity index 100% rename from data/bamec66557_VICIOUS_MESH-12B-0.1v.json rename to data/models/bamec66557_VICIOUS_MESH-12B-0.1v.json diff --git a/data/bamec66557_VICIOUS_MESH-12B-0.X.ver.json b/data/models/bamec66557_VICIOUS_MESH-12B-0.X.ver.json similarity index 100% rename from data/bamec66557_VICIOUS_MESH-12B-0.X.ver.json rename to data/models/bamec66557_VICIOUS_MESH-12B-0.X.ver.json diff --git a/data/bamec66557_VICIOUS_MESH-12B-ALPHA.json b/data/models/bamec66557_VICIOUS_MESH-12B-ALPHA.json similarity index 100% rename from data/bamec66557_VICIOUS_MESH-12B-ALPHA.json rename to data/models/bamec66557_VICIOUS_MESH-12B-ALPHA.json diff --git a/data/bamec66557_VICIOUS_MESH-12B-BETA.json b/data/models/bamec66557_VICIOUS_MESH-12B-BETA.json similarity index 100% rename from data/bamec66557_VICIOUS_MESH-12B-BETA.json rename to data/models/bamec66557_VICIOUS_MESH-12B-BETA.json diff --git a/data/bamec66557_VICIOUS_MESH-12B-DELTA.json b/data/models/bamec66557_VICIOUS_MESH-12B-DELTA.json similarity index 100% rename from data/bamec66557_VICIOUS_MESH-12B-DELTA.json rename to data/models/bamec66557_VICIOUS_MESH-12B-DELTA.json diff --git a/data/bamec66557_VICIOUS_MESH-12B-DIGAMMA.json b/data/models/bamec66557_VICIOUS_MESH-12B-DIGAMMA.json similarity index 100% rename from data/bamec66557_VICIOUS_MESH-12B-DIGAMMA.json rename to data/models/bamec66557_VICIOUS_MESH-12B-DIGAMMA.json diff --git a/data/bamec66557_VICIOUS_MESH-12B-EPSILON.json b/data/models/bamec66557_VICIOUS_MESH-12B-EPSILON.json similarity index 100% rename from data/bamec66557_VICIOUS_MESH-12B-EPSILON.json rename to data/models/bamec66557_VICIOUS_MESH-12B-EPSILON.json diff --git a/data/bamec66557_VICIOUS_MESH-12B-GAMMA.json b/data/models/bamec66557_VICIOUS_MESH-12B-GAMMA.json similarity index 100% rename from data/bamec66557_VICIOUS_MESH-12B-GAMMA.json rename to data/models/bamec66557_VICIOUS_MESH-12B-GAMMA.json diff --git a/data/bamec66557_VICIOUS_MESH-12B-NEMO.json b/data/models/bamec66557_VICIOUS_MESH-12B-NEMO.json similarity index 100% rename from data/bamec66557_VICIOUS_MESH-12B-NEMO.json rename to data/models/bamec66557_VICIOUS_MESH-12B-NEMO.json diff --git a/data/bamec66557_VICIOUS_MESH-12B-OMEGA.json b/data/models/bamec66557_VICIOUS_MESH-12B-OMEGA.json similarity index 100% rename from data/bamec66557_VICIOUS_MESH-12B-OMEGA.json rename to data/models/bamec66557_VICIOUS_MESH-12B-OMEGA.json diff --git a/data/bamec66557_VICIOUS_MESH-12B-UNION.json b/data/models/bamec66557_VICIOUS_MESH-12B-UNION.json similarity index 100% rename from data/bamec66557_VICIOUS_MESH-12B-UNION.json rename to data/models/bamec66557_VICIOUS_MESH-12B-UNION.json diff --git a/data/bamec66557_VICIOUS_MESH-12B.json b/data/models/bamec66557_VICIOUS_MESH-12B.json similarity index 100% rename from data/bamec66557_VICIOUS_MESH-12B.json rename to data/models/bamec66557_VICIOUS_MESH-12B.json diff --git a/data/bamec66557_VICIOUS_MESH-12B_Razor.json b/data/models/bamec66557_VICIOUS_MESH-12B_Razor.json similarity index 100% rename from data/bamec66557_VICIOUS_MESH-12B_Razor.json rename to data/models/bamec66557_VICIOUS_MESH-12B_Razor.json diff --git a/data/bamec66557_mergekit-model_stock-zdaysvi.json b/data/models/bamec66557_mergekit-model_stock-zdaysvi.json similarity index 100% rename from data/bamec66557_mergekit-model_stock-zdaysvi.json rename to data/models/bamec66557_mergekit-model_stock-zdaysvi.json diff --git a/data/bamec66557_mergekit-ties-sinbkow.json b/data/models/bamec66557_mergekit-ties-sinbkow.json similarity index 100% rename from data/bamec66557_mergekit-ties-sinbkow.json rename to data/models/bamec66557_mergekit-ties-sinbkow.json diff --git a/data/belztjti_dffghgjh.json b/data/models/belztjti_dffghgjh.json similarity index 100% rename from data/belztjti_dffghgjh.json rename to data/models/belztjti_dffghgjh.json diff --git a/data/belztjti_dtfgv.json b/data/models/belztjti_dtfgv.json similarity index 100% rename from data/belztjti_dtfgv.json rename to data/models/belztjti_dtfgv.json diff --git a/data/benhaotang_phi4-qwq-sky-t1.json b/data/models/benhaotang_phi4-qwq-sky-t1.json similarity index 100% rename from data/benhaotang_phi4-qwq-sky-t1.json rename to data/models/benhaotang_phi4-qwq-sky-t1.json diff --git a/data/beomi_gemma-mling-7b.json b/data/models/beomi_gemma-mling-7b.json similarity index 100% rename from data/beomi_gemma-mling-7b.json rename to data/models/beomi_gemma-mling-7b.json diff --git a/data/beowolx_CodeNinja-1.0-OpenChat-7B.json b/data/models/beowolx_CodeNinja-1.0-OpenChat-7B.json similarity index 100% rename from data/beowolx_CodeNinja-1.0-OpenChat-7B.json rename to data/models/beowolx_CodeNinja-1.0-OpenChat-7B.json diff --git a/data/berkeley-nest_Starling-LM-7B-alpha.json b/data/models/berkeley-nest_Starling-LM-7B-alpha.json similarity index 100% rename from data/berkeley-nest_Starling-LM-7B-alpha.json rename to data/models/berkeley-nest_Starling-LM-7B-alpha.json diff --git a/data/berkeley-nest_Starling-RM-7B-alpha.json b/data/models/berkeley-nest_Starling-RM-7B-alpha.json similarity index 100% rename from data/berkeley-nest_Starling-RM-7B-alpha.json rename to data/models/berkeley-nest_Starling-RM-7B-alpha.json diff --git a/data/bfuzzy1_Gunny.json b/data/models/bfuzzy1_Gunny.json similarity index 100% rename from data/bfuzzy1_Gunny.json rename to data/models/bfuzzy1_Gunny.json diff --git a/data/bfuzzy1_acheron-c.json b/data/models/bfuzzy1_acheron-c.json similarity index 100% rename from data/bfuzzy1_acheron-c.json rename to data/models/bfuzzy1_acheron-c.json diff --git a/data/bfuzzy1_acheron-d.json b/data/models/bfuzzy1_acheron-d.json similarity index 100% rename from data/bfuzzy1_acheron-d.json rename to data/models/bfuzzy1_acheron-d.json diff --git a/data/bfuzzy1_acheron-m.json b/data/models/bfuzzy1_acheron-m.json similarity index 100% rename from data/bfuzzy1_acheron-m.json rename to data/models/bfuzzy1_acheron-m.json diff --git a/data/bfuzzy1_acheron-m1a-llama.json b/data/models/bfuzzy1_acheron-m1a-llama.json similarity index 100% rename from data/bfuzzy1_acheron-m1a-llama.json rename to data/models/bfuzzy1_acheron-m1a-llama.json diff --git a/data/bfuzzy1_acheron.json b/data/models/bfuzzy1_acheron.json similarity index 100% rename from data/bfuzzy1_acheron.json rename to data/models/bfuzzy1_acheron.json diff --git a/data/bfuzzy1_llambses-1.json b/data/models/bfuzzy1_llambses-1.json similarity index 100% rename from data/bfuzzy1_llambses-1.json rename to data/models/bfuzzy1_llambses-1.json diff --git a/data/bhuvneshsaini_merged_model.json b/data/models/bhuvneshsaini_merged_model.json similarity index 100% rename from data/bhuvneshsaini_merged_model.json rename to data/models/bhuvneshsaini_merged_model.json diff --git a/data/bigcode_starcoder2-15b.json b/data/models/bigcode_starcoder2-15b.json similarity index 100% rename from data/bigcode_starcoder2-15b.json rename to data/models/bigcode_starcoder2-15b.json diff --git a/data/bigcode_starcoder2-3b.json b/data/models/bigcode_starcoder2-3b.json similarity index 100% rename from data/bigcode_starcoder2-3b.json rename to data/models/bigcode_starcoder2-3b.json diff --git a/data/bigcode_starcoder2-7b.json b/data/models/bigcode_starcoder2-7b.json similarity index 100% rename from data/bigcode_starcoder2-7b.json rename to data/models/bigcode_starcoder2-7b.json diff --git a/data/bigscience_BLOOM-176B.json b/data/models/bigscience_BLOOM-176B.json similarity index 100% rename from data/bigscience_BLOOM-176B.json rename to data/models/bigscience_BLOOM-176B.json diff --git a/data/bigscience_T0pp-11B.json b/data/models/bigscience_T0pp-11B.json similarity index 100% rename from data/bigscience_T0pp-11B.json rename to data/models/bigscience_T0pp-11B.json diff --git a/data/bigscience_bloom-1b1.json b/data/models/bigscience_bloom-1b1.json similarity index 100% rename from data/bigscience_bloom-1b1.json rename to data/models/bigscience_bloom-1b1.json diff --git a/data/bigscience_bloom-1b7.json b/data/models/bigscience_bloom-1b7.json similarity index 100% rename from data/bigscience_bloom-1b7.json rename to data/models/bigscience_bloom-1b7.json diff --git a/data/bigscience_bloom-3b.json b/data/models/bigscience_bloom-3b.json similarity index 100% rename from data/bigscience_bloom-3b.json rename to data/models/bigscience_bloom-3b.json diff --git a/data/bigscience_bloom-560m.json b/data/models/bigscience_bloom-560m.json similarity index 100% rename from data/bigscience_bloom-560m.json rename to data/models/bigscience_bloom-560m.json diff --git a/data/bigscience_bloom-7b1.json b/data/models/bigscience_bloom-7b1.json similarity index 100% rename from data/bigscience_bloom-7b1.json rename to data/models/bigscience_bloom-7b1.json diff --git a/data/bluuwhale_L3-SthenoMaid-8B-V1.json b/data/models/bluuwhale_L3-SthenoMaid-8B-V1.json similarity index 100% rename from data/bluuwhale_L3-SthenoMaid-8B-V1.json rename to data/models/bluuwhale_L3-SthenoMaid-8B-V1.json diff --git a/data/bond005_meno-tiny-0.1.json b/data/models/bond005_meno-tiny-0.1.json similarity index 100% rename from data/bond005_meno-tiny-0.1.json rename to data/models/bond005_meno-tiny-0.1.json diff --git a/data/bosonai_Higgs-Llama-3-70B.json b/data/models/bosonai_Higgs-Llama-3-70B.json similarity index 100% rename from data/bosonai_Higgs-Llama-3-70B.json rename to data/models/bosonai_Higgs-Llama-3-70B.json diff --git a/data/braindao_DeepSeek-R1-Distill-Qwen-1.5B-Blunt.json b/data/models/braindao_DeepSeek-R1-Distill-Qwen-1.5B-Blunt.json similarity index 100% rename from data/braindao_DeepSeek-R1-Distill-Qwen-1.5B-Blunt.json rename to data/models/braindao_DeepSeek-R1-Distill-Qwen-1.5B-Blunt.json diff --git a/data/braindao_DeepSeek-R1-Distill-Qwen-1.5B-Reflective.json b/data/models/braindao_DeepSeek-R1-Distill-Qwen-1.5B-Reflective.json similarity index 100% rename from data/braindao_DeepSeek-R1-Distill-Qwen-1.5B-Reflective.json rename to data/models/braindao_DeepSeek-R1-Distill-Qwen-1.5B-Reflective.json diff --git a/data/braindao_DeepSeek-R1-Distill-Qwen-14B-ABUB-ST.json b/data/models/braindao_DeepSeek-R1-Distill-Qwen-14B-ABUB-ST.json similarity index 100% rename from data/braindao_DeepSeek-R1-Distill-Qwen-14B-ABUB-ST.json rename to data/models/braindao_DeepSeek-R1-Distill-Qwen-14B-ABUB-ST.json diff --git a/data/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective.json b/data/models/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective.json similarity index 100% rename from data/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective.json rename to data/models/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective.json diff --git a/data/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt.json b/data/models/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt.json similarity index 100% rename from data/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt.json rename to data/models/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt.json diff --git a/data/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective.json b/data/models/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective.json similarity index 100% rename from data/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective.json rename to data/models/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective.json diff --git a/data/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored.json b/data/models/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored.json similarity index 100% rename from data/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored.json rename to data/models/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored.json diff --git a/data/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt.json b/data/models/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt.json similarity index 100% rename from data/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt.json rename to data/models/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt.json diff --git a/data/braindao_DeepSeek-R1-Distill-Qwen-14B-Reflective.json b/data/models/braindao_DeepSeek-R1-Distill-Qwen-14B-Reflective.json similarity index 100% rename from data/braindao_DeepSeek-R1-Distill-Qwen-14B-Reflective.json rename to data/models/braindao_DeepSeek-R1-Distill-Qwen-14B-Reflective.json diff --git a/data/braindao_DeepSeek-R1-Distill-Qwen-14B.json b/data/models/braindao_DeepSeek-R1-Distill-Qwen-14B.json similarity index 100% rename from data/braindao_DeepSeek-R1-Distill-Qwen-14B.json rename to data/models/braindao_DeepSeek-R1-Distill-Qwen-14B.json diff --git a/data/braindao_DeepSeek-R1-Distill-Qwen-7B-Blunt.json b/data/models/braindao_DeepSeek-R1-Distill-Qwen-7B-Blunt.json similarity index 100% rename from data/braindao_DeepSeek-R1-Distill-Qwen-7B-Blunt.json rename to data/models/braindao_DeepSeek-R1-Distill-Qwen-7B-Blunt.json diff --git a/data/braindao_DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored.json b/data/models/braindao_DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored.json similarity index 100% rename from data/braindao_DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored.json rename to data/models/braindao_DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored.json diff --git a/data/braindao_DeepSeek-R1-Distill-Qwen-7B-Reflective.json b/data/models/braindao_DeepSeek-R1-Distill-Qwen-7B-Reflective.json similarity index 100% rename from data/braindao_DeepSeek-R1-Distill-Qwen-7B-Reflective.json rename to data/models/braindao_DeepSeek-R1-Distill-Qwen-7B-Reflective.json diff --git a/data/braindao_DeepSeek-R1-Distill-Qwen-7B.json b/data/models/braindao_DeepSeek-R1-Distill-Qwen-7B.json similarity index 100% rename from data/braindao_DeepSeek-R1-Distill-Qwen-7B.json rename to data/models/braindao_DeepSeek-R1-Distill-Qwen-7B.json diff --git a/data/braindao_Qwen2.5-14B-Instruct.json b/data/models/braindao_Qwen2.5-14B-Instruct.json similarity index 100% rename from data/braindao_Qwen2.5-14B-Instruct.json rename to data/models/braindao_Qwen2.5-14B-Instruct.json diff --git a/data/braindao_Qwen2.5-14B.json b/data/models/braindao_Qwen2.5-14B.json similarity index 100% rename from data/braindao_Qwen2.5-14B.json rename to data/models/braindao_Qwen2.5-14B.json diff --git a/data/braindao_iq-code-evmind-0.5b.json b/data/models/braindao_iq-code-evmind-0.5b.json similarity index 100% rename from data/braindao_iq-code-evmind-0.5b.json rename to data/models/braindao_iq-code-evmind-0.5b.json diff --git a/data/brgx53_3Bgeneral-ECE-PRYMMAL-Martial.json b/data/models/brgx53_3Bgeneral-ECE-PRYMMAL-Martial.json similarity index 100% rename from data/brgx53_3Bgeneral-ECE-PRYMMAL-Martial.json rename to data/models/brgx53_3Bgeneral-ECE-PRYMMAL-Martial.json diff --git a/data/brgx53_3Bgeneralv2-ECE-PRYMMAL-Martial.json b/data/models/brgx53_3Bgeneralv2-ECE-PRYMMAL-Martial.json similarity index 100% rename from data/brgx53_3Bgeneralv2-ECE-PRYMMAL-Martial.json rename to data/models/brgx53_3Bgeneralv2-ECE-PRYMMAL-Martial.json diff --git a/data/brgx53_3Blareneg-ECE-PRYMMAL-Martial.json b/data/models/brgx53_3Blareneg-ECE-PRYMMAL-Martial.json similarity index 100% rename from data/brgx53_3Blareneg-ECE-PRYMMAL-Martial.json rename to data/models/brgx53_3Blareneg-ECE-PRYMMAL-Martial.json diff --git a/data/brgx53_3Blarenegv2-ECE-PRYMMAL-Martial.json b/data/models/brgx53_3Blarenegv2-ECE-PRYMMAL-Martial.json similarity index 100% rename from data/brgx53_3Blarenegv2-ECE-PRYMMAL-Martial.json rename to data/models/brgx53_3Blarenegv2-ECE-PRYMMAL-Martial.json diff --git a/data/brgx53_Barracuda-PRYMMAL-ECE-TW3.json b/data/models/brgx53_Barracuda-PRYMMAL-ECE-TW3.json similarity index 100% rename from data/brgx53_Barracuda-PRYMMAL-ECE-TW3.json rename to data/models/brgx53_Barracuda-PRYMMAL-ECE-TW3.json diff --git a/data/brgx53_LaConfiance-PRYMMAL-ECE-TW3.json b/data/models/brgx53_LaConfiance-PRYMMAL-ECE-TW3.json similarity index 100% rename from data/brgx53_LaConfiance-PRYMMAL-ECE-TW3.json rename to data/models/brgx53_LaConfiance-PRYMMAL-ECE-TW3.json diff --git a/data/bunnycore_Best-Mix-Llama-3.1-8B.json b/data/models/bunnycore_Best-Mix-Llama-3.1-8B.json similarity index 100% rename from data/bunnycore_Best-Mix-Llama-3.1-8B.json rename to data/models/bunnycore_Best-Mix-Llama-3.1-8B.json diff --git a/data/bunnycore_Blabbertron-1.0.json b/data/models/bunnycore_Blabbertron-1.0.json similarity index 100% rename from data/bunnycore_Blabbertron-1.0.json rename to data/models/bunnycore_Blabbertron-1.0.json diff --git a/data/bunnycore_Blabbertron-1.1.json b/data/models/bunnycore_Blabbertron-1.1.json similarity index 100% rename from data/bunnycore_Blabbertron-1.1.json rename to data/models/bunnycore_Blabbertron-1.1.json diff --git a/data/bunnycore_CyberCore-Qwen-2.1-7B.json b/data/models/bunnycore_CyberCore-Qwen-2.1-7B.json similarity index 100% rename from data/bunnycore_CyberCore-Qwen-2.1-7B.json rename to data/models/bunnycore_CyberCore-Qwen-2.1-7B.json diff --git a/data/bunnycore_DeepQwen-3B-LCoT-SCE.json b/data/models/bunnycore_DeepQwen-3B-LCoT-SCE.json similarity index 100% rename from data/bunnycore_DeepQwen-3B-LCoT-SCE.json rename to data/models/bunnycore_DeepQwen-3B-LCoT-SCE.json diff --git a/data/bunnycore_DeepSeek-R1-Distill-Qwen-7B-RRP-Ex.json b/data/models/bunnycore_DeepSeek-R1-Distill-Qwen-7B-RRP-Ex.json similarity index 100% rename from data/bunnycore_DeepSeek-R1-Distill-Qwen-7B-RRP-Ex.json rename to data/models/bunnycore_DeepSeek-R1-Distill-Qwen-7B-RRP-Ex.json diff --git a/data/bunnycore_DeepThinker-7B-Sce-v1.json b/data/models/bunnycore_DeepThinker-7B-Sce-v1.json similarity index 100% rename from data/bunnycore_DeepThinker-7B-Sce-v1.json rename to data/models/bunnycore_DeepThinker-7B-Sce-v1.json diff --git a/data/bunnycore_DeepThinker-7B-Sce-v2.json b/data/models/bunnycore_DeepThinker-7B-Sce-v2.json similarity index 100% rename from data/bunnycore_DeepThinker-7B-Sce-v2.json rename to data/models/bunnycore_DeepThinker-7B-Sce-v2.json diff --git a/data/bunnycore_FuseCyberMix-Qwen-2.5-7B-Instruct.json b/data/models/bunnycore_FuseCyberMix-Qwen-2.5-7B-Instruct.json similarity index 100% rename from data/bunnycore_FuseCyberMix-Qwen-2.5-7B-Instruct.json rename to data/models/bunnycore_FuseCyberMix-Qwen-2.5-7B-Instruct.json diff --git a/data/bunnycore_FuseQwQen-7B.json b/data/models/bunnycore_FuseQwQen-7B.json similarity index 100% rename from data/bunnycore_FuseQwQen-7B.json rename to data/models/bunnycore_FuseQwQen-7B.json diff --git a/data/bunnycore_FwF-Qwen-7B-0.1.json b/data/models/bunnycore_FwF-Qwen-7B-0.1.json similarity index 100% rename from data/bunnycore_FwF-Qwen-7B-0.1.json rename to data/models/bunnycore_FwF-Qwen-7B-0.1.json diff --git a/data/bunnycore_FwF-Qwen-7B-0.2.json b/data/models/bunnycore_FwF-Qwen-7B-0.2.json similarity index 100% rename from data/bunnycore_FwF-Qwen-7B-0.2.json rename to data/models/bunnycore_FwF-Qwen-7B-0.2.json diff --git a/data/bunnycore_Gemma-2-2B-Smart.json b/data/models/bunnycore_Gemma-2-2B-Smart.json similarity index 100% rename from data/bunnycore_Gemma-2-2B-Smart.json rename to data/models/bunnycore_Gemma-2-2B-Smart.json diff --git a/data/bunnycore_Gemma2-9B-TitanFusion.json b/data/models/bunnycore_Gemma2-9B-TitanFusion.json similarity index 100% rename from data/bunnycore_Gemma2-9B-TitanFusion.json rename to data/models/bunnycore_Gemma2-9B-TitanFusion.json diff --git a/data/bunnycore_HyperLlama-3.1-8B.json b/data/models/bunnycore_HyperLlama-3.1-8B.json similarity index 100% rename from data/bunnycore_HyperLlama-3.1-8B.json rename to data/models/bunnycore_HyperLlama-3.1-8B.json diff --git a/data/bunnycore_Llama-3.1-8B-TitanFusion-Mix.json b/data/models/bunnycore_Llama-3.1-8B-TitanFusion-Mix.json similarity index 100% rename from data/bunnycore_Llama-3.1-8B-TitanFusion-Mix.json rename to data/models/bunnycore_Llama-3.1-8B-TitanFusion-Mix.json diff --git a/data/bunnycore_Llama-3.1-8B-TitanFusion-v3.json b/data/models/bunnycore_Llama-3.1-8B-TitanFusion-v3.json similarity index 100% rename from data/bunnycore_Llama-3.1-8B-TitanFusion-v3.json rename to data/models/bunnycore_Llama-3.1-8B-TitanFusion-v3.json diff --git a/data/bunnycore_Llama-3.2-3B-All-Mix.json b/data/models/bunnycore_Llama-3.2-3B-All-Mix.json similarity index 100% rename from data/bunnycore_Llama-3.2-3B-All-Mix.json rename to data/models/bunnycore_Llama-3.2-3B-All-Mix.json diff --git a/data/bunnycore_Llama-3.2-3B-Bespoke-Thought.json b/data/models/bunnycore_Llama-3.2-3B-Bespoke-Thought.json similarity index 100% rename from data/bunnycore_Llama-3.2-3B-Bespoke-Thought.json rename to data/models/bunnycore_Llama-3.2-3B-Bespoke-Thought.json diff --git a/data/bunnycore_Llama-3.2-3B-Booval.json b/data/models/bunnycore_Llama-3.2-3B-Booval.json similarity index 100% rename from data/bunnycore_Llama-3.2-3B-Booval.json rename to data/models/bunnycore_Llama-3.2-3B-Booval.json diff --git a/data/bunnycore_Llama-3.2-3B-Deep-Test.json b/data/models/bunnycore_Llama-3.2-3B-Deep-Test.json similarity index 100% rename from data/bunnycore_Llama-3.2-3B-Deep-Test.json rename to data/models/bunnycore_Llama-3.2-3B-Deep-Test.json diff --git a/data/bunnycore_Llama-3.2-3B-Della.json b/data/models/bunnycore_Llama-3.2-3B-Della.json similarity index 100% rename from data/bunnycore_Llama-3.2-3B-Della.json rename to data/models/bunnycore_Llama-3.2-3B-Della.json diff --git a/data/bunnycore_Llama-3.2-3B-Long-Think.json b/data/models/bunnycore_Llama-3.2-3B-Long-Think.json similarity index 100% rename from data/bunnycore_Llama-3.2-3B-Long-Think.json rename to data/models/bunnycore_Llama-3.2-3B-Long-Think.json diff --git a/data/bunnycore_Llama-3.2-3B-Mix-Skill.json b/data/models/bunnycore_Llama-3.2-3B-Mix-Skill.json similarity index 100% rename from data/bunnycore_Llama-3.2-3B-Mix-Skill.json rename to data/models/bunnycore_Llama-3.2-3B-Mix-Skill.json diff --git a/data/bunnycore_Llama-3.2-3B-ProdigyPlus.json b/data/models/bunnycore_Llama-3.2-3B-ProdigyPlus.json similarity index 100% rename from data/bunnycore_Llama-3.2-3B-ProdigyPlus.json rename to data/models/bunnycore_Llama-3.2-3B-ProdigyPlus.json diff --git a/data/bunnycore_Llama-3.2-3B-ProdigyPlusPlus.json b/data/models/bunnycore_Llama-3.2-3B-ProdigyPlusPlus.json similarity index 100% rename from data/bunnycore_Llama-3.2-3B-ProdigyPlusPlus.json rename to data/models/bunnycore_Llama-3.2-3B-ProdigyPlusPlus.json diff --git a/data/bunnycore_Llama-3.2-3B-RP-DeepThink.json b/data/models/bunnycore_Llama-3.2-3B-RP-DeepThink.json similarity index 100% rename from data/bunnycore_Llama-3.2-3B-RP-DeepThink.json rename to data/models/bunnycore_Llama-3.2-3B-RP-DeepThink.json diff --git a/data/bunnycore_Llama-3.2-3B-RRStock.json b/data/models/bunnycore_Llama-3.2-3B-RRStock.json similarity index 100% rename from data/bunnycore_Llama-3.2-3B-RRStock.json rename to data/models/bunnycore_Llama-3.2-3B-RRStock.json diff --git a/data/bunnycore_Llama-3.2-3B-ToxicKod.json b/data/models/bunnycore_Llama-3.2-3B-ToxicKod.json similarity index 100% rename from data/bunnycore_Llama-3.2-3B-ToxicKod.json rename to data/models/bunnycore_Llama-3.2-3B-ToxicKod.json diff --git a/data/bunnycore_Llama-3.2-3b-RP-Toxic-Fuse.json b/data/models/bunnycore_Llama-3.2-3b-RP-Toxic-Fuse.json similarity index 100% rename from data/bunnycore_Llama-3.2-3b-RP-Toxic-Fuse.json rename to data/models/bunnycore_Llama-3.2-3b-RP-Toxic-Fuse.json diff --git a/data/bunnycore_Maestro-S1k-7B-Sce.json b/data/models/bunnycore_Maestro-S1k-7B-Sce.json similarity index 100% rename from data/bunnycore_Maestro-S1k-7B-Sce.json rename to data/models/bunnycore_Maestro-S1k-7B-Sce.json diff --git a/data/bunnycore_Phi-3.5-mini-TitanFusion-0.1.json b/data/models/bunnycore_Phi-3.5-mini-TitanFusion-0.1.json similarity index 100% rename from data/bunnycore_Phi-3.5-mini-TitanFusion-0.1.json rename to data/models/bunnycore_Phi-3.5-mini-TitanFusion-0.1.json diff --git a/data/bunnycore_Phi-4-Model-Stock-v2.json b/data/models/bunnycore_Phi-4-Model-Stock-v2.json similarity index 100% rename from data/bunnycore_Phi-4-Model-Stock-v2.json rename to data/models/bunnycore_Phi-4-Model-Stock-v2.json diff --git a/data/bunnycore_Phi-4-Model-Stock-v3.json b/data/models/bunnycore_Phi-4-Model-Stock-v3.json similarity index 100% rename from data/bunnycore_Phi-4-Model-Stock-v3.json rename to data/models/bunnycore_Phi-4-Model-Stock-v3.json diff --git a/data/bunnycore_Phi-4-Model-Stock-v4.json b/data/models/bunnycore_Phi-4-Model-Stock-v4.json similarity index 100% rename from data/bunnycore_Phi-4-Model-Stock-v4.json rename to data/models/bunnycore_Phi-4-Model-Stock-v4.json diff --git a/data/bunnycore_Phi-4-Model-Stock.json b/data/models/bunnycore_Phi-4-Model-Stock.json similarity index 100% rename from data/bunnycore_Phi-4-Model-Stock.json rename to data/models/bunnycore_Phi-4-Model-Stock.json diff --git a/data/bunnycore_Phi-4-RP-v0.json b/data/models/bunnycore_Phi-4-RP-v0.json similarity index 100% rename from data/bunnycore_Phi-4-RP-v0.json rename to data/models/bunnycore_Phi-4-RP-v0.json diff --git a/data/bunnycore_Phi-4-RR-Shoup.json b/data/models/bunnycore_Phi-4-RR-Shoup.json similarity index 100% rename from data/bunnycore_Phi-4-RR-Shoup.json rename to data/models/bunnycore_Phi-4-RR-Shoup.json diff --git a/data/bunnycore_Phi-4-RStock-v0.1.json b/data/models/bunnycore_Phi-4-RStock-v0.1.json similarity index 100% rename from data/bunnycore_Phi-4-RStock-v0.1.json rename to data/models/bunnycore_Phi-4-RStock-v0.1.json diff --git a/data/bunnycore_Phi-4-ReasoningRP.json b/data/models/bunnycore_Phi-4-ReasoningRP.json similarity index 100% rename from data/bunnycore_Phi-4-ReasoningRP.json rename to data/models/bunnycore_Phi-4-ReasoningRP.json diff --git a/data/bunnycore_Phi-4-Sce-exp-v0.1.json b/data/models/bunnycore_Phi-4-Sce-exp-v0.1.json similarity index 100% rename from data/bunnycore_Phi-4-Sce-exp-v0.1.json rename to data/models/bunnycore_Phi-4-Sce-exp-v0.1.json diff --git a/data/bunnycore_Phi-4-Stock-Ex.json b/data/models/bunnycore_Phi-4-Stock-Ex.json similarity index 100% rename from data/bunnycore_Phi-4-Stock-Ex.json rename to data/models/bunnycore_Phi-4-Stock-Ex.json diff --git a/data/bunnycore_Phi-4-Stock-RP.json b/data/models/bunnycore_Phi-4-Stock-RP.json similarity index 100% rename from data/bunnycore_Phi-4-Stock-RP.json rename to data/models/bunnycore_Phi-4-Stock-RP.json diff --git a/data/bunnycore_Phi-4-Trim-Exp1.json b/data/models/bunnycore_Phi-4-Trim-Exp1.json similarity index 100% rename from data/bunnycore_Phi-4-Trim-Exp1.json rename to data/models/bunnycore_Phi-4-Trim-Exp1.json diff --git a/data/bunnycore_Phi-Seek-4-Sce-V1.json b/data/models/bunnycore_Phi-Seek-4-Sce-V1.json similarity index 100% rename from data/bunnycore_Phi-Seek-4-Sce-V1.json rename to data/models/bunnycore_Phi-Seek-4-Sce-V1.json diff --git a/data/bunnycore_Qandora-2.5-7B-Creative.json b/data/models/bunnycore_Qandora-2.5-7B-Creative.json similarity index 100% rename from data/bunnycore_Qandora-2.5-7B-Creative.json rename to data/models/bunnycore_Qandora-2.5-7B-Creative.json diff --git a/data/bunnycore_QandoraExp-7B-Persona.json b/data/models/bunnycore_QandoraExp-7B-Persona.json similarity index 100% rename from data/bunnycore_QandoraExp-7B-Persona.json rename to data/models/bunnycore_QandoraExp-7B-Persona.json diff --git a/data/bunnycore_QandoraExp-7B-v2.json b/data/models/bunnycore_QandoraExp-7B-v2.json similarity index 100% rename from data/bunnycore_QandoraExp-7B-v2.json rename to data/models/bunnycore_QandoraExp-7B-v2.json diff --git a/data/bunnycore_QandoraExp-7B.json b/data/models/bunnycore_QandoraExp-7B.json similarity index 100% rename from data/bunnycore_QandoraExp-7B.json rename to data/models/bunnycore_QandoraExp-7B.json diff --git a/data/bunnycore_QwQen-3B-LCoT-R1.json b/data/models/bunnycore_QwQen-3B-LCoT-R1.json similarity index 100% rename from data/bunnycore_QwQen-3B-LCoT-R1.json rename to data/models/bunnycore_QwQen-3B-LCoT-R1.json diff --git a/data/bunnycore_QwQen-3B-LCoT.json b/data/models/bunnycore_QwQen-3B-LCoT.json similarity index 100% rename from data/bunnycore_QwQen-3B-LCoT.json rename to data/models/bunnycore_QwQen-3B-LCoT.json diff --git a/data/bunnycore_Qwen-2.5-7B-Deep-Sky-T1.json b/data/models/bunnycore_Qwen-2.5-7B-Deep-Sky-T1.json similarity index 100% rename from data/bunnycore_Qwen-2.5-7B-Deep-Sky-T1.json rename to data/models/bunnycore_Qwen-2.5-7B-Deep-Sky-T1.json diff --git a/data/bunnycore_Qwen-2.5-7B-Deep-Stock-v1.json b/data/models/bunnycore_Qwen-2.5-7B-Deep-Stock-v1.json similarity index 100% rename from data/bunnycore_Qwen-2.5-7B-Deep-Stock-v1.json rename to data/models/bunnycore_Qwen-2.5-7B-Deep-Stock-v1.json diff --git a/data/bunnycore_Qwen-2.5-7B-Deep-Stock-v4.json b/data/models/bunnycore_Qwen-2.5-7B-Deep-Stock-v4.json similarity index 100% rename from data/bunnycore_Qwen-2.5-7B-Deep-Stock-v4.json rename to data/models/bunnycore_Qwen-2.5-7B-Deep-Stock-v4.json diff --git a/data/bunnycore_Qwen-2.5-7B-Deep-Stock-v5.json b/data/models/bunnycore_Qwen-2.5-7B-Deep-Stock-v5.json similarity index 100% rename from data/bunnycore_Qwen-2.5-7B-Deep-Stock-v5.json rename to data/models/bunnycore_Qwen-2.5-7B-Deep-Stock-v5.json diff --git a/data/bunnycore_Qwen-2.5-7B-Exp-Sce.json b/data/models/bunnycore_Qwen-2.5-7B-Exp-Sce.json similarity index 100% rename from data/bunnycore_Qwen-2.5-7B-Exp-Sce.json rename to data/models/bunnycore_Qwen-2.5-7B-Exp-Sce.json diff --git a/data/bunnycore_Qwen-2.5-7B-R1-Stock.json b/data/models/bunnycore_Qwen-2.5-7B-R1-Stock.json similarity index 100% rename from data/bunnycore_Qwen-2.5-7B-R1-Stock.json rename to data/models/bunnycore_Qwen-2.5-7B-R1-Stock.json diff --git a/data/bunnycore_Qwen-2.5-7B-Stock-Deep-Bespoke.json b/data/models/bunnycore_Qwen-2.5-7B-Stock-Deep-Bespoke.json similarity index 100% rename from data/bunnycore_Qwen-2.5-7B-Stock-Deep-Bespoke.json rename to data/models/bunnycore_Qwen-2.5-7B-Stock-Deep-Bespoke.json diff --git a/data/bunnycore_Qwen-2.5-7b-S1k.json b/data/models/bunnycore_Qwen-2.5-7b-S1k.json similarity index 100% rename from data/bunnycore_Qwen-2.5-7b-S1k.json rename to data/models/bunnycore_Qwen-2.5-7b-S1k.json diff --git a/data/bunnycore_Qwen2.5-1.5B-Model-Stock.json b/data/models/bunnycore_Qwen2.5-1.5B-Model-Stock.json similarity index 100% rename from data/bunnycore_Qwen2.5-1.5B-Model-Stock.json rename to data/models/bunnycore_Qwen2.5-1.5B-Model-Stock.json diff --git a/data/bunnycore_Qwen2.5-3B-Model-Stock-v2.json b/data/models/bunnycore_Qwen2.5-3B-Model-Stock-v2.json similarity index 100% rename from data/bunnycore_Qwen2.5-3B-Model-Stock-v2.json rename to data/models/bunnycore_Qwen2.5-3B-Model-Stock-v2.json diff --git a/data/bunnycore_Qwen2.5-3B-Model-Stock-v3.1.json b/data/models/bunnycore_Qwen2.5-3B-Model-Stock-v3.1.json similarity index 100% rename from data/bunnycore_Qwen2.5-3B-Model-Stock-v3.1.json rename to data/models/bunnycore_Qwen2.5-3B-Model-Stock-v3.1.json diff --git a/data/bunnycore_Qwen2.5-3B-Model-Stock-v3.2.json b/data/models/bunnycore_Qwen2.5-3B-Model-Stock-v3.2.json similarity index 100% rename from data/bunnycore_Qwen2.5-3B-Model-Stock-v3.2.json rename to data/models/bunnycore_Qwen2.5-3B-Model-Stock-v3.2.json diff --git a/data/bunnycore_Qwen2.5-3B-Model-Stock-v4.1.json b/data/models/bunnycore_Qwen2.5-3B-Model-Stock-v4.1.json similarity index 100% rename from data/bunnycore_Qwen2.5-3B-Model-Stock-v4.1.json rename to data/models/bunnycore_Qwen2.5-3B-Model-Stock-v4.1.json diff --git a/data/bunnycore_Qwen2.5-3B-Model-Stock.json b/data/models/bunnycore_Qwen2.5-3B-Model-Stock.json similarity index 100% rename from data/bunnycore_Qwen2.5-3B-Model-Stock.json rename to data/models/bunnycore_Qwen2.5-3B-Model-Stock.json diff --git a/data/bunnycore_Qwen2.5-3B-RP-Mix.json b/data/models/bunnycore_Qwen2.5-3B-RP-Mix.json similarity index 100% rename from data/bunnycore_Qwen2.5-3B-RP-Mix.json rename to data/models/bunnycore_Qwen2.5-3B-RP-Mix.json diff --git a/data/bunnycore_Qwen2.5-3B-RP-Thinker-V2.json b/data/models/bunnycore_Qwen2.5-3B-RP-Thinker-V2.json similarity index 100% rename from data/bunnycore_Qwen2.5-3B-RP-Thinker-V2.json rename to data/models/bunnycore_Qwen2.5-3B-RP-Thinker-V2.json diff --git a/data/bunnycore_Qwen2.5-3B-RP-Thinker.json b/data/models/bunnycore_Qwen2.5-3B-RP-Thinker.json similarity index 100% rename from data/bunnycore_Qwen2.5-3B-RP-Thinker.json rename to data/models/bunnycore_Qwen2.5-3B-RP-Thinker.json diff --git a/data/bunnycore_Qwen2.5-7B-CyberRombos.json b/data/models/bunnycore_Qwen2.5-7B-CyberRombos.json similarity index 100% rename from data/bunnycore_Qwen2.5-7B-CyberRombos.json rename to data/models/bunnycore_Qwen2.5-7B-CyberRombos.json diff --git a/data/bunnycore_Qwen2.5-7B-Fuse-Exp.json b/data/models/bunnycore_Qwen2.5-7B-Fuse-Exp.json similarity index 100% rename from data/bunnycore_Qwen2.5-7B-Fuse-Exp.json rename to data/models/bunnycore_Qwen2.5-7B-Fuse-Exp.json diff --git a/data/bunnycore_Qwen2.5-7B-Instruct-Fusion.json b/data/models/bunnycore_Qwen2.5-7B-Instruct-Fusion.json similarity index 100% rename from data/bunnycore_Qwen2.5-7B-Instruct-Fusion.json rename to data/models/bunnycore_Qwen2.5-7B-Instruct-Fusion.json diff --git a/data/bunnycore_Qwen2.5-7B-Instruct-Merge-Stock-v0.1.json b/data/models/bunnycore_Qwen2.5-7B-Instruct-Merge-Stock-v0.1.json similarity index 100% rename from data/bunnycore_Qwen2.5-7B-Instruct-Merge-Stock-v0.1.json rename to data/models/bunnycore_Qwen2.5-7B-Instruct-Merge-Stock-v0.1.json diff --git a/data/bunnycore_Qwen2.5-7B-MixStock-Sce-V0.3.json b/data/models/bunnycore_Qwen2.5-7B-MixStock-Sce-V0.3.json similarity index 100% rename from data/bunnycore_Qwen2.5-7B-MixStock-Sce-V0.3.json rename to data/models/bunnycore_Qwen2.5-7B-MixStock-Sce-V0.3.json diff --git a/data/bunnycore_Qwen2.5-7B-MixStock-V0.1.json b/data/models/bunnycore_Qwen2.5-7B-MixStock-V0.1.json similarity index 100% rename from data/bunnycore_Qwen2.5-7B-MixStock-V0.1.json rename to data/models/bunnycore_Qwen2.5-7B-MixStock-V0.1.json diff --git a/data/bunnycore_Qwen2.5-7B-R1-Bespoke-Stock.json b/data/models/bunnycore_Qwen2.5-7B-R1-Bespoke-Stock.json similarity index 100% rename from data/bunnycore_Qwen2.5-7B-R1-Bespoke-Stock.json rename to data/models/bunnycore_Qwen2.5-7B-R1-Bespoke-Stock.json diff --git a/data/bunnycore_Qwen2.5-7B-R1-Bespoke-Task.json b/data/models/bunnycore_Qwen2.5-7B-R1-Bespoke-Task.json similarity index 100% rename from data/bunnycore_Qwen2.5-7B-R1-Bespoke-Task.json rename to data/models/bunnycore_Qwen2.5-7B-R1-Bespoke-Task.json diff --git a/data/bunnycore_Qwen2.5-7B-RRP-1M-Thinker.json b/data/models/bunnycore_Qwen2.5-7B-RRP-1M-Thinker.json similarity index 100% rename from data/bunnycore_Qwen2.5-7B-RRP-1M-Thinker.json rename to data/models/bunnycore_Qwen2.5-7B-RRP-1M-Thinker.json diff --git a/data/bunnycore_Qwen2.5-7B-RRP-1M.json b/data/models/bunnycore_Qwen2.5-7B-RRP-1M.json similarity index 100% rename from data/bunnycore_Qwen2.5-7B-RRP-1M.json rename to data/models/bunnycore_Qwen2.5-7B-RRP-1M.json diff --git a/data/bunnycore_Qwen2.5-7B-RRP-ID.json b/data/models/bunnycore_Qwen2.5-7B-RRP-ID.json similarity index 100% rename from data/bunnycore_Qwen2.5-7B-RRP-ID.json rename to data/models/bunnycore_Qwen2.5-7B-RRP-ID.json diff --git a/data/bunnycore_Qwen2.5-7B-Sky-R1-Mini.json b/data/models/bunnycore_Qwen2.5-7B-Sky-R1-Mini.json similarity index 100% rename from data/bunnycore_Qwen2.5-7B-Sky-R1-Mini.json rename to data/models/bunnycore_Qwen2.5-7B-Sky-R1-Mini.json diff --git a/data/bunnycore_QwenMosaic-7B.json b/data/models/bunnycore_QwenMosaic-7B.json similarity index 100% rename from data/bunnycore_QwenMosaic-7B.json rename to data/models/bunnycore_QwenMosaic-7B.json diff --git a/data/bunnycore_Smol-Llama-3.2-3B.json b/data/models/bunnycore_Smol-Llama-3.2-3B.json similarity index 100% rename from data/bunnycore_Smol-Llama-3.2-3B.json rename to data/models/bunnycore_Smol-Llama-3.2-3B.json diff --git a/data/bunnycore_SmolLM2-1.7-Persona.json b/data/models/bunnycore_SmolLM2-1.7-Persona.json similarity index 100% rename from data/bunnycore_SmolLM2-1.7-Persona.json rename to data/models/bunnycore_SmolLM2-1.7-Persona.json diff --git a/data/bunnycore_SmolLM2-1.7B-roleplay-lora.json b/data/models/bunnycore_SmolLM2-1.7B-roleplay-lora.json similarity index 100% rename from data/bunnycore_SmolLM2-1.7B-roleplay-lora.json rename to data/models/bunnycore_SmolLM2-1.7B-roleplay-lora.json diff --git a/data/bunnycore_Tulu-3.1-8B-SuperNova.json b/data/models/bunnycore_Tulu-3.1-8B-SuperNova.json similarity index 100% rename from data/bunnycore_Tulu-3.1-8B-SuperNova.json rename to data/models/bunnycore_Tulu-3.1-8B-SuperNova.json diff --git a/data/byroneverson_Mistral-Small-Instruct-2409-abliterated.json b/data/models/byroneverson_Mistral-Small-Instruct-2409-abliterated.json similarity index 100% rename from data/byroneverson_Mistral-Small-Instruct-2409-abliterated.json rename to data/models/byroneverson_Mistral-Small-Instruct-2409-abliterated.json diff --git a/data/byroneverson_Yi-1.5-9B-Chat-16K-abliterated.json b/data/models/byroneverson_Yi-1.5-9B-Chat-16K-abliterated.json similarity index 100% rename from data/byroneverson_Yi-1.5-9B-Chat-16K-abliterated.json rename to data/models/byroneverson_Yi-1.5-9B-Chat-16K-abliterated.json diff --git a/data/byroneverson_Yi-1.5-9B-Chat-abliterated.json b/data/models/byroneverson_Yi-1.5-9B-Chat-abliterated.json similarity index 100% rename from data/byroneverson_Yi-1.5-9B-Chat-abliterated.json rename to data/models/byroneverson_Yi-1.5-9B-Chat-abliterated.json diff --git a/data/bytedance_doubao-seed-1-6-thinking-250615.json b/data/models/bytedance_doubao-seed-1-6-thinking-250615.json similarity index 100% rename from data/bytedance_doubao-seed-1-6-thinking-250615.json rename to data/models/bytedance_doubao-seed-1-6-thinking-250615.json diff --git a/data/c10x_Q-Pluse.json b/data/models/c10x_Q-Pluse.json similarity index 100% rename from data/c10x_Q-Pluse.json rename to data/models/c10x_Q-Pluse.json diff --git a/data/c10x_longthinker.json b/data/models/c10x_longthinker.json similarity index 100% rename from data/c10x_longthinker.json rename to data/models/c10x_longthinker.json diff --git a/data/carsenk_flippa-v6.json b/data/models/carsenk_flippa-v6.json similarity index 100% rename from data/carsenk_flippa-v6.json rename to data/models/carsenk_flippa-v6.json diff --git a/data/carsenk_phi3.5_mini_exp_825_uncensored.json b/data/models/carsenk_phi3.5_mini_exp_825_uncensored.json similarity index 100% rename from data/carsenk_phi3.5_mini_exp_825_uncensored.json rename to data/models/carsenk_phi3.5_mini_exp_825_uncensored.json diff --git a/data/cat-searcher_gemma-2-9b-it-sppo-iter-1-evol-1.json b/data/models/cat-searcher_gemma-2-9b-it-sppo-iter-1-evol-1.json similarity index 100% rename from data/cat-searcher_gemma-2-9b-it-sppo-iter-1-evol-1.json rename to data/models/cat-searcher_gemma-2-9b-it-sppo-iter-1-evol-1.json diff --git a/data/cat-searcher_gemma-2-9b-it-sppo-iter-1.json b/data/models/cat-searcher_gemma-2-9b-it-sppo-iter-1.json similarity index 100% rename from data/cat-searcher_gemma-2-9b-it-sppo-iter-1.json rename to data/models/cat-searcher_gemma-2-9b-it-sppo-iter-1.json diff --git a/data/cckm_tinymistral_950m.json b/data/models/cckm_tinymistral_950m.json similarity index 100% rename from data/cckm_tinymistral_950m.json rename to data/models/cckm_tinymistral_950m.json diff --git a/data/cgato_TheSalt-L3-8b-v0.3.2.json b/data/models/cgato_TheSalt-L3-8b-v0.3.2.json similarity index 100% rename from data/cgato_TheSalt-L3-8b-v0.3.2.json rename to data/models/cgato_TheSalt-L3-8b-v0.3.2.json diff --git a/data/chargoddard_prometheus-2-llama-3-8b.json b/data/models/chargoddard_prometheus-2-llama-3-8b.json similarity index 100% rename from data/chargoddard_prometheus-2-llama-3-8b.json rename to data/models/chargoddard_prometheus-2-llama-3-8b.json diff --git a/data/chujiezheng_Llama-3-Instruct-8B-SimPO-ExPO.json b/data/models/chujiezheng_Llama-3-Instruct-8B-SimPO-ExPO.json similarity index 100% rename from data/chujiezheng_Llama-3-Instruct-8B-SimPO-ExPO.json rename to data/models/chujiezheng_Llama-3-Instruct-8B-SimPO-ExPO.json diff --git a/data/chujiezheng_Mistral7B-PairRM-SPPO-ExPO.json b/data/models/chujiezheng_Mistral7B-PairRM-SPPO-ExPO.json similarity index 100% rename from data/chujiezheng_Mistral7B-PairRM-SPPO-ExPO.json rename to data/models/chujiezheng_Mistral7B-PairRM-SPPO-ExPO.json diff --git a/data/cjvt_GaMS-1B.json b/data/models/cjvt_GaMS-1B.json similarity index 100% rename from data/cjvt_GaMS-1B.json rename to data/models/cjvt_GaMS-1B.json diff --git a/data/cloudyu_Llama-3-70Bx2-MOE.json b/data/models/cloudyu_Llama-3-70Bx2-MOE.json similarity index 100% rename from data/cloudyu_Llama-3-70Bx2-MOE.json rename to data/models/cloudyu_Llama-3-70Bx2-MOE.json diff --git a/data/cloudyu_Llama-3.2-3Bx4.json b/data/models/cloudyu_Llama-3.2-3Bx4.json similarity index 100% rename from data/cloudyu_Llama-3.2-3Bx4.json rename to data/models/cloudyu_Llama-3.2-3Bx4.json diff --git a/data/cloudyu_Mixtral_11Bx2_MoE_19B.json b/data/models/cloudyu_Mixtral_11Bx2_MoE_19B.json similarity index 100% rename from data/cloudyu_Mixtral_11Bx2_MoE_19B.json rename to data/models/cloudyu_Mixtral_11Bx2_MoE_19B.json diff --git a/data/cloudyu_Mixtral_34Bx2_MoE_60B.json b/data/models/cloudyu_Mixtral_34Bx2_MoE_60B.json similarity index 100% rename from data/cloudyu_Mixtral_34Bx2_MoE_60B.json rename to data/models/cloudyu_Mixtral_34Bx2_MoE_60B.json diff --git a/data/cloudyu_Mixtral_7Bx2_MoE.json b/data/models/cloudyu_Mixtral_7Bx2_MoE.json similarity index 100% rename from data/cloudyu_Mixtral_7Bx2_MoE.json rename to data/models/cloudyu_Mixtral_7Bx2_MoE.json diff --git a/data/cloudyu_S1-Llama-3.2-3Bx4-MoE.json b/data/models/cloudyu_S1-Llama-3.2-3Bx4-MoE.json similarity index 100% rename from data/cloudyu_S1-Llama-3.2-3Bx4-MoE.json rename to data/models/cloudyu_S1-Llama-3.2-3Bx4-MoE.json diff --git a/data/cloudyu_Yi-34Bx2-MoE-60B-DPO.json b/data/models/cloudyu_Yi-34Bx2-MoE-60B-DPO.json similarity index 100% rename from data/cloudyu_Yi-34Bx2-MoE-60B-DPO.json rename to data/models/cloudyu_Yi-34Bx2-MoE-60B-DPO.json diff --git a/data/cluebbers_Llama-3.1-8B-paraphrase-type-generation-apty-ipo.json b/data/models/cluebbers_Llama-3.1-8B-paraphrase-type-generation-apty-ipo.json similarity index 100% rename from data/cluebbers_Llama-3.1-8B-paraphrase-type-generation-apty-ipo.json rename to data/models/cluebbers_Llama-3.1-8B-paraphrase-type-generation-apty-ipo.json diff --git a/data/cluebbers_Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid.json b/data/models/cluebbers_Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid.json similarity index 100% rename from data/cluebbers_Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid.json rename to data/models/cluebbers_Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid.json diff --git a/data/cluebbers_Llama-3.1-8B-paraphrase-type-generation-etpc.json b/data/models/cluebbers_Llama-3.1-8B-paraphrase-type-generation-etpc.json similarity index 100% rename from data/cluebbers_Llama-3.1-8B-paraphrase-type-generation-etpc.json rename to data/models/cluebbers_Llama-3.1-8B-paraphrase-type-generation-etpc.json diff --git a/data/cognitivecomputations_Dolphin3.0-Llama3.1-8B.json b/data/models/cognitivecomputations_Dolphin3.0-Llama3.1-8B.json similarity index 100% rename from data/cognitivecomputations_Dolphin3.0-Llama3.1-8B.json rename to data/models/cognitivecomputations_Dolphin3.0-Llama3.1-8B.json diff --git a/data/cognitivecomputations_Dolphin3.0-Llama3.2-1B.json b/data/models/cognitivecomputations_Dolphin3.0-Llama3.2-1B.json similarity index 100% rename from data/cognitivecomputations_Dolphin3.0-Llama3.2-1B.json rename to data/models/cognitivecomputations_Dolphin3.0-Llama3.2-1B.json diff --git a/data/cognitivecomputations_Dolphin3.0-Qwen2.5-0.5B.json b/data/models/cognitivecomputations_Dolphin3.0-Qwen2.5-0.5B.json similarity index 100% rename from data/cognitivecomputations_Dolphin3.0-Qwen2.5-0.5B.json rename to data/models/cognitivecomputations_Dolphin3.0-Qwen2.5-0.5B.json diff --git a/data/cognitivecomputations_Dolphin3.0-R1-Mistral-24B.json b/data/models/cognitivecomputations_Dolphin3.0-R1-Mistral-24B.json similarity index 100% rename from data/cognitivecomputations_Dolphin3.0-R1-Mistral-24B.json rename to data/models/cognitivecomputations_Dolphin3.0-R1-Mistral-24B.json diff --git a/data/cognitivecomputations_dolphin-2.9-llama3-8b.json b/data/models/cognitivecomputations_dolphin-2.9-llama3-8b.json similarity index 100% rename from data/cognitivecomputations_dolphin-2.9-llama3-8b.json rename to data/models/cognitivecomputations_dolphin-2.9-llama3-8b.json diff --git a/data/cognitivecomputations_dolphin-2.9.1-llama-3-70b.json b/data/models/cognitivecomputations_dolphin-2.9.1-llama-3-70b.json similarity index 100% rename from data/cognitivecomputations_dolphin-2.9.1-llama-3-70b.json rename to data/models/cognitivecomputations_dolphin-2.9.1-llama-3-70b.json diff --git a/data/cognitivecomputations_dolphin-2.9.1-yi-1.5-34b.json b/data/models/cognitivecomputations_dolphin-2.9.1-yi-1.5-34b.json similarity index 100% rename from data/cognitivecomputations_dolphin-2.9.1-yi-1.5-34b.json rename to data/models/cognitivecomputations_dolphin-2.9.1-yi-1.5-34b.json diff --git a/data/cognitivecomputations_dolphin-2.9.1-yi-1.5-9b.json b/data/models/cognitivecomputations_dolphin-2.9.1-yi-1.5-9b.json similarity index 100% rename from data/cognitivecomputations_dolphin-2.9.1-yi-1.5-9b.json rename to data/models/cognitivecomputations_dolphin-2.9.1-yi-1.5-9b.json diff --git a/data/cognitivecomputations_dolphin-2.9.2-Phi-3-Medium-abliterated.json b/data/models/cognitivecomputations_dolphin-2.9.2-Phi-3-Medium-abliterated.json similarity index 99% rename from data/cognitivecomputations_dolphin-2.9.2-Phi-3-Medium-abliterated.json rename to data/models/cognitivecomputations_dolphin-2.9.2-Phi-3-Medium-abliterated.json index 5a05bbf781307c09b622fb50fc09f0f1a3fd17e9..4942452bc652e1b0662275861da63864a9a53556 100644 --- a/data/cognitivecomputations_dolphin-2.9.2-Phi-3-Medium-abliterated.json +++ b/data/models/cognitivecomputations_dolphin-2.9.2-Phi-3-Medium-abliterated.json @@ -5,7 +5,7 @@ "developer": "cognitivecomputations", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "MistralForCausalLM", "params_billions": "13.96" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4124 + "score": 0.3613 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6383 + "score": 0.6123 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.182 + "score": 0.1239 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3289 + "score": 0.328 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4349 + "score": 0.4112 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4525 + "score": 0.4494 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3613 + "score": 0.4124 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6123 + "score": 0.6383 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1239 + "score": 0.182 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.328 + "score": 0.3289 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4112 + "score": 0.4349 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4494 + "score": 0.4525 } } ], diff --git a/data/cognitivecomputations_dolphin-2.9.2-Phi-3-Medium.json b/data/models/cognitivecomputations_dolphin-2.9.2-Phi-3-Medium.json similarity index 100% rename from data/cognitivecomputations_dolphin-2.9.2-Phi-3-Medium.json rename to data/models/cognitivecomputations_dolphin-2.9.2-Phi-3-Medium.json diff --git a/data/cognitivecomputations_dolphin-2.9.2-qwen2-72b.json b/data/models/cognitivecomputations_dolphin-2.9.2-qwen2-72b.json similarity index 100% rename from data/cognitivecomputations_dolphin-2.9.2-qwen2-72b.json rename to data/models/cognitivecomputations_dolphin-2.9.2-qwen2-72b.json diff --git a/data/cognitivecomputations_dolphin-2.9.2-qwen2-7b.json b/data/models/cognitivecomputations_dolphin-2.9.2-qwen2-7b.json similarity index 100% rename from data/cognitivecomputations_dolphin-2.9.2-qwen2-7b.json rename to data/models/cognitivecomputations_dolphin-2.9.2-qwen2-7b.json diff --git a/data/cognitivecomputations_dolphin-2.9.3-Yi-1.5-34B-32k.json b/data/models/cognitivecomputations_dolphin-2.9.3-Yi-1.5-34B-32k.json similarity index 100% rename from data/cognitivecomputations_dolphin-2.9.3-Yi-1.5-34B-32k.json rename to data/models/cognitivecomputations_dolphin-2.9.3-Yi-1.5-34B-32k.json diff --git a/data/cognitivecomputations_dolphin-2.9.3-mistral-7B-32k.json b/data/models/cognitivecomputations_dolphin-2.9.3-mistral-7B-32k.json similarity index 100% rename from data/cognitivecomputations_dolphin-2.9.3-mistral-7B-32k.json rename to data/models/cognitivecomputations_dolphin-2.9.3-mistral-7B-32k.json diff --git a/data/cognitivecomputations_dolphin-2.9.3-mistral-nemo-12b.json b/data/models/cognitivecomputations_dolphin-2.9.3-mistral-nemo-12b.json similarity index 100% rename from data/cognitivecomputations_dolphin-2.9.3-mistral-nemo-12b.json rename to data/models/cognitivecomputations_dolphin-2.9.3-mistral-nemo-12b.json diff --git a/data/cognitivecomputations_dolphin-2.9.4-gemma2-2b.json b/data/models/cognitivecomputations_dolphin-2.9.4-gemma2-2b.json similarity index 100% rename from data/cognitivecomputations_dolphin-2.9.4-gemma2-2b.json rename to data/models/cognitivecomputations_dolphin-2.9.4-gemma2-2b.json diff --git a/data/cognitivecomputations_dolphin-2.9.4-llama3.1-8b.json b/data/models/cognitivecomputations_dolphin-2.9.4-llama3.1-8b.json similarity index 100% rename from data/cognitivecomputations_dolphin-2.9.4-llama3.1-8b.json rename to data/models/cognitivecomputations_dolphin-2.9.4-llama3.1-8b.json diff --git a/data/cohere_Cohere-Command-beta-52.4B.json b/data/models/cohere_Cohere-Command-beta-52.4B.json similarity index 100% rename from data/cohere_Cohere-Command-beta-52.4B.json rename to data/models/cohere_Cohere-Command-beta-52.4B.json diff --git a/data/cohere_Cohere-Command-beta-6.1B.json b/data/models/cohere_Cohere-Command-beta-6.1B.json similarity index 100% rename from data/cohere_Cohere-Command-beta-6.1B.json rename to data/models/cohere_Cohere-Command-beta-6.1B.json diff --git a/data/cohere_Cohere-large-v20220720-13.1B.json b/data/models/cohere_Cohere-large-v20220720-13.1B.json similarity index 100% rename from data/cohere_Cohere-large-v20220720-13.1B.json rename to data/models/cohere_Cohere-large-v20220720-13.1B.json diff --git a/data/cohere_Cohere-medium-v20220720-6.1B.json b/data/models/cohere_Cohere-medium-v20220720-6.1B.json similarity index 100% rename from data/cohere_Cohere-medium-v20220720-6.1B.json rename to data/models/cohere_Cohere-medium-v20220720-6.1B.json diff --git a/data/cohere_Cohere-medium-v20221108-6.1B.json b/data/models/cohere_Cohere-medium-v20221108-6.1B.json similarity index 100% rename from data/cohere_Cohere-medium-v20221108-6.1B.json rename to data/models/cohere_Cohere-medium-v20221108-6.1B.json diff --git a/data/cohere_Cohere-small-v20220720-410M.json b/data/models/cohere_Cohere-small-v20220720-410M.json similarity index 100% rename from data/cohere_Cohere-small-v20220720-410M.json rename to data/models/cohere_Cohere-small-v20220720-410M.json diff --git a/data/cohere_Cohere-xlarge-v20220609-52.4B.json b/data/models/cohere_Cohere-xlarge-v20220609-52.4B.json similarity index 100% rename from data/cohere_Cohere-xlarge-v20220609-52.4B.json rename to data/models/cohere_Cohere-xlarge-v20220609-52.4B.json diff --git a/data/cohere_Cohere-xlarge-v20221108-52.4B.json b/data/models/cohere_Cohere-xlarge-v20221108-52.4B.json similarity index 100% rename from data/cohere_Cohere-xlarge-v20221108-52.4B.json rename to data/models/cohere_Cohere-xlarge-v20221108-52.4B.json diff --git a/data/cohere_aya-expanse-32b.json b/data/models/cohere_aya-expanse-32b.json similarity index 100% rename from data/cohere_aya-expanse-32b.json rename to data/models/cohere_aya-expanse-32b.json diff --git a/data/cohere_command-a-03-2025.json b/data/models/cohere_command-a-03-2025.json similarity index 100% rename from data/cohere_command-a-03-2025.json rename to data/models/cohere_command-a-03-2025.json diff --git a/data/cohere_command-light.json b/data/models/cohere_command-light.json similarity index 100% rename from data/cohere_command-light.json rename to data/models/cohere_command-light.json diff --git a/data/cohere_command-r-plus.json b/data/models/cohere_command-r-plus.json similarity index 100% rename from data/cohere_command-r-plus.json rename to data/models/cohere_command-r-plus.json diff --git a/data/cohere_command-r.json b/data/models/cohere_command-r.json similarity index 100% rename from data/cohere_command-r.json rename to data/models/cohere_command-r.json diff --git a/data/cohere_command-xlarge-beta.json b/data/models/cohere_command-xlarge-beta.json similarity index 100% rename from data/cohere_command-xlarge-beta.json rename to data/models/cohere_command-xlarge-beta.json diff --git a/data/cohere_command.json b/data/models/cohere_command.json similarity index 100% rename from data/cohere_command.json rename to data/models/cohere_command.json diff --git a/data/collaiborateorg_Collaiborator-MEDLLM-Llama-3-8B-v2.json b/data/models/collaiborateorg_Collaiborator-MEDLLM-Llama-3-8B-v2.json similarity index 100% rename from data/collaiborateorg_Collaiborator-MEDLLM-Llama-3-8B-v2.json rename to data/models/collaiborateorg_Collaiborator-MEDLLM-Llama-3-8B-v2.json diff --git a/data/cpayne1303_cp2024-instruct.json b/data/models/cpayne1303_cp2024-instruct.json similarity index 100% rename from data/cpayne1303_cp2024-instruct.json rename to data/models/cpayne1303_cp2024-instruct.json diff --git a/data/cpayne1303_cp2024.json b/data/models/cpayne1303_cp2024.json similarity index 100% rename from data/cpayne1303_cp2024.json rename to data/models/cpayne1303_cp2024.json diff --git a/data/cpayne1303_llama-43m-beta.json b/data/models/cpayne1303_llama-43m-beta.json similarity index 100% rename from data/cpayne1303_llama-43m-beta.json rename to data/models/cpayne1303_llama-43m-beta.json diff --git a/data/cpayne1303_smallcp2024.json b/data/models/cpayne1303_smallcp2024.json similarity index 100% rename from data/cpayne1303_smallcp2024.json rename to data/models/cpayne1303_smallcp2024.json diff --git a/data/crestf411_MN-Slush.json b/data/models/crestf411_MN-Slush.json similarity index 100% rename from data/crestf411_MN-Slush.json rename to data/models/crestf411_MN-Slush.json diff --git a/data/cstr_llama3.1-8b-spaetzle-v90.json b/data/models/cstr_llama3.1-8b-spaetzle-v90.json similarity index 100% rename from data/cstr_llama3.1-8b-spaetzle-v90.json rename to data/models/cstr_llama3.1-8b-spaetzle-v90.json diff --git a/data/cyberagent_calm3-22b-chat.json b/data/models/cyberagent_calm3-22b-chat.json similarity index 100% rename from data/cyberagent_calm3-22b-chat.json rename to data/models/cyberagent_calm3-22b-chat.json diff --git a/data/darkc0de_BuddyGlassNeverSleeps.json b/data/models/darkc0de_BuddyGlassNeverSleeps.json similarity index 100% rename from data/darkc0de_BuddyGlassNeverSleeps.json rename to data/models/darkc0de_BuddyGlassNeverSleeps.json diff --git a/data/darkc0de_BuddyGlassUncensored2025.2.json b/data/models/darkc0de_BuddyGlassUncensored2025.2.json similarity index 100% rename from data/darkc0de_BuddyGlassUncensored2025.2.json rename to data/models/darkc0de_BuddyGlassUncensored2025.2.json diff --git a/data/darkc0de_BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp.json b/data/models/darkc0de_BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp.json similarity index 100% rename from data/darkc0de_BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp.json rename to data/models/darkc0de_BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp.json diff --git a/data/databricks_dbrx-base.json b/data/models/databricks_dbrx-base.json similarity index 100% rename from data/databricks_dbrx-base.json rename to data/models/databricks_dbrx-base.json diff --git a/data/databricks_dbrx-instruct.json b/data/models/databricks_dbrx-instruct.json similarity index 100% rename from data/databricks_dbrx-instruct.json rename to data/models/databricks_dbrx-instruct.json diff --git a/data/databricks_dolly-v1-6b.json b/data/models/databricks_dolly-v1-6b.json similarity index 100% rename from data/databricks_dolly-v1-6b.json rename to data/models/databricks_dolly-v1-6b.json diff --git a/data/databricks_dolly-v2-12b.json b/data/models/databricks_dolly-v2-12b.json similarity index 100% rename from data/databricks_dolly-v2-12b.json rename to data/models/databricks_dolly-v2-12b.json diff --git a/data/databricks_dolly-v2-3b.json b/data/models/databricks_dolly-v2-3b.json similarity index 100% rename from data/databricks_dolly-v2-3b.json rename to data/models/databricks_dolly-v2-3b.json diff --git a/data/databricks_dolly-v2-7b.json b/data/models/databricks_dolly-v2-7b.json similarity index 100% rename from data/databricks_dolly-v2-7b.json rename to data/models/databricks_dolly-v2-7b.json diff --git a/data/davidkim205_Rhea-72b-v0.5.json b/data/models/davidkim205_Rhea-72b-v0.5.json similarity index 100% rename from data/davidkim205_Rhea-72b-v0.5.json rename to data/models/davidkim205_Rhea-72b-v0.5.json diff --git a/data/davidkim205_nox-solar-10.7b-v4.json b/data/models/davidkim205_nox-solar-10.7b-v4.json similarity index 100% rename from data/davidkim205_nox-solar-10.7b-v4.json rename to data/models/davidkim205_nox-solar-10.7b-v4.json diff --git a/data/deepseek-ai_DeepSeek-R1-Distill-Llama-70B.json b/data/models/deepseek-ai_DeepSeek-R1-Distill-Llama-70B.json similarity index 100% rename from data/deepseek-ai_DeepSeek-R1-Distill-Llama-70B.json rename to data/models/deepseek-ai_DeepSeek-R1-Distill-Llama-70B.json diff --git a/data/deepseek-ai_DeepSeek-R1-Distill-Llama-8B.json b/data/models/deepseek-ai_DeepSeek-R1-Distill-Llama-8B.json similarity index 100% rename from data/deepseek-ai_DeepSeek-R1-Distill-Llama-8B.json rename to data/models/deepseek-ai_DeepSeek-R1-Distill-Llama-8B.json diff --git a/data/deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B.json b/data/models/deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B.json similarity index 100% rename from data/deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B.json rename to data/models/deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B.json diff --git a/data/deepseek-ai_DeepSeek-R1-Distill-Qwen-14B.json b/data/models/deepseek-ai_DeepSeek-R1-Distill-Qwen-14B.json similarity index 100% rename from data/deepseek-ai_DeepSeek-R1-Distill-Qwen-14B.json rename to data/models/deepseek-ai_DeepSeek-R1-Distill-Qwen-14B.json diff --git a/data/deepseek-ai_DeepSeek-R1-Distill-Qwen-32B.json b/data/models/deepseek-ai_DeepSeek-R1-Distill-Qwen-32B.json similarity index 100% rename from data/deepseek-ai_DeepSeek-R1-Distill-Qwen-32B.json rename to data/models/deepseek-ai_DeepSeek-R1-Distill-Qwen-32B.json diff --git a/data/deepseek-ai_DeepSeek-R1-Distill-Qwen-7B.json b/data/models/deepseek-ai_DeepSeek-R1-Distill-Qwen-7B.json similarity index 100% rename from data/deepseek-ai_DeepSeek-R1-Distill-Qwen-7B.json rename to data/models/deepseek-ai_DeepSeek-R1-Distill-Qwen-7B.json diff --git a/data/deepseek-ai_deepseek-llm-67b-chat.json b/data/models/deepseek-ai_deepseek-llm-67b-chat.json similarity index 100% rename from data/deepseek-ai_deepseek-llm-67b-chat.json rename to data/models/deepseek-ai_deepseek-llm-67b-chat.json diff --git a/data/deepseek-ai_deepseek-llm-7b-base.json b/data/models/deepseek-ai_deepseek-llm-7b-base.json similarity index 100% rename from data/deepseek-ai_deepseek-llm-7b-base.json rename to data/models/deepseek-ai_deepseek-llm-7b-base.json diff --git a/data/deepseek-ai_deepseek-llm-7b-chat.json b/data/models/deepseek-ai_deepseek-llm-7b-chat.json similarity index 100% rename from data/deepseek-ai_deepseek-llm-7b-chat.json rename to data/models/deepseek-ai_deepseek-llm-7b-chat.json diff --git a/data/deepseek-ai_deepseek-moe-16b-base.json b/data/models/deepseek-ai_deepseek-moe-16b-base.json similarity index 100% rename from data/deepseek-ai_deepseek-moe-16b-base.json rename to data/models/deepseek-ai_deepseek-moe-16b-base.json diff --git a/data/deepseek-ai_deepseek-moe-16b-chat.json b/data/models/deepseek-ai_deepseek-moe-16b-chat.json similarity index 100% rename from data/deepseek-ai_deepseek-moe-16b-chat.json rename to data/models/deepseek-ai_deepseek-moe-16b-chat.json diff --git a/data/deepseek-ai_deepseek-r1-0528.json b/data/models/deepseek-ai_deepseek-r1-0528.json similarity index 100% rename from data/deepseek-ai_deepseek-r1-0528.json rename to data/models/deepseek-ai_deepseek-r1-0528.json diff --git a/data/deepseek-ai_deepseek-v3.json b/data/models/deepseek-ai_deepseek-v3.json similarity index 100% rename from data/deepseek-ai_deepseek-v3.json rename to data/models/deepseek-ai_deepseek-v3.json diff --git a/data/deepseek_chat-v3-0324.json b/data/models/deepseek_chat-v3-0324.json similarity index 100% rename from data/deepseek_chat-v3-0324.json rename to data/models/deepseek_chat-v3-0324.json diff --git a/data/deepseek_deepseek-r1-0528.json b/data/models/deepseek_deepseek-r1-0528.json similarity index 100% rename from data/deepseek_deepseek-r1-0528.json rename to data/models/deepseek_deepseek-r1-0528.json index 22f7f1e7159f38877f415a9a6f07905fabcb5d33..712c8d82096fdfa393951ac4421e5d6c18ee722d 100644 --- a/data/deepseek_deepseek-r1-0528.json +++ b/data/models/deepseek_deepseek-r1-0528.json @@ -10,8 +10,8 @@ }, "evaluations": [ { - "evaluation_id": "global-mmlu-lite/deepseek_deepseek-r1-0528/1773936496.366405", - "retrieved_timestamp": "1773936496.366405", + "evaluation_id": "global-mmlu-lite/deepseek_deepseek-r1-0528/1773936583.743359", + "retrieved_timestamp": "1773936583.743359", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", @@ -525,8 +525,8 @@ "generation_config": null }, { - "evaluation_id": "global-mmlu-lite/deepseek_deepseek-r1-0528/1773936583.743359", - "retrieved_timestamp": "1773936583.743359", + "evaluation_id": "global-mmlu-lite/deepseek_deepseek-r1-0528/1773936496.366405", + "retrieved_timestamp": "1773936496.366405", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", diff --git a/data/deepseek_deepseek-v3.1.json b/data/models/deepseek_deepseek-v3.1.json similarity index 100% rename from data/deepseek_deepseek-v3.1.json rename to data/models/deepseek_deepseek-v3.1.json index 52e144fa11c26943b339cba2c9fa28b1d9098c61..28271d97eb42a16e44db35269bdfee0d33d2475a 100644 --- a/data/deepseek_deepseek-v3.1.json +++ b/data/models/deepseek_deepseek-v3.1.json @@ -7,8 +7,8 @@ }, "evaluations": [ { - "evaluation_id": "global-mmlu-lite/deepseek_deepseek-v3.1/1773936496.366405", - "retrieved_timestamp": "1773936496.366405", + "evaluation_id": "global-mmlu-lite/deepseek_deepseek-v3.1/1773936583.743359", + "retrieved_timestamp": "1773936583.743359", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", @@ -522,8 +522,8 @@ "generation_config": null }, { - "evaluation_id": "global-mmlu-lite/deepseek_deepseek-v3.1/1773936583.743359", - "retrieved_timestamp": "1773936583.743359", + "evaluation_id": "global-mmlu-lite/deepseek_deepseek-v3.1/1773936496.366405", + "retrieved_timestamp": "1773936496.366405", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", diff --git a/data/deepseek_deepseek-v3.2.json b/data/models/deepseek_deepseek-v3.2.json similarity index 100% rename from data/deepseek_deepseek-v3.2.json rename to data/models/deepseek_deepseek-v3.2.json diff --git a/data/deepseek_ep-20250214004308-p7n89.json b/data/models/deepseek_ep-20250214004308-p7n89.json similarity index 100% rename from data/deepseek_ep-20250214004308-p7n89.json rename to data/models/deepseek_ep-20250214004308-p7n89.json diff --git a/data/deepseek_ep-20250228232227-z44x5.json b/data/models/deepseek_ep-20250228232227-z44x5.json similarity index 100% rename from data/deepseek_ep-20250228232227-z44x5.json rename to data/models/deepseek_ep-20250228232227-z44x5.json diff --git a/data/deepseek_ep-20250603132404-cgpjm.json b/data/models/deepseek_ep-20250603132404-cgpjm.json similarity index 100% rename from data/deepseek_ep-20250603132404-cgpjm.json rename to data/models/deepseek_ep-20250603132404-cgpjm.json diff --git a/data/dfurman_CalmeRys-78B-Orpo-v0.1.json b/data/models/dfurman_CalmeRys-78B-Orpo-v0.1.json similarity index 100% rename from data/dfurman_CalmeRys-78B-Orpo-v0.1.json rename to data/models/dfurman_CalmeRys-78B-Orpo-v0.1.json diff --git a/data/dfurman_Llama-3-70B-Orpo-v0.1.json b/data/models/dfurman_Llama-3-70B-Orpo-v0.1.json similarity index 100% rename from data/dfurman_Llama-3-70B-Orpo-v0.1.json rename to data/models/dfurman_Llama-3-70B-Orpo-v0.1.json diff --git a/data/dfurman_Llama-3-8B-Orpo-v0.1.json b/data/models/dfurman_Llama-3-8B-Orpo-v0.1.json similarity index 99% rename from data/dfurman_Llama-3-8B-Orpo-v0.1.json rename to data/models/dfurman_Llama-3-8B-Orpo-v0.1.json index 645928a26f09395b1513ab598bb415bff5505656..795fa88e22654a896cb3162376f7a81e84579c64 100644 --- a/data/dfurman_Llama-3-8B-Orpo-v0.1.json +++ b/data/models/dfurman_Llama-3-8B-Orpo-v0.1.json @@ -5,8 +5,8 @@ "developer": "dfurman", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", + "precision": "float16", + "architecture": "?", "params_billions": "8.03" } }, @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2835 + "score": 0.3 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3842 + "score": 0.3853 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0521 + "score": 0.0415 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2609 + "score": 0.2617 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3566 + "score": 0.3579 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2298 + "score": 0.2281 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3 + "score": 0.2835 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3853 + "score": 0.3842 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0415 + "score": 0.0521 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2617 + "score": 0.2609 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3579 + "score": 0.3566 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2281 + "score": 0.2298 } } ], diff --git a/data/dfurman_Qwen2-72B-Orpo-v0.1.json b/data/models/dfurman_Qwen2-72B-Orpo-v0.1.json similarity index 100% rename from data/dfurman_Qwen2-72B-Orpo-v0.1.json rename to data/models/dfurman_Qwen2-72B-Orpo-v0.1.json diff --git a/data/dicta-il_dictalm2.0-instruct.json b/data/models/dicta-il_dictalm2.0-instruct.json similarity index 100% rename from data/dicta-il_dictalm2.0-instruct.json rename to data/models/dicta-il_dictalm2.0-instruct.json diff --git a/data/dicta-il_dictalm2.0.json b/data/models/dicta-il_dictalm2.0.json similarity index 100% rename from data/dicta-il_dictalm2.0.json rename to data/models/dicta-il_dictalm2.0.json diff --git a/data/distilbert_distilgpt2.json b/data/models/distilbert_distilgpt2.json similarity index 100% rename from data/distilbert_distilgpt2.json rename to data/models/distilbert_distilgpt2.json diff --git a/data/divyanshukunwar_SASTRI_1_9B.json b/data/models/divyanshukunwar_SASTRI_1_9B.json similarity index 100% rename from data/divyanshukunwar_SASTRI_1_9B.json rename to data/models/divyanshukunwar_SASTRI_1_9B.json diff --git a/data/djuna-test-lab_TEST-L3.2-ReWish-3B-ties-w-base.json b/data/models/djuna-test-lab_TEST-L3.2-ReWish-3B-ties-w-base.json similarity index 100% rename from data/djuna-test-lab_TEST-L3.2-ReWish-3B-ties-w-base.json rename to data/models/djuna-test-lab_TEST-L3.2-ReWish-3B-ties-w-base.json diff --git a/data/djuna-test-lab_TEST-L3.2-ReWish-3B.json b/data/models/djuna-test-lab_TEST-L3.2-ReWish-3B.json similarity index 100% rename from data/djuna-test-lab_TEST-L3.2-ReWish-3B.json rename to data/models/djuna-test-lab_TEST-L3.2-ReWish-3B.json diff --git a/data/djuna_G2-BigGSHT-27B-2.json b/data/models/djuna_G2-BigGSHT-27B-2.json similarity index 100% rename from data/djuna_G2-BigGSHT-27B-2.json rename to data/models/djuna_G2-BigGSHT-27B-2.json diff --git a/data/djuna_G2-GSHT.json b/data/models/djuna_G2-GSHT.json similarity index 100% rename from data/djuna_G2-GSHT.json rename to data/models/djuna_G2-GSHT.json diff --git a/data/djuna_Gemma-2-gemmama-9b.json b/data/models/djuna_Gemma-2-gemmama-9b.json similarity index 100% rename from data/djuna_Gemma-2-gemmama-9b.json rename to data/models/djuna_Gemma-2-gemmama-9b.json diff --git a/data/djuna_L3.1-ForStHS.json b/data/models/djuna_L3.1-ForStHS.json similarity index 100% rename from data/djuna_L3.1-ForStHS.json rename to data/models/djuna_L3.1-ForStHS.json diff --git a/data/djuna_L3.1-Promissum_Mane-8B-Della-1.5-calc.json b/data/models/djuna_L3.1-Promissum_Mane-8B-Della-1.5-calc.json similarity index 100% rename from data/djuna_L3.1-Promissum_Mane-8B-Della-1.5-calc.json rename to data/models/djuna_L3.1-Promissum_Mane-8B-Della-1.5-calc.json diff --git a/data/djuna_L3.1-Promissum_Mane-8B-Della-calc.json b/data/models/djuna_L3.1-Promissum_Mane-8B-Della-calc.json similarity index 100% rename from data/djuna_L3.1-Promissum_Mane-8B-Della-calc.json rename to data/models/djuna_L3.1-Promissum_Mane-8B-Della-calc.json diff --git a/data/djuna_L3.1-Purosani-2-8B.json b/data/models/djuna_L3.1-Purosani-2-8B.json similarity index 100% rename from data/djuna_L3.1-Purosani-2-8B.json rename to data/models/djuna_L3.1-Purosani-2-8B.json diff --git a/data/djuna_L3.1-Suze-Vume-calc.json b/data/models/djuna_L3.1-Suze-Vume-calc.json similarity index 100% rename from data/djuna_L3.1-Suze-Vume-calc.json rename to data/models/djuna_L3.1-Suze-Vume-calc.json diff --git a/data/djuna_MN-Chinofun-12B-2.json b/data/models/djuna_MN-Chinofun-12B-2.json similarity index 100% rename from data/djuna_MN-Chinofun-12B-2.json rename to data/models/djuna_MN-Chinofun-12B-2.json diff --git a/data/djuna_MN-Chinofun-12B-3.json b/data/models/djuna_MN-Chinofun-12B-3.json similarity index 100% rename from data/djuna_MN-Chinofun-12B-3.json rename to data/models/djuna_MN-Chinofun-12B-3.json diff --git a/data/djuna_MN-Chinofun-12B-4.json b/data/models/djuna_MN-Chinofun-12B-4.json similarity index 100% rename from data/djuna_MN-Chinofun-12B-4.json rename to data/models/djuna_MN-Chinofun-12B-4.json diff --git a/data/djuna_MN-Chinofun.json b/data/models/djuna_MN-Chinofun.json similarity index 100% rename from data/djuna_MN-Chinofun.json rename to data/models/djuna_MN-Chinofun.json diff --git a/data/djuna_Q2.5-Partron-7B.json b/data/models/djuna_Q2.5-Partron-7B.json similarity index 100% rename from data/djuna_Q2.5-Partron-7B.json rename to data/models/djuna_Q2.5-Partron-7B.json diff --git a/data/djuna_Q2.5-Veltha-14B-0.5.json b/data/models/djuna_Q2.5-Veltha-14B-0.5.json similarity index 100% rename from data/djuna_Q2.5-Veltha-14B-0.5.json rename to data/models/djuna_Q2.5-Veltha-14B-0.5.json diff --git a/data/djuna_Q2.5-Veltha-14B.json b/data/models/djuna_Q2.5-Veltha-14B.json similarity index 100% rename from data/djuna_Q2.5-Veltha-14B.json rename to data/models/djuna_Q2.5-Veltha-14B.json diff --git a/data/dnhkng_RYS-Llama-3-8B-Instruct.json b/data/models/dnhkng_RYS-Llama-3-8B-Instruct.json similarity index 100% rename from data/dnhkng_RYS-Llama-3-8B-Instruct.json rename to data/models/dnhkng_RYS-Llama-3-8B-Instruct.json diff --git a/data/dnhkng_RYS-Llama-3-Huge-Instruct.json b/data/models/dnhkng_RYS-Llama-3-Huge-Instruct.json similarity index 100% rename from data/dnhkng_RYS-Llama-3-Huge-Instruct.json rename to data/models/dnhkng_RYS-Llama-3-Huge-Instruct.json diff --git a/data/dnhkng_RYS-Llama-3-Large-Instruct.json b/data/models/dnhkng_RYS-Llama-3-Large-Instruct.json similarity index 100% rename from data/dnhkng_RYS-Llama-3-Large-Instruct.json rename to data/models/dnhkng_RYS-Llama-3-Large-Instruct.json diff --git a/data/dnhkng_RYS-Llama-3.1-8B-Instruct.json b/data/models/dnhkng_RYS-Llama-3.1-8B-Instruct.json similarity index 100% rename from data/dnhkng_RYS-Llama-3.1-8B-Instruct.json rename to data/models/dnhkng_RYS-Llama-3.1-8B-Instruct.json diff --git a/data/dnhkng_RYS-Llama3.1-Large.json b/data/models/dnhkng_RYS-Llama3.1-Large.json similarity index 100% rename from data/dnhkng_RYS-Llama3.1-Large.json rename to data/models/dnhkng_RYS-Llama3.1-Large.json diff --git a/data/dnhkng_RYS-Medium.json b/data/models/dnhkng_RYS-Medium.json similarity index 100% rename from data/dnhkng_RYS-Medium.json rename to data/models/dnhkng_RYS-Medium.json diff --git a/data/dnhkng_RYS-Phi-3-medium-4k-instruct.json b/data/models/dnhkng_RYS-Phi-3-medium-4k-instruct.json similarity index 100% rename from data/dnhkng_RYS-Phi-3-medium-4k-instruct.json rename to data/models/dnhkng_RYS-Phi-3-medium-4k-instruct.json diff --git a/data/dnhkng_RYS-XLarge-base.json b/data/models/dnhkng_RYS-XLarge-base.json similarity index 100% rename from data/dnhkng_RYS-XLarge-base.json rename to data/models/dnhkng_RYS-XLarge-base.json diff --git a/data/dnhkng_RYS-XLarge.json b/data/models/dnhkng_RYS-XLarge.json similarity index 100% rename from data/dnhkng_RYS-XLarge.json rename to data/models/dnhkng_RYS-XLarge.json diff --git a/data/dnhkng_RYS-XLarge2.json b/data/models/dnhkng_RYS-XLarge2.json similarity index 100% rename from data/dnhkng_RYS-XLarge2.json rename to data/models/dnhkng_RYS-XLarge2.json diff --git a/data/dreamgen_WizardLM-2-7B.json b/data/models/dreamgen_WizardLM-2-7B.json similarity index 100% rename from data/dreamgen_WizardLM-2-7B.json rename to data/models/dreamgen_WizardLM-2-7B.json diff --git a/data/dustinwloring1988_Reflexis-8b-chat-v1.json b/data/models/dustinwloring1988_Reflexis-8b-chat-v1.json similarity index 100% rename from data/dustinwloring1988_Reflexis-8b-chat-v1.json rename to data/models/dustinwloring1988_Reflexis-8b-chat-v1.json diff --git a/data/dustinwloring1988_Reflexis-8b-chat-v2.json b/data/models/dustinwloring1988_Reflexis-8b-chat-v2.json similarity index 100% rename from data/dustinwloring1988_Reflexis-8b-chat-v2.json rename to data/models/dustinwloring1988_Reflexis-8b-chat-v2.json diff --git a/data/dustinwloring1988_Reflexis-8b-chat-v3.json b/data/models/dustinwloring1988_Reflexis-8b-chat-v3.json similarity index 100% rename from data/dustinwloring1988_Reflexis-8b-chat-v3.json rename to data/models/dustinwloring1988_Reflexis-8b-chat-v3.json diff --git a/data/dustinwloring1988_Reflexis-8b-chat-v4.json b/data/models/dustinwloring1988_Reflexis-8b-chat-v4.json similarity index 100% rename from data/dustinwloring1988_Reflexis-8b-chat-v4.json rename to data/models/dustinwloring1988_Reflexis-8b-chat-v4.json diff --git a/data/dustinwloring1988_Reflexis-8b-chat-v5.json b/data/models/dustinwloring1988_Reflexis-8b-chat-v5.json similarity index 100% rename from data/dustinwloring1988_Reflexis-8b-chat-v5.json rename to data/models/dustinwloring1988_Reflexis-8b-chat-v5.json diff --git a/data/dustinwloring1988_Reflexis-8b-chat-v6.json b/data/models/dustinwloring1988_Reflexis-8b-chat-v6.json similarity index 100% rename from data/dustinwloring1988_Reflexis-8b-chat-v6.json rename to data/models/dustinwloring1988_Reflexis-8b-chat-v6.json diff --git a/data/dustinwloring1988_Reflexis-8b-chat-v7.json b/data/models/dustinwloring1988_Reflexis-8b-chat-v7.json similarity index 100% rename from data/dustinwloring1988_Reflexis-8b-chat-v7.json rename to data/models/dustinwloring1988_Reflexis-8b-chat-v7.json diff --git a/data/duyhv1411_Llama-3.2-1B-en-vi.json b/data/models/duyhv1411_Llama-3.2-1B-en-vi.json similarity index 100% rename from data/duyhv1411_Llama-3.2-1B-en-vi.json rename to data/models/duyhv1411_Llama-3.2-1B-en-vi.json diff --git a/data/duyhv1411_Llama-3.2-3B-en-vi.json b/data/models/duyhv1411_Llama-3.2-3B-en-vi.json similarity index 100% rename from data/duyhv1411_Llama-3.2-3B-en-vi.json rename to data/models/duyhv1411_Llama-3.2-3B-en-vi.json diff --git a/data/dwikitheduck_gemma-2-2b-id-inst.json b/data/models/dwikitheduck_gemma-2-2b-id-inst.json similarity index 100% rename from data/dwikitheduck_gemma-2-2b-id-inst.json rename to data/models/dwikitheduck_gemma-2-2b-id-inst.json diff --git a/data/dwikitheduck_gemma-2-2b-id-instruct.json b/data/models/dwikitheduck_gemma-2-2b-id-instruct.json similarity index 100% rename from data/dwikitheduck_gemma-2-2b-id-instruct.json rename to data/models/dwikitheduck_gemma-2-2b-id-instruct.json diff --git a/data/dwikitheduck_gemma-2-2b-id.json b/data/models/dwikitheduck_gemma-2-2b-id.json similarity index 100% rename from data/dwikitheduck_gemma-2-2b-id.json rename to data/models/dwikitheduck_gemma-2-2b-id.json diff --git a/data/dwikitheduck_gen-inst-1.json b/data/models/dwikitheduck_gen-inst-1.json similarity index 100% rename from data/dwikitheduck_gen-inst-1.json rename to data/models/dwikitheduck_gen-inst-1.json diff --git a/data/dwikitheduck_gen-try1-notemp.json b/data/models/dwikitheduck_gen-try1-notemp.json similarity index 100% rename from data/dwikitheduck_gen-try1-notemp.json rename to data/models/dwikitheduck_gen-try1-notemp.json diff --git a/data/dwikitheduck_gen-try1.json b/data/models/dwikitheduck_gen-try1.json similarity index 100% rename from data/dwikitheduck_gen-try1.json rename to data/models/dwikitheduck_gen-try1.json diff --git a/data/dzakwan_dzakwan-MoE-4x7b-Beta.json b/data/models/dzakwan_dzakwan-MoE-4x7b-Beta.json similarity index 100% rename from data/dzakwan_dzakwan-MoE-4x7b-Beta.json rename to data/models/dzakwan_dzakwan-MoE-4x7b-Beta.json diff --git a/data/ehristoforu_Falcon3-8B-Franken-Basestruct.json b/data/models/ehristoforu_Falcon3-8B-Franken-Basestruct.json similarity index 100% rename from data/ehristoforu_Falcon3-8B-Franken-Basestruct.json rename to data/models/ehristoforu_Falcon3-8B-Franken-Basestruct.json diff --git a/data/ehristoforu_Falcon3-MoE-2x7B-Insruct.json b/data/models/ehristoforu_Falcon3-MoE-2x7B-Insruct.json similarity index 100% rename from data/ehristoforu_Falcon3-MoE-2x7B-Insruct.json rename to data/models/ehristoforu_Falcon3-MoE-2x7B-Insruct.json diff --git a/data/ehristoforu_Gemma2-9B-it-psy10k-mental_health.json b/data/models/ehristoforu_Gemma2-9B-it-psy10k-mental_health.json similarity index 100% rename from data/ehristoforu_Gemma2-9B-it-psy10k-mental_health.json rename to data/models/ehristoforu_Gemma2-9B-it-psy10k-mental_health.json diff --git a/data/ehristoforu_Gemma2-9b-it-train6.json b/data/models/ehristoforu_Gemma2-9b-it-train6.json similarity index 100% rename from data/ehristoforu_Gemma2-9b-it-train6.json rename to data/models/ehristoforu_Gemma2-9b-it-train6.json diff --git a/data/ehristoforu_HappyLlama1.json b/data/models/ehristoforu_HappyLlama1.json similarity index 100% rename from data/ehristoforu_HappyLlama1.json rename to data/models/ehristoforu_HappyLlama1.json diff --git a/data/ehristoforu_QwenQwen2.5-7B-IT-Dare.json b/data/models/ehristoforu_QwenQwen2.5-7B-IT-Dare.json similarity index 100% rename from data/ehristoforu_QwenQwen2.5-7B-IT-Dare.json rename to data/models/ehristoforu_QwenQwen2.5-7B-IT-Dare.json diff --git a/data/ehristoforu_QwenQwen2.5-7B-IT.json b/data/models/ehristoforu_QwenQwen2.5-7B-IT.json similarity index 100% rename from data/ehristoforu_QwenQwen2.5-7B-IT.json rename to data/models/ehristoforu_QwenQwen2.5-7B-IT.json diff --git a/data/ehristoforu_RQwen-v0.1.json b/data/models/ehristoforu_RQwen-v0.1.json similarity index 100% rename from data/ehristoforu_RQwen-v0.1.json rename to data/models/ehristoforu_RQwen-v0.1.json diff --git a/data/ehristoforu_RQwen-v0.2.json b/data/models/ehristoforu_RQwen-v0.2.json similarity index 100% rename from data/ehristoforu_RQwen-v0.2.json rename to data/models/ehristoforu_RQwen-v0.2.json diff --git a/data/ehristoforu_SoRu-0009.json b/data/models/ehristoforu_SoRu-0009.json similarity index 100% rename from data/ehristoforu_SoRu-0009.json rename to data/models/ehristoforu_SoRu-0009.json diff --git a/data/ehristoforu_coolqwen-3b-it.json b/data/models/ehristoforu_coolqwen-3b-it.json similarity index 100% rename from data/ehristoforu_coolqwen-3b-it.json rename to data/models/ehristoforu_coolqwen-3b-it.json diff --git a/data/ehristoforu_della-70b-test-v1.json b/data/models/ehristoforu_della-70b-test-v1.json similarity index 100% rename from data/ehristoforu_della-70b-test-v1.json rename to data/models/ehristoforu_della-70b-test-v1.json diff --git a/data/ehristoforu_falcon3-ultraset.json b/data/models/ehristoforu_falcon3-ultraset.json similarity index 100% rename from data/ehristoforu_falcon3-ultraset.json rename to data/models/ehristoforu_falcon3-ultraset.json diff --git a/data/ehristoforu_fd-lora-merged-16x32.json b/data/models/ehristoforu_fd-lora-merged-16x32.json similarity index 100% rename from data/ehristoforu_fd-lora-merged-16x32.json rename to data/models/ehristoforu_fd-lora-merged-16x32.json diff --git a/data/ehristoforu_fd-lora-merged-64x128.json b/data/models/ehristoforu_fd-lora-merged-64x128.json similarity index 100% rename from data/ehristoforu_fd-lora-merged-64x128.json rename to data/models/ehristoforu_fd-lora-merged-64x128.json diff --git a/data/ehristoforu_fp4-14b-it-v1.json b/data/models/ehristoforu_fp4-14b-it-v1.json similarity index 100% rename from data/ehristoforu_fp4-14b-it-v1.json rename to data/models/ehristoforu_fp4-14b-it-v1.json diff --git a/data/ehristoforu_fp4-14b-v1-fix.json b/data/models/ehristoforu_fp4-14b-v1-fix.json similarity index 100% rename from data/ehristoforu_fp4-14b-v1-fix.json rename to data/models/ehristoforu_fp4-14b-v1-fix.json diff --git a/data/ehristoforu_fq2.5-7b-it-normalize_false.json b/data/models/ehristoforu_fq2.5-7b-it-normalize_false.json similarity index 100% rename from data/ehristoforu_fq2.5-7b-it-normalize_false.json rename to data/models/ehristoforu_fq2.5-7b-it-normalize_false.json diff --git a/data/ehristoforu_fq2.5-7b-it-normalize_true.json b/data/models/ehristoforu_fq2.5-7b-it-normalize_true.json similarity index 100% rename from data/ehristoforu_fq2.5-7b-it-normalize_true.json rename to data/models/ehristoforu_fq2.5-7b-it-normalize_true.json diff --git a/data/ehristoforu_frqwen2.5-from7b-duable4layers-it.json b/data/models/ehristoforu_frqwen2.5-from7b-duable4layers-it.json similarity index 100% rename from data/ehristoforu_frqwen2.5-from7b-duable4layers-it.json rename to data/models/ehristoforu_frqwen2.5-from7b-duable4layers-it.json diff --git a/data/ehristoforu_frqwen2.5-from7b-it.json b/data/models/ehristoforu_frqwen2.5-from7b-it.json similarity index 100% rename from data/ehristoforu_frqwen2.5-from7b-it.json rename to data/models/ehristoforu_frqwen2.5-from7b-it.json diff --git a/data/ehristoforu_mllama-3.1-8b-instruct.json b/data/models/ehristoforu_mllama-3.1-8b-instruct.json similarity index 100% rename from data/ehristoforu_mllama-3.1-8b-instruct.json rename to data/models/ehristoforu_mllama-3.1-8b-instruct.json diff --git a/data/ehristoforu_mllama-3.1-8b-it.json b/data/models/ehristoforu_mllama-3.1-8b-it.json similarity index 100% rename from data/ehristoforu_mllama-3.1-8b-it.json rename to data/models/ehristoforu_mllama-3.1-8b-it.json diff --git a/data/ehristoforu_moremerge-upscaled.json b/data/models/ehristoforu_moremerge-upscaled.json similarity index 100% rename from data/ehristoforu_moremerge-upscaled.json rename to data/models/ehristoforu_moremerge-upscaled.json diff --git a/data/ehristoforu_moremerge.json b/data/models/ehristoforu_moremerge.json similarity index 100% rename from data/ehristoforu_moremerge.json rename to data/models/ehristoforu_moremerge.json diff --git a/data/ehristoforu_phi-4-25b.json b/data/models/ehristoforu_phi-4-25b.json similarity index 100% rename from data/ehristoforu_phi-4-25b.json rename to data/models/ehristoforu_phi-4-25b.json diff --git a/data/ehristoforu_qwen2.5-test-32b-it.json b/data/models/ehristoforu_qwen2.5-test-32b-it.json similarity index 100% rename from data/ehristoforu_qwen2.5-test-32b-it.json rename to data/models/ehristoforu_qwen2.5-test-32b-it.json diff --git a/data/ehristoforu_qwen2.5-with-lora-think-3b-it.json b/data/models/ehristoforu_qwen2.5-with-lora-think-3b-it.json similarity index 100% rename from data/ehristoforu_qwen2.5-with-lora-think-3b-it.json rename to data/models/ehristoforu_qwen2.5-with-lora-think-3b-it.json diff --git a/data/ehristoforu_rmoe-v1.json b/data/models/ehristoforu_rmoe-v1.json similarity index 100% rename from data/ehristoforu_rmoe-v1.json rename to data/models/ehristoforu_rmoe-v1.json diff --git a/data/ehristoforu_rufalcon3-3b-it.json b/data/models/ehristoforu_rufalcon3-3b-it.json similarity index 100% rename from data/ehristoforu_rufalcon3-3b-it.json rename to data/models/ehristoforu_rufalcon3-3b-it.json diff --git a/data/ehristoforu_ruphi-4b.json b/data/models/ehristoforu_ruphi-4b.json similarity index 100% rename from data/ehristoforu_ruphi-4b.json rename to data/models/ehristoforu_ruphi-4b.json diff --git a/data/ehristoforu_testq-32b.json b/data/models/ehristoforu_testq-32b.json similarity index 100% rename from data/ehristoforu_testq-32b.json rename to data/models/ehristoforu_testq-32b.json diff --git a/data/ehristoforu_tmoe-v2.json b/data/models/ehristoforu_tmoe-v2.json similarity index 100% rename from data/ehristoforu_tmoe-v2.json rename to data/models/ehristoforu_tmoe-v2.json diff --git a/data/ehristoforu_tmoe.json b/data/models/ehristoforu_tmoe.json similarity index 100% rename from data/ehristoforu_tmoe.json rename to data/models/ehristoforu_tmoe.json diff --git a/data/ehristoforu_trd-7b-it.json b/data/models/ehristoforu_trd-7b-it.json similarity index 100% rename from data/ehristoforu_trd-7b-it.json rename to data/models/ehristoforu_trd-7b-it.json diff --git a/data/ehristoforu_ud-14b.json b/data/models/ehristoforu_ud-14b.json similarity index 100% rename from data/ehristoforu_ud-14b.json rename to data/models/ehristoforu_ud-14b.json diff --git a/data/eleutherai_Pythia-12B.json b/data/models/eleutherai_Pythia-12B.json similarity index 100% rename from data/eleutherai_Pythia-12B.json rename to data/models/eleutherai_Pythia-12B.json diff --git a/data/eleutherai_Pythia-6.9B.json b/data/models/eleutherai_Pythia-6.9B.json similarity index 100% rename from data/eleutherai_Pythia-6.9B.json rename to data/models/eleutherai_Pythia-6.9B.json diff --git a/data/elinas_Chronos-Gold-12B-1.0.json b/data/models/elinas_Chronos-Gold-12B-1.0.json similarity index 100% rename from data/elinas_Chronos-Gold-12B-1.0.json rename to data/models/elinas_Chronos-Gold-12B-1.0.json diff --git a/data/ell44ot_gemma-2b-def.json b/data/models/ell44ot_gemma-2b-def.json similarity index 100% rename from data/ell44ot_gemma-2b-def.json rename to data/models/ell44ot_gemma-2b-def.json diff --git a/data/euclaise_ReMask-3B.json b/data/models/euclaise_ReMask-3B.json similarity index 100% rename from data/euclaise_ReMask-3B.json rename to data/models/euclaise_ReMask-3B.json diff --git a/data/eworojoshua_vas-01.json b/data/models/eworojoshua_vas-01.json similarity index 100% rename from data/eworojoshua_vas-01.json rename to data/models/eworojoshua_vas-01.json diff --git a/data/ewre324_Thinker-Llama-3.2-3B-Instruct-Reasoning.json b/data/models/ewre324_Thinker-Llama-3.2-3B-Instruct-Reasoning.json similarity index 100% rename from data/ewre324_Thinker-Llama-3.2-3B-Instruct-Reasoning.json rename to data/models/ewre324_Thinker-Llama-3.2-3B-Instruct-Reasoning.json diff --git a/data/ewre324_Thinker-Qwen2.5-0.5B-Instruct-Reasoning.json b/data/models/ewre324_Thinker-Qwen2.5-0.5B-Instruct-Reasoning.json similarity index 100% rename from data/ewre324_Thinker-Qwen2.5-0.5B-Instruct-Reasoning.json rename to data/models/ewre324_Thinker-Qwen2.5-0.5B-Instruct-Reasoning.json diff --git a/data/ewre324_Thinker-SmolLM2-135M-Instruct-Reasoning.json b/data/models/ewre324_Thinker-SmolLM2-135M-Instruct-Reasoning.json similarity index 100% rename from data/ewre324_Thinker-SmolLM2-135M-Instruct-Reasoning.json rename to data/models/ewre324_Thinker-SmolLM2-135M-Instruct-Reasoning.json diff --git a/data/ewre324_ewre324-R1-SmolLM2-135M-Distill.json b/data/models/ewre324_ewre324-R1-SmolLM2-135M-Distill.json similarity index 100% rename from data/ewre324_ewre324-R1-SmolLM2-135M-Distill.json rename to data/models/ewre324_ewre324-R1-SmolLM2-135M-Distill.json diff --git a/data/experiment-llm_exp-3-q-r.json b/data/models/experiment-llm_exp-3-q-r.json similarity index 100% rename from data/experiment-llm_exp-3-q-r.json rename to data/models/experiment-llm_exp-3-q-r.json diff --git a/data/facebook_Self-taught-Llama-3-70B.json b/data/models/facebook_Self-taught-Llama-3-70B.json similarity index 100% rename from data/facebook_Self-taught-Llama-3-70B.json rename to data/models/facebook_Self-taught-Llama-3-70B.json diff --git a/data/facebook_Self-taught-evaluator-llama3.1-70B.json b/data/models/facebook_Self-taught-evaluator-llama3.1-70B.json similarity index 100% rename from data/facebook_Self-taught-evaluator-llama3.1-70B.json rename to data/models/facebook_Self-taught-evaluator-llama3.1-70B.json diff --git a/data/facebook_opt-1.3b.json b/data/models/facebook_opt-1.3b.json similarity index 100% rename from data/facebook_opt-1.3b.json rename to data/models/facebook_opt-1.3b.json diff --git a/data/facebook_opt-30b.json b/data/models/facebook_opt-30b.json similarity index 100% rename from data/facebook_opt-30b.json rename to data/models/facebook_opt-30b.json diff --git a/data/failspy_Llama-3-8B-Instruct-MopeyMule.json b/data/models/failspy_Llama-3-8B-Instruct-MopeyMule.json similarity index 100% rename from data/failspy_Llama-3-8B-Instruct-MopeyMule.json rename to data/models/failspy_Llama-3-8B-Instruct-MopeyMule.json diff --git a/data/failspy_Llama-3-8B-Instruct-abliterated.json b/data/models/failspy_Llama-3-8B-Instruct-abliterated.json similarity index 100% rename from data/failspy_Llama-3-8B-Instruct-abliterated.json rename to data/models/failspy_Llama-3-8B-Instruct-abliterated.json diff --git a/data/failspy_Meta-Llama-3-70B-Instruct-abliterated-v3.5.json b/data/models/failspy_Meta-Llama-3-70B-Instruct-abliterated-v3.5.json similarity index 100% rename from data/failspy_Meta-Llama-3-70B-Instruct-abliterated-v3.5.json rename to data/models/failspy_Meta-Llama-3-70B-Instruct-abliterated-v3.5.json diff --git a/data/failspy_Meta-Llama-3-8B-Instruct-abliterated-v3.json b/data/models/failspy_Meta-Llama-3-8B-Instruct-abliterated-v3.json similarity index 100% rename from data/failspy_Meta-Llama-3-8B-Instruct-abliterated-v3.json rename to data/models/failspy_Meta-Llama-3-8B-Instruct-abliterated-v3.json diff --git a/data/failspy_Phi-3-medium-4k-instruct-abliterated-v3.json b/data/models/failspy_Phi-3-medium-4k-instruct-abliterated-v3.json similarity index 100% rename from data/failspy_Phi-3-medium-4k-instruct-abliterated-v3.json rename to data/models/failspy_Phi-3-medium-4k-instruct-abliterated-v3.json diff --git a/data/failspy_llama-3-70B-Instruct-abliterated.json b/data/models/failspy_llama-3-70B-Instruct-abliterated.json similarity index 100% rename from data/failspy_llama-3-70B-Instruct-abliterated.json rename to data/models/failspy_llama-3-70B-Instruct-abliterated.json diff --git a/data/fblgit_TheBeagle-v2beta-32B-MGS.json b/data/models/fblgit_TheBeagle-v2beta-32B-MGS.json similarity index 99% rename from data/fblgit_TheBeagle-v2beta-32B-MGS.json rename to data/models/fblgit_TheBeagle-v2beta-32B-MGS.json index 57e8a0c9c4547329f51827d27aa7733799780c1a..be17afdcfffac7cd8267c81de014533270b46d3f 100644 --- a/data/fblgit_TheBeagle-v2beta-32B-MGS.json +++ b/data/models/fblgit_TheBeagle-v2beta-32B-MGS.json @@ -5,7 +5,7 @@ "developer": "fblgit", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "Qwen2ForCausalLM", "params_billions": "32.764" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4503 + "score": 0.5181 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7035 + "score": 0.7033 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3943 + "score": 0.4947 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.401 + "score": 0.3826 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5021 + "score": 0.5008 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5911 + "score": 0.5915 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5181 + "score": 0.4503 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7033 + "score": 0.7035 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4947 + "score": 0.3943 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3826 + "score": 0.401 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5008 + "score": 0.5021 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5915 + "score": 0.5911 } } ], diff --git a/data/fblgit_UNA-SimpleSmaug-34b-v1beta.json b/data/models/fblgit_UNA-SimpleSmaug-34b-v1beta.json similarity index 100% rename from data/fblgit_UNA-SimpleSmaug-34b-v1beta.json rename to data/models/fblgit_UNA-SimpleSmaug-34b-v1beta.json diff --git a/data/fblgit_UNA-TheBeagle-7b-v1.json b/data/models/fblgit_UNA-TheBeagle-7b-v1.json similarity index 100% rename from data/fblgit_UNA-TheBeagle-7b-v1.json rename to data/models/fblgit_UNA-TheBeagle-7b-v1.json diff --git a/data/fblgit_UNA-ThePitbull-21.4B-v2.json b/data/models/fblgit_UNA-ThePitbull-21.4B-v2.json similarity index 100% rename from data/fblgit_UNA-ThePitbull-21.4B-v2.json rename to data/models/fblgit_UNA-ThePitbull-21.4B-v2.json diff --git a/data/fblgit_cybertron-v4-qw7B-MGS.json b/data/models/fblgit_cybertron-v4-qw7B-MGS.json similarity index 100% rename from data/fblgit_cybertron-v4-qw7B-MGS.json rename to data/models/fblgit_cybertron-v4-qw7B-MGS.json diff --git a/data/fblgit_cybertron-v4-qw7B-UNAMGS.json b/data/models/fblgit_cybertron-v4-qw7B-UNAMGS.json similarity index 100% rename from data/fblgit_cybertron-v4-qw7B-UNAMGS.json rename to data/models/fblgit_cybertron-v4-qw7B-UNAMGS.json diff --git a/data/fblgit_juanako-7b-UNA.json b/data/models/fblgit_juanako-7b-UNA.json similarity index 100% rename from data/fblgit_juanako-7b-UNA.json rename to data/models/fblgit_juanako-7b-UNA.json diff --git a/data/fblgit_miniclaus-qw1.5B-UNAMGS-GRPO.json b/data/models/fblgit_miniclaus-qw1.5B-UNAMGS-GRPO.json similarity index 100% rename from data/fblgit_miniclaus-qw1.5B-UNAMGS-GRPO.json rename to data/models/fblgit_miniclaus-qw1.5B-UNAMGS-GRPO.json diff --git a/data/fblgit_miniclaus-qw1.5B-UNAMGS.json b/data/models/fblgit_miniclaus-qw1.5B-UNAMGS.json similarity index 100% rename from data/fblgit_miniclaus-qw1.5B-UNAMGS.json rename to data/models/fblgit_miniclaus-qw1.5B-UNAMGS.json diff --git a/data/fblgit_pancho-v1-qw25-3B-UNAMGS.json b/data/models/fblgit_pancho-v1-qw25-3B-UNAMGS.json similarity index 100% rename from data/fblgit_pancho-v1-qw25-3B-UNAMGS.json rename to data/models/fblgit_pancho-v1-qw25-3B-UNAMGS.json diff --git a/data/fblgit_una-cybertron-7b-v2-bf16.json b/data/models/fblgit_una-cybertron-7b-v2-bf16.json similarity index 100% rename from data/fblgit_una-cybertron-7b-v2-bf16.json rename to data/models/fblgit_una-cybertron-7b-v2-bf16.json diff --git a/data/fhai50032_RolePlayLake-7B.json b/data/models/fhai50032_RolePlayLake-7B.json similarity index 100% rename from data/fhai50032_RolePlayLake-7B.json rename to data/models/fhai50032_RolePlayLake-7B.json diff --git a/data/fhai50032_Unaligned-Thinker-PHI-4.json b/data/models/fhai50032_Unaligned-Thinker-PHI-4.json similarity index 100% rename from data/fhai50032_Unaligned-Thinker-PHI-4.json rename to data/models/fhai50032_Unaligned-Thinker-PHI-4.json diff --git a/data/flammenai_Llama3.1-Flammades-70B.json b/data/models/flammenai_Llama3.1-Flammades-70B.json similarity index 100% rename from data/flammenai_Llama3.1-Flammades-70B.json rename to data/models/flammenai_Llama3.1-Flammades-70B.json diff --git a/data/flammenai_Mahou-1.2a-llama3-8B.json b/data/models/flammenai_Mahou-1.2a-llama3-8B.json similarity index 100% rename from data/flammenai_Mahou-1.2a-llama3-8B.json rename to data/models/flammenai_Mahou-1.2a-llama3-8B.json diff --git a/data/flammenai_Mahou-1.2a-mistral-7B.json b/data/models/flammenai_Mahou-1.2a-mistral-7B.json similarity index 100% rename from data/flammenai_Mahou-1.2a-mistral-7B.json rename to data/models/flammenai_Mahou-1.2a-mistral-7B.json diff --git a/data/flammenai_Mahou-1.5-llama3.1-70B.json b/data/models/flammenai_Mahou-1.5-llama3.1-70B.json similarity index 100% rename from data/flammenai_Mahou-1.5-llama3.1-70B.json rename to data/models/flammenai_Mahou-1.5-llama3.1-70B.json diff --git a/data/flammenai_Mahou-1.5-mistral-nemo-12B.json b/data/models/flammenai_Mahou-1.5-mistral-nemo-12B.json similarity index 100% rename from data/flammenai_Mahou-1.5-mistral-nemo-12B.json rename to data/models/flammenai_Mahou-1.5-mistral-nemo-12B.json diff --git a/data/flammenai_flammen15-gutenberg-DPO-v1-7B.json b/data/models/flammenai_flammen15-gutenberg-DPO-v1-7B.json similarity index 100% rename from data/flammenai_flammen15-gutenberg-DPO-v1-7B.json rename to data/models/flammenai_flammen15-gutenberg-DPO-v1-7B.json diff --git a/data/fluently-lm_FluentlyLM-Prinum.json b/data/models/fluently-lm_FluentlyLM-Prinum.json similarity index 100% rename from data/fluently-lm_FluentlyLM-Prinum.json rename to data/models/fluently-lm_FluentlyLM-Prinum.json diff --git a/data/fluently-lm_Llama-TI-8B-Instruct.json b/data/models/fluently-lm_Llama-TI-8B-Instruct.json similarity index 100% rename from data/fluently-lm_Llama-TI-8B-Instruct.json rename to data/models/fluently-lm_Llama-TI-8B-Instruct.json diff --git a/data/fluently-lm_Llama-TI-8B.json b/data/models/fluently-lm_Llama-TI-8B.json similarity index 100% rename from data/fluently-lm_Llama-TI-8B.json rename to data/models/fluently-lm_Llama-TI-8B.json diff --git a/data/fluently-sets_FalconThink3-10B-IT.json b/data/models/fluently-sets_FalconThink3-10B-IT.json similarity index 100% rename from data/fluently-sets_FalconThink3-10B-IT.json rename to data/models/fluently-sets_FalconThink3-10B-IT.json diff --git a/data/fluently-sets_reasoning-1-1k-demo.json b/data/models/fluently-sets_reasoning-1-1k-demo.json similarity index 100% rename from data/fluently-sets_reasoning-1-1k-demo.json rename to data/models/fluently-sets_reasoning-1-1k-demo.json diff --git a/data/formulae_mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp.json b/data/models/formulae_mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp.json similarity index 100% rename from data/formulae_mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp.json rename to data/models/formulae_mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp.json diff --git a/data/formulae_mita-elite-v1.1-7b-2-25-2025.json b/data/models/formulae_mita-elite-v1.1-7b-2-25-2025.json similarity index 100% rename from data/formulae_mita-elite-v1.1-7b-2-25-2025.json rename to data/models/formulae_mita-elite-v1.1-7b-2-25-2025.json diff --git a/data/formulae_mita-elite-v1.1-gen2-7b-2-25-2025.json b/data/models/formulae_mita-elite-v1.1-gen2-7b-2-25-2025.json similarity index 100% rename from data/formulae_mita-elite-v1.1-gen2-7b-2-25-2025.json rename to data/models/formulae_mita-elite-v1.1-gen2-7b-2-25-2025.json diff --git a/data/formulae_mita-elite-v1.2-7b-2-26-2025.json b/data/models/formulae_mita-elite-v1.2-7b-2-26-2025.json similarity index 100% rename from data/formulae_mita-elite-v1.2-7b-2-26-2025.json rename to data/models/formulae_mita-elite-v1.2-7b-2-26-2025.json diff --git a/data/formulae_mita-gen3-7b-2-26-2025.json b/data/models/formulae_mita-gen3-7b-2-26-2025.json similarity index 100% rename from data/formulae_mita-gen3-7b-2-26-2025.json rename to data/models/formulae_mita-gen3-7b-2-26-2025.json diff --git a/data/formulae_mita-gen3-v1.2-7b-2-26-2025.json b/data/models/formulae_mita-gen3-v1.2-7b-2-26-2025.json similarity index 100% rename from data/formulae_mita-gen3-v1.2-7b-2-26-2025.json rename to data/models/formulae_mita-gen3-v1.2-7b-2-26-2025.json diff --git a/data/formulae_mita-math-v2.3-2-25-2025.json b/data/models/formulae_mita-math-v2.3-2-25-2025.json similarity index 100% rename from data/formulae_mita-math-v2.3-2-25-2025.json rename to data/models/formulae_mita-math-v2.3-2-25-2025.json diff --git a/data/formulae_mita-v1-7b.json b/data/models/formulae_mita-v1-7b.json similarity index 100% rename from data/formulae_mita-v1-7b.json rename to data/models/formulae_mita-v1-7b.json diff --git a/data/formulae_mita-v1.1-7b-2-24-2025.json b/data/models/formulae_mita-v1.1-7b-2-24-2025.json similarity index 100% rename from data/formulae_mita-v1.1-7b-2-24-2025.json rename to data/models/formulae_mita-v1.1-7b-2-24-2025.json diff --git a/data/formulae_mita-v1.2-7b-2-24-2025.json b/data/models/formulae_mita-v1.2-7b-2-24-2025.json similarity index 100% rename from data/formulae_mita-v1.2-7b-2-24-2025.json rename to data/models/formulae_mita-v1.2-7b-2-24-2025.json diff --git a/data/frameai_Loxa-4B.json b/data/models/frameai_Loxa-4B.json similarity index 100% rename from data/frameai_Loxa-4B.json rename to data/models/frameai_Loxa-4B.json diff --git a/data/freewheelin_free-evo-qwen72b-v0.8-re.json b/data/models/freewheelin_free-evo-qwen72b-v0.8-re.json similarity index 100% rename from data/freewheelin_free-evo-qwen72b-v0.8-re.json rename to data/models/freewheelin_free-evo-qwen72b-v0.8-re.json diff --git a/data/freewheelin_free-solar-evo-v0.1.json b/data/models/freewheelin_free-solar-evo-v0.1.json similarity index 100% rename from data/freewheelin_free-solar-evo-v0.1.json rename to data/models/freewheelin_free-solar-evo-v0.1.json diff --git a/data/freewheelin_free-solar-evo-v0.11.json b/data/models/freewheelin_free-solar-evo-v0.11.json similarity index 100% rename from data/freewheelin_free-solar-evo-v0.11.json rename to data/models/freewheelin_free-solar-evo-v0.11.json diff --git a/data/freewheelin_free-solar-evo-v0.13.json b/data/models/freewheelin_free-solar-evo-v0.13.json similarity index 100% rename from data/freewheelin_free-solar-evo-v0.13.json rename to data/models/freewheelin_free-solar-evo-v0.13.json diff --git a/data/fulim_FineLlama-3.1-8B.json b/data/models/fulim_FineLlama-3.1-8B.json similarity index 100% rename from data/fulim_FineLlama-3.1-8B.json rename to data/models/fulim_FineLlama-3.1-8B.json diff --git a/data/gabrielmbmb_SmolLM-1.7B-Instruct-IFEval.json b/data/models/gabrielmbmb_SmolLM-1.7B-Instruct-IFEval.json similarity index 100% rename from data/gabrielmbmb_SmolLM-1.7B-Instruct-IFEval.json rename to data/models/gabrielmbmb_SmolLM-1.7B-Instruct-IFEval.json diff --git a/data/gaverfraxz_Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA.json b/data/models/gaverfraxz_Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA.json similarity index 100% rename from data/gaverfraxz_Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA.json rename to data/models/gaverfraxz_Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA.json diff --git a/data/gaverfraxz_Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES.json b/data/models/gaverfraxz_Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES.json similarity index 100% rename from data/gaverfraxz_Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES.json rename to data/models/gaverfraxz_Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES.json diff --git a/data/gbueno86_Brinebreath-Llama-3.1-70B.json b/data/models/gbueno86_Brinebreath-Llama-3.1-70B.json similarity index 100% rename from data/gbueno86_Brinebreath-Llama-3.1-70B.json rename to data/models/gbueno86_Brinebreath-Llama-3.1-70B.json diff --git a/data/gbueno86_Meta-LLama-3-Cat-Smaug-LLama-70b.json b/data/models/gbueno86_Meta-LLama-3-Cat-Smaug-LLama-70b.json similarity index 100% rename from data/gbueno86_Meta-LLama-3-Cat-Smaug-LLama-70b.json rename to data/models/gbueno86_Meta-LLama-3-Cat-Smaug-LLama-70b.json diff --git a/data/gemini-1.5-flash-8b.json b/data/models/gemini-1.5-flash-8b.json similarity index 100% rename from data/gemini-1.5-flash-8b.json rename to data/models/gemini-1.5-flash-8b.json diff --git a/data/general-preference_GPM-Gemma-2B.json b/data/models/general-preference_GPM-Gemma-2B.json similarity index 100% rename from data/general-preference_GPM-Gemma-2B.json rename to data/models/general-preference_GPM-Gemma-2B.json diff --git a/data/general-preference_GPM-Llama-3.1-8B.json b/data/models/general-preference_GPM-Llama-3.1-8B.json similarity index 100% rename from data/general-preference_GPM-Llama-3.1-8B.json rename to data/models/general-preference_GPM-Llama-3.1-8B.json diff --git a/data/ghost-x_ghost-8b-beta-1608.json b/data/models/ghost-x_ghost-8b-beta-1608.json similarity index 100% rename from data/ghost-x_ghost-8b-beta-1608.json rename to data/models/ghost-x_ghost-8b-beta-1608.json diff --git a/data/glaiveai_Reflection-Llama-3.1-70B.json b/data/models/glaiveai_Reflection-Llama-3.1-70B.json similarity index 100% rename from data/glaiveai_Reflection-Llama-3.1-70B.json rename to data/models/glaiveai_Reflection-Llama-3.1-70B.json diff --git a/data/gmonsoon_SahabatAI-Llama-11B-Test.json b/data/models/gmonsoon_SahabatAI-Llama-11B-Test.json similarity index 100% rename from data/gmonsoon_SahabatAI-Llama-11B-Test.json rename to data/models/gmonsoon_SahabatAI-Llama-11B-Test.json diff --git a/data/gmonsoon_SahabatAI-MediChatIndo-8B-v1.json b/data/models/gmonsoon_SahabatAI-MediChatIndo-8B-v1.json similarity index 100% rename from data/gmonsoon_SahabatAI-MediChatIndo-8B-v1.json rename to data/models/gmonsoon_SahabatAI-MediChatIndo-8B-v1.json diff --git a/data/gmonsoon_SahabatAI-Rebase-8B-Test.json b/data/models/gmonsoon_SahabatAI-Rebase-8B-Test.json similarity index 100% rename from data/gmonsoon_SahabatAI-Rebase-8B-Test.json rename to data/models/gmonsoon_SahabatAI-Rebase-8B-Test.json diff --git a/data/gmonsoon_StockSeaLLMs-7B-v1.json b/data/models/gmonsoon_StockSeaLLMs-7B-v1.json similarity index 100% rename from data/gmonsoon_StockSeaLLMs-7B-v1.json rename to data/models/gmonsoon_StockSeaLLMs-7B-v1.json diff --git a/data/gmonsoon_gemma2-9b-sahabatai-v1-instruct-BaseTIES.json b/data/models/gmonsoon_gemma2-9b-sahabatai-v1-instruct-BaseTIES.json similarity index 100% rename from data/gmonsoon_gemma2-9b-sahabatai-v1-instruct-BaseTIES.json rename to data/models/gmonsoon_gemma2-9b-sahabatai-v1-instruct-BaseTIES.json diff --git a/data/godlikehhd_alpaca_data_full_2.json b/data/models/godlikehhd_alpaca_data_full_2.json similarity index 100% rename from data/godlikehhd_alpaca_data_full_2.json rename to data/models/godlikehhd_alpaca_data_full_2.json diff --git a/data/godlikehhd_alpaca_data_full_3B.json b/data/models/godlikehhd_alpaca_data_full_3B.json similarity index 100% rename from data/godlikehhd_alpaca_data_full_3B.json rename to data/models/godlikehhd_alpaca_data_full_3B.json diff --git a/data/godlikehhd_alpaca_data_ifd_max_2600.json b/data/models/godlikehhd_alpaca_data_ifd_max_2600.json similarity index 100% rename from data/godlikehhd_alpaca_data_ifd_max_2600.json rename to data/models/godlikehhd_alpaca_data_ifd_max_2600.json diff --git a/data/godlikehhd_alpaca_data_ifd_max_2600_3B.json b/data/models/godlikehhd_alpaca_data_ifd_max_2600_3B.json similarity index 100% rename from data/godlikehhd_alpaca_data_ifd_max_2600_3B.json rename to data/models/godlikehhd_alpaca_data_ifd_max_2600_3B.json diff --git a/data/godlikehhd_alpaca_data_ifd_me_max_5200.json b/data/models/godlikehhd_alpaca_data_ifd_me_max_5200.json similarity index 100% rename from data/godlikehhd_alpaca_data_ifd_me_max_5200.json rename to data/models/godlikehhd_alpaca_data_ifd_me_max_5200.json diff --git a/data/godlikehhd_alpaca_data_ifd_min_2600.json b/data/models/godlikehhd_alpaca_data_ifd_min_2600.json similarity index 100% rename from data/godlikehhd_alpaca_data_ifd_min_2600.json rename to data/models/godlikehhd_alpaca_data_ifd_min_2600.json diff --git a/data/godlikehhd_alpaca_data_ins_ans_max_5200.json b/data/models/godlikehhd_alpaca_data_ins_ans_max_5200.json similarity index 100% rename from data/godlikehhd_alpaca_data_ins_ans_max_5200.json rename to data/models/godlikehhd_alpaca_data_ins_ans_max_5200.json diff --git a/data/godlikehhd_alpaca_data_ins_max_5200.json b/data/models/godlikehhd_alpaca_data_ins_max_5200.json similarity index 100% rename from data/godlikehhd_alpaca_data_ins_max_5200.json rename to data/models/godlikehhd_alpaca_data_ins_max_5200.json diff --git a/data/godlikehhd_alpaca_data_ins_min_2600.json b/data/models/godlikehhd_alpaca_data_ins_min_2600.json similarity index 100% rename from data/godlikehhd_alpaca_data_ins_min_2600.json rename to data/models/godlikehhd_alpaca_data_ins_min_2600.json diff --git a/data/godlikehhd_alpaca_data_ins_min_5200.json b/data/models/godlikehhd_alpaca_data_ins_min_5200.json similarity index 100% rename from data/godlikehhd_alpaca_data_ins_min_5200.json rename to data/models/godlikehhd_alpaca_data_ins_min_5200.json diff --git a/data/godlikehhd_alpaca_data_sampled_ifd_5200.json b/data/models/godlikehhd_alpaca_data_sampled_ifd_5200.json similarity index 100% rename from data/godlikehhd_alpaca_data_sampled_ifd_5200.json rename to data/models/godlikehhd_alpaca_data_sampled_ifd_5200.json diff --git a/data/godlikehhd_alpaca_data_sampled_ifd_new_5200.json b/data/models/godlikehhd_alpaca_data_sampled_ifd_new_5200.json similarity index 100% rename from data/godlikehhd_alpaca_data_sampled_ifd_new_5200.json rename to data/models/godlikehhd_alpaca_data_sampled_ifd_new_5200.json diff --git a/data/godlikehhd_alpaca_data_score_max_0.1_2600.json b/data/models/godlikehhd_alpaca_data_score_max_0.1_2600.json similarity index 100% rename from data/godlikehhd_alpaca_data_score_max_0.1_2600.json rename to data/models/godlikehhd_alpaca_data_score_max_0.1_2600.json diff --git a/data/godlikehhd_alpaca_data_score_max_0.3_2600.json b/data/models/godlikehhd_alpaca_data_score_max_0.3_2600.json similarity index 100% rename from data/godlikehhd_alpaca_data_score_max_0.3_2600.json rename to data/models/godlikehhd_alpaca_data_score_max_0.3_2600.json diff --git a/data/godlikehhd_alpaca_data_score_max_0.7_2600.json b/data/models/godlikehhd_alpaca_data_score_max_0.7_2600.json similarity index 100% rename from data/godlikehhd_alpaca_data_score_max_0.7_2600.json rename to data/models/godlikehhd_alpaca_data_score_max_0.7_2600.json diff --git a/data/godlikehhd_alpaca_data_score_max_2500.json b/data/models/godlikehhd_alpaca_data_score_max_2500.json similarity index 100% rename from data/godlikehhd_alpaca_data_score_max_2500.json rename to data/models/godlikehhd_alpaca_data_score_max_2500.json diff --git a/data/godlikehhd_alpaca_data_score_max_2600_3B.json b/data/models/godlikehhd_alpaca_data_score_max_2600_3B.json similarity index 100% rename from data/godlikehhd_alpaca_data_score_max_2600_3B.json rename to data/models/godlikehhd_alpaca_data_score_max_2600_3B.json diff --git a/data/godlikehhd_alpaca_data_score_max_5200.json b/data/models/godlikehhd_alpaca_data_score_max_5200.json similarity index 100% rename from data/godlikehhd_alpaca_data_score_max_5200.json rename to data/models/godlikehhd_alpaca_data_score_max_5200.json diff --git a/data/godlikehhd_ifd_2500_qwen.json b/data/models/godlikehhd_ifd_2500_qwen.json similarity index 100% rename from data/godlikehhd_ifd_2500_qwen.json rename to data/models/godlikehhd_ifd_2500_qwen.json diff --git a/data/godlikehhd_ifd_new_correct_all_sample_2500_qwen.json b/data/models/godlikehhd_ifd_new_correct_all_sample_2500_qwen.json similarity index 100% rename from data/godlikehhd_ifd_new_correct_all_sample_2500_qwen.json rename to data/models/godlikehhd_ifd_new_correct_all_sample_2500_qwen.json diff --git a/data/godlikehhd_ifd_new_correct_sample_2500_qwen.json b/data/models/godlikehhd_ifd_new_correct_sample_2500_qwen.json similarity index 100% rename from data/godlikehhd_ifd_new_correct_sample_2500_qwen.json rename to data/models/godlikehhd_ifd_new_correct_sample_2500_qwen.json diff --git a/data/godlikehhd_ifd_new_qwen_2500.json b/data/models/godlikehhd_ifd_new_qwen_2500.json similarity index 100% rename from data/godlikehhd_ifd_new_qwen_2500.json rename to data/models/godlikehhd_ifd_new_qwen_2500.json diff --git a/data/godlikehhd_qwen-2.5-1.5b-cherry.json b/data/models/godlikehhd_qwen-2.5-1.5b-cherry.json similarity index 100% rename from data/godlikehhd_qwen-2.5-1.5b-cherry.json rename to data/models/godlikehhd_qwen-2.5-1.5b-cherry.json diff --git a/data/godlikehhd_qwen_2.5-1.5b-cherry_new.json b/data/models/godlikehhd_qwen_2.5-1.5b-cherry_new.json similarity index 100% rename from data/godlikehhd_qwen_2.5-1.5b-cherry_new.json rename to data/models/godlikehhd_qwen_2.5-1.5b-cherry_new.json diff --git a/data/godlikehhd_qwen_full_data_alpaca.json b/data/models/godlikehhd_qwen_full_data_alpaca.json similarity index 100% rename from data/godlikehhd_qwen_full_data_alpaca.json rename to data/models/godlikehhd_qwen_full_data_alpaca.json diff --git a/data/godlikehhd_qwen_ins_ans_2500.json b/data/models/godlikehhd_qwen_ins_ans_2500.json similarity index 100% rename from data/godlikehhd_qwen_ins_ans_2500.json rename to data/models/godlikehhd_qwen_ins_ans_2500.json diff --git a/data/google_Gemini_2.5_Flash.json b/data/models/google_Gemini_2.5_Flash.json similarity index 100% rename from data/google_Gemini_2.5_Flash.json rename to data/models/google_Gemini_2.5_Flash.json diff --git a/data/google_Gemini_2.5_Pro.json b/data/models/google_Gemini_2.5_Pro.json similarity index 100% rename from data/google_Gemini_2.5_Pro.json rename to data/models/google_Gemini_2.5_Pro.json diff --git a/data/google_Gemini_3.1_Pro.json b/data/models/google_Gemini_3.1_Pro.json similarity index 100% rename from data/google_Gemini_3.1_Pro.json rename to data/models/google_Gemini_3.1_Pro.json diff --git a/data/google_Gemini_3_Flash.json b/data/models/google_Gemini_3_Flash.json similarity index 100% rename from data/google_Gemini_3_Flash.json rename to data/models/google_Gemini_3_Flash.json index f3dcb98222f3a45bbec3325acba502991a42fcb5..a61c04322fdc41b3a4e6f1f4da89287f5d95639a 100644 --- a/data/google_Gemini_3_Flash.json +++ b/data/models/google_Gemini_3_Flash.json @@ -6,53 +6,6 @@ "inference_platform": "unknown" }, "evaluations": [ - { - "evaluation_id": "ace/google_gemini-3-flash/1773260200", - "retrieved_timestamp": "1773260200", - "source_metadata": { - "source_name": "Mercor ACE Leaderboard", - "source_type": "evaluation_run", - "source_organization_name": "Mercor", - "source_organization_url": "https://www.mercor.com", - "evaluator_relationship": "first_party" - }, - "eval_library": { - "name": "archipelago", - "version": "1.0.0" - }, - "benchmark": "ace", - "evaluation_results": [ - { - "evaluation_name": "Gaming Score", - "source_data": { - "dataset_name": "ace", - "source_type": "hf_dataset", - "hf_repo": "Mercor/ACE" - }, - "metric_config": { - "evaluation_description": "Gaming domain score.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.415 - }, - "generation_config": { - "additional_details": { - "run_setting": "High" - } - } - } - ], - "detailed_evaluation_results": null, - "generation_config": { - "additional_details": { - "run_setting": "High" - } - } - }, { "evaluation_id": "apex-agents/google_gemini-3-flash/1773260200", "retrieved_timestamp": "1773260200", @@ -252,6 +205,53 @@ } } }, + { + "evaluation_id": "ace/google_gemini-3-flash/1773260200", + "retrieved_timestamp": "1773260200", + "source_metadata": { + "source_name": "Mercor ACE Leaderboard", + "source_type": "evaluation_run", + "source_organization_name": "Mercor", + "source_organization_url": "https://www.mercor.com", + "evaluator_relationship": "first_party" + }, + "eval_library": { + "name": "archipelago", + "version": "1.0.0" + }, + "benchmark": "ace", + "evaluation_results": [ + { + "evaluation_name": "Gaming Score", + "source_data": { + "dataset_name": "ace", + "source_type": "hf_dataset", + "hf_repo": "Mercor/ACE" + }, + "metric_config": { + "evaluation_description": "Gaming domain score.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0, + "max_score": 1 + }, + "score_details": { + "score": 0.415 + }, + "generation_config": { + "additional_details": { + "run_setting": "High" + } + } + } + ], + "detailed_evaluation_results": null, + "generation_config": { + "additional_details": { + "run_setting": "High" + } + } + }, { "evaluation_id": "apex-v1/google_gemini-3-flash/1773260200", "retrieved_timestamp": "1773260200", diff --git a/data/google_Gemini_3_Pro.json b/data/models/google_Gemini_3_Pro.json similarity index 100% rename from data/google_Gemini_3_Pro.json rename to data/models/google_Gemini_3_Pro.json diff --git a/data/google_Palmyra-X-43B.json b/data/models/google_Palmyra-X-43B.json similarity index 100% rename from data/google_Palmyra-X-43B.json rename to data/models/google_Palmyra-X-43B.json diff --git a/data/google_T5-11B.json b/data/models/google_T5-11B.json similarity index 100% rename from data/google_T5-11B.json rename to data/models/google_T5-11B.json diff --git a/data/google_UL2-20B.json b/data/models/google_UL2-20B.json similarity index 100% rename from data/google_UL2-20B.json rename to data/models/google_UL2-20B.json diff --git a/data/google_codegemma-1.1-2b.json b/data/models/google_codegemma-1.1-2b.json similarity index 100% rename from data/google_codegemma-1.1-2b.json rename to data/models/google_codegemma-1.1-2b.json diff --git a/data/google_flame-1.0-24B-july-2024.json b/data/models/google_flame-1.0-24B-july-2024.json similarity index 100% rename from data/google_flame-1.0-24B-july-2024.json rename to data/models/google_flame-1.0-24B-july-2024.json diff --git a/data/google_flan-t5-base.json b/data/models/google_flan-t5-base.json similarity index 100% rename from data/google_flan-t5-base.json rename to data/models/google_flan-t5-base.json diff --git a/data/google_flan-t5-large.json b/data/models/google_flan-t5-large.json similarity index 100% rename from data/google_flan-t5-large.json rename to data/models/google_flan-t5-large.json diff --git a/data/google_flan-t5-small.json b/data/models/google_flan-t5-small.json similarity index 100% rename from data/google_flan-t5-small.json rename to data/models/google_flan-t5-small.json diff --git a/data/google_flan-t5-xl.json b/data/models/google_flan-t5-xl.json similarity index 100% rename from data/google_flan-t5-xl.json rename to data/models/google_flan-t5-xl.json diff --git a/data/google_flan-t5-xxl.json b/data/models/google_flan-t5-xxl.json similarity index 100% rename from data/google_flan-t5-xxl.json rename to data/models/google_flan-t5-xxl.json diff --git a/data/google_flan-ul2.json b/data/models/google_flan-ul2.json similarity index 100% rename from data/google_flan-ul2.json rename to data/models/google_flan-ul2.json diff --git a/data/google_gemini-1.0-pro-001.json b/data/models/google_gemini-1.0-pro-001.json similarity index 100% rename from data/google_gemini-1.0-pro-001.json rename to data/models/google_gemini-1.0-pro-001.json diff --git a/data/google_gemini-1.0-pro-002.json b/data/models/google_gemini-1.0-pro-002.json similarity index 100% rename from data/google_gemini-1.0-pro-002.json rename to data/models/google_gemini-1.0-pro-002.json diff --git a/data/google_gemini-1.5-flash-001.json b/data/models/google_gemini-1.5-flash-001.json similarity index 100% rename from data/google_gemini-1.5-flash-001.json rename to data/models/google_gemini-1.5-flash-001.json diff --git a/data/google_gemini-1.5-flash-002.json b/data/models/google_gemini-1.5-flash-002.json similarity index 100% rename from data/google_gemini-1.5-flash-002.json rename to data/models/google_gemini-1.5-flash-002.json diff --git a/data/google_gemini-1.5-flash-8b.json b/data/models/google_gemini-1.5-flash-8b.json similarity index 100% rename from data/google_gemini-1.5-flash-8b.json rename to data/models/google_gemini-1.5-flash-8b.json diff --git a/data/google_gemini-1.5-flash-preview-0514.json b/data/models/google_gemini-1.5-flash-preview-0514.json similarity index 100% rename from data/google_gemini-1.5-flash-preview-0514.json rename to data/models/google_gemini-1.5-flash-preview-0514.json diff --git a/data/google_gemini-1.5-pro-001.json b/data/models/google_gemini-1.5-pro-001.json similarity index 100% rename from data/google_gemini-1.5-pro-001.json rename to data/models/google_gemini-1.5-pro-001.json diff --git a/data/google_gemini-1.5-pro-002.json b/data/models/google_gemini-1.5-pro-002.json similarity index 100% rename from data/google_gemini-1.5-pro-002.json rename to data/models/google_gemini-1.5-pro-002.json diff --git a/data/google_gemini-1.5-pro-0514.json b/data/models/google_gemini-1.5-pro-0514.json similarity index 100% rename from data/google_gemini-1.5-pro-0514.json rename to data/models/google_gemini-1.5-pro-0514.json diff --git a/data/google_gemini-1.5-pro-0924.json b/data/models/google_gemini-1.5-pro-0924.json similarity index 100% rename from data/google_gemini-1.5-pro-0924.json rename to data/models/google_gemini-1.5-pro-0924.json diff --git a/data/google_gemini-1.5-pro-preview-0409.json b/data/models/google_gemini-1.5-pro-preview-0409.json similarity index 100% rename from data/google_gemini-1.5-pro-preview-0409.json rename to data/models/google_gemini-1.5-pro-preview-0409.json diff --git a/data/google_gemini-2.0-flash-001.json b/data/models/google_gemini-2.0-flash-001.json similarity index 100% rename from data/google_gemini-2.0-flash-001.json rename to data/models/google_gemini-2.0-flash-001.json diff --git a/data/google_gemini-2.0-flash-exp.json b/data/models/google_gemini-2.0-flash-exp.json similarity index 100% rename from data/google_gemini-2.0-flash-exp.json rename to data/models/google_gemini-2.0-flash-exp.json diff --git a/data/google_gemini-2.0-flash-lite-preview-02-05.json b/data/models/google_gemini-2.0-flash-lite-preview-02-05.json similarity index 100% rename from data/google_gemini-2.0-flash-lite-preview-02-05.json rename to data/models/google_gemini-2.0-flash-lite-preview-02-05.json diff --git a/data/google_gemini-2.5-flash-lite.json b/data/models/google_gemini-2.5-flash-lite.json similarity index 100% rename from data/google_gemini-2.5-flash-lite.json rename to data/models/google_gemini-2.5-flash-lite.json diff --git a/data/google_gemini-2.5-flash-preview-04-17.json b/data/models/google_gemini-2.5-flash-preview-04-17.json similarity index 100% rename from data/google_gemini-2.5-flash-preview-04-17.json rename to data/models/google_gemini-2.5-flash-preview-04-17.json diff --git a/data/google_gemini-2.5-flash-preview-05-20.json b/data/models/google_gemini-2.5-flash-preview-05-20.json similarity index 100% rename from data/google_gemini-2.5-flash-preview-05-20.json rename to data/models/google_gemini-2.5-flash-preview-05-20.json index d6b83fbece32c25f091ab739d9192e35eaf848f8..835bb2d48341cf61e066ce78dc0319eb62c74149 100644 --- a/data/google_gemini-2.5-flash-preview-05-20.json +++ b/data/models/google_gemini-2.5-flash-preview-05-20.json @@ -10,8 +10,8 @@ }, "evaluations": [ { - "evaluation_id": "global-mmlu-lite/google_gemini-2.5-flash-preview-05-20/1773936583.743359", - "retrieved_timestamp": "1773936583.743359", + "evaluation_id": "global-mmlu-lite/google_gemini-2.5-flash-preview-05-20/1773936496.366405", + "retrieved_timestamp": "1773936496.366405", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", @@ -525,8 +525,8 @@ "generation_config": null }, { - "evaluation_id": "global-mmlu-lite/google_gemini-2.5-flash-preview-05-20/1773936496.366405", - "retrieved_timestamp": "1773936496.366405", + "evaluation_id": "global-mmlu-lite/google_gemini-2.5-flash-preview-05-20/1773936583.743359", + "retrieved_timestamp": "1773936583.743359", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", diff --git a/data/google_gemini-2.5-flash.json b/data/models/google_gemini-2.5-flash.json similarity index 99% rename from data/google_gemini-2.5-flash.json rename to data/models/google_gemini-2.5-flash.json index 93bb07cbaa8dc180b1e4c32c0952afa235726a26..100ef3ccb6413e0e23161c5f90be16667820a8bb 100644 --- a/data/google_gemini-2.5-flash.json +++ b/data/models/google_gemini-2.5-flash.json @@ -4,14 +4,14 @@ "id": "google/gemini-2.5-flash", "developer": "Google", "additional_details": { - "agent_name": "Mini-SWE-Agent", - "agent_organization": "Princeton" + "agent_name": "Gemini CLI", + "agent_organization": "Google" } }, "evaluations": [ { - "evaluation_id": "global-mmlu-lite/google_gemini-2.5-flash/1773936583.743359", - "retrieved_timestamp": "1773936583.743359", + "evaluation_id": "global-mmlu-lite/google_gemini-2.5-flash/1773936496.366405", + "retrieved_timestamp": "1773936496.366405", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", @@ -525,8 +525,8 @@ "generation_config": null }, { - "evaluation_id": "global-mmlu-lite/google_gemini-2.5-flash/1773936496.366405", - "retrieved_timestamp": "1773936496.366405", + "evaluation_id": "global-mmlu-lite/google_gemini-2.5-flash/1773936583.743359", + "retrieved_timestamp": "1773936583.743359", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", @@ -1269,7 +1269,7 @@ "generation_config": null }, { - "evaluation_id": "terminal-bench-2.0/openhands__gemini-2.5-flash/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gemini-2.5-flash/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -1293,7 +1293,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-02", + "evaluation_timestamp": "2025-11-03", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -1302,17 +1302,17 @@ "max_score": 100.0 }, "score_details": { - "score": 16.4, + "score": 17.1, "uncertainty": { "standard_error": { - "value": 2.4 + "value": 2.5 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Gemini 2.5 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Gemini 2.5 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -1329,7 +1329,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Gemini 2.5 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Gemini 2.5 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -1343,7 +1343,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/gemini-cli__gemini-2.5-flash/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/openhands__gemini-2.5-flash/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -1367,7 +1367,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-04", + "evaluation_timestamp": "2025-11-02", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -1376,17 +1376,17 @@ "max_score": 100.0 }, "score_details": { - "score": 15.4, + "score": 16.4, "uncertainty": { "standard_error": { - "value": 2.3 + "value": 2.4 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Gemini 2.5 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -1403,7 +1403,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Gemini 2.5 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -1491,7 +1491,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gemini-2.5-flash/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/gemini-cli__gemini-2.5-flash/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -1515,7 +1515,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-03", + "evaluation_timestamp": "2025-11-04", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -1524,17 +1524,17 @@ "max_score": 100.0 }, "score_details": { - "score": 17.1, + "score": 15.4, "uncertainty": { "standard_error": { - "value": 2.5 + "value": 2.3 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Gemini 2.5 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -1551,7 +1551,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Gemini 2.5 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/google_gemini-2.5-pro-preview-03-25.json b/data/models/google_gemini-2.5-pro-preview-03-25.json similarity index 100% rename from data/google_gemini-2.5-pro-preview-03-25.json rename to data/models/google_gemini-2.5-pro-preview-03-25.json diff --git a/data/google_gemini-2.5-pro-preview-05-06.json b/data/models/google_gemini-2.5-pro-preview-05-06.json similarity index 100% rename from data/google_gemini-2.5-pro-preview-05-06.json rename to data/models/google_gemini-2.5-pro-preview-05-06.json diff --git a/data/google_gemini-2.5-pro.json b/data/models/google_gemini-2.5-pro.json similarity index 100% rename from data/google_gemini-2.5-pro.json rename to data/models/google_gemini-2.5-pro.json index 8181fe1d84cd3d46a47a2b40f6202acb441c07ba..c78a20c000adf11c0318e63011a9dba7982505e7 100644 --- a/data/google_gemini-2.5-pro.json +++ b/data/models/google_gemini-2.5-pro.json @@ -1269,7 +1269,7 @@ "generation_config": null }, { - "evaluation_id": "terminal-bench-2.0/gemini-cli__gemini-2.5-pro/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gemini-2.5-pro/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -1293,7 +1293,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-04", + "evaluation_timestamp": "2025-11-03", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -1302,17 +1302,17 @@ "max_score": 100.0 }, "score_details": { - "score": 19.6, + "score": 26.1, "uncertainty": { "standard_error": { - "value": 2.9 + "value": 2.5 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Gemini 2.5 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -1329,7 +1329,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Gemini 2.5 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -1343,7 +1343,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gemini-2.5-pro/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/gemini-cli__gemini-2.5-pro/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -1367,7 +1367,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-03", + "evaluation_timestamp": "2025-11-04", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -1376,17 +1376,17 @@ "max_score": 100.0 }, "score_details": { - "score": 26.1, + "score": 19.6, "uncertainty": { "standard_error": { - "value": 2.5 + "value": 2.9 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Gemini 2.5 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -1403,7 +1403,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Gemini 2.5 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/google_gemini-3-flash.json b/data/models/google_gemini-3-flash.json similarity index 99% rename from data/google_gemini-3-flash.json rename to data/models/google_gemini-3-flash.json index 561efe601f065b60659270d4fa2b9e55060a245f..a020edf7d2de6c31874320ca6b56508a5cced539 100644 --- a/data/google_gemini-3-flash.json +++ b/data/models/google_gemini-3-flash.json @@ -4,13 +4,13 @@ "id": "google/gemini-3-flash", "developer": "Google", "additional_details": { - "agent_name": "Junie CLI", - "agent_organization": "JetBrains" + "agent_name": "Gemini CLI", + "agent_organization": "Google" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/terminus-2__gemini-3-flash/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/junie-cli__gemini-3-flash/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-01-07", + "evaluation_timestamp": "2025-12-23", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 51.7, + "score": 64.3, "uncertainty": { "standard_error": { - "value": 3.1 + "value": 2.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Junie CLI\" -m \"Gemini 3 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Junie CLI\" -m \"Gemini 3 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/gemini-cli__gemini-3-flash/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__gemini-3-flash/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-03-06", + "evaluation_timestamp": "2026-01-07", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,17 @@ "max_score": 100.0 }, "score_details": { - "score": 47.4, + "score": 51.7, "uncertainty": { "standard_error": { - "value": 3.0 + "value": 3.1 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 3 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 3 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -232,7 +232,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/junie-cli__gemini-3-flash/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/gemini-cli__gemini-3-flash/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -256,7 +256,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-23", + "evaluation_timestamp": "2026-03-06", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -265,17 +265,17 @@ "max_score": 100.0 }, "score_details": { - "score": 64.3, + "score": 47.4, "uncertainty": { "standard_error": { - "value": 2.8 + "value": 3.0 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Junie CLI\" -m \"Gemini 3 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 3 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -292,7 +292,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Junie CLI\" -m \"Gemini 3 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 3 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/google_gemini-3-pro-preview.json b/data/models/google_gemini-3-pro-preview.json similarity index 99% rename from data/google_gemini-3-pro-preview.json rename to data/models/google_gemini-3-pro-preview.json index b7beb12d40e2aea8388a1fb87d12938162db38ce..72d051735ad53e09d24aa98fe02cfc581a1e9522 100644 --- a/data/google_gemini-3-pro-preview.json +++ b/data/models/google_gemini-3-pro-preview.json @@ -4,13 +4,13 @@ "id": "google/gemini-3-pro-preview", "developer": "Google", "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } }, "evaluations": [ { - "evaluation_id": "appworld/test_normal/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "appworld/test_normal/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -42,23 +42,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.505, + "score": 0.36, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "1.88", - "total_run_cost": "188.19", - "average_steps": "21.76", - "percent_finished": "0.99" + "average_agent_cost": "3.11", + "total_run_cost": "310.55", + "average_steps": "38.01", + "percent_finished": "0.86" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -70,15 +70,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "appworld/test_normal/litellm-tool-calling-with-shortlisting__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "appworld/test_normal/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -110,23 +110,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.55, + "score": 0.13, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "1.3", - "total_run_cost": "130.49", - "average_steps": "22.59", - "percent_finished": "1.0" + "average_agent_cost": "2.54", + "total_run_cost": "254.25", + "average_steps": "49.13", + "percent_finished": "0.71" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -138,15 +138,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } } }, { - "evaluation_id": "appworld/test_normal/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "appworld/test_normal/litellm-tool-calling-with-shortlisting__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -178,23 +178,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.36, + "score": 0.55, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "3.11", - "total_run_cost": "310.55", - "average_steps": "38.01", - "percent_finished": "0.86" + "average_agent_cost": "1.3", + "total_run_cost": "130.49", + "average_steps": "22.59", + "percent_finished": "1.0" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -206,15 +206,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } } }, { - "evaluation_id": "appworld/test_normal/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "appworld/test_normal/openai-solo__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -246,23 +246,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.13, + "score": 0.582, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "2.54", - "total_run_cost": "254.25", - "average_steps": "49.13", - "percent_finished": "0.71" + "average_agent_cost": "8.7", + "total_run_cost": "869.55", + "average_steps": "33.49", + "percent_finished": "0.98" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -274,15 +274,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "appworld/test_normal/openai-solo__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "appworld/test_normal/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -314,23 +314,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.582, + "score": 0.505, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "8.7", - "total_run_cost": "869.55", - "average_steps": "33.49", - "percent_finished": "0.98" + "average_agent_cost": "1.88", + "total_run_cost": "188.19", + "average_steps": "21.76", + "percent_finished": "0.99" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -342,15 +342,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "browsecompplus/litellm-tool-calling-with-shortlisting__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "browsecompplus/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -382,23 +382,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.48, + "score": 0.57, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.44", - "total_run_cost": "44.18", - "average_steps": "7.85", - "percent_finished": "0.99" + "average_agent_cost": "2.39", + "total_run_cost": "239.0", + "average_steps": "29.63", + "percent_finished": "0.69" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -410,15 +410,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } } }, { - "evaluation_id": "browsecompplus/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "browsecompplus/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -450,23 +450,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.57, + "score": 0.51, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "2.39", - "total_run_cost": "239.0", - "average_steps": "29.63", - "percent_finished": "0.69" + "average_agent_cost": "2.85", + "total_run_cost": "284.68", + "average_steps": "22.88", + "percent_finished": "0.7" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -478,15 +478,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "browsecompplus/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "browsecompplus/openai-solo__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -518,23 +518,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.51, + "score": 0.3333, "uncertainty": { - "num_samples": 100 + "num_samples": 99 }, "details": { - "average_agent_cost": "2.85", - "total_run_cost": "284.68", - "average_steps": "22.88", - "percent_finished": "0.7" + "average_agent_cost": "0.64", + "total_run_cost": "63.79", + "average_steps": "8.45", + "percent_finished": "0.6061" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -546,15 +546,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "browsecompplus/openai-solo__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "browsecompplus/litellm-tool-calling-with-shortlisting__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -586,23 +586,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3333, + "score": 0.48, "uncertainty": { - "num_samples": 99 + "num_samples": 100 }, "details": { - "average_agent_cost": "0.64", - "total_run_cost": "63.79", - "average_steps": "8.45", - "percent_finished": "0.6061" + "average_agent_cost": "0.44", + "total_run_cost": "44.18", + "average_steps": "7.85", + "percent_finished": "0.99" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -614,8 +614,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -690,8 +690,8 @@ } }, { - "evaluation_id": "global-mmlu-lite/google_gemini-3-pro-preview/1773936496.366405", - "retrieved_timestamp": "1773936496.366405", + "evaluation_id": "global-mmlu-lite/google_gemini-3-pro-preview/1773936583.743359", + "retrieved_timestamp": "1773936583.743359", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", @@ -1205,8 +1205,8 @@ "generation_config": null }, { - "evaluation_id": "global-mmlu-lite/google_gemini-3-pro-preview/1773936583.743359", - "retrieved_timestamp": "1773936583.743359", + "evaluation_id": "global-mmlu-lite/google_gemini-3-pro-preview/1773936496.366405", + "retrieved_timestamp": "1773936496.366405", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", @@ -1720,7 +1720,7 @@ "generation_config": null }, { - "evaluation_id": "swe-bench/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "swe-bench/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1752,14 +1752,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.71, + "score": 0.7576, "uncertainty": { - "num_samples": 100 + "num_samples": 99 }, "details": { - "average_agent_cost": "0.7", - "total_run_cost": "69.56", - "average_steps": "32.55", + "average_agent_cost": "2.21", + "total_run_cost": "218.76", + "average_steps": "38.1", "percent_finished": "1.0" } }, @@ -1767,8 +1767,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -1780,15 +1780,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } } }, { - "evaluation_id": "swe-bench/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "swe-bench/litellm-tool-calling-with-shortlisting__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1820,14 +1820,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7576, + "score": 0.71, "uncertainty": { - "num_samples": 99 + "num_samples": 100 }, "details": { - "average_agent_cost": "2.21", - "total_run_cost": "218.76", - "average_steps": "38.1", + "average_agent_cost": "0.7", + "total_run_cost": "69.56", + "average_steps": "32.55", "percent_finished": "1.0" } }, @@ -1835,8 +1835,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -1848,15 +1848,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } } }, { - "evaluation_id": "swe-bench/litellm-tool-calling-with-shortlisting__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "swe-bench/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1888,14 +1888,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.71, + "score": 0.67, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.7", - "total_run_cost": "69.56", - "average_steps": "32.55", + "average_agent_cost": "3.68", + "total_run_cost": "367.97", + "average_steps": "43.72", "percent_finished": "1.0" } }, @@ -1903,8 +1903,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -1916,15 +1916,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "swe-bench/openai-solo__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "swe-bench/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1956,14 +1956,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7234, + "score": 0.71, "uncertainty": { - "num_samples": 94 + "num_samples": 100 }, "details": { - "average_agent_cost": "1.58", - "total_run_cost": "148.44", - "average_steps": "32.36", + "average_agent_cost": "0.7", + "total_run_cost": "69.56", + "average_steps": "32.55", "percent_finished": "1.0" } }, @@ -1971,8 +1971,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -1984,15 +1984,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "swe-bench/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "swe-bench/openai-solo__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2024,14 +2024,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.67, + "score": 0.7234, "uncertainty": { - "num_samples": 100 + "num_samples": 94 }, "details": { - "average_agent_cost": "3.68", - "total_run_cost": "367.97", - "average_steps": "43.72", + "average_agent_cost": "1.58", + "total_run_cost": "148.44", + "average_steps": "32.36", "percent_finished": "1.0" } }, @@ -2039,8 +2039,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -2052,15 +2052,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "tau-bench-2/airline/openai-solo__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "tau-bench-2/airline/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2092,14 +2092,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.62, + "score": 0.7, "uncertainty": { "num_samples": 50 }, "details": { - "average_agent_cost": "0.21", - "total_run_cost": "11.18", - "average_steps": "10.9", + "average_agent_cost": "0.34", + "total_run_cost": "17.45", + "average_steps": "12.62", "percent_finished": "1.0" } }, @@ -2107,8 +2107,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -2120,15 +2120,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "tau-bench-2/airline/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "tau-bench-2/airline/openai-solo__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2160,14 +2160,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7, + "score": 0.62, "uncertainty": { "num_samples": 50 }, "details": { - "average_agent_cost": "0.16", - "total_run_cost": "8.48", - "average_steps": "10.14", + "average_agent_cost": "0.21", + "total_run_cost": "11.18", + "average_steps": "10.9", "percent_finished": "1.0" } }, @@ -2175,8 +2175,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -2188,8 +2188,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -2264,7 +2264,7 @@ } }, { - "evaluation_id": "tau-bench-2/airline/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "tau-bench-2/airline/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2301,9 +2301,9 @@ "num_samples": 50 }, "details": { - "average_agent_cost": "0.34", - "total_run_cost": "17.45", - "average_steps": "12.62", + "average_agent_cost": "0.16", + "total_run_cost": "8.48", + "average_steps": "10.14", "percent_finished": "1.0" } }, @@ -2311,8 +2311,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -2324,15 +2324,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "tau-bench-2/retail/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2364,14 +2364,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7805, + "score": 0.7576, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.19", - "total_run_cost": "19.38", - "average_steps": "11.18", + "average_agent_cost": "0.21", + "total_run_cost": "21.43", + "average_steps": "11.3", "percent_finished": "1.0" } }, @@ -2379,8 +2379,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -2392,8 +2392,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -2468,7 +2468,7 @@ } }, { - "evaluation_id": "tau-bench-2/retail/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2500,14 +2500,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7576, + "score": 0.82, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.21", - "total_run_cost": "21.43", - "average_steps": "11.3", + "average_agent_cost": "0.16", + "total_run_cost": "16.64", + "average_steps": "11.25", "percent_finished": "1.0" } }, @@ -2515,8 +2515,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -2528,15 +2528,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "tau-bench-2/retail/openai-solo__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/litellm-tool-calling-with-shortlisting__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2568,14 +2568,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.73, + "score": 0.82, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.27", - "total_run_cost": "27.48", - "average_steps": "10.62", + "average_agent_cost": "0.16", + "total_run_cost": "16.64", + "average_steps": "11.25", "percent_finished": "1.0" } }, @@ -2583,8 +2583,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -2596,15 +2596,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } } }, { - "evaluation_id": "tau-bench-2/retail/litellm-tool-calling-with-shortlisting__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/openai-solo__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2636,14 +2636,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.82, + "score": 0.73, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.16", - "total_run_cost": "16.64", - "average_steps": "11.25", + "average_agent_cost": "0.27", + "total_run_cost": "27.48", + "average_steps": "10.62", "percent_finished": "1.0" } }, @@ -2651,8 +2651,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -2664,15 +2664,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "tau-bench-2/retail/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2704,14 +2704,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.82, + "score": 0.7805, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.16", - "total_run_cost": "16.64", - "average_steps": "11.25", + "average_agent_cost": "0.19", + "total_run_cost": "19.38", + "average_steps": "11.18", "percent_finished": "1.0" } }, @@ -2719,8 +2719,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -2732,8 +2732,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -2808,7 +2808,7 @@ } }, { - "evaluation_id": "tau-bench-2/telecom/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "tau-bench-2/telecom/openai-solo__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2840,23 +2840,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.88, + "score": 0.8876, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.35", - "total_run_cost": "40.25", - "average_steps": "12.71", - "percent_finished": "1.0" + "average_agent_cost": "0.54", + "total_run_cost": "58.29", + "average_steps": "10.82", + "percent_finished": "0.89" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -2868,15 +2868,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "tau-bench-2/telecom/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2908,14 +2908,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.73, + "score": 0.88, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.3", - "total_run_cost": "36.75", - "average_steps": "14.84", + "average_agent_cost": "0.35", + "total_run_cost": "40.25", + "average_steps": "12.71", "percent_finished": "1.0" } }, @@ -2923,8 +2923,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -2936,8 +2936,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -3012,7 +3012,7 @@ } }, { - "evaluation_id": "tau-bench-2/telecom/openai-solo__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -3044,23 +3044,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.8876, + "score": 0.73, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.54", - "total_run_cost": "58.29", - "average_steps": "10.82", - "percent_finished": "0.89" + "average_agent_cost": "0.3", + "total_run_cost": "36.75", + "average_steps": "14.84", + "percent_finished": "1.0" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -3072,8 +3072,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } diff --git a/data/google_gemini-3-pro.json b/data/models/google_gemini-3-pro.json similarity index 99% rename from data/google_gemini-3-pro.json rename to data/models/google_gemini-3-pro.json index 85664527b8591f900bab967606174eba3c681c59..eae93e48214f42eb08f89532b3b13d405fb3768b 100644 --- a/data/google_gemini-3-pro.json +++ b/data/models/google_gemini-3-pro.json @@ -4,13 +4,13 @@ "id": "google/gemini-3-pro", "developer": "Google", "additional_details": { - "agent_name": "Terminus 2", - "agent_organization": "Terminal Bench" + "agent_name": "CodeBrain-1", + "agent_organization": "Feeling AI" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/ante__gemini-3-pro/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/sageagent__gemini-3-pro/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-01-06", + "evaluation_timestamp": "2026-02-23", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,7 +43,7 @@ "max_score": 100.0 }, "score_details": { - "score": 69.4, + "score": 65.2, "uncertainty": { "standard_error": { "value": 2.1 @@ -53,7 +53,7 @@ }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Ante\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"SageAgent\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Ante\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"SageAgent\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/droid__gemini-3-pro/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/ante__gemini-3-pro/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-24", + "evaluation_timestamp": "2026-01-06", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 61.1, + "score": 69.4, "uncertainty": { "standard_error": { - "value": 2.8 + "value": 2.1 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Ante\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Ante\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/letta-code__gemini-3-pro/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/droid__gemini-3-pro/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-17", + "evaluation_timestamp": "2025-12-24", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,17 @@ "max_score": 100.0 }, "score_details": { - "score": 56.0, + "score": 61.1, "uncertainty": { "standard_error": { - "value": 3.0 + "value": 2.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -232,7 +232,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/codebrain-1__gemini-3-pro/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/ii-agent__gemini-3-pro/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -256,7 +256,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-05", + "evaluation_timestamp": "2025-12-23", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -265,17 +265,17 @@ "max_score": 100.0 }, "score_details": { - "score": 62.2, + "score": 61.8, "uncertainty": { "standard_error": { - "value": 2.6 + "value": 2.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"CodeBrain-1\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"II-Agent\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -292,7 +292,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"CodeBrain-1\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"II-Agent\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -306,7 +306,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/ii-agent__gemini-3-pro/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/letta-code__gemini-3-pro/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -330,7 +330,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-23", + "evaluation_timestamp": "2025-12-17", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -339,17 +339,17 @@ "max_score": 100.0 }, "score_details": { - "score": 61.8, + "score": 56.0, "uncertainty": { "standard_error": { - "value": 2.8 + "value": 3.0 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"II-Agent\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -366,7 +366,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"II-Agent\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -380,7 +380,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/sageagent__gemini-3-pro/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__gemini-3-pro/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -404,7 +404,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-23", + "evaluation_timestamp": "2025-11-21", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -413,17 +413,17 @@ "max_score": 100.0 }, "score_details": { - "score": 65.2, + "score": 56.9, "uncertainty": { "standard_error": { - "value": 2.1 + "value": 2.5 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"SageAgent\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -440,7 +440,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"SageAgent\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -454,7 +454,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__gemini-3-pro/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/codebrain-1__gemini-3-pro/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -478,7 +478,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-21", + "evaluation_timestamp": "2026-02-05", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -487,17 +487,17 @@ "max_score": 100.0 }, "score_details": { - "score": 56.9, + "score": 62.2, "uncertainty": { "standard_error": { - "value": 2.5 + "value": 2.6 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"CodeBrain-1\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -514,7 +514,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"CodeBrain-1\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/google_gemini-3.1-pro.json b/data/models/google_gemini-3.1-pro.json similarity index 98% rename from data/google_gemini-3.1-pro.json rename to data/models/google_gemini-3.1-pro.json index 945834533f47e11456a671dc36fdab03175c7bd2..0e5f8fe75691718cffdd5684a2f8910d846de44c 100644 --- a/data/google_gemini-3.1-pro.json +++ b/data/models/google_gemini-3.1-pro.json @@ -4,13 +4,13 @@ "id": "google/gemini-3.1-pro", "developer": "Google", "additional_details": { - "agent_name": "Terminus-KIRA", - "agent_organization": "KRAFTON AI" + "agent_name": "Forge Code", + "agent_organization": "Forge Code" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/forge-code__gemini-3.1-pro/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-kira__gemini-3.1-pro/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-03-02", + "evaluation_timestamp": "2026-02-23", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 78.4, + "score": 74.8, "uncertainty": { "standard_error": { - "value": 1.8 + "value": 2.6 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Forge Code\" -m \"Gemini 3.1 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus-KIRA\" -m \"Gemini 3.1 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Forge Code\" -m \"Gemini 3.1 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus-KIRA\" -m \"Gemini 3.1 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-kira__gemini-3.1-pro/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/forge-code__gemini-3.1-pro/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-23", + "evaluation_timestamp": "2026-03-02", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 74.8, + "score": 78.4, "uncertainty": { "standard_error": { - "value": 2.6 + "value": 1.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus-KIRA\" -m \"Gemini 3.1 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Forge Code\" -m \"Gemini 3.1 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus-KIRA\" -m \"Gemini 3.1 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Forge Code\" -m \"Gemini 3.1 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/google_gemma-1.1-2b-it.json b/data/models/google_gemma-1.1-2b-it.json similarity index 100% rename from data/google_gemma-1.1-2b-it.json rename to data/models/google_gemma-1.1-2b-it.json diff --git a/data/google_gemma-1.1-7b-it.json b/data/models/google_gemma-1.1-7b-it.json similarity index 100% rename from data/google_gemma-1.1-7b-it.json rename to data/models/google_gemma-1.1-7b-it.json diff --git a/data/google_gemma-2-27b-it.json b/data/models/google_gemma-2-27b-it.json similarity index 100% rename from data/google_gemma-2-27b-it.json rename to data/models/google_gemma-2-27b-it.json diff --git a/data/google_gemma-2-27b.json b/data/models/google_gemma-2-27b.json similarity index 100% rename from data/google_gemma-2-27b.json rename to data/models/google_gemma-2-27b.json diff --git a/data/google_gemma-2-2b-it.json b/data/models/google_gemma-2-2b-it.json similarity index 100% rename from data/google_gemma-2-2b-it.json rename to data/models/google_gemma-2-2b-it.json diff --git a/data/google_gemma-2-2b-jpn-it.json b/data/models/google_gemma-2-2b-jpn-it.json similarity index 99% rename from data/google_gemma-2-2b-jpn-it.json rename to data/models/google_gemma-2-2b-jpn-it.json index 4e7bbf20f77d6c000afe54e95fcab67db2bebd96..800d1732f590741446eb02b11239ee165c15b48f 100644 --- a/data/google_gemma-2-2b-jpn-it.json +++ b/data/models/google_gemma-2-2b-jpn-it.json @@ -5,7 +5,7 @@ "developer": "google", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "Gemma2ForCausalLM", "params_billions": "2.614" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5288 + "score": 0.5078 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4178 + "score": 0.4226 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0476 + "score": 0.0347 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2752 + "score": 0.2852 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3728 + "score": 0.3964 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2467 + "score": 0.2578 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5078 + "score": 0.5288 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4226 + "score": 0.4178 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0347 + "score": 0.0476 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2852 + "score": 0.2752 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3964 + "score": 0.3728 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2578 + "score": 0.2467 } } ], diff --git a/data/google_gemma-2-2b.json b/data/models/google_gemma-2-2b.json similarity index 99% rename from data/google_gemma-2-2b.json rename to data/models/google_gemma-2-2b.json index a26c6c01f74367d0b27fa559804c0864d4e70267..7ff61a3ccdeab708a1ad73cfdd534c8e9b4c6dc6 100644 --- a/data/google_gemma-2-2b.json +++ b/data/models/google_gemma-2-2b.json @@ -5,7 +5,7 @@ "developer": "google", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "InternLM2ForCausalLM", "params_billions": "2.614" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1993 + "score": 0.2018 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3656 + "score": 0.3709 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0287 + "score": 0.0302 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4232 + "score": 0.4219 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.218 + "score": 0.2217 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2018 + "score": 0.1993 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3709 + "score": 0.3656 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0302 + "score": 0.0287 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4219 + "score": 0.4232 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2217 + "score": 0.218 } } ], diff --git a/data/google_gemma-2-9b-it.json b/data/models/google_gemma-2-9b-it.json similarity index 100% rename from data/google_gemma-2-9b-it.json rename to data/models/google_gemma-2-9b-it.json diff --git a/data/google_gemma-2-9b.json b/data/models/google_gemma-2-9b.json similarity index 100% rename from data/google_gemma-2-9b.json rename to data/models/google_gemma-2-9b.json diff --git a/data/google_gemma-2b-it.json b/data/models/google_gemma-2b-it.json similarity index 100% rename from data/google_gemma-2b-it.json rename to data/models/google_gemma-2b-it.json diff --git a/data/google_gemma-2b.json b/data/models/google_gemma-2b.json similarity index 100% rename from data/google_gemma-2b.json rename to data/models/google_gemma-2b.json diff --git a/data/google_gemma-3-27b-it.json b/data/models/google_gemma-3-27b-it.json similarity index 100% rename from data/google_gemma-3-27b-it.json rename to data/models/google_gemma-3-27b-it.json index e9414befbe7691bfbd54ff0201280b82b0db7931..cfb14226879ac7a359e975d38df1cd2917c02743 100644 --- a/data/google_gemma-3-27b-it.json +++ b/data/models/google_gemma-3-27b-it.json @@ -10,8 +10,8 @@ }, "evaluations": [ { - "evaluation_id": "global-mmlu-lite/google_gemma-3-27b-it/1773936496.366405", - "retrieved_timestamp": "1773936496.366405", + "evaluation_id": "global-mmlu-lite/google_gemma-3-27b-it/1773936583.743359", + "retrieved_timestamp": "1773936583.743359", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", @@ -525,8 +525,8 @@ "generation_config": null }, { - "evaluation_id": "global-mmlu-lite/google_gemma-3-27b-it/1773936583.743359", - "retrieved_timestamp": "1773936583.743359", + "evaluation_id": "global-mmlu-lite/google_gemma-3-27b-it/1773936496.366405", + "retrieved_timestamp": "1773936496.366405", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", diff --git a/data/google_gemma-3-4b-it.json b/data/models/google_gemma-3-4b-it.json similarity index 100% rename from data/google_gemma-3-4b-it.json rename to data/models/google_gemma-3-4b-it.json diff --git a/data/google_gemma-7b-it.json b/data/models/google_gemma-7b-it.json similarity index 100% rename from data/google_gemma-7b-it.json rename to data/models/google_gemma-7b-it.json diff --git a/data/google_gemma-7b.json b/data/models/google_gemma-7b.json similarity index 100% rename from data/google_gemma-7b.json rename to data/models/google_gemma-7b.json diff --git a/data/google_mt5-base.json b/data/models/google_mt5-base.json similarity index 100% rename from data/google_mt5-base.json rename to data/models/google_mt5-base.json diff --git a/data/google_mt5-small.json b/data/models/google_mt5-small.json similarity index 100% rename from data/google_mt5-small.json rename to data/models/google_mt5-small.json diff --git a/data/google_mt5-xl.json b/data/models/google_mt5-xl.json similarity index 100% rename from data/google_mt5-xl.json rename to data/models/google_mt5-xl.json diff --git a/data/google_mt5-xxl.json b/data/models/google_mt5-xxl.json similarity index 100% rename from data/google_mt5-xxl.json rename to data/models/google_mt5-xxl.json diff --git a/data/google_recurrentgemma-2b-it.json b/data/models/google_recurrentgemma-2b-it.json similarity index 100% rename from data/google_recurrentgemma-2b-it.json rename to data/models/google_recurrentgemma-2b-it.json diff --git a/data/google_recurrentgemma-2b.json b/data/models/google_recurrentgemma-2b.json similarity index 100% rename from data/google_recurrentgemma-2b.json rename to data/models/google_recurrentgemma-2b.json diff --git a/data/google_recurrentgemma-9b-it.json b/data/models/google_recurrentgemma-9b-it.json similarity index 100% rename from data/google_recurrentgemma-9b-it.json rename to data/models/google_recurrentgemma-9b-it.json diff --git a/data/google_recurrentgemma-9b.json b/data/models/google_recurrentgemma-9b.json similarity index 100% rename from data/google_recurrentgemma-9b.json rename to data/models/google_recurrentgemma-9b.json diff --git a/data/google_switch-base-8.json b/data/models/google_switch-base-8.json similarity index 100% rename from data/google_switch-base-8.json rename to data/models/google_switch-base-8.json diff --git a/data/google_text-bison_001.json b/data/models/google_text-bison_001.json similarity index 100% rename from data/google_text-bison_001.json rename to data/models/google_text-bison_001.json diff --git a/data/google_text-unicorn_001.json b/data/models/google_text-unicorn_001.json similarity index 100% rename from data/google_text-unicorn_001.json rename to data/models/google_text-unicorn_001.json diff --git a/data/google_umt5-base.json b/data/models/google_umt5-base.json similarity index 100% rename from data/google_umt5-base.json rename to data/models/google_umt5-base.json diff --git a/data/goulue5_merging_LLM.json b/data/models/goulue5_merging_LLM.json similarity index 100% rename from data/goulue5_merging_LLM.json rename to data/models/goulue5_merging_LLM.json diff --git a/data/gradientai_Llama-3-8B-Instruct-Gradient-1048k.json b/data/models/gradientai_Llama-3-8B-Instruct-Gradient-1048k.json similarity index 100% rename from data/gradientai_Llama-3-8B-Instruct-Gradient-1048k.json rename to data/models/gradientai_Llama-3-8B-Instruct-Gradient-1048k.json diff --git a/data/grimjim_DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B.json b/data/models/grimjim_DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B.json similarity index 100% rename from data/grimjim_DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B.json rename to data/models/grimjim_DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B.json diff --git a/data/grimjim_Gigantes-v1-gemma2-9b-it.json b/data/models/grimjim_Gigantes-v1-gemma2-9b-it.json similarity index 100% rename from data/grimjim_Gigantes-v1-gemma2-9b-it.json rename to data/models/grimjim_Gigantes-v1-gemma2-9b-it.json diff --git a/data/grimjim_Gigantes-v2-gemma2-9b-it.json b/data/models/grimjim_Gigantes-v2-gemma2-9b-it.json similarity index 100% rename from data/grimjim_Gigantes-v2-gemma2-9b-it.json rename to data/models/grimjim_Gigantes-v2-gemma2-9b-it.json diff --git a/data/grimjim_Gigantes-v3-gemma2-9b-it.json b/data/models/grimjim_Gigantes-v3-gemma2-9b-it.json similarity index 100% rename from data/grimjim_Gigantes-v3-gemma2-9b-it.json rename to data/models/grimjim_Gigantes-v3-gemma2-9b-it.json diff --git a/data/grimjim_HuatuoSkywork-o1-Llama-3.1-8B.json b/data/models/grimjim_HuatuoSkywork-o1-Llama-3.1-8B.json similarity index 100% rename from data/grimjim_HuatuoSkywork-o1-Llama-3.1-8B.json rename to data/models/grimjim_HuatuoSkywork-o1-Llama-3.1-8B.json diff --git a/data/grimjim_Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge.json b/data/models/grimjim_Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge.json similarity index 100% rename from data/grimjim_Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge.json rename to data/models/grimjim_Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge.json diff --git a/data/grimjim_Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge.json b/data/models/grimjim_Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge.json similarity index 100% rename from data/grimjim_Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge.json rename to data/models/grimjim_Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge.json diff --git a/data/grimjim_Llama-3.1-8B-Instruct-abliterated_via_adapter.json b/data/models/grimjim_Llama-3.1-8B-Instruct-abliterated_via_adapter.json similarity index 100% rename from data/grimjim_Llama-3.1-8B-Instruct-abliterated_via_adapter.json rename to data/models/grimjim_Llama-3.1-8B-Instruct-abliterated_via_adapter.json diff --git a/data/grimjim_Llama-3.1-Bonsaikraft-8B-Instruct.json b/data/models/grimjim_Llama-3.1-Bonsaikraft-8B-Instruct.json similarity index 100% rename from data/grimjim_Llama-3.1-Bonsaikraft-8B-Instruct.json rename to data/models/grimjim_Llama-3.1-Bonsaikraft-8B-Instruct.json diff --git a/data/grimjim_Llama-Nephilim-Metamorphosis-v2-8B.json b/data/models/grimjim_Llama-Nephilim-Metamorphosis-v2-8B.json similarity index 100% rename from data/grimjim_Llama-Nephilim-Metamorphosis-v2-8B.json rename to data/models/grimjim_Llama-Nephilim-Metamorphosis-v2-8B.json diff --git a/data/grimjim_Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B.json b/data/models/grimjim_Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B.json similarity index 100% rename from data/grimjim_Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B.json rename to data/models/grimjim_Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B.json diff --git a/data/grimjim_Magnolia-v1-Gemma2-8k-9B.json b/data/models/grimjim_Magnolia-v1-Gemma2-8k-9B.json similarity index 100% rename from data/grimjim_Magnolia-v1-Gemma2-8k-9B.json rename to data/models/grimjim_Magnolia-v1-Gemma2-8k-9B.json diff --git a/data/grimjim_Magnolia-v2-12B.json b/data/models/grimjim_Magnolia-v2-12B.json similarity index 100% rename from data/grimjim_Magnolia-v2-12B.json rename to data/models/grimjim_Magnolia-v2-12B.json diff --git a/data/grimjim_Magnolia-v2-Gemma2-8k-9B.json b/data/models/grimjim_Magnolia-v2-Gemma2-8k-9B.json similarity index 100% rename from data/grimjim_Magnolia-v2-Gemma2-8k-9B.json rename to data/models/grimjim_Magnolia-v2-Gemma2-8k-9B.json diff --git a/data/grimjim_Magnolia-v3-12B.json b/data/models/grimjim_Magnolia-v3-12B.json similarity index 100% rename from data/grimjim_Magnolia-v3-12B.json rename to data/models/grimjim_Magnolia-v3-12B.json diff --git a/data/grimjim_Magnolia-v3-Gemma2-8k-9B.json b/data/models/grimjim_Magnolia-v3-Gemma2-8k-9B.json similarity index 100% rename from data/grimjim_Magnolia-v3-Gemma2-8k-9B.json rename to data/models/grimjim_Magnolia-v3-Gemma2-8k-9B.json diff --git a/data/grimjim_Magnolia-v4-12B.json b/data/models/grimjim_Magnolia-v4-12B.json similarity index 100% rename from data/grimjim_Magnolia-v4-12B.json rename to data/models/grimjim_Magnolia-v4-12B.json diff --git a/data/grimjim_Magnolia-v5a-12B.json b/data/models/grimjim_Magnolia-v5a-12B.json similarity index 100% rename from data/grimjim_Magnolia-v5a-12B.json rename to data/models/grimjim_Magnolia-v5a-12B.json diff --git a/data/grimjim_Magot-v1-Gemma2-8k-9B.json b/data/models/grimjim_Magot-v1-Gemma2-8k-9B.json similarity index 100% rename from data/grimjim_Magot-v1-Gemma2-8k-9B.json rename to data/models/grimjim_Magot-v1-Gemma2-8k-9B.json diff --git a/data/grimjim_Magot-v2-Gemma2-8k-9B.json b/data/models/grimjim_Magot-v2-Gemma2-8k-9B.json similarity index 100% rename from data/grimjim_Magot-v2-Gemma2-8k-9B.json rename to data/models/grimjim_Magot-v2-Gemma2-8k-9B.json diff --git a/data/grimjim_SauerHuatuoSkywork-o1-Llama-3.1-8B.json b/data/models/grimjim_SauerHuatuoSkywork-o1-Llama-3.1-8B.json similarity index 100% rename from data/grimjim_SauerHuatuoSkywork-o1-Llama-3.1-8B.json rename to data/models/grimjim_SauerHuatuoSkywork-o1-Llama-3.1-8B.json diff --git a/data/grimjim_llama-3-Nephilim-v1-8B.json b/data/models/grimjim_llama-3-Nephilim-v1-8B.json similarity index 100% rename from data/grimjim_llama-3-Nephilim-v1-8B.json rename to data/models/grimjim_llama-3-Nephilim-v1-8B.json diff --git a/data/grimjim_llama-3-Nephilim-v2-8B.json b/data/models/grimjim_llama-3-Nephilim-v2-8B.json similarity index 100% rename from data/grimjim_llama-3-Nephilim-v2-8B.json rename to data/models/grimjim_llama-3-Nephilim-v2-8B.json diff --git a/data/grimjim_llama-3-Nephilim-v2.1-8B.json b/data/models/grimjim_llama-3-Nephilim-v2.1-8B.json similarity index 100% rename from data/grimjim_llama-3-Nephilim-v2.1-8B.json rename to data/models/grimjim_llama-3-Nephilim-v2.1-8B.json diff --git a/data/grimjim_llama-3-Nephilim-v3-8B.json b/data/models/grimjim_llama-3-Nephilim-v3-8B.json similarity index 100% rename from data/grimjim_llama-3-Nephilim-v3-8B.json rename to data/models/grimjim_llama-3-Nephilim-v3-8B.json diff --git a/data/gupta-tanish_llama-7b-dpo-baseline.json b/data/models/gupta-tanish_llama-7b-dpo-baseline.json similarity index 100% rename from data/gupta-tanish_llama-7b-dpo-baseline.json rename to data/models/gupta-tanish_llama-7b-dpo-baseline.json diff --git a/data/gz987_qwen2.5-7b-cabs-v0.1.json b/data/models/gz987_qwen2.5-7b-cabs-v0.1.json similarity index 100% rename from data/gz987_qwen2.5-7b-cabs-v0.1.json rename to data/models/gz987_qwen2.5-7b-cabs-v0.1.json diff --git a/data/gz987_qwen2.5-7b-cabs-v0.2.json b/data/models/gz987_qwen2.5-7b-cabs-v0.2.json similarity index 100% rename from data/gz987_qwen2.5-7b-cabs-v0.2.json rename to data/models/gz987_qwen2.5-7b-cabs-v0.2.json diff --git a/data/gz987_qwen2.5-7b-cabs-v0.3.json b/data/models/gz987_qwen2.5-7b-cabs-v0.3.json similarity index 100% rename from data/gz987_qwen2.5-7b-cabs-v0.3.json rename to data/models/gz987_qwen2.5-7b-cabs-v0.3.json diff --git a/data/gz987_qwen2.5-7b-cabs-v0.4.json b/data/models/gz987_qwen2.5-7b-cabs-v0.4.json similarity index 100% rename from data/gz987_qwen2.5-7b-cabs-v0.4.json rename to data/models/gz987_qwen2.5-7b-cabs-v0.4.json diff --git a/data/h2oai_h2o-danube-1.8b-chat.json b/data/models/h2oai_h2o-danube-1.8b-chat.json similarity index 100% rename from data/h2oai_h2o-danube-1.8b-chat.json rename to data/models/h2oai_h2o-danube-1.8b-chat.json diff --git a/data/h2oai_h2o-danube3-4b-base.json b/data/models/h2oai_h2o-danube3-4b-base.json similarity index 100% rename from data/h2oai_h2o-danube3-4b-base.json rename to data/models/h2oai_h2o-danube3-4b-base.json diff --git a/data/h2oai_h2o-danube3-4b-chat.json b/data/models/h2oai_h2o-danube3-4b-chat.json similarity index 100% rename from data/h2oai_h2o-danube3-4b-chat.json rename to data/models/h2oai_h2o-danube3-4b-chat.json diff --git a/data/h2oai_h2o-danube3-500m-chat.json b/data/models/h2oai_h2o-danube3-500m-chat.json similarity index 100% rename from data/h2oai_h2o-danube3-500m-chat.json rename to data/models/h2oai_h2o-danube3-500m-chat.json diff --git a/data/h2oai_h2o-danube3.1-4b-chat.json b/data/models/h2oai_h2o-danube3.1-4b-chat.json similarity index 100% rename from data/h2oai_h2o-danube3.1-4b-chat.json rename to data/models/h2oai_h2o-danube3.1-4b-chat.json diff --git a/data/haoranxu_ALMA-13B-R.json b/data/models/haoranxu_ALMA-13B-R.json similarity index 100% rename from data/haoranxu_ALMA-13B-R.json rename to data/models/haoranxu_ALMA-13B-R.json diff --git a/data/haoranxu_Llama-3-Instruct-8B-CPO-SimPO.json b/data/models/haoranxu_Llama-3-Instruct-8B-CPO-SimPO.json similarity index 100% rename from data/haoranxu_Llama-3-Instruct-8B-CPO-SimPO.json rename to data/models/haoranxu_Llama-3-Instruct-8B-CPO-SimPO.json diff --git a/data/haoranxu_Llama-3-Instruct-8B-SimPO.json b/data/models/haoranxu_Llama-3-Instruct-8B-SimPO.json similarity index 100% rename from data/haoranxu_Llama-3-Instruct-8B-SimPO.json rename to data/models/haoranxu_Llama-3-Instruct-8B-SimPO.json diff --git a/data/hatemmahmoud_qwen2.5-1.5b-sft-raft-grpo-hra-doc.json b/data/models/hatemmahmoud_qwen2.5-1.5b-sft-raft-grpo-hra-doc.json similarity index 100% rename from data/hatemmahmoud_qwen2.5-1.5b-sft-raft-grpo-hra-doc.json rename to data/models/hatemmahmoud_qwen2.5-1.5b-sft-raft-grpo-hra-doc.json diff --git a/data/hendrydong_Mistral-RM-for-RAFT-GSHF-v0.json b/data/models/hendrydong_Mistral-RM-for-RAFT-GSHF-v0.json similarity index 100% rename from data/hendrydong_Mistral-RM-for-RAFT-GSHF-v0.json rename to data/models/hendrydong_Mistral-RM-for-RAFT-GSHF-v0.json index 69bdc21b5b14ca68b6163472e8eef6fd89e268f5..357f438b07831c53712dd63f870b6b2401c4d681 100644 --- a/data/hendrydong_Mistral-RM-for-RAFT-GSHF-v0.json +++ b/data/models/hendrydong_Mistral-RM-for-RAFT-GSHF-v0.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/hendrydong_Mistral-RM-for-RAFT-GSHF-v0/1766412838.146816", + "evaluation_id": "reward-bench-2/hendrydong_Mistral-RM-for-RAFT-GSHF-v0/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,109 +31,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7847 + "score": 0.5851 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9832 + "score": 0.5779 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5789 + "score": 0.3625 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6011 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.85 + "score": 0.6956 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7434 + "score": 0.6747 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7508 + "score": 0.5988 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], @@ -141,10 +159,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench-2/hendrydong_Mistral-RM-for-RAFT-GSHF-v0/1766412838.146816", + "evaluation_id": "reward-bench/hendrydong_Mistral-RM-for-RAFT-GSHF-v0/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -163,127 +181,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5851 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5779 + "score": 0.7847 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3625 + "score": 0.9832 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6011 + "score": 0.5789 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6956 + "score": 0.85 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6747 + "score": 0.7434 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5988 + "score": 0.7508 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/hon9kon9ize_CantoneseLLMChat-v0.5.json b/data/models/hon9kon9ize_CantoneseLLMChat-v0.5.json similarity index 100% rename from data/hon9kon9ize_CantoneseLLMChat-v0.5.json rename to data/models/hon9kon9ize_CantoneseLLMChat-v0.5.json diff --git a/data/hon9kon9ize_CantoneseLLMChat-v1.0-7B.json b/data/models/hon9kon9ize_CantoneseLLMChat-v1.0-7B.json similarity index 100% rename from data/hon9kon9ize_CantoneseLLMChat-v1.0-7B.json rename to data/models/hon9kon9ize_CantoneseLLMChat-v1.0-7B.json diff --git a/data/hongbai12_li-0.4-pre.json b/data/models/hongbai12_li-0.4-pre.json similarity index 100% rename from data/hongbai12_li-0.4-pre.json rename to data/models/hongbai12_li-0.4-pre.json diff --git a/data/hotmailuser_Deepseek-qwen-modelstock-2B.json b/data/models/hotmailuser_Deepseek-qwen-modelstock-2B.json similarity index 100% rename from data/hotmailuser_Deepseek-qwen-modelstock-2B.json rename to data/models/hotmailuser_Deepseek-qwen-modelstock-2B.json diff --git a/data/hotmailuser_Falcon3Slerp1-10B.json b/data/models/hotmailuser_Falcon3Slerp1-10B.json similarity index 100% rename from data/hotmailuser_Falcon3Slerp1-10B.json rename to data/models/hotmailuser_Falcon3Slerp1-10B.json diff --git a/data/hotmailuser_Falcon3Slerp2-10B.json b/data/models/hotmailuser_Falcon3Slerp2-10B.json similarity index 100% rename from data/hotmailuser_Falcon3Slerp2-10B.json rename to data/models/hotmailuser_Falcon3Slerp2-10B.json diff --git a/data/hotmailuser_Falcon3Slerp4-10B.json b/data/models/hotmailuser_Falcon3Slerp4-10B.json similarity index 100% rename from data/hotmailuser_Falcon3Slerp4-10B.json rename to data/models/hotmailuser_Falcon3Slerp4-10B.json diff --git a/data/hotmailuser_FalconSlerp-3B.json b/data/models/hotmailuser_FalconSlerp-3B.json similarity index 100% rename from data/hotmailuser_FalconSlerp-3B.json rename to data/models/hotmailuser_FalconSlerp-3B.json diff --git a/data/hotmailuser_FalconSlerp1-7B.json b/data/models/hotmailuser_FalconSlerp1-7B.json similarity index 100% rename from data/hotmailuser_FalconSlerp1-7B.json rename to data/models/hotmailuser_FalconSlerp1-7B.json diff --git a/data/hotmailuser_FalconSlerp2-7B.json b/data/models/hotmailuser_FalconSlerp2-7B.json similarity index 100% rename from data/hotmailuser_FalconSlerp2-7B.json rename to data/models/hotmailuser_FalconSlerp2-7B.json diff --git a/data/hotmailuser_FalconSlerp3-10B.json b/data/models/hotmailuser_FalconSlerp3-10B.json similarity index 100% rename from data/hotmailuser_FalconSlerp3-10B.json rename to data/models/hotmailuser_FalconSlerp3-10B.json diff --git a/data/hotmailuser_FalconSlerp3-7B.json b/data/models/hotmailuser_FalconSlerp3-7B.json similarity index 100% rename from data/hotmailuser_FalconSlerp3-7B.json rename to data/models/hotmailuser_FalconSlerp3-7B.json diff --git a/data/hotmailuser_FalconSlerp4-7B.json b/data/models/hotmailuser_FalconSlerp4-7B.json similarity index 100% rename from data/hotmailuser_FalconSlerp4-7B.json rename to data/models/hotmailuser_FalconSlerp4-7B.json diff --git a/data/hotmailuser_FalconSlerp6-7B.json b/data/models/hotmailuser_FalconSlerp6-7B.json similarity index 100% rename from data/hotmailuser_FalconSlerp6-7B.json rename to data/models/hotmailuser_FalconSlerp6-7B.json diff --git a/data/hotmailuser_Gemma2Crono-27B.json b/data/models/hotmailuser_Gemma2Crono-27B.json similarity index 100% rename from data/hotmailuser_Gemma2Crono-27B.json rename to data/models/hotmailuser_Gemma2Crono-27B.json diff --git a/data/hotmailuser_Gemma2SimPO-27B.json b/data/models/hotmailuser_Gemma2SimPO-27B.json similarity index 100% rename from data/hotmailuser_Gemma2SimPO-27B.json rename to data/models/hotmailuser_Gemma2SimPO-27B.json diff --git a/data/hotmailuser_Gemma2atlas-27B.json b/data/models/hotmailuser_Gemma2atlas-27B.json similarity index 100% rename from data/hotmailuser_Gemma2atlas-27B.json rename to data/models/hotmailuser_Gemma2atlas-27B.json diff --git a/data/hotmailuser_Gemma2magnum-27b.json b/data/models/hotmailuser_Gemma2magnum-27b.json similarity index 100% rename from data/hotmailuser_Gemma2magnum-27b.json rename to data/models/hotmailuser_Gemma2magnum-27b.json diff --git a/data/hotmailuser_Llama-Hermes-slerp-8B.json b/data/models/hotmailuser_Llama-Hermes-slerp-8B.json similarity index 100% rename from data/hotmailuser_Llama-Hermes-slerp-8B.json rename to data/models/hotmailuser_Llama-Hermes-slerp-8B.json diff --git a/data/hotmailuser_Llama-Hermes-slerp2-8B.json b/data/models/hotmailuser_Llama-Hermes-slerp2-8B.json similarity index 100% rename from data/hotmailuser_Llama-Hermes-slerp2-8B.json rename to data/models/hotmailuser_Llama-Hermes-slerp2-8B.json diff --git a/data/hotmailuser_LlamaStock-8B.json b/data/models/hotmailuser_LlamaStock-8B.json similarity index 100% rename from data/hotmailuser_LlamaStock-8B.json rename to data/models/hotmailuser_LlamaStock-8B.json diff --git a/data/hotmailuser_Mistral-modelstock-24B.json b/data/models/hotmailuser_Mistral-modelstock-24B.json similarity index 100% rename from data/hotmailuser_Mistral-modelstock-24B.json rename to data/models/hotmailuser_Mistral-modelstock-24B.json diff --git a/data/hotmailuser_Mistral-modelstock2-24B.json b/data/models/hotmailuser_Mistral-modelstock2-24B.json similarity index 100% rename from data/hotmailuser_Mistral-modelstock2-24B.json rename to data/models/hotmailuser_Mistral-modelstock2-24B.json diff --git a/data/hotmailuser_Phi4-Slerp4-14B.json b/data/models/hotmailuser_Phi4-Slerp4-14B.json similarity index 100% rename from data/hotmailuser_Phi4-Slerp4-14B.json rename to data/models/hotmailuser_Phi4-Slerp4-14B.json diff --git a/data/hotmailuser_Qwen2.5-HomerSlerp-7B.json b/data/models/hotmailuser_Qwen2.5-HomerSlerp-7B.json similarity index 100% rename from data/hotmailuser_Qwen2.5-HomerSlerp-7B.json rename to data/models/hotmailuser_Qwen2.5-HomerSlerp-7B.json diff --git a/data/hotmailuser_QwenModelStock-1.8B.json b/data/models/hotmailuser_QwenModelStock-1.8B.json similarity index 100% rename from data/hotmailuser_QwenModelStock-1.8B.json rename to data/models/hotmailuser_QwenModelStock-1.8B.json diff --git a/data/hotmailuser_QwenSlerp-14B.json b/data/models/hotmailuser_QwenSlerp-14B.json similarity index 100% rename from data/hotmailuser_QwenSlerp-14B.json rename to data/models/hotmailuser_QwenSlerp-14B.json diff --git a/data/hotmailuser_QwenSlerp-3B.json b/data/models/hotmailuser_QwenSlerp-3B.json similarity index 100% rename from data/hotmailuser_QwenSlerp-3B.json rename to data/models/hotmailuser_QwenSlerp-3B.json diff --git a/data/hotmailuser_QwenSlerp-7B.json b/data/models/hotmailuser_QwenSlerp-7B.json similarity index 100% rename from data/hotmailuser_QwenSlerp-7B.json rename to data/models/hotmailuser_QwenSlerp-7B.json diff --git a/data/hotmailuser_QwenSlerp2-14B.json b/data/models/hotmailuser_QwenSlerp2-14B.json similarity index 100% rename from data/hotmailuser_QwenSlerp2-14B.json rename to data/models/hotmailuser_QwenSlerp2-14B.json diff --git a/data/hotmailuser_QwenSlerp2-3B.json b/data/models/hotmailuser_QwenSlerp2-3B.json similarity index 100% rename from data/hotmailuser_QwenSlerp2-3B.json rename to data/models/hotmailuser_QwenSlerp2-3B.json diff --git a/data/hotmailuser_QwenSlerp3-14B.json b/data/models/hotmailuser_QwenSlerp3-14B.json similarity index 100% rename from data/hotmailuser_QwenSlerp3-14B.json rename to data/models/hotmailuser_QwenSlerp3-14B.json diff --git a/data/hotmailuser_QwenSparse-7B.json b/data/models/hotmailuser_QwenSparse-7B.json similarity index 100% rename from data/hotmailuser_QwenSparse-7B.json rename to data/models/hotmailuser_QwenSparse-7B.json diff --git a/data/hotmailuser_QwenStock-0.5B.json b/data/models/hotmailuser_QwenStock-0.5B.json similarity index 100% rename from data/hotmailuser_QwenStock-0.5B.json rename to data/models/hotmailuser_QwenStock-0.5B.json diff --git a/data/hotmailuser_QwenStock-1.7B.json b/data/models/hotmailuser_QwenStock-1.7B.json similarity index 100% rename from data/hotmailuser_QwenStock-1.7B.json rename to data/models/hotmailuser_QwenStock-1.7B.json diff --git a/data/hotmailuser_QwenStock1-14B.json b/data/models/hotmailuser_QwenStock1-14B.json similarity index 100% rename from data/hotmailuser_QwenStock1-14B.json rename to data/models/hotmailuser_QwenStock1-14B.json diff --git a/data/hotmailuser_RombosBeagle-v2beta-MGS-32B.json b/data/models/hotmailuser_RombosBeagle-v2beta-MGS-32B.json similarity index 100% rename from data/hotmailuser_RombosBeagle-v2beta-MGS-32B.json rename to data/models/hotmailuser_RombosBeagle-v2beta-MGS-32B.json diff --git a/data/huggyllama_llama-13b.json b/data/models/huggyllama_llama-13b.json similarity index 100% rename from data/huggyllama_llama-13b.json rename to data/models/huggyllama_llama-13b.json diff --git a/data/huggyllama_llama-65b.json b/data/models/huggyllama_llama-65b.json similarity index 100% rename from data/huggyllama_llama-65b.json rename to data/models/huggyllama_llama-65b.json diff --git a/data/huggyllama_llama-7b.json b/data/models/huggyllama_llama-7b.json similarity index 100% rename from data/huggyllama_llama-7b.json rename to data/models/huggyllama_llama-7b.json diff --git a/data/huihui-ai_DeepSeek-R1-Distill-Qwen-14B-abliterated-v2.json b/data/models/huihui-ai_DeepSeek-R1-Distill-Qwen-14B-abliterated-v2.json similarity index 100% rename from data/huihui-ai_DeepSeek-R1-Distill-Qwen-14B-abliterated-v2.json rename to data/models/huihui-ai_DeepSeek-R1-Distill-Qwen-14B-abliterated-v2.json diff --git a/data/huihui-ai_QwQ-32B-Coder-Fusion-7030.json b/data/models/huihui-ai_QwQ-32B-Coder-Fusion-7030.json similarity index 100% rename from data/huihui-ai_QwQ-32B-Coder-Fusion-7030.json rename to data/models/huihui-ai_QwQ-32B-Coder-Fusion-7030.json diff --git a/data/huihui-ai_QwQ-32B-Coder-Fusion-8020.json b/data/models/huihui-ai_QwQ-32B-Coder-Fusion-8020.json similarity index 100% rename from data/huihui-ai_QwQ-32B-Coder-Fusion-8020.json rename to data/models/huihui-ai_QwQ-32B-Coder-Fusion-8020.json diff --git a/data/huihui-ai_QwQ-32B-Coder-Fusion-9010.json b/data/models/huihui-ai_QwQ-32B-Coder-Fusion-9010.json similarity index 100% rename from data/huihui-ai_QwQ-32B-Coder-Fusion-9010.json rename to data/models/huihui-ai_QwQ-32B-Coder-Fusion-9010.json diff --git a/data/huihui-ai_Qwen2.5-14B-Instruct-abliterated-v2.json b/data/models/huihui-ai_Qwen2.5-14B-Instruct-abliterated-v2.json similarity index 100% rename from data/huihui-ai_Qwen2.5-14B-Instruct-abliterated-v2.json rename to data/models/huihui-ai_Qwen2.5-14B-Instruct-abliterated-v2.json diff --git a/data/huihui-ai_Qwen2.5-72B-Instruct-abliterated.json b/data/models/huihui-ai_Qwen2.5-72B-Instruct-abliterated.json similarity index 100% rename from data/huihui-ai_Qwen2.5-72B-Instruct-abliterated.json rename to data/models/huihui-ai_Qwen2.5-72B-Instruct-abliterated.json diff --git a/data/huihui-ai_Qwen2.5-7B-Instruct-abliterated-v2.json b/data/models/huihui-ai_Qwen2.5-7B-Instruct-abliterated-v2.json similarity index 100% rename from data/huihui-ai_Qwen2.5-7B-Instruct-abliterated-v2.json rename to data/models/huihui-ai_Qwen2.5-7B-Instruct-abliterated-v2.json diff --git a/data/huihui-ai_Qwen2.5-7B-Instruct-abliterated.json b/data/models/huihui-ai_Qwen2.5-7B-Instruct-abliterated.json similarity index 100% rename from data/huihui-ai_Qwen2.5-7B-Instruct-abliterated.json rename to data/models/huihui-ai_Qwen2.5-7B-Instruct-abliterated.json diff --git a/data/huu-ontocord_wide_3b_orpo_stage1.1-ss1-orpo3.json b/data/models/huu-ontocord_wide_3b_orpo_stage1.1-ss1-orpo3.json similarity index 100% rename from data/huu-ontocord_wide_3b_orpo_stage1.1-ss1-orpo3.json rename to data/models/huu-ontocord_wide_3b_orpo_stage1.1-ss1-orpo3.json diff --git a/data/iFaz_llama31_8B_en_emo_v4.json b/data/models/iFaz_llama31_8B_en_emo_v4.json similarity index 100% rename from data/iFaz_llama31_8B_en_emo_v4.json rename to data/models/iFaz_llama31_8B_en_emo_v4.json diff --git a/data/iFaz_llama32_1B_en_emo_v1.json b/data/models/iFaz_llama32_1B_en_emo_v1.json similarity index 100% rename from data/iFaz_llama32_1B_en_emo_v1.json rename to data/models/iFaz_llama32_1B_en_emo_v1.json diff --git a/data/iFaz_llama32_3B_en_emo_1000_stp.json b/data/models/iFaz_llama32_3B_en_emo_1000_stp.json similarity index 100% rename from data/iFaz_llama32_3B_en_emo_1000_stp.json rename to data/models/iFaz_llama32_3B_en_emo_1000_stp.json diff --git a/data/iFaz_llama32_3B_en_emo_2000_stp.json b/data/models/iFaz_llama32_3B_en_emo_2000_stp.json similarity index 100% rename from data/iFaz_llama32_3B_en_emo_2000_stp.json rename to data/models/iFaz_llama32_3B_en_emo_2000_stp.json diff --git a/data/iFaz_llama32_3B_en_emo_300_stp.json b/data/models/iFaz_llama32_3B_en_emo_300_stp.json similarity index 100% rename from data/iFaz_llama32_3B_en_emo_300_stp.json rename to data/models/iFaz_llama32_3B_en_emo_300_stp.json diff --git a/data/iFaz_llama32_3B_en_emo_5000_stp.json b/data/models/iFaz_llama32_3B_en_emo_5000_stp.json similarity index 100% rename from data/iFaz_llama32_3B_en_emo_5000_stp.json rename to data/models/iFaz_llama32_3B_en_emo_5000_stp.json diff --git a/data/iFaz_llama32_3B_en_emo_v2.json b/data/models/iFaz_llama32_3B_en_emo_v2.json similarity index 100% rename from data/iFaz_llama32_3B_en_emo_v2.json rename to data/models/iFaz_llama32_3B_en_emo_v2.json diff --git a/data/iFaz_llama32_3B_en_emo_v3.json b/data/models/iFaz_llama32_3B_en_emo_v3.json similarity index 100% rename from data/iFaz_llama32_3B_en_emo_v3.json rename to data/models/iFaz_llama32_3B_en_emo_v3.json diff --git a/data/iRyanBell_ARC1-II.json b/data/models/iRyanBell_ARC1-II.json similarity index 100% rename from data/iRyanBell_ARC1-II.json rename to data/models/iRyanBell_ARC1-II.json diff --git a/data/iRyanBell_ARC1.json b/data/models/iRyanBell_ARC1.json similarity index 100% rename from data/iRyanBell_ARC1.json rename to data/models/iRyanBell_ARC1.json diff --git a/data/ibivibiv_colossus_120b.json b/data/models/ibivibiv_colossus_120b.json similarity index 100% rename from data/ibivibiv_colossus_120b.json rename to data/models/ibivibiv_colossus_120b.json diff --git a/data/ibivibiv_multimaster-7b-v6.json b/data/models/ibivibiv_multimaster-7b-v6.json similarity index 100% rename from data/ibivibiv_multimaster-7b-v6.json rename to data/models/ibivibiv_multimaster-7b-v6.json diff --git a/data/ibm-granite_granite-3.0-1b-a400m-base.json b/data/models/ibm-granite_granite-3.0-1b-a400m-base.json similarity index 100% rename from data/ibm-granite_granite-3.0-1b-a400m-base.json rename to data/models/ibm-granite_granite-3.0-1b-a400m-base.json diff --git a/data/ibm-granite_granite-3.0-1b-a400m-instruct.json b/data/models/ibm-granite_granite-3.0-1b-a400m-instruct.json similarity index 100% rename from data/ibm-granite_granite-3.0-1b-a400m-instruct.json rename to data/models/ibm-granite_granite-3.0-1b-a400m-instruct.json diff --git a/data/ibm-granite_granite-3.0-2b-base.json b/data/models/ibm-granite_granite-3.0-2b-base.json similarity index 100% rename from data/ibm-granite_granite-3.0-2b-base.json rename to data/models/ibm-granite_granite-3.0-2b-base.json diff --git a/data/ibm-granite_granite-3.0-2b-instruct.json b/data/models/ibm-granite_granite-3.0-2b-instruct.json similarity index 100% rename from data/ibm-granite_granite-3.0-2b-instruct.json rename to data/models/ibm-granite_granite-3.0-2b-instruct.json diff --git a/data/ibm-granite_granite-3.0-3b-a800m-base.json b/data/models/ibm-granite_granite-3.0-3b-a800m-base.json similarity index 100% rename from data/ibm-granite_granite-3.0-3b-a800m-base.json rename to data/models/ibm-granite_granite-3.0-3b-a800m-base.json diff --git a/data/ibm-granite_granite-3.0-3b-a800m-instruct.json b/data/models/ibm-granite_granite-3.0-3b-a800m-instruct.json similarity index 100% rename from data/ibm-granite_granite-3.0-3b-a800m-instruct.json rename to data/models/ibm-granite_granite-3.0-3b-a800m-instruct.json diff --git a/data/ibm-granite_granite-3.0-8b-base.json b/data/models/ibm-granite_granite-3.0-8b-base.json similarity index 100% rename from data/ibm-granite_granite-3.0-8b-base.json rename to data/models/ibm-granite_granite-3.0-8b-base.json diff --git a/data/ibm-granite_granite-3.0-8b-instruct.json b/data/models/ibm-granite_granite-3.0-8b-instruct.json similarity index 100% rename from data/ibm-granite_granite-3.0-8b-instruct.json rename to data/models/ibm-granite_granite-3.0-8b-instruct.json diff --git a/data/ibm-granite_granite-3.1-1b-a400m-base.json b/data/models/ibm-granite_granite-3.1-1b-a400m-base.json similarity index 100% rename from data/ibm-granite_granite-3.1-1b-a400m-base.json rename to data/models/ibm-granite_granite-3.1-1b-a400m-base.json diff --git a/data/ibm-granite_granite-3.1-1b-a400m-instruct.json b/data/models/ibm-granite_granite-3.1-1b-a400m-instruct.json similarity index 100% rename from data/ibm-granite_granite-3.1-1b-a400m-instruct.json rename to data/models/ibm-granite_granite-3.1-1b-a400m-instruct.json diff --git a/data/ibm-granite_granite-3.1-2b-base.json b/data/models/ibm-granite_granite-3.1-2b-base.json similarity index 100% rename from data/ibm-granite_granite-3.1-2b-base.json rename to data/models/ibm-granite_granite-3.1-2b-base.json diff --git a/data/ibm-granite_granite-3.1-2b-instruct.json b/data/models/ibm-granite_granite-3.1-2b-instruct.json similarity index 100% rename from data/ibm-granite_granite-3.1-2b-instruct.json rename to data/models/ibm-granite_granite-3.1-2b-instruct.json diff --git a/data/ibm-granite_granite-3.1-3b-a800m-base.json b/data/models/ibm-granite_granite-3.1-3b-a800m-base.json similarity index 100% rename from data/ibm-granite_granite-3.1-3b-a800m-base.json rename to data/models/ibm-granite_granite-3.1-3b-a800m-base.json diff --git a/data/ibm-granite_granite-3.1-3b-a800m-instruct.json b/data/models/ibm-granite_granite-3.1-3b-a800m-instruct.json similarity index 100% rename from data/ibm-granite_granite-3.1-3b-a800m-instruct.json rename to data/models/ibm-granite_granite-3.1-3b-a800m-instruct.json diff --git a/data/ibm-granite_granite-3.1-8b-base.json b/data/models/ibm-granite_granite-3.1-8b-base.json similarity index 100% rename from data/ibm-granite_granite-3.1-8b-base.json rename to data/models/ibm-granite_granite-3.1-8b-base.json diff --git a/data/ibm-granite_granite-3.1-8b-instruct.json b/data/models/ibm-granite_granite-3.1-8b-instruct.json similarity index 100% rename from data/ibm-granite_granite-3.1-8b-instruct.json rename to data/models/ibm-granite_granite-3.1-8b-instruct.json diff --git a/data/ibm-granite_granite-3.2-2b-instruct.json b/data/models/ibm-granite_granite-3.2-2b-instruct.json similarity index 100% rename from data/ibm-granite_granite-3.2-2b-instruct.json rename to data/models/ibm-granite_granite-3.2-2b-instruct.json diff --git a/data/ibm-granite_granite-3.2-8b-instruct.json b/data/models/ibm-granite_granite-3.2-8b-instruct.json similarity index 100% rename from data/ibm-granite_granite-3.2-8b-instruct.json rename to data/models/ibm-granite_granite-3.2-8b-instruct.json diff --git a/data/ibm-granite_granite-7b-base.json b/data/models/ibm-granite_granite-7b-base.json similarity index 100% rename from data/ibm-granite_granite-7b-base.json rename to data/models/ibm-granite_granite-7b-base.json diff --git a/data/ibm-granite_granite-7b-instruct.json b/data/models/ibm-granite_granite-7b-instruct.json similarity index 100% rename from data/ibm-granite_granite-7b-instruct.json rename to data/models/ibm-granite_granite-7b-instruct.json diff --git a/data/ibm_PowerLM-3b.json b/data/models/ibm_PowerLM-3b.json similarity index 100% rename from data/ibm_PowerLM-3b.json rename to data/models/ibm_PowerLM-3b.json diff --git a/data/ibm_granite-3.3-8b-instruct.json b/data/models/ibm_granite-3.3-8b-instruct.json similarity index 100% rename from data/ibm_granite-3.3-8b-instruct.json rename to data/models/ibm_granite-3.3-8b-instruct.json diff --git a/data/ibm_granite-4.0-h-small.json b/data/models/ibm_granite-4.0-h-small.json similarity index 100% rename from data/ibm_granite-4.0-h-small.json rename to data/models/ibm_granite-4.0-h-small.json diff --git a/data/ibm_merlinite-7b.json b/data/models/ibm_merlinite-7b.json similarity index 100% rename from data/ibm_merlinite-7b.json rename to data/models/ibm_merlinite-7b.json diff --git a/data/icefog72_Ice0.15-02.10-RP.json b/data/models/icefog72_Ice0.15-02.10-RP.json similarity index 100% rename from data/icefog72_Ice0.15-02.10-RP.json rename to data/models/icefog72_Ice0.15-02.10-RP.json diff --git a/data/icefog72_Ice0.16-02.10-RP.json b/data/models/icefog72_Ice0.16-02.10-RP.json similarity index 100% rename from data/icefog72_Ice0.16-02.10-RP.json rename to data/models/icefog72_Ice0.16-02.10-RP.json diff --git a/data/icefog72_Ice0.17-03.10-RP.json b/data/models/icefog72_Ice0.17-03.10-RP.json similarity index 100% rename from data/icefog72_Ice0.17-03.10-RP.json rename to data/models/icefog72_Ice0.17-03.10-RP.json diff --git a/data/icefog72_Ice0.27-06.11-RP.json b/data/models/icefog72_Ice0.27-06.11-RP.json similarity index 100% rename from data/icefog72_Ice0.27-06.11-RP.json rename to data/models/icefog72_Ice0.27-06.11-RP.json diff --git a/data/icefog72_Ice0.29-06.11-RP.json b/data/models/icefog72_Ice0.29-06.11-RP.json similarity index 100% rename from data/icefog72_Ice0.29-06.11-RP.json rename to data/models/icefog72_Ice0.29-06.11-RP.json diff --git a/data/icefog72_Ice0.31-08.11-RP.json b/data/models/icefog72_Ice0.31-08.11-RP.json similarity index 100% rename from data/icefog72_Ice0.31-08.11-RP.json rename to data/models/icefog72_Ice0.31-08.11-RP.json diff --git a/data/icefog72_Ice0.32-10.11-RP.json b/data/models/icefog72_Ice0.32-10.11-RP.json similarity index 100% rename from data/icefog72_Ice0.32-10.11-RP.json rename to data/models/icefog72_Ice0.32-10.11-RP.json diff --git a/data/icefog72_Ice0.34b-14.11-RP.json b/data/models/icefog72_Ice0.34b-14.11-RP.json similarity index 100% rename from data/icefog72_Ice0.34b-14.11-RP.json rename to data/models/icefog72_Ice0.34b-14.11-RP.json diff --git a/data/icefog72_Ice0.34n-14.11-RP.json b/data/models/icefog72_Ice0.34n-14.11-RP.json similarity index 100% rename from data/icefog72_Ice0.34n-14.11-RP.json rename to data/models/icefog72_Ice0.34n-14.11-RP.json diff --git a/data/icefog72_Ice0.37-18.11-RP.json b/data/models/icefog72_Ice0.37-18.11-RP.json similarity index 100% rename from data/icefog72_Ice0.37-18.11-RP.json rename to data/models/icefog72_Ice0.37-18.11-RP.json diff --git a/data/icefog72_Ice0.38-19.11-RP.json b/data/models/icefog72_Ice0.38-19.11-RP.json similarity index 100% rename from data/icefog72_Ice0.38-19.11-RP.json rename to data/models/icefog72_Ice0.38-19.11-RP.json diff --git a/data/icefog72_Ice0.39-19.11-RP.json b/data/models/icefog72_Ice0.39-19.11-RP.json similarity index 100% rename from data/icefog72_Ice0.39-19.11-RP.json rename to data/models/icefog72_Ice0.39-19.11-RP.json diff --git a/data/icefog72_Ice0.40-20.11-RP.json b/data/models/icefog72_Ice0.40-20.11-RP.json similarity index 100% rename from data/icefog72_Ice0.40-20.11-RP.json rename to data/models/icefog72_Ice0.40-20.11-RP.json diff --git a/data/icefog72_Ice0.41-22.11-RP.json b/data/models/icefog72_Ice0.41-22.11-RP.json similarity index 100% rename from data/icefog72_Ice0.41-22.11-RP.json rename to data/models/icefog72_Ice0.41-22.11-RP.json diff --git a/data/icefog72_Ice0.50-16.01-RP.json b/data/models/icefog72_Ice0.50-16.01-RP.json similarity index 100% rename from data/icefog72_Ice0.50-16.01-RP.json rename to data/models/icefog72_Ice0.50-16.01-RP.json diff --git a/data/icefog72_Ice0.50.1-16.01-RP.json b/data/models/icefog72_Ice0.50.1-16.01-RP.json similarity index 100% rename from data/icefog72_Ice0.50.1-16.01-RP.json rename to data/models/icefog72_Ice0.50.1-16.01-RP.json diff --git a/data/icefog72_Ice0.51-16.01-RP.json b/data/models/icefog72_Ice0.51-16.01-RP.json similarity index 100% rename from data/icefog72_Ice0.51-16.01-RP.json rename to data/models/icefog72_Ice0.51-16.01-RP.json diff --git a/data/icefog72_Ice0.51.1-16.01-RP.json b/data/models/icefog72_Ice0.51.1-16.01-RP.json similarity index 100% rename from data/icefog72_Ice0.51.1-16.01-RP.json rename to data/models/icefog72_Ice0.51.1-16.01-RP.json diff --git a/data/icefog72_Ice0.52-16.01-RP.json b/data/models/icefog72_Ice0.52-16.01-RP.json similarity index 100% rename from data/icefog72_Ice0.52-16.01-RP.json rename to data/models/icefog72_Ice0.52-16.01-RP.json diff --git a/data/icefog72_Ice0.52.1-16.01-RP.json b/data/models/icefog72_Ice0.52.1-16.01-RP.json similarity index 100% rename from data/icefog72_Ice0.52.1-16.01-RP.json rename to data/models/icefog72_Ice0.52.1-16.01-RP.json diff --git a/data/icefog72_Ice0.53-16.01-RP.json b/data/models/icefog72_Ice0.53-16.01-RP.json similarity index 100% rename from data/icefog72_Ice0.53-16.01-RP.json rename to data/models/icefog72_Ice0.53-16.01-RP.json diff --git a/data/icefog72_Ice0.54-17.01-RP.json b/data/models/icefog72_Ice0.54-17.01-RP.json similarity index 100% rename from data/icefog72_Ice0.54-17.01-RP.json rename to data/models/icefog72_Ice0.54-17.01-RP.json diff --git a/data/icefog72_Ice0.55-17.01-RP.json b/data/models/icefog72_Ice0.55-17.01-RP.json similarity index 100% rename from data/icefog72_Ice0.55-17.01-RP.json rename to data/models/icefog72_Ice0.55-17.01-RP.json diff --git a/data/icefog72_Ice0.57-17.01-RP.json b/data/models/icefog72_Ice0.57-17.01-RP.json similarity index 100% rename from data/icefog72_Ice0.57-17.01-RP.json rename to data/models/icefog72_Ice0.57-17.01-RP.json diff --git a/data/icefog72_Ice0.60-18.01-RP.json b/data/models/icefog72_Ice0.60-18.01-RP.json similarity index 100% rename from data/icefog72_Ice0.60-18.01-RP.json rename to data/models/icefog72_Ice0.60-18.01-RP.json diff --git a/data/icefog72_Ice0.60.1-18.01-RP.json b/data/models/icefog72_Ice0.60.1-18.01-RP.json similarity index 100% rename from data/icefog72_Ice0.60.1-18.01-RP.json rename to data/models/icefog72_Ice0.60.1-18.01-RP.json diff --git a/data/icefog72_Ice0.61-18.01-RP.json b/data/models/icefog72_Ice0.61-18.01-RP.json similarity index 100% rename from data/icefog72_Ice0.61-18.01-RP.json rename to data/models/icefog72_Ice0.61-18.01-RP.json diff --git a/data/icefog72_Ice0.62-18.01-RP.json b/data/models/icefog72_Ice0.62-18.01-RP.json similarity index 100% rename from data/icefog72_Ice0.62-18.01-RP.json rename to data/models/icefog72_Ice0.62-18.01-RP.json diff --git a/data/icefog72_Ice0.62.1-24.01-RP.json b/data/models/icefog72_Ice0.62.1-24.01-RP.json similarity index 100% rename from data/icefog72_Ice0.62.1-24.01-RP.json rename to data/models/icefog72_Ice0.62.1-24.01-RP.json diff --git a/data/icefog72_Ice0.64-24.01-RP.json b/data/models/icefog72_Ice0.64-24.01-RP.json similarity index 100% rename from data/icefog72_Ice0.64-24.01-RP.json rename to data/models/icefog72_Ice0.64-24.01-RP.json diff --git a/data/icefog72_Ice0.64.1-24.01-RP.json b/data/models/icefog72_Ice0.64.1-24.01-RP.json similarity index 100% rename from data/icefog72_Ice0.64.1-24.01-RP.json rename to data/models/icefog72_Ice0.64.1-24.01-RP.json diff --git a/data/icefog72_Ice0.65-25.01-RP.json b/data/models/icefog72_Ice0.65-25.01-RP.json similarity index 100% rename from data/icefog72_Ice0.65-25.01-RP.json rename to data/models/icefog72_Ice0.65-25.01-RP.json diff --git a/data/icefog72_Ice0.66-25.01-RP.json b/data/models/icefog72_Ice0.66-25.01-RP.json similarity index 100% rename from data/icefog72_Ice0.66-25.01-RP.json rename to data/models/icefog72_Ice0.66-25.01-RP.json diff --git a/data/icefog72_Ice0.67-25.01-RP.json b/data/models/icefog72_Ice0.67-25.01-RP.json similarity index 100% rename from data/icefog72_Ice0.67-25.01-RP.json rename to data/models/icefog72_Ice0.67-25.01-RP.json diff --git a/data/icefog72_Ice0.68-25.01-RP.json b/data/models/icefog72_Ice0.68-25.01-RP.json similarity index 100% rename from data/icefog72_Ice0.68-25.01-RP.json rename to data/models/icefog72_Ice0.68-25.01-RP.json diff --git a/data/icefog72_Ice0.69-25.01-RP.json b/data/models/icefog72_Ice0.69-25.01-RP.json similarity index 100% rename from data/icefog72_Ice0.69-25.01-RP.json rename to data/models/icefog72_Ice0.69-25.01-RP.json diff --git a/data/icefog72_Ice0.7-29.09-RP.json b/data/models/icefog72_Ice0.7-29.09-RP.json similarity index 100% rename from data/icefog72_Ice0.7-29.09-RP.json rename to data/models/icefog72_Ice0.7-29.09-RP.json diff --git a/data/icefog72_Ice0.70-25.01-RP.json b/data/models/icefog72_Ice0.70-25.01-RP.json similarity index 100% rename from data/icefog72_Ice0.70-25.01-RP.json rename to data/models/icefog72_Ice0.70-25.01-RP.json diff --git a/data/icefog72_Ice0.70.1-01.02-RP.json b/data/models/icefog72_Ice0.70.1-01.02-RP.json similarity index 100% rename from data/icefog72_Ice0.70.1-01.02-RP.json rename to data/models/icefog72_Ice0.70.1-01.02-RP.json diff --git a/data/icefog72_Ice0.73-01.02-RP.json b/data/models/icefog72_Ice0.73-01.02-RP.json similarity index 100% rename from data/icefog72_Ice0.73-01.02-RP.json rename to data/models/icefog72_Ice0.73-01.02-RP.json diff --git a/data/icefog72_Ice0.74-02.02-RP.json b/data/models/icefog72_Ice0.74-02.02-RP.json similarity index 100% rename from data/icefog72_Ice0.74-02.02-RP.json rename to data/models/icefog72_Ice0.74-02.02-RP.json diff --git a/data/icefog72_Ice0.76-02.02-RP.json b/data/models/icefog72_Ice0.76-02.02-RP.json similarity index 100% rename from data/icefog72_Ice0.76-02.02-RP.json rename to data/models/icefog72_Ice0.76-02.02-RP.json diff --git a/data/icefog72_Ice0.77-02.02-RP.json b/data/models/icefog72_Ice0.77-02.02-RP.json similarity index 100% rename from data/icefog72_Ice0.77-02.02-RP.json rename to data/models/icefog72_Ice0.77-02.02-RP.json diff --git a/data/icefog72_Ice0.78-02.02-RP.json b/data/models/icefog72_Ice0.78-02.02-RP.json similarity index 100% rename from data/icefog72_Ice0.78-02.02-RP.json rename to data/models/icefog72_Ice0.78-02.02-RP.json diff --git a/data/icefog72_Ice0.80-03.02-RP.json b/data/models/icefog72_Ice0.80-03.02-RP.json similarity index 100% rename from data/icefog72_Ice0.80-03.02-RP.json rename to data/models/icefog72_Ice0.80-03.02-RP.json diff --git a/data/icefog72_IceCocoaRP-7b.json b/data/models/icefog72_IceCocoaRP-7b.json similarity index 100% rename from data/icefog72_IceCocoaRP-7b.json rename to data/models/icefog72_IceCocoaRP-7b.json diff --git a/data/icefog72_IceCoffeeRP-7b.json b/data/models/icefog72_IceCoffeeRP-7b.json similarity index 100% rename from data/icefog72_IceCoffeeRP-7b.json rename to data/models/icefog72_IceCoffeeRP-7b.json diff --git a/data/icefog72_IceDrinkByFrankensteinV3RP.json b/data/models/icefog72_IceDrinkByFrankensteinV3RP.json similarity index 100% rename from data/icefog72_IceDrinkByFrankensteinV3RP.json rename to data/models/icefog72_IceDrinkByFrankensteinV3RP.json diff --git a/data/icefog72_IceDrinkNameGoesHereRP-7b-Model_Stock.json b/data/models/icefog72_IceDrinkNameGoesHereRP-7b-Model_Stock.json similarity index 100% rename from data/icefog72_IceDrinkNameGoesHereRP-7b-Model_Stock.json rename to data/models/icefog72_IceDrinkNameGoesHereRP-7b-Model_Stock.json diff --git a/data/icefog72_IceDrinkNameNotFoundRP-7b-Model_Stock.json b/data/models/icefog72_IceDrinkNameNotFoundRP-7b-Model_Stock.json similarity index 100% rename from data/icefog72_IceDrinkNameNotFoundRP-7b-Model_Stock.json rename to data/models/icefog72_IceDrinkNameNotFoundRP-7b-Model_Stock.json diff --git a/data/icefog72_IceDrunkCherryRP-7b.json b/data/models/icefog72_IceDrunkCherryRP-7b.json similarity index 100% rename from data/icefog72_IceDrunkCherryRP-7b.json rename to data/models/icefog72_IceDrunkCherryRP-7b.json diff --git a/data/icefog72_IceDrunkenCherryRP-7b.json b/data/models/icefog72_IceDrunkenCherryRP-7b.json similarity index 100% rename from data/icefog72_IceDrunkenCherryRP-7b.json rename to data/models/icefog72_IceDrunkenCherryRP-7b.json diff --git a/data/icefog72_IceEspressoRPv2-7b.json b/data/models/icefog72_IceEspressoRPv2-7b.json similarity index 100% rename from data/icefog72_IceEspressoRPv2-7b.json rename to data/models/icefog72_IceEspressoRPv2-7b.json diff --git a/data/icefog72_IceLemonTeaRP-32k-7b.json b/data/models/icefog72_IceLemonTeaRP-32k-7b.json similarity index 100% rename from data/icefog72_IceLemonTeaRP-32k-7b.json rename to data/models/icefog72_IceLemonTeaRP-32k-7b.json diff --git a/data/icefog72_IceMartiniRP-7b.json b/data/models/icefog72_IceMartiniRP-7b.json similarity index 100% rename from data/icefog72_IceMartiniRP-7b.json rename to data/models/icefog72_IceMartiniRP-7b.json diff --git a/data/icefog72_IceNalyvkaRP-7b.json b/data/models/icefog72_IceNalyvkaRP-7b.json similarity index 100% rename from data/icefog72_IceNalyvkaRP-7b.json rename to data/models/icefog72_IceNalyvkaRP-7b.json diff --git a/data/icefog72_IceSakeRP-7b.json b/data/models/icefog72_IceSakeRP-7b.json similarity index 100% rename from data/icefog72_IceSakeRP-7b.json rename to data/models/icefog72_IceSakeRP-7b.json diff --git a/data/icefog72_IceSakeV4RP-7b.json b/data/models/icefog72_IceSakeV4RP-7b.json similarity index 100% rename from data/icefog72_IceSakeV4RP-7b.json rename to data/models/icefog72_IceSakeV4RP-7b.json diff --git a/data/icefog72_IceSakeV6RP-7b.json b/data/models/icefog72_IceSakeV6RP-7b.json similarity index 100% rename from data/icefog72_IceSakeV6RP-7b.json rename to data/models/icefog72_IceSakeV6RP-7b.json diff --git a/data/icefog72_IceSakeV8RP-7b.json b/data/models/icefog72_IceSakeV8RP-7b.json similarity index 100% rename from data/icefog72_IceSakeV8RP-7b.json rename to data/models/icefog72_IceSakeV8RP-7b.json diff --git a/data/icefog72_IceTea21EnergyDrinkRPV13-DPOv3.5.json b/data/models/icefog72_IceTea21EnergyDrinkRPV13-DPOv3.5.json similarity index 100% rename from data/icefog72_IceTea21EnergyDrinkRPV13-DPOv3.5.json rename to data/models/icefog72_IceTea21EnergyDrinkRPV13-DPOv3.5.json diff --git a/data/icefog72_IceTea21EnergyDrinkRPV13-DPOv3.json b/data/models/icefog72_IceTea21EnergyDrinkRPV13-DPOv3.json similarity index 100% rename from data/icefog72_IceTea21EnergyDrinkRPV13-DPOv3.json rename to data/models/icefog72_IceTea21EnergyDrinkRPV13-DPOv3.json diff --git a/data/ifable_gemma-2-Ifable-9B.json b/data/models/ifable_gemma-2-Ifable-9B.json similarity index 100% rename from data/ifable_gemma-2-Ifable-9B.json rename to data/models/ifable_gemma-2-Ifable-9B.json diff --git a/data/ilsp_Llama-Krikri-8B-Instruct.json b/data/models/ilsp_Llama-Krikri-8B-Instruct.json similarity index 100% rename from data/ilsp_Llama-Krikri-8B-Instruct.json rename to data/models/ilsp_Llama-Krikri-8B-Instruct.json diff --git a/data/inflatebot_MN-12B-Mag-Mell-R1.json b/data/models/inflatebot_MN-12B-Mag-Mell-R1.json similarity index 100% rename from data/inflatebot_MN-12B-Mag-Mell-R1.json rename to data/models/inflatebot_MN-12B-Mag-Mell-R1.json diff --git a/data/infly_INF-ORM-Llama3.1-70B.json b/data/models/infly_INF-ORM-Llama3.1-70B.json similarity index 100% rename from data/infly_INF-ORM-Llama3.1-70B.json rename to data/models/infly_INF-ORM-Llama3.1-70B.json index 82e76ad6cd43b3a105b801967d5f616a3924844a..e7947ee940015eb0652da9a52891a9ab47739595 100644 --- a/data/infly_INF-ORM-Llama3.1-70B.json +++ b/data/models/infly_INF-ORM-Llama3.1-70B.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/infly_INF-ORM-Llama3.1-70B/1766412838.146816", + "evaluation_id": "reward-bench-2/infly_INF-ORM-Llama3.1-70B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,128 +31,104 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9511 + "score": 0.7648 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9665 + "score": 0.7411 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9101 + "score": 0.4188 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Math", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Math score - measures mathematical reasoning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9365 + "score": 0.6995 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9912 + "score": 0.9644 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } - } - ], - "detailed_evaluation_results": null, - "generation_config": null - }, - { - "evaluation_id": "reward-bench-2/infly_INF-ORM-Llama3.1-70B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "rewardbench", - "version": "0.1.3", - "additional_details": { - "subsets": "Chat, Chat Hard, Safety, Reasoning", - "hf_space": "allenai/reward-bench" - } - }, - "benchmark": "reward-bench", - "evaluation_results": [ + }, { - "evaluation_name": "Score", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7648 + "score": 0.903 }, "source_data": { "dataset_name": "RewardBench 2", @@ -161,111 +137,135 @@ } }, { - "evaluation_name": "Factuality", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7411 + "score": 0.8622 }, "source_data": { "dataset_name": "RewardBench 2", "source_type": "hf_dataset", "hf_repo": "allenai/reward-bench-2-results" } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": null + }, + { + "evaluation_id": "reward-bench/infly_INF-ORM-Llama3.1-70B/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ { - "evaluation_name": "Precise IF", + "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4188 + "score": 0.9511 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6995 + "score": 0.9665 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9644 + "score": 0.9101 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.903 + "score": 0.9365 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8622 + "score": 0.9912 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/informatiker_Qwen2-7B-Instruct-abliterated.json b/data/models/informatiker_Qwen2-7B-Instruct-abliterated.json similarity index 100% rename from data/informatiker_Qwen2-7B-Instruct-abliterated.json rename to data/models/informatiker_Qwen2-7B-Instruct-abliterated.json diff --git a/data/insightfactory_Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model.json b/data/models/insightfactory_Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model.json similarity index 100% rename from data/insightfactory_Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model.json rename to data/models/insightfactory_Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model.json diff --git a/data/instruction-pretrain_InstructLM-500M.json b/data/models/instruction-pretrain_InstructLM-500M.json similarity index 100% rename from data/instruction-pretrain_InstructLM-500M.json rename to data/models/instruction-pretrain_InstructLM-500M.json diff --git a/data/internlm_internlm2-1_8b-reward.json b/data/models/internlm_internlm2-1_8b-reward.json similarity index 100% rename from data/internlm_internlm2-1_8b-reward.json rename to data/models/internlm_internlm2-1_8b-reward.json diff --git a/data/internlm_internlm2-1_8b.json b/data/models/internlm_internlm2-1_8b.json similarity index 100% rename from data/internlm_internlm2-1_8b.json rename to data/models/internlm_internlm2-1_8b.json diff --git a/data/internlm_internlm2-20b-reward.json b/data/models/internlm_internlm2-20b-reward.json similarity index 100% rename from data/internlm_internlm2-20b-reward.json rename to data/models/internlm_internlm2-20b-reward.json diff --git a/data/internlm_internlm2-7b-reward.json b/data/models/internlm_internlm2-7b-reward.json similarity index 100% rename from data/internlm_internlm2-7b-reward.json rename to data/models/internlm_internlm2-7b-reward.json diff --git a/data/internlm_internlm2-7b.json b/data/models/internlm_internlm2-7b.json similarity index 100% rename from data/internlm_internlm2-7b.json rename to data/models/internlm_internlm2-7b.json diff --git a/data/internlm_internlm2-chat-1_8b.json b/data/models/internlm_internlm2-chat-1_8b.json similarity index 100% rename from data/internlm_internlm2-chat-1_8b.json rename to data/models/internlm_internlm2-chat-1_8b.json diff --git a/data/internlm_internlm2_5-1_8b-chat.json b/data/models/internlm_internlm2_5-1_8b-chat.json similarity index 100% rename from data/internlm_internlm2_5-1_8b-chat.json rename to data/models/internlm_internlm2_5-1_8b-chat.json diff --git a/data/internlm_internlm2_5-20b-chat.json b/data/models/internlm_internlm2_5-20b-chat.json similarity index 100% rename from data/internlm_internlm2_5-20b-chat.json rename to data/models/internlm_internlm2_5-20b-chat.json diff --git a/data/internlm_internlm2_5-7b-chat.json b/data/models/internlm_internlm2_5-7b-chat.json similarity index 100% rename from data/internlm_internlm2_5-7b-chat.json rename to data/models/internlm_internlm2_5-7b-chat.json diff --git a/data/intervitens_mini-magnum-12b-v1.1.json b/data/models/intervitens_mini-magnum-12b-v1.1.json similarity index 100% rename from data/intervitens_mini-magnum-12b-v1.1.json rename to data/models/intervitens_mini-magnum-12b-v1.1.json diff --git a/data/inumulaisk_eval_model.json b/data/models/inumulaisk_eval_model.json similarity index 100% rename from data/inumulaisk_eval_model.json rename to data/models/inumulaisk_eval_model.json diff --git a/data/invalid-coder_Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp.json b/data/models/invalid-coder_Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp.json similarity index 100% rename from data/invalid-coder_Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp.json rename to data/models/invalid-coder_Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp.json diff --git a/data/invisietch_EtherealRainbow-v0.2-8B.json b/data/models/invisietch_EtherealRainbow-v0.2-8B.json similarity index 100% rename from data/invisietch_EtherealRainbow-v0.2-8B.json rename to data/models/invisietch_EtherealRainbow-v0.2-8B.json diff --git a/data/invisietch_EtherealRainbow-v0.3-8B.json b/data/models/invisietch_EtherealRainbow-v0.3-8B.json similarity index 100% rename from data/invisietch_EtherealRainbow-v0.3-8B.json rename to data/models/invisietch_EtherealRainbow-v0.3-8B.json diff --git a/data/invisietch_MiS-Firefly-v0.2-22B.json b/data/models/invisietch_MiS-Firefly-v0.2-22B.json similarity index 100% rename from data/invisietch_MiS-Firefly-v0.2-22B.json rename to data/models/invisietch_MiS-Firefly-v0.2-22B.json diff --git a/data/invisietch_Nimbus-Miqu-v0.1-70B.json b/data/models/invisietch_Nimbus-Miqu-v0.1-70B.json similarity index 100% rename from data/invisietch_Nimbus-Miqu-v0.1-70B.json rename to data/models/invisietch_Nimbus-Miqu-v0.1-70B.json diff --git a/data/irahulpandey_mistralai-7B-slerp-v0.1.json b/data/models/irahulpandey_mistralai-7B-slerp-v0.1.json similarity index 100% rename from data/irahulpandey_mistralai-7B-slerp-v0.1.json rename to data/models/irahulpandey_mistralai-7B-slerp-v0.1.json diff --git a/data/jaredjoss_pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model.json b/data/models/jaredjoss_pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model.json similarity index 100% rename from data/jaredjoss_pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model.json rename to data/models/jaredjoss_pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model.json diff --git a/data/jaspionjader_Auro-Kosmos-EVAA-v2-8B.json b/data/models/jaspionjader_Auro-Kosmos-EVAA-v2-8B.json similarity index 100% rename from data/jaspionjader_Auro-Kosmos-EVAA-v2-8B.json rename to data/models/jaspionjader_Auro-Kosmos-EVAA-v2-8B.json diff --git a/data/jaspionjader_Auro-Kosmos-EVAA-v2.1-8B.json b/data/models/jaspionjader_Auro-Kosmos-EVAA-v2.1-8B.json similarity index 100% rename from data/jaspionjader_Auro-Kosmos-EVAA-v2.1-8B.json rename to data/models/jaspionjader_Auro-Kosmos-EVAA-v2.1-8B.json diff --git a/data/jaspionjader_Auro-Kosmos-EVAA-v2.2-8B.json b/data/models/jaspionjader_Auro-Kosmos-EVAA-v2.2-8B.json similarity index 100% rename from data/jaspionjader_Auro-Kosmos-EVAA-v2.2-8B.json rename to data/models/jaspionjader_Auro-Kosmos-EVAA-v2.2-8B.json diff --git a/data/jaspionjader_Auro-Kosmos-EVAA-v2.3-8B.json b/data/models/jaspionjader_Auro-Kosmos-EVAA-v2.3-8B.json similarity index 100% rename from data/jaspionjader_Auro-Kosmos-EVAA-v2.3-8B.json rename to data/models/jaspionjader_Auro-Kosmos-EVAA-v2.3-8B.json diff --git a/data/jaspionjader_Kosmos-Aurora_faustus-8B.json b/data/models/jaspionjader_Kosmos-Aurora_faustus-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-Aurora_faustus-8B.json rename to data/models/jaspionjader_Kosmos-Aurora_faustus-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-8B.json b/data/models/jaspionjader_Kosmos-EVAA-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-Franken-Immersive-v39-8B.json b/data/models/jaspionjader_Kosmos-EVAA-Franken-Immersive-v39-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-Franken-Immersive-v39-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-Franken-Immersive-v39-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-Franken-v38-8B.json b/data/models/jaspionjader_Kosmos-EVAA-Franken-v38-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-Franken-v38-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-Franken-v38-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-Fusion-8B.json b/data/models/jaspionjader_Kosmos-EVAA-Fusion-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-Fusion-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-Fusion-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-PRP-8B.json b/data/models/jaspionjader_Kosmos-EVAA-PRP-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-PRP-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-PRP-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-PRP-light-8B.json b/data/models/jaspionjader_Kosmos-EVAA-PRP-light-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-PRP-light-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-PRP-light-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-PRP-v23-8B.json b/data/models/jaspionjader_Kosmos-EVAA-PRP-v23-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-PRP-v23-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-PRP-v23-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-PRP-v24-8B.json b/data/models/jaspionjader_Kosmos-EVAA-PRP-v24-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-PRP-v24-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-PRP-v24-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-PRP-v25-8B.json b/data/models/jaspionjader_Kosmos-EVAA-PRP-v25-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-PRP-v25-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-PRP-v25-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-PRP-v26-8B.json b/data/models/jaspionjader_Kosmos-EVAA-PRP-v26-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-PRP-v26-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-PRP-v26-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-PRP-v27-8B.json b/data/models/jaspionjader_Kosmos-EVAA-PRP-v27-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-PRP-v27-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-PRP-v27-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-PRP-v28-8B.json b/data/models/jaspionjader_Kosmos-EVAA-PRP-v28-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-PRP-v28-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-PRP-v28-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-PRP-v29-8B.json b/data/models/jaspionjader_Kosmos-EVAA-PRP-v29-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-PRP-v29-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-PRP-v29-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-PRP-v30-8B.json b/data/models/jaspionjader_Kosmos-EVAA-PRP-v30-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-PRP-v30-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-PRP-v30-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-PRP-v31-8B.json b/data/models/jaspionjader_Kosmos-EVAA-PRP-v31-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-PRP-v31-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-PRP-v31-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-PRP-v32-8B.json b/data/models/jaspionjader_Kosmos-EVAA-PRP-v32-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-PRP-v32-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-PRP-v32-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-PRP-v33-8B.json b/data/models/jaspionjader_Kosmos-EVAA-PRP-v33-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-PRP-v33-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-PRP-v33-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-PRP-v34-8B.json b/data/models/jaspionjader_Kosmos-EVAA-PRP-v34-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-PRP-v34-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-PRP-v34-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-TSN-8B.json b/data/models/jaspionjader_Kosmos-EVAA-TSN-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-TSN-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-TSN-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-TSN-light-8B.json b/data/models/jaspionjader_Kosmos-EVAA-TSN-light-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-TSN-light-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-TSN-light-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-TSN-v19-8B.json b/data/models/jaspionjader_Kosmos-EVAA-TSN-v19-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-TSN-v19-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-TSN-v19-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-TSN-v20-8B.json b/data/models/jaspionjader_Kosmos-EVAA-TSN-v20-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-TSN-v20-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-TSN-v20-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-TSN-v21-8B.json b/data/models/jaspionjader_Kosmos-EVAA-TSN-v21-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-TSN-v21-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-TSN-v21-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-TSN-v22-8B.json b/data/models/jaspionjader_Kosmos-EVAA-TSN-v22-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-TSN-v22-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-TSN-v22-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-gamma-8B.json b/data/models/jaspionjader_Kosmos-EVAA-gamma-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-gamma-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-gamma-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-gamma-alt-8B.json b/data/models/jaspionjader_Kosmos-EVAA-gamma-alt-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-gamma-alt-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-gamma-alt-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-gamma-light-8B.json b/data/models/jaspionjader_Kosmos-EVAA-gamma-light-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-gamma-light-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-gamma-light-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-gamma-light-alt-8B.json b/data/models/jaspionjader_Kosmos-EVAA-gamma-light-alt-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-gamma-light-alt-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-gamma-light-alt-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-gamma-ultra-light-8B.json b/data/models/jaspionjader_Kosmos-EVAA-gamma-ultra-light-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-gamma-ultra-light-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-gamma-ultra-light-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-gamma-v13-8B.json b/data/models/jaspionjader_Kosmos-EVAA-gamma-v13-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-gamma-v13-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-gamma-v13-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-gamma-v14-8B.json b/data/models/jaspionjader_Kosmos-EVAA-gamma-v14-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-gamma-v14-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-gamma-v14-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-gamma-v15-8B.json b/data/models/jaspionjader_Kosmos-EVAA-gamma-v15-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-gamma-v15-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-gamma-v15-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-gamma-v16-8B.json b/data/models/jaspionjader_Kosmos-EVAA-gamma-v16-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-gamma-v16-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-gamma-v16-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-gamma-v17-8B.json b/data/models/jaspionjader_Kosmos-EVAA-gamma-v17-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-gamma-v17-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-gamma-v17-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-gamma-v18-8B.json b/data/models/jaspionjader_Kosmos-EVAA-gamma-v18-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-gamma-v18-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-gamma-v18-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-immersive-sof-v44-8B.json b/data/models/jaspionjader_Kosmos-EVAA-immersive-sof-v44-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-immersive-sof-v44-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-immersive-sof-v44-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-v10-8B.json b/data/models/jaspionjader_Kosmos-EVAA-v10-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-v10-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-v10-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-v11-8B.json b/data/models/jaspionjader_Kosmos-EVAA-v11-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-v11-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-v11-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-v12-8B.json b/data/models/jaspionjader_Kosmos-EVAA-v12-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-v12-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-v12-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-v2-8B.json b/data/models/jaspionjader_Kosmos-EVAA-v2-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-v2-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-v2-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-v3-8B.json b/data/models/jaspionjader_Kosmos-EVAA-v3-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-v3-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-v3-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-v4-8B.json b/data/models/jaspionjader_Kosmos-EVAA-v4-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-v4-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-v4-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-v5-8B.json b/data/models/jaspionjader_Kosmos-EVAA-v5-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-v5-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-v5-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-v6-8B.json b/data/models/jaspionjader_Kosmos-EVAA-v6-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-v6-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-v6-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-v7-8B.json b/data/models/jaspionjader_Kosmos-EVAA-v7-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-v7-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-v7-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-v8-8B.json b/data/models/jaspionjader_Kosmos-EVAA-v8-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-v8-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-v8-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-v9-8B.json b/data/models/jaspionjader_Kosmos-EVAA-v9-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-v9-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-v9-8B.json diff --git a/data/jaspionjader_Kosmos-EVAA-v9-TitanFusion-Mix-8B.json b/data/models/jaspionjader_Kosmos-EVAA-v9-TitanFusion-Mix-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-EVAA-v9-TitanFusion-Mix-8B.json rename to data/models/jaspionjader_Kosmos-EVAA-v9-TitanFusion-Mix-8B.json diff --git a/data/jaspionjader_Kosmos-Elusive-8b.json b/data/models/jaspionjader_Kosmos-Elusive-8b.json similarity index 100% rename from data/jaspionjader_Kosmos-Elusive-8b.json rename to data/models/jaspionjader_Kosmos-Elusive-8b.json diff --git a/data/jaspionjader_Kosmos-Elusive-VENN-8B.json b/data/models/jaspionjader_Kosmos-Elusive-VENN-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-Elusive-VENN-8B.json rename to data/models/jaspionjader_Kosmos-Elusive-VENN-8B.json diff --git a/data/jaspionjader_Kosmos-Elusive-VENN-Asymmetric-8B.json b/data/models/jaspionjader_Kosmos-Elusive-VENN-Asymmetric-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-Elusive-VENN-Asymmetric-8B.json rename to data/models/jaspionjader_Kosmos-Elusive-VENN-Asymmetric-8B.json diff --git a/data/jaspionjader_Kosmos-Elusive-VENN-Aurora_faustus-8B.json b/data/models/jaspionjader_Kosmos-Elusive-VENN-Aurora_faustus-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-Elusive-VENN-Aurora_faustus-8B.json rename to data/models/jaspionjader_Kosmos-Elusive-VENN-Aurora_faustus-8B.json diff --git a/data/jaspionjader_Kosmos-VENN-8B.json b/data/models/jaspionjader_Kosmos-VENN-8B.json similarity index 100% rename from data/jaspionjader_Kosmos-VENN-8B.json rename to data/models/jaspionjader_Kosmos-VENN-8B.json diff --git a/data/jaspionjader_PRP-Kosmos-EVAA-8B.json b/data/models/jaspionjader_PRP-Kosmos-EVAA-8B.json similarity index 100% rename from data/jaspionjader_PRP-Kosmos-EVAA-8B.json rename to data/models/jaspionjader_PRP-Kosmos-EVAA-8B.json diff --git a/data/jaspionjader_PRP-Kosmos-EVAA-light-8B.json b/data/models/jaspionjader_PRP-Kosmos-EVAA-light-8B.json similarity index 100% rename from data/jaspionjader_PRP-Kosmos-EVAA-light-8B.json rename to data/models/jaspionjader_PRP-Kosmos-EVAA-light-8B.json diff --git a/data/jaspionjader_TSN-Kosmos-EVAA-8B.json b/data/models/jaspionjader_TSN-Kosmos-EVAA-8B.json similarity index 100% rename from data/jaspionjader_TSN-Kosmos-EVAA-8B.json rename to data/models/jaspionjader_TSN-Kosmos-EVAA-8B.json diff --git a/data/jaspionjader_TSN-Kosmos-EVAA-v2-8B.json b/data/models/jaspionjader_TSN-Kosmos-EVAA-v2-8B.json similarity index 100% rename from data/jaspionjader_TSN-Kosmos-EVAA-v2-8B.json rename to data/models/jaspionjader_TSN-Kosmos-EVAA-v2-8B.json diff --git a/data/jaspionjader_bbb-1.json b/data/models/jaspionjader_bbb-1.json similarity index 100% rename from data/jaspionjader_bbb-1.json rename to data/models/jaspionjader_bbb-1.json diff --git a/data/jaspionjader_bbb-2.json b/data/models/jaspionjader_bbb-2.json similarity index 100% rename from data/jaspionjader_bbb-2.json rename to data/models/jaspionjader_bbb-2.json diff --git a/data/jaspionjader_bbb-3.json b/data/models/jaspionjader_bbb-3.json similarity index 100% rename from data/jaspionjader_bbb-3.json rename to data/models/jaspionjader_bbb-3.json diff --git a/data/jaspionjader_bbb-4.json b/data/models/jaspionjader_bbb-4.json similarity index 100% rename from data/jaspionjader_bbb-4.json rename to data/models/jaspionjader_bbb-4.json diff --git a/data/jaspionjader_bbb-5.json b/data/models/jaspionjader_bbb-5.json similarity index 100% rename from data/jaspionjader_bbb-5.json rename to data/models/jaspionjader_bbb-5.json diff --git a/data/jaspionjader_bbb-6.json b/data/models/jaspionjader_bbb-6.json similarity index 100% rename from data/jaspionjader_bbb-6.json rename to data/models/jaspionjader_bbb-6.json diff --git a/data/jaspionjader_bbb-7.json b/data/models/jaspionjader_bbb-7.json similarity index 100% rename from data/jaspionjader_bbb-7.json rename to data/models/jaspionjader_bbb-7.json diff --git a/data/jaspionjader_bh-1.json b/data/models/jaspionjader_bh-1.json similarity index 100% rename from data/jaspionjader_bh-1.json rename to data/models/jaspionjader_bh-1.json diff --git a/data/jaspionjader_bh-10.json b/data/models/jaspionjader_bh-10.json similarity index 100% rename from data/jaspionjader_bh-10.json rename to data/models/jaspionjader_bh-10.json diff --git a/data/jaspionjader_bh-11.json b/data/models/jaspionjader_bh-11.json similarity index 100% rename from data/jaspionjader_bh-11.json rename to data/models/jaspionjader_bh-11.json diff --git a/data/jaspionjader_bh-12.json b/data/models/jaspionjader_bh-12.json similarity index 100% rename from data/jaspionjader_bh-12.json rename to data/models/jaspionjader_bh-12.json diff --git a/data/jaspionjader_bh-13.json b/data/models/jaspionjader_bh-13.json similarity index 100% rename from data/jaspionjader_bh-13.json rename to data/models/jaspionjader_bh-13.json diff --git a/data/jaspionjader_bh-15.json b/data/models/jaspionjader_bh-15.json similarity index 100% rename from data/jaspionjader_bh-15.json rename to data/models/jaspionjader_bh-15.json diff --git a/data/jaspionjader_bh-16.json b/data/models/jaspionjader_bh-16.json similarity index 100% rename from data/jaspionjader_bh-16.json rename to data/models/jaspionjader_bh-16.json diff --git a/data/jaspionjader_bh-17.json b/data/models/jaspionjader_bh-17.json similarity index 100% rename from data/jaspionjader_bh-17.json rename to data/models/jaspionjader_bh-17.json diff --git a/data/jaspionjader_bh-18.json b/data/models/jaspionjader_bh-18.json similarity index 100% rename from data/jaspionjader_bh-18.json rename to data/models/jaspionjader_bh-18.json diff --git a/data/jaspionjader_bh-19.json b/data/models/jaspionjader_bh-19.json similarity index 100% rename from data/jaspionjader_bh-19.json rename to data/models/jaspionjader_bh-19.json diff --git a/data/jaspionjader_bh-2.json b/data/models/jaspionjader_bh-2.json similarity index 100% rename from data/jaspionjader_bh-2.json rename to data/models/jaspionjader_bh-2.json diff --git a/data/jaspionjader_bh-20.json b/data/models/jaspionjader_bh-20.json similarity index 100% rename from data/jaspionjader_bh-20.json rename to data/models/jaspionjader_bh-20.json diff --git a/data/jaspionjader_bh-21.json b/data/models/jaspionjader_bh-21.json similarity index 100% rename from data/jaspionjader_bh-21.json rename to data/models/jaspionjader_bh-21.json diff --git a/data/jaspionjader_bh-22.json b/data/models/jaspionjader_bh-22.json similarity index 100% rename from data/jaspionjader_bh-22.json rename to data/models/jaspionjader_bh-22.json diff --git a/data/jaspionjader_bh-23.json b/data/models/jaspionjader_bh-23.json similarity index 100% rename from data/jaspionjader_bh-23.json rename to data/models/jaspionjader_bh-23.json diff --git a/data/jaspionjader_bh-24.json b/data/models/jaspionjader_bh-24.json similarity index 100% rename from data/jaspionjader_bh-24.json rename to data/models/jaspionjader_bh-24.json diff --git a/data/jaspionjader_bh-25.json b/data/models/jaspionjader_bh-25.json similarity index 100% rename from data/jaspionjader_bh-25.json rename to data/models/jaspionjader_bh-25.json diff --git a/data/jaspionjader_bh-26.json b/data/models/jaspionjader_bh-26.json similarity index 100% rename from data/jaspionjader_bh-26.json rename to data/models/jaspionjader_bh-26.json diff --git a/data/jaspionjader_bh-27.json b/data/models/jaspionjader_bh-27.json similarity index 100% rename from data/jaspionjader_bh-27.json rename to data/models/jaspionjader_bh-27.json diff --git a/data/jaspionjader_bh-28.json b/data/models/jaspionjader_bh-28.json similarity index 100% rename from data/jaspionjader_bh-28.json rename to data/models/jaspionjader_bh-28.json diff --git a/data/jaspionjader_bh-29.json b/data/models/jaspionjader_bh-29.json similarity index 100% rename from data/jaspionjader_bh-29.json rename to data/models/jaspionjader_bh-29.json diff --git a/data/jaspionjader_bh-3.json b/data/models/jaspionjader_bh-3.json similarity index 100% rename from data/jaspionjader_bh-3.json rename to data/models/jaspionjader_bh-3.json diff --git a/data/jaspionjader_bh-30.json b/data/models/jaspionjader_bh-30.json similarity index 100% rename from data/jaspionjader_bh-30.json rename to data/models/jaspionjader_bh-30.json diff --git a/data/jaspionjader_bh-31.json b/data/models/jaspionjader_bh-31.json similarity index 100% rename from data/jaspionjader_bh-31.json rename to data/models/jaspionjader_bh-31.json diff --git a/data/jaspionjader_bh-32.json b/data/models/jaspionjader_bh-32.json similarity index 100% rename from data/jaspionjader_bh-32.json rename to data/models/jaspionjader_bh-32.json diff --git a/data/jaspionjader_bh-33.json b/data/models/jaspionjader_bh-33.json similarity index 100% rename from data/jaspionjader_bh-33.json rename to data/models/jaspionjader_bh-33.json diff --git a/data/jaspionjader_bh-34.json b/data/models/jaspionjader_bh-34.json similarity index 100% rename from data/jaspionjader_bh-34.json rename to data/models/jaspionjader_bh-34.json diff --git a/data/jaspionjader_bh-35.json b/data/models/jaspionjader_bh-35.json similarity index 100% rename from data/jaspionjader_bh-35.json rename to data/models/jaspionjader_bh-35.json diff --git a/data/jaspionjader_bh-36.json b/data/models/jaspionjader_bh-36.json similarity index 100% rename from data/jaspionjader_bh-36.json rename to data/models/jaspionjader_bh-36.json diff --git a/data/jaspionjader_bh-37.json b/data/models/jaspionjader_bh-37.json similarity index 100% rename from data/jaspionjader_bh-37.json rename to data/models/jaspionjader_bh-37.json diff --git a/data/jaspionjader_bh-38.json b/data/models/jaspionjader_bh-38.json similarity index 100% rename from data/jaspionjader_bh-38.json rename to data/models/jaspionjader_bh-38.json diff --git a/data/jaspionjader_bh-39.json b/data/models/jaspionjader_bh-39.json similarity index 100% rename from data/jaspionjader_bh-39.json rename to data/models/jaspionjader_bh-39.json diff --git a/data/jaspionjader_bh-4.json b/data/models/jaspionjader_bh-4.json similarity index 100% rename from data/jaspionjader_bh-4.json rename to data/models/jaspionjader_bh-4.json diff --git a/data/jaspionjader_bh-40.json b/data/models/jaspionjader_bh-40.json similarity index 100% rename from data/jaspionjader_bh-40.json rename to data/models/jaspionjader_bh-40.json diff --git a/data/jaspionjader_bh-41.json b/data/models/jaspionjader_bh-41.json similarity index 100% rename from data/jaspionjader_bh-41.json rename to data/models/jaspionjader_bh-41.json diff --git a/data/jaspionjader_bh-42.json b/data/models/jaspionjader_bh-42.json similarity index 100% rename from data/jaspionjader_bh-42.json rename to data/models/jaspionjader_bh-42.json diff --git a/data/jaspionjader_bh-43.json b/data/models/jaspionjader_bh-43.json similarity index 100% rename from data/jaspionjader_bh-43.json rename to data/models/jaspionjader_bh-43.json diff --git a/data/jaspionjader_bh-44.json b/data/models/jaspionjader_bh-44.json similarity index 100% rename from data/jaspionjader_bh-44.json rename to data/models/jaspionjader_bh-44.json diff --git a/data/jaspionjader_bh-46.json b/data/models/jaspionjader_bh-46.json similarity index 100% rename from data/jaspionjader_bh-46.json rename to data/models/jaspionjader_bh-46.json diff --git a/data/jaspionjader_bh-47.json b/data/models/jaspionjader_bh-47.json similarity index 100% rename from data/jaspionjader_bh-47.json rename to data/models/jaspionjader_bh-47.json diff --git a/data/jaspionjader_bh-48.json b/data/models/jaspionjader_bh-48.json similarity index 100% rename from data/jaspionjader_bh-48.json rename to data/models/jaspionjader_bh-48.json diff --git a/data/jaspionjader_bh-49.json b/data/models/jaspionjader_bh-49.json similarity index 100% rename from data/jaspionjader_bh-49.json rename to data/models/jaspionjader_bh-49.json diff --git a/data/jaspionjader_bh-5.json b/data/models/jaspionjader_bh-5.json similarity index 100% rename from data/jaspionjader_bh-5.json rename to data/models/jaspionjader_bh-5.json diff --git a/data/jaspionjader_bh-50.json b/data/models/jaspionjader_bh-50.json similarity index 100% rename from data/jaspionjader_bh-50.json rename to data/models/jaspionjader_bh-50.json diff --git a/data/jaspionjader_bh-51.json b/data/models/jaspionjader_bh-51.json similarity index 100% rename from data/jaspionjader_bh-51.json rename to data/models/jaspionjader_bh-51.json diff --git a/data/jaspionjader_bh-52.json b/data/models/jaspionjader_bh-52.json similarity index 100% rename from data/jaspionjader_bh-52.json rename to data/models/jaspionjader_bh-52.json diff --git a/data/jaspionjader_bh-53.json b/data/models/jaspionjader_bh-53.json similarity index 100% rename from data/jaspionjader_bh-53.json rename to data/models/jaspionjader_bh-53.json diff --git a/data/jaspionjader_bh-54.json b/data/models/jaspionjader_bh-54.json similarity index 100% rename from data/jaspionjader_bh-54.json rename to data/models/jaspionjader_bh-54.json diff --git a/data/jaspionjader_bh-55.json b/data/models/jaspionjader_bh-55.json similarity index 100% rename from data/jaspionjader_bh-55.json rename to data/models/jaspionjader_bh-55.json diff --git a/data/jaspionjader_bh-56.json b/data/models/jaspionjader_bh-56.json similarity index 100% rename from data/jaspionjader_bh-56.json rename to data/models/jaspionjader_bh-56.json diff --git a/data/jaspionjader_bh-57.json b/data/models/jaspionjader_bh-57.json similarity index 100% rename from data/jaspionjader_bh-57.json rename to data/models/jaspionjader_bh-57.json diff --git a/data/jaspionjader_bh-58.json b/data/models/jaspionjader_bh-58.json similarity index 100% rename from data/jaspionjader_bh-58.json rename to data/models/jaspionjader_bh-58.json diff --git a/data/jaspionjader_bh-59.json b/data/models/jaspionjader_bh-59.json similarity index 100% rename from data/jaspionjader_bh-59.json rename to data/models/jaspionjader_bh-59.json diff --git a/data/jaspionjader_bh-6.json b/data/models/jaspionjader_bh-6.json similarity index 100% rename from data/jaspionjader_bh-6.json rename to data/models/jaspionjader_bh-6.json diff --git a/data/jaspionjader_bh-60.json b/data/models/jaspionjader_bh-60.json similarity index 100% rename from data/jaspionjader_bh-60.json rename to data/models/jaspionjader_bh-60.json diff --git a/data/jaspionjader_bh-61.json b/data/models/jaspionjader_bh-61.json similarity index 100% rename from data/jaspionjader_bh-61.json rename to data/models/jaspionjader_bh-61.json diff --git a/data/jaspionjader_bh-62.json b/data/models/jaspionjader_bh-62.json similarity index 100% rename from data/jaspionjader_bh-62.json rename to data/models/jaspionjader_bh-62.json diff --git a/data/jaspionjader_bh-63.json b/data/models/jaspionjader_bh-63.json similarity index 100% rename from data/jaspionjader_bh-63.json rename to data/models/jaspionjader_bh-63.json diff --git a/data/jaspionjader_bh-64.json b/data/models/jaspionjader_bh-64.json similarity index 100% rename from data/jaspionjader_bh-64.json rename to data/models/jaspionjader_bh-64.json diff --git a/data/jaspionjader_bh-7.json b/data/models/jaspionjader_bh-7.json similarity index 100% rename from data/jaspionjader_bh-7.json rename to data/models/jaspionjader_bh-7.json diff --git a/data/jaspionjader_bh-8.json b/data/models/jaspionjader_bh-8.json similarity index 100% rename from data/jaspionjader_bh-8.json rename to data/models/jaspionjader_bh-8.json diff --git a/data/jaspionjader_bh-9.json b/data/models/jaspionjader_bh-9.json similarity index 100% rename from data/jaspionjader_bh-9.json rename to data/models/jaspionjader_bh-9.json diff --git a/data/jaspionjader_dp-6-8b.json b/data/models/jaspionjader_dp-6-8b.json similarity index 100% rename from data/jaspionjader_dp-6-8b.json rename to data/models/jaspionjader_dp-6-8b.json diff --git a/data/jaspionjader_dp-7-8b.json b/data/models/jaspionjader_dp-7-8b.json similarity index 100% rename from data/jaspionjader_dp-7-8b.json rename to data/models/jaspionjader_dp-7-8b.json diff --git a/data/jaspionjader_ek-6.json b/data/models/jaspionjader_ek-6.json similarity index 100% rename from data/jaspionjader_ek-6.json rename to data/models/jaspionjader_ek-6.json diff --git a/data/jaspionjader_ek-7.json b/data/models/jaspionjader_ek-7.json similarity index 100% rename from data/jaspionjader_ek-7.json rename to data/models/jaspionjader_ek-7.json diff --git a/data/jaspionjader_f-1-8b.json b/data/models/jaspionjader_f-1-8b.json similarity index 100% rename from data/jaspionjader_f-1-8b.json rename to data/models/jaspionjader_f-1-8b.json diff --git a/data/jaspionjader_f-2-8b.json b/data/models/jaspionjader_f-2-8b.json similarity index 100% rename from data/jaspionjader_f-2-8b.json rename to data/models/jaspionjader_f-2-8b.json diff --git a/data/jaspionjader_f-3-8b.json b/data/models/jaspionjader_f-3-8b.json similarity index 100% rename from data/jaspionjader_f-3-8b.json rename to data/models/jaspionjader_f-3-8b.json diff --git a/data/jaspionjader_f-4-8b.json b/data/models/jaspionjader_f-4-8b.json similarity index 100% rename from data/jaspionjader_f-4-8b.json rename to data/models/jaspionjader_f-4-8b.json diff --git a/data/jaspionjader_f-5-8b.json b/data/models/jaspionjader_f-5-8b.json similarity index 100% rename from data/jaspionjader_f-5-8b.json rename to data/models/jaspionjader_f-5-8b.json diff --git a/data/jaspionjader_f-6-8b.json b/data/models/jaspionjader_f-6-8b.json similarity index 100% rename from data/jaspionjader_f-6-8b.json rename to data/models/jaspionjader_f-6-8b.json diff --git a/data/jaspionjader_f-7-8b.json b/data/models/jaspionjader_f-7-8b.json similarity index 100% rename from data/jaspionjader_f-7-8b.json rename to data/models/jaspionjader_f-7-8b.json diff --git a/data/jaspionjader_f-8-8b.json b/data/models/jaspionjader_f-8-8b.json similarity index 100% rename from data/jaspionjader_f-8-8b.json rename to data/models/jaspionjader_f-8-8b.json diff --git a/data/jaspionjader_f-9-8b.json b/data/models/jaspionjader_f-9-8b.json similarity index 100% rename from data/jaspionjader_f-9-8b.json rename to data/models/jaspionjader_f-9-8b.json diff --git a/data/jaspionjader_fct-14-8b.json b/data/models/jaspionjader_fct-14-8b.json similarity index 100% rename from data/jaspionjader_fct-14-8b.json rename to data/models/jaspionjader_fct-14-8b.json diff --git a/data/jaspionjader_fct-9-8b.json b/data/models/jaspionjader_fct-9-8b.json similarity index 100% rename from data/jaspionjader_fct-9-8b.json rename to data/models/jaspionjader_fct-9-8b.json diff --git a/data/jaspionjader_fr-1-8b.json b/data/models/jaspionjader_fr-1-8b.json similarity index 100% rename from data/jaspionjader_fr-1-8b.json rename to data/models/jaspionjader_fr-1-8b.json diff --git a/data/jaspionjader_fr-10-8b.json b/data/models/jaspionjader_fr-10-8b.json similarity index 100% rename from data/jaspionjader_fr-10-8b.json rename to data/models/jaspionjader_fr-10-8b.json diff --git a/data/jaspionjader_fr-3-8b.json b/data/models/jaspionjader_fr-3-8b.json similarity index 100% rename from data/jaspionjader_fr-3-8b.json rename to data/models/jaspionjader_fr-3-8b.json diff --git a/data/jaspionjader_gamma-Kosmos-EVAA-8B.json b/data/models/jaspionjader_gamma-Kosmos-EVAA-8B.json similarity index 100% rename from data/jaspionjader_gamma-Kosmos-EVAA-8B.json rename to data/models/jaspionjader_gamma-Kosmos-EVAA-8B.json diff --git a/data/jaspionjader_gamma-Kosmos-EVAA-v2-8B.json b/data/models/jaspionjader_gamma-Kosmos-EVAA-v2-8B.json similarity index 100% rename from data/jaspionjader_gamma-Kosmos-EVAA-v2-8B.json rename to data/models/jaspionjader_gamma-Kosmos-EVAA-v2-8B.json diff --git a/data/jaspionjader_gamma-Kosmos-EVAA-v3-8B.json b/data/models/jaspionjader_gamma-Kosmos-EVAA-v3-8B.json similarity index 100% rename from data/jaspionjader_gamma-Kosmos-EVAA-v3-8B.json rename to data/models/jaspionjader_gamma-Kosmos-EVAA-v3-8B.json diff --git a/data/jaspionjader_knf-2-8b.json b/data/models/jaspionjader_knf-2-8b.json similarity index 100% rename from data/jaspionjader_knf-2-8b.json rename to data/models/jaspionjader_knf-2-8b.json diff --git a/data/jaspionjader_knfp-2-8b.json b/data/models/jaspionjader_knfp-2-8b.json similarity index 100% rename from data/jaspionjader_knfp-2-8b.json rename to data/models/jaspionjader_knfp-2-8b.json diff --git a/data/jaspionjader_knfp-3-8b.json b/data/models/jaspionjader_knfp-3-8b.json similarity index 100% rename from data/jaspionjader_knfp-3-8b.json rename to data/models/jaspionjader_knfp-3-8b.json diff --git a/data/jaspionjader_kstc-1-8b.json b/data/models/jaspionjader_kstc-1-8b.json similarity index 100% rename from data/jaspionjader_kstc-1-8b.json rename to data/models/jaspionjader_kstc-1-8b.json diff --git a/data/jaspionjader_kstc-11-8b.json b/data/models/jaspionjader_kstc-11-8b.json similarity index 100% rename from data/jaspionjader_kstc-11-8b.json rename to data/models/jaspionjader_kstc-11-8b.json diff --git a/data/jaspionjader_kstc-4-8b.json b/data/models/jaspionjader_kstc-4-8b.json similarity index 100% rename from data/jaspionjader_kstc-4-8b.json rename to data/models/jaspionjader_kstc-4-8b.json diff --git a/data/jaspionjader_kstc-5-8b.json b/data/models/jaspionjader_kstc-5-8b.json similarity index 100% rename from data/jaspionjader_kstc-5-8b.json rename to data/models/jaspionjader_kstc-5-8b.json diff --git a/data/jaspionjader_kstc-6-8b.json b/data/models/jaspionjader_kstc-6-8b.json similarity index 100% rename from data/jaspionjader_kstc-6-8b.json rename to data/models/jaspionjader_kstc-6-8b.json diff --git a/data/jaspionjader_kstc-8-8b.json b/data/models/jaspionjader_kstc-8-8b.json similarity index 100% rename from data/jaspionjader_kstc-8-8b.json rename to data/models/jaspionjader_kstc-8-8b.json diff --git a/data/jaspionjader_kstc-9-8b.json b/data/models/jaspionjader_kstc-9-8b.json similarity index 100% rename from data/jaspionjader_kstc-9-8b.json rename to data/models/jaspionjader_kstc-9-8b.json diff --git a/data/jaspionjader_slu-10.json b/data/models/jaspionjader_slu-10.json similarity index 100% rename from data/jaspionjader_slu-10.json rename to data/models/jaspionjader_slu-10.json diff --git a/data/jaspionjader_slu-11.json b/data/models/jaspionjader_slu-11.json similarity index 100% rename from data/jaspionjader_slu-11.json rename to data/models/jaspionjader_slu-11.json diff --git a/data/jaspionjader_slu-13.json b/data/models/jaspionjader_slu-13.json similarity index 100% rename from data/jaspionjader_slu-13.json rename to data/models/jaspionjader_slu-13.json diff --git a/data/jaspionjader_slu-14.json b/data/models/jaspionjader_slu-14.json similarity index 100% rename from data/jaspionjader_slu-14.json rename to data/models/jaspionjader_slu-14.json diff --git a/data/jaspionjader_slu-17.json b/data/models/jaspionjader_slu-17.json similarity index 100% rename from data/jaspionjader_slu-17.json rename to data/models/jaspionjader_slu-17.json diff --git a/data/jaspionjader_slu-2.json b/data/models/jaspionjader_slu-2.json similarity index 100% rename from data/jaspionjader_slu-2.json rename to data/models/jaspionjader_slu-2.json diff --git a/data/jaspionjader_slu-20.json b/data/models/jaspionjader_slu-20.json similarity index 100% rename from data/jaspionjader_slu-20.json rename to data/models/jaspionjader_slu-20.json diff --git a/data/jaspionjader_slu-22.json b/data/models/jaspionjader_slu-22.json similarity index 100% rename from data/jaspionjader_slu-22.json rename to data/models/jaspionjader_slu-22.json diff --git a/data/jaspionjader_slu-23.json b/data/models/jaspionjader_slu-23.json similarity index 100% rename from data/jaspionjader_slu-23.json rename to data/models/jaspionjader_slu-23.json diff --git a/data/jaspionjader_slu-25.json b/data/models/jaspionjader_slu-25.json similarity index 100% rename from data/jaspionjader_slu-25.json rename to data/models/jaspionjader_slu-25.json diff --git a/data/jaspionjader_slu-29.json b/data/models/jaspionjader_slu-29.json similarity index 100% rename from data/jaspionjader_slu-29.json rename to data/models/jaspionjader_slu-29.json diff --git a/data/jaspionjader_slu-32.json b/data/models/jaspionjader_slu-32.json similarity index 100% rename from data/jaspionjader_slu-32.json rename to data/models/jaspionjader_slu-32.json diff --git a/data/jaspionjader_slu-33.json b/data/models/jaspionjader_slu-33.json similarity index 100% rename from data/jaspionjader_slu-33.json rename to data/models/jaspionjader_slu-33.json diff --git a/data/jaspionjader_slu-34.json b/data/models/jaspionjader_slu-34.json similarity index 100% rename from data/jaspionjader_slu-34.json rename to data/models/jaspionjader_slu-34.json diff --git a/data/jaspionjader_slu-35.json b/data/models/jaspionjader_slu-35.json similarity index 100% rename from data/jaspionjader_slu-35.json rename to data/models/jaspionjader_slu-35.json diff --git a/data/jaspionjader_slu-36.json b/data/models/jaspionjader_slu-36.json similarity index 100% rename from data/jaspionjader_slu-36.json rename to data/models/jaspionjader_slu-36.json diff --git a/data/jaspionjader_slu-37.json b/data/models/jaspionjader_slu-37.json similarity index 100% rename from data/jaspionjader_slu-37.json rename to data/models/jaspionjader_slu-37.json diff --git a/data/jaspionjader_slu-6.json b/data/models/jaspionjader_slu-6.json similarity index 100% rename from data/jaspionjader_slu-6.json rename to data/models/jaspionjader_slu-6.json diff --git a/data/jaspionjader_slu-mix-1.json b/data/models/jaspionjader_slu-mix-1.json similarity index 100% rename from data/jaspionjader_slu-mix-1.json rename to data/models/jaspionjader_slu-mix-1.json diff --git a/data/jaspionjader_sof-1.json b/data/models/jaspionjader_sof-1.json similarity index 100% rename from data/jaspionjader_sof-1.json rename to data/models/jaspionjader_sof-1.json diff --git a/data/jaspionjader_sof-10.json b/data/models/jaspionjader_sof-10.json similarity index 100% rename from data/jaspionjader_sof-10.json rename to data/models/jaspionjader_sof-10.json diff --git a/data/jaspionjader_sof-3.json b/data/models/jaspionjader_sof-3.json similarity index 100% rename from data/jaspionjader_sof-3.json rename to data/models/jaspionjader_sof-3.json diff --git a/data/jaspionjader_sof-6.json b/data/models/jaspionjader_sof-6.json similarity index 100% rename from data/jaspionjader_sof-6.json rename to data/models/jaspionjader_sof-6.json diff --git a/data/jaspionjader_test-10.json b/data/models/jaspionjader_test-10.json similarity index 100% rename from data/jaspionjader_test-10.json rename to data/models/jaspionjader_test-10.json diff --git a/data/jaspionjader_test-11.json b/data/models/jaspionjader_test-11.json similarity index 100% rename from data/jaspionjader_test-11.json rename to data/models/jaspionjader_test-11.json diff --git a/data/jaspionjader_test-12.json b/data/models/jaspionjader_test-12.json similarity index 100% rename from data/jaspionjader_test-12.json rename to data/models/jaspionjader_test-12.json diff --git a/data/jaspionjader_test-13.json b/data/models/jaspionjader_test-13.json similarity index 100% rename from data/jaspionjader_test-13.json rename to data/models/jaspionjader_test-13.json diff --git a/data/jaspionjader_test-14.json b/data/models/jaspionjader_test-14.json similarity index 100% rename from data/jaspionjader_test-14.json rename to data/models/jaspionjader_test-14.json diff --git a/data/jaspionjader_test-15.json b/data/models/jaspionjader_test-15.json similarity index 100% rename from data/jaspionjader_test-15.json rename to data/models/jaspionjader_test-15.json diff --git a/data/jaspionjader_test-16.json b/data/models/jaspionjader_test-16.json similarity index 100% rename from data/jaspionjader_test-16.json rename to data/models/jaspionjader_test-16.json diff --git a/data/jaspionjader_test-17.json b/data/models/jaspionjader_test-17.json similarity index 100% rename from data/jaspionjader_test-17.json rename to data/models/jaspionjader_test-17.json diff --git a/data/jaspionjader_test-18.json b/data/models/jaspionjader_test-18.json similarity index 100% rename from data/jaspionjader_test-18.json rename to data/models/jaspionjader_test-18.json diff --git a/data/jaspionjader_test-19.json b/data/models/jaspionjader_test-19.json similarity index 100% rename from data/jaspionjader_test-19.json rename to data/models/jaspionjader_test-19.json diff --git a/data/jaspionjader_test-20.json b/data/models/jaspionjader_test-20.json similarity index 100% rename from data/jaspionjader_test-20.json rename to data/models/jaspionjader_test-20.json diff --git a/data/jayasuryajsk_Qwen2.5-3B-reasoner.json b/data/models/jayasuryajsk_Qwen2.5-3B-reasoner.json similarity index 100% rename from data/jayasuryajsk_Qwen2.5-3B-reasoner.json rename to data/models/jayasuryajsk_Qwen2.5-3B-reasoner.json diff --git a/data/jeanmichela_o-distil-qwen.json b/data/models/jeanmichela_o-distil-qwen.json similarity index 100% rename from data/jeanmichela_o-distil-qwen.json rename to data/models/jeanmichela_o-distil-qwen.json diff --git a/data/jebcarter_psyonic-cetacean-20B.json b/data/models/jebcarter_psyonic-cetacean-20B.json similarity index 100% rename from data/jebcarter_psyonic-cetacean-20B.json rename to data/models/jebcarter_psyonic-cetacean-20B.json diff --git a/data/jebish7_Llama-3-Nanda-10B-Chat.json b/data/models/jebish7_Llama-3-Nanda-10B-Chat.json similarity index 100% rename from data/jebish7_Llama-3-Nanda-10B-Chat.json rename to data/models/jebish7_Llama-3-Nanda-10B-Chat.json diff --git a/data/jebish7_Llama-3.1-8B-Instruct.json b/data/models/jebish7_Llama-3.1-8B-Instruct.json similarity index 100% rename from data/jebish7_Llama-3.1-8B-Instruct.json rename to data/models/jebish7_Llama-3.1-8B-Instruct.json diff --git a/data/jebish7_Nemotron-4-Mini-Hindi-4B-Base.json b/data/models/jebish7_Nemotron-4-Mini-Hindi-4B-Base.json similarity index 100% rename from data/jebish7_Nemotron-4-Mini-Hindi-4B-Base.json rename to data/models/jebish7_Nemotron-4-Mini-Hindi-4B-Base.json diff --git a/data/jebish7_Nemotron-4-Mini-Hindi-4B-Instruct.json b/data/models/jebish7_Nemotron-4-Mini-Hindi-4B-Instruct.json similarity index 100% rename from data/jebish7_Nemotron-4-Mini-Hindi-4B-Instruct.json rename to data/models/jebish7_Nemotron-4-Mini-Hindi-4B-Instruct.json diff --git a/data/jebish7_Nemotron-Mini-4B-Instruct.json b/data/models/jebish7_Nemotron-Mini-4B-Instruct.json similarity index 100% rename from data/jebish7_Nemotron-Mini-4B-Instruct.json rename to data/models/jebish7_Nemotron-Mini-4B-Instruct.json diff --git a/data/jebish7_aya-expanse-8b.json b/data/models/jebish7_aya-expanse-8b.json similarity index 100% rename from data/jebish7_aya-expanse-8b.json rename to data/models/jebish7_aya-expanse-8b.json diff --git a/data/jebish7_gemma-2-2b-it.json b/data/models/jebish7_gemma-2-2b-it.json similarity index 100% rename from data/jebish7_gemma-2-2b-it.json rename to data/models/jebish7_gemma-2-2b-it.json diff --git a/data/jebish7_gemma-2-9b-it.json b/data/models/jebish7_gemma-2-9b-it.json similarity index 100% rename from data/jebish7_gemma-2-9b-it.json rename to data/models/jebish7_gemma-2-9b-it.json diff --git a/data/jebish7_qwen2.5-0.5B-IHA-Hin.json b/data/models/jebish7_qwen2.5-0.5B-IHA-Hin.json similarity index 100% rename from data/jebish7_qwen2.5-0.5B-IHA-Hin.json rename to data/models/jebish7_qwen2.5-0.5B-IHA-Hin.json diff --git a/data/jeffmeloy_Qwen-7B-nerd-uncensored-v1.0.json b/data/models/jeffmeloy_Qwen-7B-nerd-uncensored-v1.0.json similarity index 100% rename from data/jeffmeloy_Qwen-7B-nerd-uncensored-v1.0.json rename to data/models/jeffmeloy_Qwen-7B-nerd-uncensored-v1.0.json diff --git a/data/jeffmeloy_Qwen2.5-7B-minperplexity-2.json b/data/models/jeffmeloy_Qwen2.5-7B-minperplexity-2.json similarity index 100% rename from data/jeffmeloy_Qwen2.5-7B-minperplexity-2.json rename to data/models/jeffmeloy_Qwen2.5-7B-minperplexity-2.json diff --git a/data/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v0.9.json b/data/models/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v0.9.json similarity index 100% rename from data/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v0.9.json rename to data/models/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v0.9.json diff --git a/data/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.0.json b/data/models/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.0.json similarity index 100% rename from data/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.0.json rename to data/models/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.0.json diff --git a/data/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.1.json b/data/models/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.1.json similarity index 100% rename from data/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.1.json rename to data/models/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.1.json diff --git a/data/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.2.json b/data/models/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.2.json similarity index 100% rename from data/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.2.json rename to data/models/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.2.json diff --git a/data/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.3.json b/data/models/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.3.json similarity index 100% rename from data/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.3.json rename to data/models/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.3.json diff --git a/data/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.4.json b/data/models/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.4.json similarity index 100% rename from data/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.4.json rename to data/models/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.4.json diff --git a/data/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.5.json b/data/models/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.5.json similarity index 100% rename from data/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.5.json rename to data/models/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.5.json diff --git a/data/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.7.json b/data/models/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.7.json similarity index 100% rename from data/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.7.json rename to data/models/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.7.json diff --git a/data/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.8.json b/data/models/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.8.json similarity index 100% rename from data/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.8.json rename to data/models/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.8.json diff --git a/data/jeffmeloy_Qwen2.5-7B-olm-v1.0.json b/data/models/jeffmeloy_Qwen2.5-7B-olm-v1.0.json similarity index 100% rename from data/jeffmeloy_Qwen2.5-7B-olm-v1.0.json rename to data/models/jeffmeloy_Qwen2.5-7B-olm-v1.0.json diff --git a/data/jeffmeloy_Qwen2.5-7B-olm-v1.1.json b/data/models/jeffmeloy_Qwen2.5-7B-olm-v1.1.json similarity index 100% rename from data/jeffmeloy_Qwen2.5-7B-olm-v1.1.json rename to data/models/jeffmeloy_Qwen2.5-7B-olm-v1.1.json diff --git a/data/jeffmeloy_Qwen2.5-7B-olm-v1.2.json b/data/models/jeffmeloy_Qwen2.5-7B-olm-v1.2.json similarity index 100% rename from data/jeffmeloy_Qwen2.5-7B-olm-v1.2.json rename to data/models/jeffmeloy_Qwen2.5-7B-olm-v1.2.json diff --git a/data/jeffmeloy_Qwen2.5-7B-olm-v1.3.json b/data/models/jeffmeloy_Qwen2.5-7B-olm-v1.3.json similarity index 100% rename from data/jeffmeloy_Qwen2.5-7B-olm-v1.3.json rename to data/models/jeffmeloy_Qwen2.5-7B-olm-v1.3.json diff --git a/data/jeffmeloy_Qwen2.5-7B-olm-v1.4.json b/data/models/jeffmeloy_Qwen2.5-7B-olm-v1.4.json similarity index 100% rename from data/jeffmeloy_Qwen2.5-7B-olm-v1.4.json rename to data/models/jeffmeloy_Qwen2.5-7B-olm-v1.4.json diff --git a/data/jeffmeloy_Qwen2.5-7B-olm-v1.5.json b/data/models/jeffmeloy_Qwen2.5-7B-olm-v1.5.json similarity index 100% rename from data/jeffmeloy_Qwen2.5-7B-olm-v1.5.json rename to data/models/jeffmeloy_Qwen2.5-7B-olm-v1.5.json diff --git a/data/jeffmeloy_jeffmeloy_Qwen2.5-7B-minperplexity-1.json b/data/models/jeffmeloy_jeffmeloy_Qwen2.5-7B-minperplexity-1.json similarity index 100% rename from data/jeffmeloy_jeffmeloy_Qwen2.5-7B-minperplexity-1.json rename to data/models/jeffmeloy_jeffmeloy_Qwen2.5-7B-minperplexity-1.json diff --git a/data/jeonsworld_CarbonVillain-en-10.7B-v4.json b/data/models/jeonsworld_CarbonVillain-en-10.7B-v4.json similarity index 100% rename from data/jeonsworld_CarbonVillain-en-10.7B-v4.json rename to data/models/jeonsworld_CarbonVillain-en-10.7B-v4.json diff --git a/data/jiangxinyang-shanda_Homer-LLama3-8B.json b/data/models/jiangxinyang-shanda_Homer-LLama3-8B.json similarity index 100% rename from data/jiangxinyang-shanda_Homer-LLama3-8B.json rename to data/models/jiangxinyang-shanda_Homer-LLama3-8B.json diff --git a/data/jieliu_Storm-7B.json b/data/models/jieliu_Storm-7B.json similarity index 100% rename from data/jieliu_Storm-7B.json rename to data/models/jieliu_Storm-7B.json diff --git a/data/jiviai_medX_v2.json b/data/models/jiviai_medX_v2.json similarity index 100% rename from data/jiviai_medX_v2.json rename to data/models/jiviai_medX_v2.json diff --git a/data/jlzhou_Qwen2.5-3B-Infinity-Instruct-0625.json b/data/models/jlzhou_Qwen2.5-3B-Infinity-Instruct-0625.json similarity index 100% rename from data/jlzhou_Qwen2.5-3B-Infinity-Instruct-0625.json rename to data/models/jlzhou_Qwen2.5-3B-Infinity-Instruct-0625.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01.json b/data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01.json rename to data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1.json b/data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1.json rename to data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01.json b/data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01.json rename to data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1.json b/data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1.json rename to data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01.json b/data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01.json rename to data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1.json b/data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1.json rename to data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01.json b/data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01.json rename to data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1.json b/data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1.json rename to data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01.json b/data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01.json rename to data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1.json b/data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1.json rename to data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01.json b/data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01.json rename to data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1.json b/data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1.json rename to data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01.json b/data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01.json rename to data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1.json b/data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1.json rename to data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01.json b/data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01.json rename to data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1.json b/data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1.json rename to data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01.json b/data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01.json rename to data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1.json b/data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1.json rename to data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01.json b/data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01.json rename to data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1.json b/data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1.json rename to data/models/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_dare_linear.json b/data/models/johnsutor_Llama-3-8B-Instruct_dare_linear.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_dare_linear.json rename to data/models/johnsutor_Llama-3-8B-Instruct_dare_linear.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.1.json b/data/models/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.1.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.1.json rename to data/models/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.1.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.3.json b/data/models/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.3.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.3.json rename to data/models/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.3.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.7.json b/data/models/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.7.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.7.json rename to data/models/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.7.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.9.json b/data/models/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.9.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.9.json rename to data/models/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.9.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_linear.json b/data/models/johnsutor_Llama-3-8B-Instruct_linear.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_linear.json rename to data/models/johnsutor_Llama-3-8B-Instruct_linear.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_ties-density-0.1.json b/data/models/johnsutor_Llama-3-8B-Instruct_ties-density-0.1.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_ties-density-0.1.json rename to data/models/johnsutor_Llama-3-8B-Instruct_ties-density-0.1.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_ties-density-0.3.json b/data/models/johnsutor_Llama-3-8B-Instruct_ties-density-0.3.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_ties-density-0.3.json rename to data/models/johnsutor_Llama-3-8B-Instruct_ties-density-0.3.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_ties-density-0.5.json b/data/models/johnsutor_Llama-3-8B-Instruct_ties-density-0.5.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_ties-density-0.5.json rename to data/models/johnsutor_Llama-3-8B-Instruct_ties-density-0.5.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_ties-density-0.7.json b/data/models/johnsutor_Llama-3-8B-Instruct_ties-density-0.7.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_ties-density-0.7.json rename to data/models/johnsutor_Llama-3-8B-Instruct_ties-density-0.7.json diff --git a/data/johnsutor_Llama-3-8B-Instruct_ties-density-0.9.json b/data/models/johnsutor_Llama-3-8B-Instruct_ties-density-0.9.json similarity index 100% rename from data/johnsutor_Llama-3-8B-Instruct_ties-density-0.9.json rename to data/models/johnsutor_Llama-3-8B-Instruct_ties-density-0.9.json diff --git a/data/jondurbin_bagel-dpo-34b-v0.5.json b/data/models/jondurbin_bagel-dpo-34b-v0.5.json similarity index 100% rename from data/jondurbin_bagel-dpo-34b-v0.5.json rename to data/models/jondurbin_bagel-dpo-34b-v0.5.json diff --git a/data/jpacifico_Chocolatine-14B-Instruct-4k-DPO.json b/data/models/jpacifico_Chocolatine-14B-Instruct-4k-DPO.json similarity index 100% rename from data/jpacifico_Chocolatine-14B-Instruct-4k-DPO.json rename to data/models/jpacifico_Chocolatine-14B-Instruct-4k-DPO.json diff --git a/data/jpacifico_Chocolatine-14B-Instruct-DPO-v1.2.json b/data/models/jpacifico_Chocolatine-14B-Instruct-DPO-v1.2.json similarity index 100% rename from data/jpacifico_Chocolatine-14B-Instruct-DPO-v1.2.json rename to data/models/jpacifico_Chocolatine-14B-Instruct-DPO-v1.2.json diff --git a/data/jpacifico_Chocolatine-14B-Instruct-DPO-v1.3.json b/data/models/jpacifico_Chocolatine-14B-Instruct-DPO-v1.3.json similarity index 100% rename from data/jpacifico_Chocolatine-14B-Instruct-DPO-v1.3.json rename to data/models/jpacifico_Chocolatine-14B-Instruct-DPO-v1.3.json diff --git a/data/jpacifico_Chocolatine-2-14B-Instruct-DPO-v2.0b1.json b/data/models/jpacifico_Chocolatine-2-14B-Instruct-DPO-v2.0b1.json similarity index 100% rename from data/jpacifico_Chocolatine-2-14B-Instruct-DPO-v2.0b1.json rename to data/models/jpacifico_Chocolatine-2-14B-Instruct-DPO-v2.0b1.json diff --git a/data/jpacifico_Chocolatine-2-14B-Instruct-v2.0.1.json b/data/models/jpacifico_Chocolatine-2-14B-Instruct-v2.0.1.json similarity index 100% rename from data/jpacifico_Chocolatine-2-14B-Instruct-v2.0.1.json rename to data/models/jpacifico_Chocolatine-2-14B-Instruct-v2.0.1.json diff --git a/data/jpacifico_Chocolatine-2-14B-Instruct-v2.0.3.json b/data/models/jpacifico_Chocolatine-2-14B-Instruct-v2.0.3.json similarity index 100% rename from data/jpacifico_Chocolatine-2-14B-Instruct-v2.0.3.json rename to data/models/jpacifico_Chocolatine-2-14B-Instruct-v2.0.3.json diff --git a/data/jpacifico_Chocolatine-2-14B-Instruct-v2.0.json b/data/models/jpacifico_Chocolatine-2-14B-Instruct-v2.0.json similarity index 100% rename from data/jpacifico_Chocolatine-2-14B-Instruct-v2.0.json rename to data/models/jpacifico_Chocolatine-2-14B-Instruct-v2.0.json diff --git a/data/jpacifico_Chocolatine-2-14B-Instruct-v2.0b2.json b/data/models/jpacifico_Chocolatine-2-14B-Instruct-v2.0b2.json similarity index 100% rename from data/jpacifico_Chocolatine-2-14B-Instruct-v2.0b2.json rename to data/models/jpacifico_Chocolatine-2-14B-Instruct-v2.0b2.json diff --git a/data/jpacifico_Chocolatine-2-14B-Instruct-v2.0b3.json b/data/models/jpacifico_Chocolatine-2-14B-Instruct-v2.0b3.json similarity index 100% rename from data/jpacifico_Chocolatine-2-14B-Instruct-v2.0b3.json rename to data/models/jpacifico_Chocolatine-2-14B-Instruct-v2.0b3.json diff --git a/data/jpacifico_Chocolatine-3B-Instruct-DPO-Revised.json b/data/models/jpacifico_Chocolatine-3B-Instruct-DPO-Revised.json similarity index 100% rename from data/jpacifico_Chocolatine-3B-Instruct-DPO-Revised.json rename to data/models/jpacifico_Chocolatine-3B-Instruct-DPO-Revised.json diff --git a/data/jpacifico_Chocolatine-3B-Instruct-DPO-v1.0.json b/data/models/jpacifico_Chocolatine-3B-Instruct-DPO-v1.0.json similarity index 100% rename from data/jpacifico_Chocolatine-3B-Instruct-DPO-v1.0.json rename to data/models/jpacifico_Chocolatine-3B-Instruct-DPO-v1.0.json diff --git a/data/jpacifico_Chocolatine-3B-Instruct-DPO-v1.2.json b/data/models/jpacifico_Chocolatine-3B-Instruct-DPO-v1.2.json similarity index 100% rename from data/jpacifico_Chocolatine-3B-Instruct-DPO-v1.2.json rename to data/models/jpacifico_Chocolatine-3B-Instruct-DPO-v1.2.json diff --git a/data/jpacifico_Distilucie-7B-Math-Instruct-DPO-v0.1.json b/data/models/jpacifico_Distilucie-7B-Math-Instruct-DPO-v0.1.json similarity index 100% rename from data/jpacifico_Distilucie-7B-Math-Instruct-DPO-v0.1.json rename to data/models/jpacifico_Distilucie-7B-Math-Instruct-DPO-v0.1.json diff --git a/data/jpacifico_Lucie-7B-Instruct-DPO-v1.1.3.json b/data/models/jpacifico_Lucie-7B-Instruct-DPO-v1.1.3.json similarity index 100% rename from data/jpacifico_Lucie-7B-Instruct-DPO-v1.1.3.json rename to data/models/jpacifico_Lucie-7B-Instruct-DPO-v1.1.3.json diff --git a/data/jpacifico_Lucie-7B-Instruct-DPO-v1.1.json b/data/models/jpacifico_Lucie-7B-Instruct-DPO-v1.1.json similarity index 100% rename from data/jpacifico_Lucie-7B-Instruct-DPO-v1.1.json rename to data/models/jpacifico_Lucie-7B-Instruct-DPO-v1.1.json diff --git a/data/jpacifico_Lucie-7B-Instruct-Merged-Model_Stock-v1.0.json b/data/models/jpacifico_Lucie-7B-Instruct-Merged-Model_Stock-v1.0.json similarity index 100% rename from data/jpacifico_Lucie-7B-Instruct-Merged-Model_Stock-v1.0.json rename to data/models/jpacifico_Lucie-7B-Instruct-Merged-Model_Stock-v1.0.json diff --git a/data/jpacifico_Lucie-7B-Instruct-Merged-Model_Stock-v1.1.json b/data/models/jpacifico_Lucie-7B-Instruct-Merged-Model_Stock-v1.1.json similarity index 100% rename from data/jpacifico_Lucie-7B-Instruct-Merged-Model_Stock-v1.1.json rename to data/models/jpacifico_Lucie-7B-Instruct-Merged-Model_Stock-v1.1.json diff --git a/data/jpacifico_Lucie-Boosted-7B-Instruct.json b/data/models/jpacifico_Lucie-Boosted-7B-Instruct.json similarity index 100% rename from data/jpacifico_Lucie-Boosted-7B-Instruct.json rename to data/models/jpacifico_Lucie-Boosted-7B-Instruct.json diff --git a/data/jsfs11_L3-8B-Stheno-slerp.json b/data/models/jsfs11_L3-8B-Stheno-slerp.json similarity index 100% rename from data/jsfs11_L3-8B-Stheno-slerp.json rename to data/models/jsfs11_L3-8B-Stheno-slerp.json diff --git a/data/jsfs11_MixtureofMerges-MoE-4x7b-v4.json b/data/models/jsfs11_MixtureofMerges-MoE-4x7b-v4.json similarity index 100% rename from data/jsfs11_MixtureofMerges-MoE-4x7b-v4.json rename to data/models/jsfs11_MixtureofMerges-MoE-4x7b-v4.json diff --git a/data/jsfs11_MixtureofMerges-MoE-4x7b-v5.json b/data/models/jsfs11_MixtureofMerges-MoE-4x7b-v5.json similarity index 100% rename from data/jsfs11_MixtureofMerges-MoE-4x7b-v5.json rename to data/models/jsfs11_MixtureofMerges-MoE-4x7b-v5.json diff --git a/data/kaist-ai_janus-7b.json b/data/models/kaist-ai_janus-7b.json similarity index 100% rename from data/kaist-ai_janus-7b.json rename to data/models/kaist-ai_janus-7b.json diff --git a/data/kaist-ai_janus-dpo-7b.json b/data/models/kaist-ai_janus-dpo-7b.json similarity index 100% rename from data/kaist-ai_janus-dpo-7b.json rename to data/models/kaist-ai_janus-dpo-7b.json diff --git a/data/kaist-ai_janus-rm-7b.json b/data/models/kaist-ai_janus-rm-7b.json similarity index 100% rename from data/kaist-ai_janus-rm-7b.json rename to data/models/kaist-ai_janus-rm-7b.json diff --git a/data/kaist-ai_mistral-orpo-capybara-7k.json b/data/models/kaist-ai_mistral-orpo-capybara-7k.json similarity index 100% rename from data/kaist-ai_mistral-orpo-capybara-7k.json rename to data/models/kaist-ai_mistral-orpo-capybara-7k.json diff --git a/data/kavonalds_BunderMaxx-0710.json b/data/models/kavonalds_BunderMaxx-0710.json similarity index 100% rename from data/kavonalds_BunderMaxx-0710.json rename to data/models/kavonalds_BunderMaxx-0710.json diff --git a/data/kavonalds_BunderMaxx-1010.json b/data/models/kavonalds_BunderMaxx-1010.json similarity index 100% rename from data/kavonalds_BunderMaxx-1010.json rename to data/models/kavonalds_BunderMaxx-1010.json diff --git a/data/kavonalds_Lancer-1-1b-Instruct.json b/data/models/kavonalds_Lancer-1-1b-Instruct.json similarity index 100% rename from data/kavonalds_Lancer-1-1b-Instruct.json rename to data/models/kavonalds_Lancer-1-1b-Instruct.json diff --git a/data/kayfour_T3Q-Qwen2.5-7B-it-KOR-Safe.json b/data/models/kayfour_T3Q-Qwen2.5-7B-it-KOR-Safe.json similarity index 100% rename from data/kayfour_T3Q-Qwen2.5-7B-it-KOR-Safe.json rename to data/models/kayfour_T3Q-Qwen2.5-7B-it-KOR-Safe.json diff --git a/data/keeeeenw_MicroLlama.json b/data/models/keeeeenw_MicroLlama.json similarity index 100% rename from data/keeeeenw_MicroLlama.json rename to data/models/keeeeenw_MicroLlama.json diff --git a/data/kekmodel_StopCarbon-10.7B-v5.json b/data/models/kekmodel_StopCarbon-10.7B-v5.json similarity index 100% rename from data/kekmodel_StopCarbon-10.7B-v5.json rename to data/models/kekmodel_StopCarbon-10.7B-v5.json diff --git a/data/kevin009_llamaRAGdrama.json b/data/models/kevin009_llamaRAGdrama.json similarity index 100% rename from data/kevin009_llamaRAGdrama.json rename to data/models/kevin009_llamaRAGdrama.json diff --git a/data/khoantap_cheap-moe-merge.json b/data/models/khoantap_cheap-moe-merge.json similarity index 100% rename from data/khoantap_cheap-moe-merge.json rename to data/models/khoantap_cheap-moe-merge.json diff --git a/data/khoantap_llama-3-8b-stock-merge.json b/data/models/khoantap_llama-3-8b-stock-merge.json similarity index 100% rename from data/khoantap_llama-3-8b-stock-merge.json rename to data/models/khoantap_llama-3-8b-stock-merge.json diff --git a/data/khoantap_llama-breadcrumbs-ties-merge.json b/data/models/khoantap_llama-breadcrumbs-ties-merge.json similarity index 100% rename from data/khoantap_llama-breadcrumbs-ties-merge.json rename to data/models/khoantap_llama-breadcrumbs-ties-merge.json diff --git a/data/khoantap_llama-evolve-ties-best-merge.json b/data/models/khoantap_llama-evolve-ties-best-merge.json similarity index 100% rename from data/khoantap_llama-evolve-ties-best-merge.json rename to data/models/khoantap_llama-evolve-ties-best-merge.json diff --git a/data/khoantap_llama-linear-0.5-0.5-1-merge.json b/data/models/khoantap_llama-linear-0.5-0.5-1-merge.json similarity index 100% rename from data/khoantap_llama-linear-0.5-0.5-1-merge.json rename to data/models/khoantap_llama-linear-0.5-0.5-1-merge.json diff --git a/data/khoantap_llama-linear-0.5-1-0.5-merge.json b/data/models/khoantap_llama-linear-0.5-1-0.5-merge.json similarity index 100% rename from data/khoantap_llama-linear-0.5-1-0.5-merge.json rename to data/models/khoantap_llama-linear-0.5-1-0.5-merge.json diff --git a/data/khoantap_llama-linear-1-0.5-0.5-merge.json b/data/models/khoantap_llama-linear-1-0.5-0.5-merge.json similarity index 100% rename from data/khoantap_llama-linear-1-0.5-0.5-merge.json rename to data/models/khoantap_llama-linear-1-0.5-0.5-merge.json diff --git a/data/khoantap_llama-slerp-merge.json b/data/models/khoantap_llama-slerp-merge.json similarity index 100% rename from data/khoantap_llama-slerp-merge.json rename to data/models/khoantap_llama-slerp-merge.json diff --git a/data/khoantap_moe-out-merge.json b/data/models/khoantap_moe-out-merge.json similarity index 100% rename from data/khoantap_moe-out-merge.json rename to data/models/khoantap_moe-out-merge.json diff --git a/data/khulaifi95_Llama-3.1-8B-Reason-Blend-888k.json b/data/models/khulaifi95_Llama-3.1-8B-Reason-Blend-888k.json similarity index 100% rename from data/khulaifi95_Llama-3.1-8B-Reason-Blend-888k.json rename to data/models/khulaifi95_Llama-3.1-8B-Reason-Blend-888k.json diff --git a/data/kms7530_chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1.json b/data/models/kms7530_chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1.json similarity index 100% rename from data/kms7530_chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1.json rename to data/models/kms7530_chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1.json diff --git a/data/kms7530_chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath.json b/data/models/kms7530_chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath.json similarity index 100% rename from data/kms7530_chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath.json rename to data/models/kms7530_chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath.json diff --git a/data/kms7530_chemeng_qwen-math-7b_24_1_100_1.json b/data/models/kms7530_chemeng_qwen-math-7b_24_1_100_1.json similarity index 100% rename from data/kms7530_chemeng_qwen-math-7b_24_1_100_1.json rename to data/models/kms7530_chemeng_qwen-math-7b_24_1_100_1.json diff --git a/data/kms7530_chemeng_qwen-math-7b_24_1_100_1_nonmath.json b/data/models/kms7530_chemeng_qwen-math-7b_24_1_100_1_nonmath.json similarity index 100% rename from data/kms7530_chemeng_qwen-math-7b_24_1_100_1_nonmath.json rename to data/models/kms7530_chemeng_qwen-math-7b_24_1_100_1_nonmath.json diff --git a/data/kno10_ende-chat-0.0.5.json b/data/models/kno10_ende-chat-0.0.5.json similarity index 100% rename from data/kno10_ende-chat-0.0.5.json rename to data/models/kno10_ende-chat-0.0.5.json diff --git a/data/kno10_ende-chat-0.0.7.json b/data/models/kno10_ende-chat-0.0.7.json similarity index 100% rename from data/kno10_ende-chat-0.0.7.json rename to data/models/kno10_ende-chat-0.0.7.json diff --git a/data/kuaishou_kwaipilot-40b-0604.json b/data/models/kuaishou_kwaipilot-40b-0604.json similarity index 100% rename from data/kuaishou_kwaipilot-40b-0604.json rename to data/models/kuaishou_kwaipilot-40b-0604.json diff --git a/data/kyutai_helium-1-preview-2b.json b/data/models/kyutai_helium-1-preview-2b.json similarity index 100% rename from data/kyutai_helium-1-preview-2b.json rename to data/models/kyutai_helium-1-preview-2b.json diff --git a/data/kz919_QwQ-0.5B-Distilled-SFT.json b/data/models/kz919_QwQ-0.5B-Distilled-SFT.json similarity index 100% rename from data/kz919_QwQ-0.5B-Distilled-SFT.json rename to data/models/kz919_QwQ-0.5B-Distilled-SFT.json diff --git a/data/ladydaina_ECE-FDF.json b/data/models/ladydaina_ECE-FDF.json similarity index 100% rename from data/ladydaina_ECE-FDF.json rename to data/models/ladydaina_ECE-FDF.json diff --git a/data/laislemke_LLaMA-2-vicuna-7b-slerp.json b/data/models/laislemke_LLaMA-2-vicuna-7b-slerp.json similarity index 100% rename from data/laislemke_LLaMA-2-vicuna-7b-slerp.json rename to data/models/laislemke_LLaMA-2-vicuna-7b-slerp.json diff --git a/data/lalainy_ECE-PRYMMAL-0.5B-FT-V5-MUSR.json b/data/models/lalainy_ECE-PRYMMAL-0.5B-FT-V5-MUSR.json similarity index 100% rename from data/lalainy_ECE-PRYMMAL-0.5B-FT-V5-MUSR.json rename to data/models/lalainy_ECE-PRYMMAL-0.5B-FT-V5-MUSR.json diff --git a/data/lalainy_ECE-PRYMMAL-0.5B-SLERP-V4.json b/data/models/lalainy_ECE-PRYMMAL-0.5B-SLERP-V4.json similarity index 100% rename from data/lalainy_ECE-PRYMMAL-0.5B-SLERP-V4.json rename to data/models/lalainy_ECE-PRYMMAL-0.5B-SLERP-V4.json diff --git a/data/lalainy_ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1.json b/data/models/lalainy_ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1.json similarity index 100% rename from data/lalainy_ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1.json rename to data/models/lalainy_ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1.json diff --git a/data/lalainy_ECE-PRYMMAL-YL-1B-SLERP-V3.json b/data/models/lalainy_ECE-PRYMMAL-YL-1B-SLERP-V3.json similarity index 100% rename from data/lalainy_ECE-PRYMMAL-YL-1B-SLERP-V3.json rename to data/models/lalainy_ECE-PRYMMAL-YL-1B-SLERP-V3.json diff --git a/data/lalainy_ECE-PRYMMAL-YL-1B-SLERP-V4.json b/data/models/lalainy_ECE-PRYMMAL-YL-1B-SLERP-V4.json similarity index 100% rename from data/lalainy_ECE-PRYMMAL-YL-1B-SLERP-V4.json rename to data/models/lalainy_ECE-PRYMMAL-YL-1B-SLERP-V4.json diff --git a/data/lalainy_ECE-PRYMMAL-YL-6B-SLERP-V1.json b/data/models/lalainy_ECE-PRYMMAL-YL-6B-SLERP-V1.json similarity index 100% rename from data/lalainy_ECE-PRYMMAL-YL-6B-SLERP-V1.json rename to data/models/lalainy_ECE-PRYMMAL-YL-6B-SLERP-V1.json diff --git a/data/lalainy_ECE-PRYMMAL-YL-6B-SLERP-V2.json b/data/models/lalainy_ECE-PRYMMAL-YL-6B-SLERP-V2.json similarity index 100% rename from data/lalainy_ECE-PRYMMAL-YL-6B-SLERP-V2.json rename to data/models/lalainy_ECE-PRYMMAL-YL-6B-SLERP-V2.json diff --git a/data/langgptai_Qwen-las-v0.1.json b/data/models/langgptai_Qwen-las-v0.1.json similarity index 100% rename from data/langgptai_Qwen-las-v0.1.json rename to data/models/langgptai_Qwen-las-v0.1.json diff --git a/data/langgptai_qwen1.5-7b-chat-sa-v0.1.json b/data/models/langgptai_qwen1.5-7b-chat-sa-v0.1.json similarity index 100% rename from data/langgptai_qwen1.5-7b-chat-sa-v0.1.json rename to data/models/langgptai_qwen1.5-7b-chat-sa-v0.1.json diff --git a/data/lars1234_Mistral-Small-24B-Instruct-2501-writer.json b/data/models/lars1234_Mistral-Small-24B-Instruct-2501-writer.json similarity index 100% rename from data/lars1234_Mistral-Small-24B-Instruct-2501-writer.json rename to data/models/lars1234_Mistral-Small-24B-Instruct-2501-writer.json diff --git a/data/leafspark_Llama-3.1-8B-MultiReflection-Instruct.json b/data/models/leafspark_Llama-3.1-8B-MultiReflection-Instruct.json similarity index 100% rename from data/leafspark_Llama-3.1-8B-MultiReflection-Instruct.json rename to data/models/leafspark_Llama-3.1-8B-MultiReflection-Instruct.json diff --git a/data/lemon07r_Gemma-2-Ataraxy-9B.json b/data/models/lemon07r_Gemma-2-Ataraxy-9B.json similarity index 100% rename from data/lemon07r_Gemma-2-Ataraxy-9B.json rename to data/models/lemon07r_Gemma-2-Ataraxy-9B.json diff --git a/data/lemon07r_Gemma-2-Ataraxy-Advanced-9B.json b/data/models/lemon07r_Gemma-2-Ataraxy-Advanced-9B.json similarity index 100% rename from data/lemon07r_Gemma-2-Ataraxy-Advanced-9B.json rename to data/models/lemon07r_Gemma-2-Ataraxy-Advanced-9B.json diff --git a/data/lemon07r_Gemma-2-Ataraxy-Remix-9B.json b/data/models/lemon07r_Gemma-2-Ataraxy-Remix-9B.json similarity index 100% rename from data/lemon07r_Gemma-2-Ataraxy-Remix-9B.json rename to data/models/lemon07r_Gemma-2-Ataraxy-Remix-9B.json diff --git a/data/lemon07r_Gemma-2-Ataraxy-v2-9B.json b/data/models/lemon07r_Gemma-2-Ataraxy-v2-9B.json similarity index 100% rename from data/lemon07r_Gemma-2-Ataraxy-v2-9B.json rename to data/models/lemon07r_Gemma-2-Ataraxy-v2-9B.json diff --git a/data/lemon07r_Gemma-2-Ataraxy-v2a-9B.json b/data/models/lemon07r_Gemma-2-Ataraxy-v2a-9B.json similarity index 100% rename from data/lemon07r_Gemma-2-Ataraxy-v2a-9B.json rename to data/models/lemon07r_Gemma-2-Ataraxy-v2a-9B.json diff --git a/data/lemon07r_Gemma-2-Ataraxy-v2f-9B.json b/data/models/lemon07r_Gemma-2-Ataraxy-v2f-9B.json similarity index 100% rename from data/lemon07r_Gemma-2-Ataraxy-v2f-9B.json rename to data/models/lemon07r_Gemma-2-Ataraxy-v2f-9B.json diff --git a/data/lemon07r_Gemma-2-Ataraxy-v3-Advanced-9B.json b/data/models/lemon07r_Gemma-2-Ataraxy-v3-Advanced-9B.json similarity index 100% rename from data/lemon07r_Gemma-2-Ataraxy-v3-Advanced-9B.json rename to data/models/lemon07r_Gemma-2-Ataraxy-v3-Advanced-9B.json diff --git a/data/lemon07r_Gemma-2-Ataraxy-v3b-9B.json b/data/models/lemon07r_Gemma-2-Ataraxy-v3b-9B.json similarity index 100% rename from data/lemon07r_Gemma-2-Ataraxy-v3b-9B.json rename to data/models/lemon07r_Gemma-2-Ataraxy-v3b-9B.json diff --git a/data/lemon07r_Gemma-2-Ataraxy-v3i-9B.json b/data/models/lemon07r_Gemma-2-Ataraxy-v3i-9B.json similarity index 100% rename from data/lemon07r_Gemma-2-Ataraxy-v3i-9B.json rename to data/models/lemon07r_Gemma-2-Ataraxy-v3i-9B.json diff --git a/data/lemon07r_Gemma-2-Ataraxy-v3j-9B.json b/data/models/lemon07r_Gemma-2-Ataraxy-v3j-9B.json similarity index 100% rename from data/lemon07r_Gemma-2-Ataraxy-v3j-9B.json rename to data/models/lemon07r_Gemma-2-Ataraxy-v3j-9B.json diff --git a/data/lemon07r_Gemma-2-Ataraxy-v4-Advanced-9B.json b/data/models/lemon07r_Gemma-2-Ataraxy-v4-Advanced-9B.json similarity index 100% rename from data/lemon07r_Gemma-2-Ataraxy-v4-Advanced-9B.json rename to data/models/lemon07r_Gemma-2-Ataraxy-v4-Advanced-9B.json diff --git a/data/lemon07r_Gemma-2-Ataraxy-v4a-Advanced-9B.json b/data/models/lemon07r_Gemma-2-Ataraxy-v4a-Advanced-9B.json similarity index 100% rename from data/lemon07r_Gemma-2-Ataraxy-v4a-Advanced-9B.json rename to data/models/lemon07r_Gemma-2-Ataraxy-v4a-Advanced-9B.json diff --git a/data/lemon07r_Gemma-2-Ataraxy-v4b-9B.json b/data/models/lemon07r_Gemma-2-Ataraxy-v4b-9B.json similarity index 100% rename from data/lemon07r_Gemma-2-Ataraxy-v4b-9B.json rename to data/models/lemon07r_Gemma-2-Ataraxy-v4b-9B.json diff --git a/data/lemon07r_Gemma-2-Ataraxy-v4c-9B.json b/data/models/lemon07r_Gemma-2-Ataraxy-v4c-9B.json similarity index 100% rename from data/lemon07r_Gemma-2-Ataraxy-v4c-9B.json rename to data/models/lemon07r_Gemma-2-Ataraxy-v4c-9B.json diff --git a/data/lemon07r_Gemma-2-Ataraxy-v4d-9B.json b/data/models/lemon07r_Gemma-2-Ataraxy-v4d-9B.json similarity index 100% rename from data/lemon07r_Gemma-2-Ataraxy-v4d-9B.json rename to data/models/lemon07r_Gemma-2-Ataraxy-v4d-9B.json diff --git a/data/lemon07r_Llama-3-RedMagic4-8B.json b/data/models/lemon07r_Llama-3-RedMagic4-8B.json similarity index 100% rename from data/lemon07r_Llama-3-RedMagic4-8B.json rename to data/models/lemon07r_Llama-3-RedMagic4-8B.json diff --git a/data/lemon07r_llama-3-NeuralMahou-8b.json b/data/models/lemon07r_llama-3-NeuralMahou-8b.json similarity index 100% rename from data/lemon07r_llama-3-NeuralMahou-8b.json rename to data/models/lemon07r_llama-3-NeuralMahou-8b.json diff --git a/data/lesubra_ECE-EIFFEL-3B.json b/data/models/lesubra_ECE-EIFFEL-3B.json similarity index 100% rename from data/lesubra_ECE-EIFFEL-3B.json rename to data/models/lesubra_ECE-EIFFEL-3B.json diff --git a/data/lesubra_ECE-EIFFEL-3Bv2.json b/data/models/lesubra_ECE-EIFFEL-3Bv2.json similarity index 100% rename from data/lesubra_ECE-EIFFEL-3Bv2.json rename to data/models/lesubra_ECE-EIFFEL-3Bv2.json diff --git a/data/lesubra_ECE-EIFFEL-3Bv3.json b/data/models/lesubra_ECE-EIFFEL-3Bv3.json similarity index 100% rename from data/lesubra_ECE-EIFFEL-3Bv3.json rename to data/models/lesubra_ECE-EIFFEL-3Bv3.json diff --git a/data/lesubra_ECE-PRYMMAL-3B-SLERP-V1.json b/data/models/lesubra_ECE-PRYMMAL-3B-SLERP-V1.json similarity index 100% rename from data/lesubra_ECE-PRYMMAL-3B-SLERP-V1.json rename to data/models/lesubra_ECE-PRYMMAL-3B-SLERP-V1.json diff --git a/data/lesubra_ECE-PRYMMAL-3B-SLERP-V2.json b/data/models/lesubra_ECE-PRYMMAL-3B-SLERP-V2.json similarity index 100% rename from data/lesubra_ECE-PRYMMAL-3B-SLERP-V2.json rename to data/models/lesubra_ECE-PRYMMAL-3B-SLERP-V2.json diff --git a/data/lesubra_ECE-PRYMMAL-3B-SLERP_2-V1.json b/data/models/lesubra_ECE-PRYMMAL-3B-SLERP_2-V1.json similarity index 100% rename from data/lesubra_ECE-PRYMMAL-3B-SLERP_2-V1.json rename to data/models/lesubra_ECE-PRYMMAL-3B-SLERP_2-V1.json diff --git a/data/lesubra_ECE-PRYMMAL-3B-SLERP_2-V2.json b/data/models/lesubra_ECE-PRYMMAL-3B-SLERP_2-V2.json similarity index 100% rename from data/lesubra_ECE-PRYMMAL-3B-SLERP_2-V2.json rename to data/models/lesubra_ECE-PRYMMAL-3B-SLERP_2-V2.json diff --git a/data/lesubra_merge-test.json b/data/models/lesubra_merge-test.json similarity index 100% rename from data/lesubra_merge-test.json rename to data/models/lesubra_merge-test.json diff --git a/data/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-full.json b/data/models/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-full.json similarity index 100% rename from data/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-full.json rename to data/models/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-full.json diff --git a/data/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-half.json b/data/models/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-half.json similarity index 100% rename from data/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-half.json rename to data/models/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-half.json diff --git a/data/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-top25.json b/data/models/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-top25.json similarity index 100% rename from data/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-top25.json rename to data/models/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-top25.json diff --git a/data/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-top75.json b/data/models/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-top75.json similarity index 100% rename from data/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-top75.json rename to data/models/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-top75.json diff --git a/data/lightblue_suzume-llama-3-8B-multilingual.json b/data/models/lightblue_suzume-llama-3-8B-multilingual.json similarity index 100% rename from data/lightblue_suzume-llama-3-8B-multilingual.json rename to data/models/lightblue_suzume-llama-3-8B-multilingual.json diff --git a/data/lkoenig_BBAI_145.json b/data/models/lkoenig_BBAI_145.json similarity index 100% rename from data/lkoenig_BBAI_145.json rename to data/models/lkoenig_BBAI_145.json diff --git a/data/lkoenig_BBAI_200_Gemma.json b/data/models/lkoenig_BBAI_200_Gemma.json similarity index 100% rename from data/lkoenig_BBAI_200_Gemma.json rename to data/models/lkoenig_BBAI_200_Gemma.json diff --git a/data/lkoenig_BBAI_212_QwenLawLo.json b/data/models/lkoenig_BBAI_212_QwenLawLo.json similarity index 100% rename from data/lkoenig_BBAI_212_QwenLawLo.json rename to data/models/lkoenig_BBAI_212_QwenLawLo.json diff --git a/data/lkoenig_BBAI_212_Qwencore.json b/data/models/lkoenig_BBAI_212_Qwencore.json similarity index 100% rename from data/lkoenig_BBAI_212_Qwencore.json rename to data/models/lkoenig_BBAI_212_Qwencore.json diff --git a/data/lkoenig_BBAI_230_Xiaqwen.json b/data/models/lkoenig_BBAI_230_Xiaqwen.json similarity index 100% rename from data/lkoenig_BBAI_230_Xiaqwen.json rename to data/models/lkoenig_BBAI_230_Xiaqwen.json diff --git a/data/lkoenig_BBAI_375_QwenDyancabs.json b/data/models/lkoenig_BBAI_375_QwenDyancabs.json similarity index 100% rename from data/lkoenig_BBAI_375_QwenDyancabs.json rename to data/models/lkoenig_BBAI_375_QwenDyancabs.json diff --git a/data/lkoenig_BBAI_456_QwenKoen.json b/data/models/lkoenig_BBAI_456_QwenKoen.json similarity index 100% rename from data/lkoenig_BBAI_456_QwenKoen.json rename to data/models/lkoenig_BBAI_456_QwenKoen.json diff --git a/data/lkoenig_BBAI_7B_KoenQwenDyan.json b/data/models/lkoenig_BBAI_7B_KoenQwenDyan.json similarity index 100% rename from data/lkoenig_BBAI_7B_KoenQwenDyan.json rename to data/models/lkoenig_BBAI_7B_KoenQwenDyan.json diff --git a/data/lkoenig_BBAI_7B_Qwen2.5koen.json b/data/models/lkoenig_BBAI_7B_Qwen2.5koen.json similarity index 100% rename from data/lkoenig_BBAI_7B_Qwen2.5koen.json rename to data/models/lkoenig_BBAI_7B_Qwen2.5koen.json diff --git a/data/lkoenig_BBAI_7B_QwenDyanKoenLo.json b/data/models/lkoenig_BBAI_7B_QwenDyanKoenLo.json similarity index 100% rename from data/lkoenig_BBAI_7B_QwenDyanKoenLo.json rename to data/models/lkoenig_BBAI_7B_QwenDyanKoenLo.json diff --git a/data/lkoenig_BBAI_7B_QwenDyancabsLAW.json b/data/models/lkoenig_BBAI_7B_QwenDyancabsLAW.json similarity index 100% rename from data/lkoenig_BBAI_7B_QwenDyancabsLAW.json rename to data/models/lkoenig_BBAI_7B_QwenDyancabsLAW.json diff --git a/data/llm-blender_PairRM-hf.json b/data/models/llm-blender_PairRM-hf.json similarity index 100% rename from data/llm-blender_PairRM-hf.json rename to data/models/llm-blender_PairRM-hf.json diff --git a/data/llmat_Mistral-v0.3-7B-ORPO.json b/data/models/llmat_Mistral-v0.3-7B-ORPO.json similarity index 100% rename from data/llmat_Mistral-v0.3-7B-ORPO.json rename to data/models/llmat_Mistral-v0.3-7B-ORPO.json diff --git a/data/llnYou_ECE-PRYMMAL-YL-1B-SLERP-V5.json b/data/models/llnYou_ECE-PRYMMAL-YL-1B-SLERP-V5.json similarity index 100% rename from data/llnYou_ECE-PRYMMAL-YL-1B-SLERP-V5.json rename to data/models/llnYou_ECE-PRYMMAL-YL-1B-SLERP-V5.json diff --git a/data/llnYou_ECE-PRYMMAL-YL-1B-SLERP-V6.json b/data/models/llnYou_ECE-PRYMMAL-YL-1B-SLERP-V6.json similarity index 100% rename from data/llnYou_ECE-PRYMMAL-YL-1B-SLERP-V6.json rename to data/models/llnYou_ECE-PRYMMAL-YL-1B-SLERP-V6.json diff --git a/data/llnYou_ECE-PRYMMAL-YL-3B-SLERP-V1.json b/data/models/llnYou_ECE-PRYMMAL-YL-3B-SLERP-V1.json similarity index 100% rename from data/llnYou_ECE-PRYMMAL-YL-3B-SLERP-V1.json rename to data/models/llnYou_ECE-PRYMMAL-YL-3B-SLERP-V1.json diff --git a/data/llnYou_ECE-PRYMMAL-YL-3B-SLERP-V2.json b/data/models/llnYou_ECE-PRYMMAL-YL-3B-SLERP-V2.json similarity index 100% rename from data/llnYou_ECE-PRYMMAL-YL-3B-SLERP-V2.json rename to data/models/llnYou_ECE-PRYMMAL-YL-3B-SLERP-V2.json diff --git a/data/llnYou_ECE-PRYMMAL-YL-3B-SLERP-V3.json b/data/models/llnYou_ECE-PRYMMAL-YL-3B-SLERP-V3.json similarity index 100% rename from data/llnYou_ECE-PRYMMAL-YL-3B-SLERP-V3.json rename to data/models/llnYou_ECE-PRYMMAL-YL-3B-SLERP-V3.json diff --git a/data/lmsys_Vicuna-v1.3-13B.json b/data/models/lmsys_Vicuna-v1.3-13B.json similarity index 100% rename from data/lmsys_Vicuna-v1.3-13B.json rename to data/models/lmsys_Vicuna-v1.3-13B.json diff --git a/data/lmsys_Vicuna-v1.3-7B.json b/data/models/lmsys_Vicuna-v1.3-7B.json similarity index 100% rename from data/lmsys_Vicuna-v1.3-7B.json rename to data/models/lmsys_Vicuna-v1.3-7B.json diff --git a/data/lmsys_vicuna-13b-v1.3.json b/data/models/lmsys_vicuna-13b-v1.3.json similarity index 100% rename from data/lmsys_vicuna-13b-v1.3.json rename to data/models/lmsys_vicuna-13b-v1.3.json diff --git a/data/lmsys_vicuna-7b-v1.3.json b/data/models/lmsys_vicuna-7b-v1.3.json similarity index 100% rename from data/lmsys_vicuna-7b-v1.3.json rename to data/models/lmsys_vicuna-7b-v1.3.json diff --git a/data/lmsys_vicuna-7b-v1.5.json b/data/models/lmsys_vicuna-7b-v1.5.json similarity index 100% rename from data/lmsys_vicuna-7b-v1.5.json rename to data/models/lmsys_vicuna-7b-v1.5.json diff --git a/data/lodrick-the-lafted_llama-3.1-8b-instruct-ortho-v7.json b/data/models/lodrick-the-lafted_llama-3.1-8b-instruct-ortho-v7.json similarity index 100% rename from data/lodrick-the-lafted_llama-3.1-8b-instruct-ortho-v7.json rename to data/models/lodrick-the-lafted_llama-3.1-8b-instruct-ortho-v7.json diff --git a/data/lordjia_Llama-3-Cantonese-8B-Instruct.json b/data/models/lordjia_Llama-3-Cantonese-8B-Instruct.json similarity index 100% rename from data/lordjia_Llama-3-Cantonese-8B-Instruct.json rename to data/models/lordjia_Llama-3-Cantonese-8B-Instruct.json diff --git a/data/lordjia_Qwen2-Cantonese-7B-Instruct.json b/data/models/lordjia_Qwen2-Cantonese-7B-Instruct.json similarity index 100% rename from data/lordjia_Qwen2-Cantonese-7B-Instruct.json rename to data/models/lordjia_Qwen2-Cantonese-7B-Instruct.json diff --git a/data/lt-asset_nova-1.3b.json b/data/models/lt-asset_nova-1.3b.json similarity index 100% rename from data/lt-asset_nova-1.3b.json rename to data/models/lt-asset_nova-1.3b.json diff --git a/data/lunahr_thea-3b-50r-u1.json b/data/models/lunahr_thea-3b-50r-u1.json similarity index 100% rename from data/lunahr_thea-3b-50r-u1.json rename to data/models/lunahr_thea-3b-50r-u1.json diff --git a/data/lunahr_thea-v2-3b-50r.json b/data/models/lunahr_thea-v2-3b-50r.json similarity index 100% rename from data/lunahr_thea-v2-3b-50r.json rename to data/models/lunahr_thea-v2-3b-50r.json diff --git a/data/m42-health_Llama3-Med42-70B.json b/data/models/m42-health_Llama3-Med42-70B.json similarity index 100% rename from data/m42-health_Llama3-Med42-70B.json rename to data/models/m42-health_Llama3-Med42-70B.json diff --git a/data/macadeliccc_Samantha-Qwen-2-7B.json b/data/models/macadeliccc_Samantha-Qwen-2-7B.json similarity index 100% rename from data/macadeliccc_Samantha-Qwen-2-7B.json rename to data/models/macadeliccc_Samantha-Qwen-2-7B.json diff --git a/data/macadeliccc_magistrate-3.2-3b-base.json b/data/models/macadeliccc_magistrate-3.2-3b-base.json similarity index 100% rename from data/macadeliccc_magistrate-3.2-3b-base.json rename to data/models/macadeliccc_magistrate-3.2-3b-base.json diff --git a/data/macadeliccc_magistrate-3.2-3b-it.json b/data/models/macadeliccc_magistrate-3.2-3b-it.json similarity index 100% rename from data/macadeliccc_magistrate-3.2-3b-it.json rename to data/models/macadeliccc_magistrate-3.2-3b-it.json diff --git a/data/magnifi_Phi3_intent_v56_3_w_unknown_5_lr_0.002.json b/data/models/magnifi_Phi3_intent_v56_3_w_unknown_5_lr_0.002.json similarity index 100% rename from data/magnifi_Phi3_intent_v56_3_w_unknown_5_lr_0.002.json rename to data/models/magnifi_Phi3_intent_v56_3_w_unknown_5_lr_0.002.json diff --git a/data/maldv_Awqward2.5-32B-Instruct.json b/data/models/maldv_Awqward2.5-32B-Instruct.json similarity index 100% rename from data/maldv_Awqward2.5-32B-Instruct.json rename to data/models/maldv_Awqward2.5-32B-Instruct.json diff --git a/data/maldv_Lytta2.5-32B-Instruct.json b/data/models/maldv_Lytta2.5-32B-Instruct.json similarity index 100% rename from data/maldv_Lytta2.5-32B-Instruct.json rename to data/models/maldv_Lytta2.5-32B-Instruct.json diff --git a/data/maldv_Qwentile2.5-32B-Instruct.json b/data/models/maldv_Qwentile2.5-32B-Instruct.json similarity index 100% rename from data/maldv_Qwentile2.5-32B-Instruct.json rename to data/models/maldv_Qwentile2.5-32B-Instruct.json diff --git a/data/maldv_badger-kappa-llama-3-8b.json b/data/models/maldv_badger-kappa-llama-3-8b.json similarity index 100% rename from data/maldv_badger-kappa-llama-3-8b.json rename to data/models/maldv_badger-kappa-llama-3-8b.json diff --git a/data/maldv_badger-lambda-llama-3-8b.json b/data/models/maldv_badger-lambda-llama-3-8b.json similarity index 100% rename from data/maldv_badger-lambda-llama-3-8b.json rename to data/models/maldv_badger-lambda-llama-3-8b.json diff --git a/data/maldv_badger-mu-llama-3-8b.json b/data/models/maldv_badger-mu-llama-3-8b.json similarity index 100% rename from data/maldv_badger-mu-llama-3-8b.json rename to data/models/maldv_badger-mu-llama-3-8b.json diff --git a/data/maldv_badger-writer-llama-3-8b.json b/data/models/maldv_badger-writer-llama-3-8b.json similarity index 100% rename from data/maldv_badger-writer-llama-3-8b.json rename to data/models/maldv_badger-writer-llama-3-8b.json diff --git a/data/marcuscedricridia_Cheng-1.json b/data/models/marcuscedricridia_Cheng-1.json similarity index 100% rename from data/marcuscedricridia_Cheng-1.json rename to data/models/marcuscedricridia_Cheng-1.json diff --git a/data/marcuscedricridia_Cheng-2-v1.1.json b/data/models/marcuscedricridia_Cheng-2-v1.1.json similarity index 100% rename from data/marcuscedricridia_Cheng-2-v1.1.json rename to data/models/marcuscedricridia_Cheng-2-v1.1.json diff --git a/data/marcuscedricridia_Cheng-2.json b/data/models/marcuscedricridia_Cheng-2.json similarity index 100% rename from data/marcuscedricridia_Cheng-2.json rename to data/models/marcuscedricridia_Cheng-2.json diff --git a/data/marcuscedricridia_Hush-Qwen2.5-7B-MST-v1.1.json b/data/models/marcuscedricridia_Hush-Qwen2.5-7B-MST-v1.1.json similarity index 100% rename from data/marcuscedricridia_Hush-Qwen2.5-7B-MST-v1.1.json rename to data/models/marcuscedricridia_Hush-Qwen2.5-7B-MST-v1.1.json diff --git a/data/marcuscedricridia_Hush-Qwen2.5-7B-MST-v1.3.json b/data/models/marcuscedricridia_Hush-Qwen2.5-7B-MST-v1.3.json similarity index 100% rename from data/marcuscedricridia_Hush-Qwen2.5-7B-MST-v1.3.json rename to data/models/marcuscedricridia_Hush-Qwen2.5-7B-MST-v1.3.json diff --git a/data/marcuscedricridia_Hush-Qwen2.5-7B-MST.json b/data/models/marcuscedricridia_Hush-Qwen2.5-7B-MST.json similarity index 100% rename from data/marcuscedricridia_Hush-Qwen2.5-7B-MST.json rename to data/models/marcuscedricridia_Hush-Qwen2.5-7B-MST.json diff --git a/data/marcuscedricridia_Hush-Qwen2.5-7B-Preview.json b/data/models/marcuscedricridia_Hush-Qwen2.5-7B-Preview.json similarity index 100% rename from data/marcuscedricridia_Hush-Qwen2.5-7B-Preview.json rename to data/models/marcuscedricridia_Hush-Qwen2.5-7B-Preview.json diff --git a/data/marcuscedricridia_Hush-Qwen2.5-7B-RP-v1.4-1M.json b/data/models/marcuscedricridia_Hush-Qwen2.5-7B-RP-v1.4-1M.json similarity index 100% rename from data/marcuscedricridia_Hush-Qwen2.5-7B-RP-v1.4-1M.json rename to data/models/marcuscedricridia_Hush-Qwen2.5-7B-RP-v1.4-1M.json diff --git a/data/marcuscedricridia_Hush-Qwen2.5-7B-v1.1.json b/data/models/marcuscedricridia_Hush-Qwen2.5-7B-v1.1.json similarity index 100% rename from data/marcuscedricridia_Hush-Qwen2.5-7B-v1.1.json rename to data/models/marcuscedricridia_Hush-Qwen2.5-7B-v1.1.json diff --git a/data/marcuscedricridia_Hush-Qwen2.5-7B-v1.2.json b/data/models/marcuscedricridia_Hush-Qwen2.5-7B-v1.2.json similarity index 100% rename from data/marcuscedricridia_Hush-Qwen2.5-7B-v1.2.json rename to data/models/marcuscedricridia_Hush-Qwen2.5-7B-v1.2.json diff --git a/data/marcuscedricridia_Hush-Qwen2.5-7B-v1.3.json b/data/models/marcuscedricridia_Hush-Qwen2.5-7B-v1.3.json similarity index 100% rename from data/marcuscedricridia_Hush-Qwen2.5-7B-v1.3.json rename to data/models/marcuscedricridia_Hush-Qwen2.5-7B-v1.3.json diff --git a/data/marcuscedricridia_Hush-Qwen2.5-7B-v1.4.json b/data/models/marcuscedricridia_Hush-Qwen2.5-7B-v1.4.json similarity index 100% rename from data/marcuscedricridia_Hush-Qwen2.5-7B-v1.4.json rename to data/models/marcuscedricridia_Hush-Qwen2.5-7B-v1.4.json diff --git a/data/marcuscedricridia_Qwen2.5-7B-Preview.json b/data/models/marcuscedricridia_Qwen2.5-7B-Preview.json similarity index 100% rename from data/marcuscedricridia_Qwen2.5-7B-Preview.json rename to data/models/marcuscedricridia_Qwen2.5-7B-Preview.json diff --git a/data/marcuscedricridia_Yell-Qwen2.5-7B-Preview-v1.1.json b/data/models/marcuscedricridia_Yell-Qwen2.5-7B-Preview-v1.1.json similarity index 100% rename from data/marcuscedricridia_Yell-Qwen2.5-7B-Preview-v1.1.json rename to data/models/marcuscedricridia_Yell-Qwen2.5-7B-Preview-v1.1.json diff --git a/data/marcuscedricridia_Yell-Qwen2.5-7B-Preview.json b/data/models/marcuscedricridia_Yell-Qwen2.5-7B-Preview.json similarity index 100% rename from data/marcuscedricridia_Yell-Qwen2.5-7B-Preview.json rename to data/models/marcuscedricridia_Yell-Qwen2.5-7B-Preview.json diff --git a/data/marcuscedricridia_absolute-o1-7b.json b/data/models/marcuscedricridia_absolute-o1-7b.json similarity index 100% rename from data/marcuscedricridia_absolute-o1-7b.json rename to data/models/marcuscedricridia_absolute-o1-7b.json diff --git a/data/marcuscedricridia_cursa-o1-7b-2-28-2025.json b/data/models/marcuscedricridia_cursa-o1-7b-2-28-2025.json similarity index 100% rename from data/marcuscedricridia_cursa-o1-7b-2-28-2025.json rename to data/models/marcuscedricridia_cursa-o1-7b-2-28-2025.json diff --git a/data/marcuscedricridia_cursa-o1-7b-v1.1.json b/data/models/marcuscedricridia_cursa-o1-7b-v1.1.json similarity index 100% rename from data/marcuscedricridia_cursa-o1-7b-v1.1.json rename to data/models/marcuscedricridia_cursa-o1-7b-v1.1.json diff --git a/data/marcuscedricridia_cursa-o1-7b-v1.2-normalize-false.json b/data/models/marcuscedricridia_cursa-o1-7b-v1.2-normalize-false.json similarity index 100% rename from data/marcuscedricridia_cursa-o1-7b-v1.2-normalize-false.json rename to data/models/marcuscedricridia_cursa-o1-7b-v1.2-normalize-false.json diff --git a/data/marcuscedricridia_cursa-o1-7b.json b/data/models/marcuscedricridia_cursa-o1-7b.json similarity index 100% rename from data/marcuscedricridia_cursa-o1-7b.json rename to data/models/marcuscedricridia_cursa-o1-7b.json diff --git a/data/marcuscedricridia_cursor-o1-7b.json b/data/models/marcuscedricridia_cursor-o1-7b.json similarity index 100% rename from data/marcuscedricridia_cursor-o1-7b.json rename to data/models/marcuscedricridia_cursor-o1-7b.json diff --git a/data/marcuscedricridia_cursorr-o1.2-7b.json b/data/models/marcuscedricridia_cursorr-o1.2-7b.json similarity index 100% rename from data/marcuscedricridia_cursorr-o1.2-7b.json rename to data/models/marcuscedricridia_cursorr-o1.2-7b.json diff --git a/data/marcuscedricridia_etr1o-explicit-v1.1.json b/data/models/marcuscedricridia_etr1o-explicit-v1.1.json similarity index 100% rename from data/marcuscedricridia_etr1o-explicit-v1.1.json rename to data/models/marcuscedricridia_etr1o-explicit-v1.1.json diff --git a/data/marcuscedricridia_etr1o-explicit-v1.2.json b/data/models/marcuscedricridia_etr1o-explicit-v1.2.json similarity index 100% rename from data/marcuscedricridia_etr1o-explicit-v1.2.json rename to data/models/marcuscedricridia_etr1o-explicit-v1.2.json diff --git a/data/marcuscedricridia_etr1o-v1.1.json b/data/models/marcuscedricridia_etr1o-v1.1.json similarity index 100% rename from data/marcuscedricridia_etr1o-v1.1.json rename to data/models/marcuscedricridia_etr1o-v1.1.json diff --git a/data/marcuscedricridia_etr1o-v1.2.json b/data/models/marcuscedricridia_etr1o-v1.2.json similarity index 100% rename from data/marcuscedricridia_etr1o-v1.2.json rename to data/models/marcuscedricridia_etr1o-v1.2.json diff --git a/data/marcuscedricridia_fan-o1-7b.json b/data/models/marcuscedricridia_fan-o1-7b.json similarity index 100% rename from data/marcuscedricridia_fan-o1-7b.json rename to data/models/marcuscedricridia_fan-o1-7b.json diff --git a/data/marcuscedricridia_olmner-7b.json b/data/models/marcuscedricridia_olmner-7b.json similarity index 100% rename from data/marcuscedricridia_olmner-7b.json rename to data/models/marcuscedricridia_olmner-7b.json diff --git a/data/marcuscedricridia_olmner-della-7b.json b/data/models/marcuscedricridia_olmner-della-7b.json similarity index 100% rename from data/marcuscedricridia_olmner-della-7b.json rename to data/models/marcuscedricridia_olmner-della-7b.json diff --git a/data/marcuscedricridia_olmner-o1-7b.json b/data/models/marcuscedricridia_olmner-o1-7b.json similarity index 100% rename from data/marcuscedricridia_olmner-o1-7b.json rename to data/models/marcuscedricridia_olmner-o1-7b.json diff --git a/data/marcuscedricridia_olmner-sbr-7b.json b/data/models/marcuscedricridia_olmner-sbr-7b.json similarity index 100% rename from data/marcuscedricridia_olmner-sbr-7b.json rename to data/models/marcuscedricridia_olmner-sbr-7b.json diff --git a/data/marcuscedricridia_post-cursa-o1.json b/data/models/marcuscedricridia_post-cursa-o1.json similarity index 100% rename from data/marcuscedricridia_post-cursa-o1.json rename to data/models/marcuscedricridia_post-cursa-o1.json diff --git a/data/marcuscedricridia_pre-cursa-o1-v1.2.json b/data/models/marcuscedricridia_pre-cursa-o1-v1.2.json similarity index 100% rename from data/marcuscedricridia_pre-cursa-o1-v1.2.json rename to data/models/marcuscedricridia_pre-cursa-o1-v1.2.json diff --git a/data/marcuscedricridia_pre-cursa-o1-v1.3.json b/data/models/marcuscedricridia_pre-cursa-o1-v1.3.json similarity index 100% rename from data/marcuscedricridia_pre-cursa-o1-v1.3.json rename to data/models/marcuscedricridia_pre-cursa-o1-v1.3.json diff --git a/data/marcuscedricridia_pre-cursa-o1-v1.4.json b/data/models/marcuscedricridia_pre-cursa-o1-v1.4.json similarity index 100% rename from data/marcuscedricridia_pre-cursa-o1-v1.4.json rename to data/models/marcuscedricridia_pre-cursa-o1-v1.4.json diff --git a/data/marcuscedricridia_pre-cursa-o1-v1.6.json b/data/models/marcuscedricridia_pre-cursa-o1-v1.6.json similarity index 100% rename from data/marcuscedricridia_pre-cursa-o1-v1.6.json rename to data/models/marcuscedricridia_pre-cursa-o1-v1.6.json diff --git a/data/marcuscedricridia_pre-cursa-o1.json b/data/models/marcuscedricridia_pre-cursa-o1.json similarity index 100% rename from data/marcuscedricridia_pre-cursa-o1.json rename to data/models/marcuscedricridia_pre-cursa-o1.json diff --git a/data/marcuscedricridia_r1o-et.json b/data/models/marcuscedricridia_r1o-et.json similarity index 100% rename from data/marcuscedricridia_r1o-et.json rename to data/models/marcuscedricridia_r1o-et.json diff --git a/data/marcuscedricridia_sbr-o1-7b.json b/data/models/marcuscedricridia_sbr-o1-7b.json similarity index 100% rename from data/marcuscedricridia_sbr-o1-7b.json rename to data/models/marcuscedricridia_sbr-o1-7b.json diff --git a/data/marcuscedricridia_stray-r1o-et.json b/data/models/marcuscedricridia_stray-r1o-et.json similarity index 100% rename from data/marcuscedricridia_stray-r1o-et.json rename to data/models/marcuscedricridia_stray-r1o-et.json diff --git a/data/marin-community_marin-8b-instruct.json b/data/models/marin-community_marin-8b-instruct.json similarity index 100% rename from data/marin-community_marin-8b-instruct.json rename to data/models/marin-community_marin-8b-instruct.json diff --git a/data/matouLeLoup_ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3.json b/data/models/matouLeLoup_ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3.json similarity index 100% rename from data/matouLeLoup_ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3.json rename to data/models/matouLeLoup_ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3.json diff --git a/data/matouLeLoup_ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis.json b/data/models/matouLeLoup_ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis.json similarity index 100% rename from data/matouLeLoup_ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis.json rename to data/models/matouLeLoup_ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis.json diff --git a/data/matouLeLoup_ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis.json b/data/models/matouLeLoup_ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis.json similarity index 100% rename from data/matouLeLoup_ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis.json rename to data/models/matouLeLoup_ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis.json diff --git a/data/matouLeLoup_ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis.json b/data/models/matouLeLoup_ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis.json similarity index 100% rename from data/matouLeLoup_ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis.json rename to data/models/matouLeLoup_ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis.json diff --git a/data/matouLeLoup_ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis.json b/data/models/matouLeLoup_ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis.json similarity index 100% rename from data/matouLeLoup_ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis.json rename to data/models/matouLeLoup_ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis.json diff --git a/data/mattshumer_Reflection-70B.json b/data/models/mattshumer_Reflection-70B.json similarity index 100% rename from data/mattshumer_Reflection-70B.json rename to data/models/mattshumer_Reflection-70B.json diff --git a/data/mattshumer_Reflection-Llama-3.1-70B.json b/data/models/mattshumer_Reflection-Llama-3.1-70B.json similarity index 100% rename from data/mattshumer_Reflection-Llama-3.1-70B.json rename to data/models/mattshumer_Reflection-Llama-3.1-70B.json diff --git a/data/mattshumer_ref_70_e3.json b/data/models/mattshumer_ref_70_e3.json similarity index 100% rename from data/mattshumer_ref_70_e3.json rename to data/models/mattshumer_ref_70_e3.json diff --git a/data/maywell_Qwen2-7B-Multilingual-RP.json b/data/models/maywell_Qwen2-7B-Multilingual-RP.json similarity index 100% rename from data/maywell_Qwen2-7B-Multilingual-RP.json rename to data/models/maywell_Qwen2-7B-Multilingual-RP.json diff --git a/data/meditsolutions_Llama-3.1-MedIT-SUN-8B.json b/data/models/meditsolutions_Llama-3.1-MedIT-SUN-8B.json similarity index 100% rename from data/meditsolutions_Llama-3.1-MedIT-SUN-8B.json rename to data/models/meditsolutions_Llama-3.1-MedIT-SUN-8B.json diff --git a/data/meditsolutions_Llama-3.2-SUN-1B-Instruct.json b/data/models/meditsolutions_Llama-3.2-SUN-1B-Instruct.json similarity index 100% rename from data/meditsolutions_Llama-3.2-SUN-1B-Instruct.json rename to data/models/meditsolutions_Llama-3.2-SUN-1B-Instruct.json diff --git a/data/meditsolutions_Llama-3.2-SUN-1B-chat.json b/data/models/meditsolutions_Llama-3.2-SUN-1B-chat.json similarity index 100% rename from data/meditsolutions_Llama-3.2-SUN-1B-chat.json rename to data/models/meditsolutions_Llama-3.2-SUN-1B-chat.json diff --git a/data/meditsolutions_Llama-3.2-SUN-2.4B-checkpoint-26000.json b/data/models/meditsolutions_Llama-3.2-SUN-2.4B-checkpoint-26000.json similarity index 100% rename from data/meditsolutions_Llama-3.2-SUN-2.4B-checkpoint-26000.json rename to data/models/meditsolutions_Llama-3.2-SUN-2.4B-checkpoint-26000.json diff --git a/data/meditsolutions_Llama-3.2-SUN-2.4B-checkpoint-34800.json b/data/models/meditsolutions_Llama-3.2-SUN-2.4B-checkpoint-34800.json similarity index 100% rename from data/meditsolutions_Llama-3.2-SUN-2.4B-checkpoint-34800.json rename to data/models/meditsolutions_Llama-3.2-SUN-2.4B-checkpoint-34800.json diff --git a/data/meditsolutions_Llama-3.2-SUN-2.4B-v1.0.0.json b/data/models/meditsolutions_Llama-3.2-SUN-2.4B-v1.0.0.json similarity index 100% rename from data/meditsolutions_Llama-3.2-SUN-2.4B-v1.0.0.json rename to data/models/meditsolutions_Llama-3.2-SUN-2.4B-v1.0.0.json diff --git a/data/meditsolutions_Llama-3.2-SUN-2.5B-chat.json b/data/models/meditsolutions_Llama-3.2-SUN-2.5B-chat.json similarity index 100% rename from data/meditsolutions_Llama-3.2-SUN-2.5B-chat.json rename to data/models/meditsolutions_Llama-3.2-SUN-2.5B-chat.json diff --git a/data/meditsolutions_Llama-3.2-SUN-HDIC-1B-Instruct.json b/data/models/meditsolutions_Llama-3.2-SUN-HDIC-1B-Instruct.json similarity index 100% rename from data/meditsolutions_Llama-3.2-SUN-HDIC-1B-Instruct.json rename to data/models/meditsolutions_Llama-3.2-SUN-HDIC-1B-Instruct.json diff --git a/data/meditsolutions_MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune.json b/data/models/meditsolutions_MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune.json similarity index 100% rename from data/meditsolutions_MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune.json rename to data/models/meditsolutions_MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune.json diff --git a/data/meditsolutions_MSH-v1-Bielik-v2.3-Instruct-MedIT-merge.json b/data/models/meditsolutions_MSH-v1-Bielik-v2.3-Instruct-MedIT-merge.json similarity index 100% rename from data/meditsolutions_MSH-v1-Bielik-v2.3-Instruct-MedIT-merge.json rename to data/models/meditsolutions_MSH-v1-Bielik-v2.3-Instruct-MedIT-merge.json diff --git a/data/meditsolutions_MedIT-Mesh-3B-Instruct.json b/data/models/meditsolutions_MedIT-Mesh-3B-Instruct.json similarity index 100% rename from data/meditsolutions_MedIT-Mesh-3B-Instruct.json rename to data/models/meditsolutions_MedIT-Mesh-3B-Instruct.json diff --git a/data/meditsolutions_SmolLM2-MedIT-Upscale-2B.json b/data/models/meditsolutions_SmolLM2-MedIT-Upscale-2B.json similarity index 100% rename from data/meditsolutions_SmolLM2-MedIT-Upscale-2B.json rename to data/models/meditsolutions_SmolLM2-MedIT-Upscale-2B.json diff --git a/data/meetkai_functionary-small-v3.1.json b/data/models/meetkai_functionary-small-v3.1.json similarity index 100% rename from data/meetkai_functionary-small-v3.1.json rename to data/models/meetkai_functionary-small-v3.1.json diff --git a/data/meraGPT_mera-mix-4x7B.json b/data/models/meraGPT_mera-mix-4x7B.json similarity index 100% rename from data/meraGPT_mera-mix-4x7B.json rename to data/models/meraGPT_mera-mix-4x7B.json diff --git a/data/mergekit-community_JAJUKA-WEWILLNEVERFORGETYOU-3B.json b/data/models/mergekit-community_JAJUKA-WEWILLNEVERFORGETYOU-3B.json similarity index 100% rename from data/mergekit-community_JAJUKA-WEWILLNEVERFORGETYOU-3B.json rename to data/models/mergekit-community_JAJUKA-WEWILLNEVERFORGETYOU-3B.json diff --git a/data/mergekit-community_SuperQwen-2.5-1.5B.json b/data/models/mergekit-community_SuperQwen-2.5-1.5B.json similarity index 100% rename from data/mergekit-community_SuperQwen-2.5-1.5B.json rename to data/models/mergekit-community_SuperQwen-2.5-1.5B.json diff --git a/data/mergekit-community_VirtuosoSmall-InstructModelStock.json b/data/models/mergekit-community_VirtuosoSmall-InstructModelStock.json similarity index 100% rename from data/mergekit-community_VirtuosoSmall-InstructModelStock.json rename to data/models/mergekit-community_VirtuosoSmall-InstructModelStock.json diff --git a/data/mergekit-community_diabolic6045_ELN-AOC-CAIN.json b/data/models/mergekit-community_diabolic6045_ELN-AOC-CAIN.json similarity index 100% rename from data/mergekit-community_diabolic6045_ELN-AOC-CAIN.json rename to data/models/mergekit-community_diabolic6045_ELN-AOC-CAIN.json diff --git a/data/mergekit-community_mergekit-dare_ties-ajgjgea.json b/data/models/mergekit-community_mergekit-dare_ties-ajgjgea.json similarity index 100% rename from data/mergekit-community_mergekit-dare_ties-ajgjgea.json rename to data/models/mergekit-community_mergekit-dare_ties-ajgjgea.json diff --git a/data/mergekit-community_mergekit-della-zgowfmf.json b/data/models/mergekit-community_mergekit-della-zgowfmf.json similarity index 100% rename from data/mergekit-community_mergekit-della-zgowfmf.json rename to data/models/mergekit-community_mergekit-della-zgowfmf.json diff --git a/data/mergekit-community_mergekit-model_stock-azgztvm.json b/data/models/mergekit-community_mergekit-model_stock-azgztvm.json similarity index 100% rename from data/mergekit-community_mergekit-model_stock-azgztvm.json rename to data/models/mergekit-community_mergekit-model_stock-azgztvm.json diff --git a/data/mergekit-community_mergekit-slerp-fmrazcr.json b/data/models/mergekit-community_mergekit-slerp-fmrazcr.json similarity index 100% rename from data/mergekit-community_mergekit-slerp-fmrazcr.json rename to data/models/mergekit-community_mergekit-slerp-fmrazcr.json diff --git a/data/mergekit-community_mergekit-ties-rraxdhv.json b/data/models/mergekit-community_mergekit-ties-rraxdhv.json similarity index 100% rename from data/mergekit-community_mergekit-ties-rraxdhv.json rename to data/models/mergekit-community_mergekit-ties-rraxdhv.json diff --git a/data/mergekit-community_mergekit-ties-ykqemwr.json b/data/models/mergekit-community_mergekit-ties-ykqemwr.json similarity index 100% rename from data/mergekit-community_mergekit-ties-ykqemwr.json rename to data/models/mergekit-community_mergekit-ties-ykqemwr.json diff --git a/data/mergekit-community_sexeh_time_testing.json b/data/models/mergekit-community_sexeh_time_testing.json similarity index 100% rename from data/mergekit-community_sexeh_time_testing.json rename to data/models/mergekit-community_sexeh_time_testing.json diff --git a/data/meta-llama_Llama-2-13b-chat-hf.json b/data/models/meta-llama_Llama-2-13b-chat-hf.json similarity index 100% rename from data/meta-llama_Llama-2-13b-chat-hf.json rename to data/models/meta-llama_Llama-2-13b-chat-hf.json diff --git a/data/meta-llama_Llama-2-13b-hf.json b/data/models/meta-llama_Llama-2-13b-hf.json similarity index 100% rename from data/meta-llama_Llama-2-13b-hf.json rename to data/models/meta-llama_Llama-2-13b-hf.json diff --git a/data/meta-llama_Llama-2-70b-chat-hf.json b/data/models/meta-llama_Llama-2-70b-chat-hf.json similarity index 100% rename from data/meta-llama_Llama-2-70b-chat-hf.json rename to data/models/meta-llama_Llama-2-70b-chat-hf.json diff --git a/data/meta-llama_Llama-2-70b-hf.json b/data/models/meta-llama_Llama-2-70b-hf.json similarity index 100% rename from data/meta-llama_Llama-2-70b-hf.json rename to data/models/meta-llama_Llama-2-70b-hf.json diff --git a/data/meta-llama_Llama-2-7b-chat-hf.json b/data/models/meta-llama_Llama-2-7b-chat-hf.json similarity index 100% rename from data/meta-llama_Llama-2-7b-chat-hf.json rename to data/models/meta-llama_Llama-2-7b-chat-hf.json diff --git a/data/meta-llama_Llama-2-7b-hf.json b/data/models/meta-llama_Llama-2-7b-hf.json similarity index 100% rename from data/meta-llama_Llama-2-7b-hf.json rename to data/models/meta-llama_Llama-2-7b-hf.json diff --git a/data/meta-llama_Llama-3.1-70B-Instruct.json b/data/models/meta-llama_Llama-3.1-70B-Instruct.json similarity index 100% rename from data/meta-llama_Llama-3.1-70B-Instruct.json rename to data/models/meta-llama_Llama-3.1-70B-Instruct.json diff --git a/data/meta-llama_Llama-3.1-70B.json b/data/models/meta-llama_Llama-3.1-70B.json similarity index 100% rename from data/meta-llama_Llama-3.1-70B.json rename to data/models/meta-llama_Llama-3.1-70B.json diff --git a/data/meta-llama_Llama-3.1-8B-Instruct.json b/data/models/meta-llama_Llama-3.1-8B-Instruct.json similarity index 100% rename from data/meta-llama_Llama-3.1-8B-Instruct.json rename to data/models/meta-llama_Llama-3.1-8B-Instruct.json diff --git a/data/meta-llama_Llama-3.1-8B.json b/data/models/meta-llama_Llama-3.1-8B.json similarity index 100% rename from data/meta-llama_Llama-3.1-8B.json rename to data/models/meta-llama_Llama-3.1-8B.json diff --git a/data/meta-llama_Llama-3.2-1B-Instruct.json b/data/models/meta-llama_Llama-3.2-1B-Instruct.json similarity index 100% rename from data/meta-llama_Llama-3.2-1B-Instruct.json rename to data/models/meta-llama_Llama-3.2-1B-Instruct.json diff --git a/data/meta-llama_Llama-3.2-1B.json b/data/models/meta-llama_Llama-3.2-1B.json similarity index 100% rename from data/meta-llama_Llama-3.2-1B.json rename to data/models/meta-llama_Llama-3.2-1B.json diff --git a/data/meta-llama_Llama-3.2-3B-Instruct.json b/data/models/meta-llama_Llama-3.2-3B-Instruct.json similarity index 100% rename from data/meta-llama_Llama-3.2-3B-Instruct.json rename to data/models/meta-llama_Llama-3.2-3B-Instruct.json diff --git a/data/meta-llama_Llama-3.2-3B.json b/data/models/meta-llama_Llama-3.2-3B.json similarity index 100% rename from data/meta-llama_Llama-3.2-3B.json rename to data/models/meta-llama_Llama-3.2-3B.json diff --git a/data/meta-llama_Llama-3.3-70B-Instruct.json b/data/models/meta-llama_Llama-3.3-70B-Instruct.json similarity index 100% rename from data/meta-llama_Llama-3.3-70B-Instruct.json rename to data/models/meta-llama_Llama-3.3-70B-Instruct.json diff --git a/data/meta-llama_Meta-Llama-3-70B-Instruct.json b/data/models/meta-llama_Meta-Llama-3-70B-Instruct.json similarity index 100% rename from data/meta-llama_Meta-Llama-3-70B-Instruct.json rename to data/models/meta-llama_Meta-Llama-3-70B-Instruct.json diff --git a/data/meta-llama_Meta-Llama-3-70B.json b/data/models/meta-llama_Meta-Llama-3-70B.json similarity index 100% rename from data/meta-llama_Meta-Llama-3-70B.json rename to data/models/meta-llama_Meta-Llama-3-70B.json diff --git a/data/meta-llama_Meta-Llama-3-8B-Instruct.json b/data/models/meta-llama_Meta-Llama-3-8B-Instruct.json similarity index 100% rename from data/meta-llama_Meta-Llama-3-8B-Instruct.json rename to data/models/meta-llama_Meta-Llama-3-8B-Instruct.json index f945702a2b1910b55f1417925dcbc1ee75212cdf..62488b692c11a41f5cbe263dd82e987b279a3c64 100644 --- a/data/meta-llama_Meta-Llama-3-8B-Instruct.json +++ b/data/models/meta-llama_Meta-Llama-3-8B-Instruct.json @@ -41,7 +41,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4782 + "score": 0.7408 } }, { @@ -59,7 +59,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.491 + "score": 0.4989 } }, { @@ -77,7 +77,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0914 + "score": 0.0869 } }, { @@ -95,7 +95,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2928 + "score": 0.2592 } }, { @@ -113,7 +113,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3805 + "score": 0.3568 } }, { @@ -131,7 +131,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3591 + "score": 0.3664 } } ], @@ -171,7 +171,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7408 + "score": 0.4782 } }, { @@ -189,7 +189,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4989 + "score": 0.491 } }, { @@ -207,7 +207,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0869 + "score": 0.0914 } }, { @@ -225,7 +225,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2592 + "score": 0.2928 } }, { @@ -243,7 +243,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3568 + "score": 0.3805 } }, { @@ -261,7 +261,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3664 + "score": 0.3591 } } ], diff --git a/data/meta-llama_Meta-Llama-3-8B.json b/data/models/meta-llama_Meta-Llama-3-8B.json similarity index 100% rename from data/meta-llama_Meta-Llama-3-8B.json rename to data/models/meta-llama_Meta-Llama-3-8B.json diff --git a/data/meta-llama_Meta-Llama-3.1-405B-Instruct-Turbo.json b/data/models/meta-llama_Meta-Llama-3.1-405B-Instruct-Turbo.json similarity index 100% rename from data/meta-llama_Meta-Llama-3.1-405B-Instruct-Turbo.json rename to data/models/meta-llama_Meta-Llama-3.1-405B-Instruct-Turbo.json diff --git a/data/meta-llama_Meta-Llama-3.1-70B-Instruct-Turbo.json b/data/models/meta-llama_Meta-Llama-3.1-70B-Instruct-Turbo.json similarity index 100% rename from data/meta-llama_Meta-Llama-3.1-70B-Instruct-Turbo.json rename to data/models/meta-llama_Meta-Llama-3.1-70B-Instruct-Turbo.json diff --git a/data/meta-llama_Meta-Llama-3.1-70B-Instruct.json b/data/models/meta-llama_Meta-Llama-3.1-70B-Instruct.json similarity index 100% rename from data/meta-llama_Meta-Llama-3.1-70B-Instruct.json rename to data/models/meta-llama_Meta-Llama-3.1-70B-Instruct.json diff --git a/data/meta-llama_Meta-Llama-3.1-8B-Instruct-Turbo.json b/data/models/meta-llama_Meta-Llama-3.1-8B-Instruct-Turbo.json similarity index 100% rename from data/meta-llama_Meta-Llama-3.1-8B-Instruct-Turbo.json rename to data/models/meta-llama_Meta-Llama-3.1-8B-Instruct-Turbo.json diff --git a/data/meta-metrics_MetaMetrics-RM-v1.0.json b/data/models/meta-metrics_MetaMetrics-RM-v1.0.json similarity index 100% rename from data/meta-metrics_MetaMetrics-RM-v1.0.json rename to data/models/meta-metrics_MetaMetrics-RM-v1.0.json diff --git a/data/meta_LLaMA-13B.json b/data/models/meta_LLaMA-13B.json similarity index 100% rename from data/meta_LLaMA-13B.json rename to data/models/meta_LLaMA-13B.json diff --git a/data/meta_LLaMA-30B.json b/data/models/meta_LLaMA-30B.json similarity index 100% rename from data/meta_LLaMA-30B.json rename to data/models/meta_LLaMA-30B.json diff --git a/data/models/meta_LLaMA-65B.json b/data/models/meta_LLaMA-65B.json new file mode 100644 index 0000000000000000000000000000000000000000..1c4ba53d52c065d4fd8799872f80827119957366 --- /dev/null +++ b/data/models/meta_LLaMA-65B.json @@ -0,0 +1,674 @@ +{ + "model_info": { + "name": "LLaMA 65B", + "id": "meta/LLaMA-65B", + "developer": "meta", + "inference_platform": "unknown" + }, + "evaluations": [ + { + "evaluation_id": "helm_classic/meta_LLaMA-65B/1774096308.339228", + "retrieved_timestamp": "1774096308.339228", + "source_metadata": { + "source_name": "helm_classic", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "helm", + "version": "unknown" + }, + "benchmark": "helm_classic", + "evaluation_results": [ + { + "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "How many models this model outperform on average (over columns).", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.908, + "details": { + "description": "", + "tab": "Accuracy", + "Mean win rate - Calibration": "{\"description\": \"\", \"tab\": \"Calibration\", \"score\": \"\"}", + "Mean win rate - Robustness": "{\"description\": \"\", \"tab\": \"Robustness\", \"score\": \"0.8851981351981352\"}", + "Mean win rate - Fairness": "{\"description\": \"\", \"tab\": \"Fairness\", \"score\": \"0.9235431235431235\"}", + "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}", + "Mean win rate - Bias": "{\"description\": \"\", \"tab\": \"Bias\", \"score\": \"0.4059399223461723\"}", + "Mean win rate - Toxicity": "{\"description\": \"\", \"tab\": \"Toxicity\", \"score\": \"0.5910839160839161\"}", + "Mean win rate - Summarization metrics": "{\"description\": \"\", \"tab\": \"Summarization metrics\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on MMLU", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.584, + "details": { + "description": "min=0.34, mean=0.584, max=0.89, sum=2.919 (5)", + "tab": "Accuracy", + "MMLU - ECE (10-bin)": "{\"description\": \"5 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "MMLU - EM (Robustness)": "{\"description\": \"min=0.27, mean=0.504, max=0.81, sum=2.518 (5)\", \"tab\": \"Robustness\", \"score\": \"0.5036842105263158\"}", + "MMLU - EM (Fairness)": "{\"description\": \"min=0.34, mean=0.551, max=0.84, sum=2.757 (5)\", \"tab\": \"Fairness\", \"score\": \"0.5514385964912281\"}", + "MMLU - Denoised inference time (s)": "{\"description\": \"5 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}", + "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MMLU - # prompt tokens": "{\"description\": \"min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)\", \"tab\": \"General information\", \"score\": \"522.5470877192982\"}", + "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "MMLU - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on BoolQ", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.871, + "details": { + "description": "min=0.871, mean=0.871, max=0.871, sum=0.871 (1)", + "tab": "Accuracy", + "BoolQ - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "BoolQ - EM (Robustness)": "{\"description\": \"min=0.84, mean=0.84, max=0.84, sum=0.84 (1)\", \"tab\": \"Robustness\", \"score\": \"0.84\"}", + "BoolQ - EM (Fairness)": "{\"description\": \"min=0.847, mean=0.847, max=0.847, sum=0.847 (1)\", \"tab\": \"Fairness\", \"score\": \"0.847\"}", + "BoolQ - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "BoolQ - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "BoolQ - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "BoolQ - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "BoolQ - # prompt tokens": "{\"description\": \"min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)\", \"tab\": \"General information\", \"score\": \"1439.447\"}", + "BoolQ - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "BoolQ - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "BoolQ - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Stereotypes (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Representation (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Representation (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Toxic fraction": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "F1 on NarrativeQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.755, + "details": { + "description": "min=0.755, mean=0.755, max=0.755, sum=0.755 (1)", + "tab": "Accuracy", + "NarrativeQA - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "NarrativeQA - F1 (Robustness)": "{\"description\": \"min=0.567, mean=0.567, max=0.567, sum=0.567 (1)\", \"tab\": \"Robustness\", \"score\": \"0.5674436891870642\"}", + "NarrativeQA - F1 (Fairness)": "{\"description\": \"min=0.661, mean=0.661, max=0.661, sum=0.661 (1)\", \"tab\": \"Fairness\", \"score\": \"0.6614214785759094\"}", + "NarrativeQA - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}", + "NarrativeQA - # train": "{\"description\": \"min=1.437, mean=1.437, max=1.437, sum=1.437 (1)\", \"tab\": \"General information\", \"score\": \"1.4366197183098592\"}", + "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NarrativeQA - # prompt tokens": "{\"description\": \"min=1541.115, mean=1541.115, max=1541.115, sum=1541.115 (1)\", \"tab\": \"General information\", \"score\": \"1541.1154929577465\"}", + "NarrativeQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NarrativeQA - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NarrativeQA - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "NarrativeQA - Stereotypes (gender)": "{\"description\": \"min=0.5, mean=0.5, max=0.5, sum=0.5 (1)\", \"tab\": \"Bias\", \"score\": \"0.5\"}", + "NarrativeQA - Representation (race)": "{\"description\": \"min=0.667, mean=0.667, max=0.667, sum=0.667 (1)\", \"tab\": \"Bias\", \"score\": \"0.6666666666666667\"}", + "NarrativeQA - Representation (gender)": "{\"description\": \"min=0.198, mean=0.198, max=0.198, sum=0.198 (1)\", \"tab\": \"Bias\", \"score\": \"0.1981132075471698\"}", + "NarrativeQA - Toxic fraction": "{\"description\": \"min=0.008, mean=0.008, max=0.008, sum=0.008 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.008450704225352112\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "F1 on NaturalQuestions (open-book)", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.672, + "details": { + "description": "min=0.672, mean=0.672, max=0.672, sum=0.672 (1)", + "tab": "Accuracy", + "NaturalQuestions (closed-book) - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "NaturalQuestions (open-book) - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "NaturalQuestions (closed-book) - F1 (Robustness)": "{\"description\": \"min=0.388, mean=0.388, max=0.388, sum=0.388 (1)\", \"tab\": \"Robustness\", \"score\": \"0.3875883665002626\"}", + "NaturalQuestions (open-book) - F1 (Robustness)": "{\"description\": \"min=0.624, mean=0.624, max=0.624, sum=0.624 (1)\", \"tab\": \"Robustness\", \"score\": \"0.623794662165915\"}", + "NaturalQuestions (closed-book) - F1 (Fairness)": "{\"description\": \"min=0.375, mean=0.375, max=0.375, sum=0.375 (1)\", \"tab\": \"Fairness\", \"score\": \"0.3753249636782112\"}", + "NaturalQuestions (open-book) - F1 (Fairness)": "{\"description\": \"min=0.633, mean=0.633, max=0.633, sum=0.633 (1)\", \"tab\": \"Fairness\", \"score\": \"0.6326996444457361\"}", + "NaturalQuestions (closed-book) - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "NaturalQuestions (open-book) - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=137.383, mean=137.383, max=137.383, sum=137.383 (1)\", \"tab\": \"General information\", \"score\": \"137.383\"}", + "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NaturalQuestions (closed-book) - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (open-book) - # train": "{\"description\": \"min=3.722, mean=3.722, max=3.722, sum=3.722 (1)\", \"tab\": \"General information\", \"score\": \"3.722\"}", + "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0.049, mean=0.049, max=0.049, sum=0.049 (1)\", \"tab\": \"General information\", \"score\": \"0.049\"}", + "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=1407.178, mean=1407.178, max=1407.178, sum=1407.178 (1)\", \"tab\": \"General information\", \"score\": \"1407.178\"}", + "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=0.987, mean=0.987, max=0.987, sum=0.987 (1)\", \"tab\": \"General information\", \"score\": \"0.987\"}", + "NaturalQuestions (open-book) - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NaturalQuestions (closed-book) - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "NaturalQuestions (closed-book) - Stereotypes (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "NaturalQuestions (closed-book) - Representation (race)": "{\"description\": \"min=0.352, mean=0.352, max=0.352, sum=0.352 (1)\", \"tab\": \"Bias\", \"score\": \"0.35238095238095235\"}", + "NaturalQuestions (closed-book) - Representation (gender)": "{\"description\": \"min=0.3, mean=0.3, max=0.3, sum=0.3 (1)\", \"tab\": \"Bias\", \"score\": \"0.30000000000000004\"}", + "NaturalQuestions (open-book) - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "NaturalQuestions (open-book) - Stereotypes (gender)": "{\"description\": \"min=0.5, mean=0.5, max=0.5, sum=0.5 (1)\", \"tab\": \"Bias\", \"score\": \"0.5\"}", + "NaturalQuestions (open-book) - Representation (race)": "{\"description\": \"min=0.436, mean=0.436, max=0.436, sum=0.436 (1)\", \"tab\": \"Bias\", \"score\": \"0.4358974358974359\"}", + "NaturalQuestions (open-book) - Representation (gender)": "{\"description\": \"min=0.393, mean=0.393, max=0.393, sum=0.393 (1)\", \"tab\": \"Bias\", \"score\": \"0.3928571428571429\"}", + "NaturalQuestions (closed-book) - Toxic fraction": "{\"description\": \"min=0.001, mean=0.001, max=0.001, sum=0.001 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.001\"}", + "NaturalQuestions (open-book) - Toxic fraction": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.0\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "F1 on QuAC", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.401, + "details": { + "description": "min=0.401, mean=0.401, max=0.401, sum=0.401 (1)", + "tab": "Accuracy", + "QuAC - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "QuAC - F1 (Robustness)": "{\"description\": \"min=0.275, mean=0.275, max=0.275, sum=0.275 (1)\", \"tab\": \"Robustness\", \"score\": \"0.2748605351114493\"}", + "QuAC - F1 (Fairness)": "{\"description\": \"min=0.333, mean=0.333, max=0.333, sum=0.333 (1)\", \"tab\": \"Fairness\", \"score\": \"0.33296543407590734\"}", + "QuAC - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "QuAC - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "QuAC - # train": "{\"description\": \"min=0.507, mean=0.507, max=0.507, sum=0.507 (1)\", \"tab\": \"General information\", \"score\": \"0.507\"}", + "QuAC - truncated": "{\"description\": \"min=0.06, mean=0.06, max=0.06, sum=0.06 (1)\", \"tab\": \"General information\", \"score\": \"0.06\"}", + "QuAC - # prompt tokens": "{\"description\": \"min=1498.657, mean=1498.657, max=1498.657, sum=1498.657 (1)\", \"tab\": \"General information\", \"score\": \"1498.657\"}", + "QuAC - # output tokens": "{\"description\": \"min=0.997, mean=0.997, max=0.997, sum=0.997 (1)\", \"tab\": \"General information\", \"score\": \"0.997\"}", + "QuAC - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "QuAC - Stereotypes (race)": "{\"description\": \"min=0.621, mean=0.621, max=0.621, sum=0.621 (1)\", \"tab\": \"Bias\", \"score\": \"0.6210526315789473\"}", + "QuAC - Stereotypes (gender)": "{\"description\": \"min=0.394, mean=0.394, max=0.394, sum=0.394 (1)\", \"tab\": \"Bias\", \"score\": \"0.3944670750705233\"}", + "QuAC - Representation (race)": "{\"description\": \"min=0.38, mean=0.38, max=0.38, sum=0.38 (1)\", \"tab\": \"Bias\", \"score\": \"0.3804713804713804\"}", + "QuAC - Representation (gender)": "{\"description\": \"min=0.243, mean=0.243, max=0.243, sum=0.243 (1)\", \"tab\": \"Bias\", \"score\": \"0.24335260115606938\"}", + "QuAC - Toxic fraction": "{\"description\": \"min=0.003, mean=0.003, max=0.003, sum=0.003 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.003\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on HellaSwag", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "HellaSwag - ECE (10-bin)": "{\"description\": \"No matching runs\", \"tab\": \"Calibration\", \"score\": \"\"}", + "HellaSwag - EM (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "HellaSwag - EM (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "HellaSwag - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "HellaSwag - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on OpenbookQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "OpenbookQA - ECE (10-bin)": "{\"description\": \"No matching runs\", \"tab\": \"Calibration\", \"score\": \"\"}", + "OpenbookQA - EM (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "OpenbookQA - EM (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "OpenbookQA - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "OpenbookQA - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on TruthfulQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.508, + "details": { + "description": "min=0.508, mean=0.508, max=0.508, sum=0.508 (1)", + "tab": "Accuracy", + "TruthfulQA - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "TruthfulQA - EM (Robustness)": "{\"description\": \"min=0.448, mean=0.448, max=0.448, sum=0.448 (1)\", \"tab\": \"Robustness\", \"score\": \"0.44801223241590216\"}", + "TruthfulQA - EM (Fairness)": "{\"description\": \"min=0.42, mean=0.42, max=0.42, sum=0.42 (1)\", \"tab\": \"Fairness\", \"score\": \"0.42048929663608564\"}", + "TruthfulQA - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "TruthfulQA - # eval": "{\"description\": \"min=654, mean=654, max=654, sum=654 (1)\", \"tab\": \"General information\", \"score\": \"654.0\"}", + "TruthfulQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "TruthfulQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "TruthfulQA - # prompt tokens": "{\"description\": \"min=524.602, mean=524.602, max=524.602, sum=524.602 (1)\", \"tab\": \"General information\", \"score\": \"524.6024464831804\"}", + "TruthfulQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "TruthfulQA - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "MS MARCO (regular) - RR@10 (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "MS MARCO (TREC) - NDCG@10 (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "MS MARCO (regular) - RR@10 (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "MS MARCO (TREC) - NDCG@10 (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "MS MARCO (regular) - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "MS MARCO (TREC) - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "MS MARCO (regular) - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}", + "MS MARCO (TREC) - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "ROUGE-2 on CNN/DailyMail", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "CNN/DailyMail - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "CNN/DailyMail - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}", + "CNN/DailyMail - SummaC": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - QAFactEval": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - BERTScore (F1)": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - Coverage": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - Density": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - Compression": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - HumanEval-faithfulness": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - HumanEval-relevance": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - HumanEval-coherence": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "ROUGE-2 on XSUM", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "XSUM - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "XSUM - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}", + "XSUM - SummaC": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - QAFactEval": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - BERTScore (F1)": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - Coverage": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - Density": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - Compression": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - HumanEval-faithfulness": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - HumanEval-relevance": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - HumanEval-coherence": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on IMDB", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.962, + "details": { + "description": "min=0.962, mean=0.962, max=0.962, sum=0.962 (1)", + "tab": "Accuracy", + "IMDB - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "IMDB - EM (Robustness)": "{\"description\": \"min=0.935, mean=0.935, max=0.935, sum=0.935 (1)\", \"tab\": \"Robustness\", \"score\": \"0.935\"}", + "IMDB - EM (Fairness)": "{\"description\": \"min=0.953, mean=0.953, max=0.953, sum=0.953 (1)\", \"tab\": \"Fairness\", \"score\": \"0.953\"}", + "IMDB - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "IMDB - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "IMDB - # train": "{\"description\": \"min=2.781, mean=2.781, max=2.781, sum=2.781 (1)\", \"tab\": \"General information\", \"score\": \"2.781\"}", + "IMDB - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "IMDB - # prompt tokens": "{\"description\": \"min=1751.213, mean=1751.213, max=1751.213, sum=1751.213 (1)\", \"tab\": \"General information\", \"score\": \"1751.213\"}", + "IMDB - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "IMDB - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "IMDB - Stereotypes (race)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Stereotypes (gender)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Representation (race)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Representation (gender)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Toxic fraction": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on CivilComments", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.655, + "details": { + "description": "min=0.395, mean=0.655, max=0.863, sum=11.783 (18)", + "tab": "Accuracy", + "CivilComments - ECE (10-bin)": "{\"description\": \"9 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "CivilComments - EM (Robustness)": "{\"description\": \"min=0.247, mean=0.566, max=0.853, sum=10.188 (18)\", \"tab\": \"Robustness\", \"score\": \"0.565986035612513\"}", + "CivilComments - EM (Fairness)": "{\"description\": \"min=0.32, mean=0.574, max=0.8, sum=10.336 (18)\", \"tab\": \"Fairness\", \"score\": \"0.57420608635975\"}", + "CivilComments - Denoised inference time (s)": "{\"description\": \"9 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "CivilComments - # eval": "{\"description\": \"min=74, mean=371.556, max=683, sum=6688 (18)\", \"tab\": \"General information\", \"score\": \"371.55555555555554\"}", + "CivilComments - # train": "{\"description\": \"min=5, mean=5, max=5, sum=90 (18)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "CivilComments - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (18)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "CivilComments - # prompt tokens": "{\"description\": \"min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)\", \"tab\": \"General information\", \"score\": \"855.2410378605821\"}", + "CivilComments - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=18 (18)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "CivilComments - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=18 (18)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "CivilComments - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Stereotypes (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Representation (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Representation (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Toxic fraction": "{\"description\": \"9 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on RAFT", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.702, + "details": { + "description": "min=0.125, mean=0.702, max=0.975, sum=7.725 (11)", + "tab": "Accuracy", + "RAFT - ECE (10-bin)": "{\"description\": \"11 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "RAFT - EM (Robustness)": "{\"description\": \"min=0, mean=0.655, max=0.975, sum=7.2 (11)\", \"tab\": \"Robustness\", \"score\": \"0.6545454545454545\"}", + "RAFT - EM (Fairness)": "{\"description\": \"min=0.075, mean=0.668, max=0.975, sum=7.35 (11)\", \"tab\": \"Fairness\", \"score\": \"0.6681818181818182\"}", + "RAFT - Denoised inference time (s)": "{\"description\": \"11 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "RAFT - # eval": "{\"description\": \"min=40, mean=40, max=40, sum=440 (11)\", \"tab\": \"General information\", \"score\": \"40.0\"}", + "RAFT - # train": "{\"description\": \"min=0.45, mean=4.552, max=5, sum=50.075 (11)\", \"tab\": \"General information\", \"score\": \"4.552272727272727\"}", + "RAFT - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (11)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "RAFT - # prompt tokens": "{\"description\": \"min=303.675, mean=954.111, max=1882.1, sum=10495.225 (11)\", \"tab\": \"General information\", \"score\": \"954.1113636363635\"}", + "RAFT - # output tokens": "{\"description\": \"min=0.8, mean=0.982, max=1, sum=10.8 (11)\", \"tab\": \"General information\", \"score\": \"0.9818181818181819\"}", + "RAFT - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=11 (11)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "RAFT - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Stereotypes (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Representation (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Representation (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Toxic fraction": "{\"description\": \"11 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + } + ], + "detailed_evaluation_results": null, + "generation_config": { + "additional_details": {} + } + } + ] +} \ No newline at end of file diff --git a/data/meta_LLaMA-7B.json b/data/models/meta_LLaMA-7B.json similarity index 100% rename from data/meta_LLaMA-7B.json rename to data/models/meta_LLaMA-7B.json diff --git a/data/models/meta_Llama-2-13B.json b/data/models/meta_Llama-2-13B.json new file mode 100644 index 0000000000000000000000000000000000000000..ea745ae3071913358288054c176dad53886d7e4c --- /dev/null +++ b/data/models/meta_Llama-2-13B.json @@ -0,0 +1,674 @@ +{ + "model_info": { + "name": "Llama 2 13B", + "id": "meta/Llama-2-13B", + "developer": "meta", + "inference_platform": "unknown" + }, + "evaluations": [ + { + "evaluation_id": "helm_classic/meta_Llama-2-13B/1774096308.339228", + "retrieved_timestamp": "1774096308.339228", + "source_metadata": { + "source_name": "helm_classic", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "helm", + "version": "unknown" + }, + "benchmark": "helm_classic", + "evaluation_results": [ + { + "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "How many models this model outperform on average (over columns).", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.823, + "details": { + "description": "", + "tab": "Accuracy", + "Mean win rate - Calibration": "{\"description\": \"\", \"tab\": \"Calibration\", \"score\": \"\"}", + "Mean win rate - Robustness": "{\"description\": \"\", \"tab\": \"Robustness\", \"score\": \"0.8231701631701632\"}", + "Mean win rate - Fairness": "{\"description\": \"\", \"tab\": \"Fairness\", \"score\": \"0.8078088578088578\"}", + "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}", + "Mean win rate - Bias": "{\"description\": \"\", \"tab\": \"Bias\", \"score\": \"0.46948265409803874\"}", + "Mean win rate - Toxicity": "{\"description\": \"\", \"tab\": \"Toxicity\", \"score\": \"0.4142191142191142\"}", + "Mean win rate - Summarization metrics": "{\"description\": \"\", \"tab\": \"Summarization metrics\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on MMLU", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.507, + "details": { + "description": "min=0.28, mean=0.507, max=0.84, sum=2.533 (5)", + "tab": "Accuracy", + "MMLU - ECE (10-bin)": "{\"description\": \"5 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "MMLU - EM (Robustness)": "{\"description\": \"min=0.22, mean=0.444, max=0.76, sum=2.222 (5)\", \"tab\": \"Robustness\", \"score\": \"0.44438596491228066\"}", + "MMLU - EM (Fairness)": "{\"description\": \"min=0.26, mean=0.466, max=0.79, sum=2.331 (5)\", \"tab\": \"Fairness\", \"score\": \"0.46614035087719297\"}", + "MMLU - Denoised inference time (s)": "{\"description\": \"5 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}", + "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MMLU - # prompt tokens": "{\"description\": \"min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)\", \"tab\": \"General information\", \"score\": \"522.5470877192982\"}", + "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "MMLU - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on BoolQ", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.811, + "details": { + "description": "min=0.811, mean=0.811, max=0.811, sum=0.811 (1)", + "tab": "Accuracy", + "BoolQ - ECE (10-bin)": "{\"description\": \"min=0.116, mean=0.116, max=0.116, sum=0.116 (1)\", \"tab\": \"Calibration\", \"score\": \"\"}", + "BoolQ - EM (Robustness)": "{\"description\": \"min=0.753, mean=0.753, max=0.753, sum=0.753 (1)\", \"tab\": \"Robustness\", \"score\": \"0.753\"}", + "BoolQ - EM (Fairness)": "{\"description\": \"min=0.732, mean=0.732, max=0.732, sum=0.732 (1)\", \"tab\": \"Fairness\", \"score\": \"0.732\"}", + "BoolQ - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "BoolQ - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "BoolQ - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "BoolQ - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "BoolQ - # prompt tokens": "{\"description\": \"min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)\", \"tab\": \"General information\", \"score\": \"1439.447\"}", + "BoolQ - # output tokens": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "BoolQ - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "BoolQ - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Stereotypes (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Representation (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Representation (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Toxic fraction": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "F1 on NarrativeQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.744, + "details": { + "description": "min=0.744, mean=0.744, max=0.744, sum=0.744 (1)", + "tab": "Accuracy", + "NarrativeQA - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "NarrativeQA - F1 (Robustness)": "{\"description\": \"min=0.682, mean=0.682, max=0.682, sum=0.682 (1)\", \"tab\": \"Robustness\", \"score\": \"0.681791424099214\"}", + "NarrativeQA - F1 (Fairness)": "{\"description\": \"min=0.657, mean=0.657, max=0.657, sum=0.657 (1)\", \"tab\": \"Fairness\", \"score\": \"0.6567284210865421\"}", + "NarrativeQA - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}", + "NarrativeQA - # train": "{\"description\": \"min=4.414, mean=4.414, max=4.414, sum=4.414 (1)\", \"tab\": \"General information\", \"score\": \"4.414084507042253\"}", + "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NarrativeQA - # prompt tokens": "{\"description\": \"min=3673.268, mean=3673.268, max=3673.268, sum=3673.268 (1)\", \"tab\": \"General information\", \"score\": \"3673.2676056338028\"}", + "NarrativeQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NarrativeQA - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NarrativeQA - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "NarrativeQA - Stereotypes (gender)": "{\"description\": \"min=0.417, mean=0.417, max=0.417, sum=0.417 (1)\", \"tab\": \"Bias\", \"score\": \"0.4166666666666667\"}", + "NarrativeQA - Representation (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "NarrativeQA - Representation (gender)": "{\"description\": \"min=0.218, mean=0.218, max=0.218, sum=0.218 (1)\", \"tab\": \"Bias\", \"score\": \"0.21830985915492954\"}", + "NarrativeQA - Toxic fraction": "{\"description\": \"min=0.014, mean=0.014, max=0.014, sum=0.014 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.014084507042253521\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "F1 on NaturalQuestions (open-book)", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.637, + "details": { + "description": "min=0.637, mean=0.637, max=0.637, sum=0.637 (1)", + "tab": "Accuracy", + "NaturalQuestions (closed-book) - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "NaturalQuestions (open-book) - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "NaturalQuestions (closed-book) - F1 (Robustness)": "{\"description\": \"min=0.324, mean=0.324, max=0.324, sum=0.324 (1)\", \"tab\": \"Robustness\", \"score\": \"0.3243542710528751\"}", + "NaturalQuestions (open-book) - F1 (Robustness)": "{\"description\": \"min=0.563, mean=0.563, max=0.563, sum=0.563 (1)\", \"tab\": \"Robustness\", \"score\": \"0.5631882717621935\"}", + "NaturalQuestions (closed-book) - F1 (Fairness)": "{\"description\": \"min=0.309, mean=0.309, max=0.309, sum=0.309 (1)\", \"tab\": \"Fairness\", \"score\": \"0.30927547433853436\"}", + "NaturalQuestions (open-book) - F1 (Fairness)": "{\"description\": \"min=0.58, mean=0.58, max=0.58, sum=0.58 (1)\", \"tab\": \"Fairness\", \"score\": \"0.5801102053016279\"}", + "NaturalQuestions (closed-book) - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "NaturalQuestions (open-book) - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=137.383, mean=137.383, max=137.383, sum=137.383 (1)\", \"tab\": \"General information\", \"score\": \"137.383\"}", + "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NaturalQuestions (closed-book) - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (open-book) - # train": "{\"description\": \"min=4.831, mean=4.831, max=4.831, sum=4.831 (1)\", \"tab\": \"General information\", \"score\": \"4.831\"}", + "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0.026, mean=0.026, max=0.026, sum=0.026 (1)\", \"tab\": \"General information\", \"score\": \"0.026\"}", + "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=2289.409, mean=2289.409, max=2289.409, sum=2289.409 (1)\", \"tab\": \"General information\", \"score\": \"2289.409\"}", + "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=0.984, mean=0.984, max=0.984, sum=0.984 (1)\", \"tab\": \"General information\", \"score\": \"0.984\"}", + "NaturalQuestions (open-book) - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NaturalQuestions (closed-book) - Stereotypes (race)": "{\"description\": \"min=0.667, mean=0.667, max=0.667, sum=0.667 (1)\", \"tab\": \"Bias\", \"score\": \"0.6666666666666667\"}", + "NaturalQuestions (closed-book) - Stereotypes (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "NaturalQuestions (closed-book) - Representation (race)": "{\"description\": \"min=0.521, mean=0.521, max=0.521, sum=0.521 (1)\", \"tab\": \"Bias\", \"score\": \"0.5205992509363295\"}", + "NaturalQuestions (closed-book) - Representation (gender)": "{\"description\": \"min=0.15, mean=0.15, max=0.15, sum=0.15 (1)\", \"tab\": \"Bias\", \"score\": \"0.15000000000000002\"}", + "NaturalQuestions (open-book) - Stereotypes (race)": "{\"description\": \"min=0.667, mean=0.667, max=0.667, sum=0.667 (1)\", \"tab\": \"Bias\", \"score\": \"0.6666666666666667\"}", + "NaturalQuestions (open-book) - Stereotypes (gender)": "{\"description\": \"min=0.5, mean=0.5, max=0.5, sum=0.5 (1)\", \"tab\": \"Bias\", \"score\": \"0.5\"}", + "NaturalQuestions (open-book) - Representation (race)": "{\"description\": \"min=0.467, mean=0.467, max=0.467, sum=0.467 (1)\", \"tab\": \"Bias\", \"score\": \"0.4666666666666667\"}", + "NaturalQuestions (open-book) - Representation (gender)": "{\"description\": \"min=0.357, mean=0.357, max=0.357, sum=0.357 (1)\", \"tab\": \"Bias\", \"score\": \"0.3571428571428571\"}", + "NaturalQuestions (closed-book) - Toxic fraction": "{\"description\": \"min=0.001, mean=0.001, max=0.001, sum=0.001 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.001\"}", + "NaturalQuestions (open-book) - Toxic fraction": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.0\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "F1 on QuAC", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.424, + "details": { + "description": "min=0.424, mean=0.424, max=0.424, sum=0.424 (1)", + "tab": "Accuracy", + "QuAC - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "QuAC - F1 (Robustness)": "{\"description\": \"min=0.294, mean=0.294, max=0.294, sum=0.294 (1)\", \"tab\": \"Robustness\", \"score\": \"0.2939019916232739\"}", + "QuAC - F1 (Fairness)": "{\"description\": \"min=0.351, mean=0.351, max=0.351, sum=0.351 (1)\", \"tab\": \"Fairness\", \"score\": \"0.35074944218906556\"}", + "QuAC - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "QuAC - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "QuAC - # train": "{\"description\": \"min=3.204, mean=3.204, max=3.204, sum=3.204 (1)\", \"tab\": \"General information\", \"score\": \"3.204\"}", + "QuAC - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "QuAC - # prompt tokens": "{\"description\": \"min=3617.038, mean=3617.038, max=3617.038, sum=3617.038 (1)\", \"tab\": \"General information\", \"score\": \"3617.038\"}", + "QuAC - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "QuAC - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "QuAC - Stereotypes (race)": "{\"description\": \"min=0.549, mean=0.549, max=0.549, sum=0.549 (1)\", \"tab\": \"Bias\", \"score\": \"0.5485347985347986\"}", + "QuAC - Stereotypes (gender)": "{\"description\": \"min=0.392, mean=0.392, max=0.392, sum=0.392 (1)\", \"tab\": \"Bias\", \"score\": \"0.39214643381310055\"}", + "QuAC - Representation (race)": "{\"description\": \"min=0.325, mean=0.325, max=0.325, sum=0.325 (1)\", \"tab\": \"Bias\", \"score\": \"0.3248945147679325\"}", + "QuAC - Representation (gender)": "{\"description\": \"min=0.242, mean=0.242, max=0.242, sum=0.242 (1)\", \"tab\": \"Bias\", \"score\": \"0.24197860962566847\"}", + "QuAC - Toxic fraction": "{\"description\": \"min=0.004, mean=0.004, max=0.004, sum=0.004 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.004\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on HellaSwag", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "HellaSwag - ECE (10-bin)": "{\"description\": \"No matching runs\", \"tab\": \"Calibration\", \"score\": \"\"}", + "HellaSwag - EM (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "HellaSwag - EM (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "HellaSwag - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "HellaSwag - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on OpenbookQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "OpenbookQA - ECE (10-bin)": "{\"description\": \"No matching runs\", \"tab\": \"Calibration\", \"score\": \"\"}", + "OpenbookQA - EM (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "OpenbookQA - EM (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "OpenbookQA - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "OpenbookQA - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on TruthfulQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.33, + "details": { + "description": "min=0.33, mean=0.33, max=0.33, sum=0.33 (1)", + "tab": "Accuracy", + "TruthfulQA - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "TruthfulQA - EM (Robustness)": "{\"description\": \"min=0.287, mean=0.287, max=0.287, sum=0.287 (1)\", \"tab\": \"Robustness\", \"score\": \"0.2874617737003058\"}", + "TruthfulQA - EM (Fairness)": "{\"description\": \"min=0.274, mean=0.274, max=0.274, sum=0.274 (1)\", \"tab\": \"Fairness\", \"score\": \"0.27370030581039756\"}", + "TruthfulQA - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "TruthfulQA - # eval": "{\"description\": \"min=654, mean=654, max=654, sum=654 (1)\", \"tab\": \"General information\", \"score\": \"654.0\"}", + "TruthfulQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "TruthfulQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "TruthfulQA - # prompt tokens": "{\"description\": \"min=524.602, mean=524.602, max=524.602, sum=524.602 (1)\", \"tab\": \"General information\", \"score\": \"524.6024464831804\"}", + "TruthfulQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "TruthfulQA - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "MS MARCO (regular) - RR@10 (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "MS MARCO (TREC) - NDCG@10 (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "MS MARCO (regular) - RR@10 (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "MS MARCO (TREC) - NDCG@10 (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "MS MARCO (regular) - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "MS MARCO (TREC) - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "MS MARCO (regular) - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}", + "MS MARCO (TREC) - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "ROUGE-2 on CNN/DailyMail", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "CNN/DailyMail - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "CNN/DailyMail - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}", + "CNN/DailyMail - SummaC": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - QAFactEval": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - BERTScore (F1)": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - Coverage": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - Density": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - Compression": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - HumanEval-faithfulness": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - HumanEval-relevance": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - HumanEval-coherence": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "ROUGE-2 on XSUM", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "XSUM - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "XSUM - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}", + "XSUM - SummaC": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - QAFactEval": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - BERTScore (F1)": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - Coverage": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - Density": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - Compression": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - HumanEval-faithfulness": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - HumanEval-relevance": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - HumanEval-coherence": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on IMDB", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.962, + "details": { + "description": "min=0.962, mean=0.962, max=0.962, sum=0.962 (1)", + "tab": "Accuracy", + "IMDB - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "IMDB - EM (Robustness)": "{\"description\": \"min=0.954, mean=0.954, max=0.954, sum=0.954 (1)\", \"tab\": \"Robustness\", \"score\": \"0.954\"}", + "IMDB - EM (Fairness)": "{\"description\": \"min=0.957, mean=0.957, max=0.957, sum=0.957 (1)\", \"tab\": \"Fairness\", \"score\": \"0.957\"}", + "IMDB - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "IMDB - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "IMDB - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "IMDB - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "IMDB - # prompt tokens": "{\"description\": \"min=2897.409, mean=2897.409, max=2897.409, sum=2897.409 (1)\", \"tab\": \"General information\", \"score\": \"2897.409\"}", + "IMDB - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "IMDB - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "IMDB - Stereotypes (race)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Stereotypes (gender)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Representation (race)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Representation (gender)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Toxic fraction": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on CivilComments", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.588, + "details": { + "description": "min=0.087, mean=0.588, max=0.968, sum=10.579 (18)", + "tab": "Accuracy", + "CivilComments - ECE (10-bin)": "{\"description\": \"min=0.098, mean=0.323, max=0.788, sum=4.519 (14)\", \"tab\": \"Calibration\", \"score\": \"\"}", + "CivilComments - EM (Robustness)": "{\"description\": \"min=0.022, mean=0.47, max=0.958, sum=8.468 (18)\", \"tab\": \"Robustness\", \"score\": \"0.47042658911281887\"}", + "CivilComments - EM (Fairness)": "{\"description\": \"min=0.006, mean=0.489, max=0.968, sum=8.81 (18)\", \"tab\": \"Fairness\", \"score\": \"0.4894481246425394\"}", + "CivilComments - Denoised inference time (s)": "{\"description\": \"9 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "CivilComments - # eval": "{\"description\": \"min=74, mean=371.556, max=683, sum=6688 (18)\", \"tab\": \"General information\", \"score\": \"371.55555555555554\"}", + "CivilComments - # train": "{\"description\": \"min=5, mean=5, max=5, sum=90 (18)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "CivilComments - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (18)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "CivilComments - # prompt tokens": "{\"description\": \"min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)\", \"tab\": \"General information\", \"score\": \"855.2410378605821\"}", + "CivilComments - # output tokens": "{\"description\": \"min=1, mean=2.692, max=5, sum=48.448 (18)\", \"tab\": \"General information\", \"score\": \"2.6915388744093813\"}", + "CivilComments - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=18 (18)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "CivilComments - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Stereotypes (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Representation (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Representation (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Toxic fraction": "{\"description\": \"9 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on RAFT", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.707, + "details": { + "description": "min=0.1, mean=0.707, max=0.975, sum=7.775 (11)", + "tab": "Accuracy", + "RAFT - ECE (10-bin)": "{\"description\": \"11 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "RAFT - EM (Robustness)": "{\"description\": \"min=0.05, mean=0.652, max=0.95, sum=7.175 (11)\", \"tab\": \"Robustness\", \"score\": \"0.6522727272727272\"}", + "RAFT - EM (Fairness)": "{\"description\": \"min=0.075, mean=0.673, max=0.975, sum=7.4 (11)\", \"tab\": \"Fairness\", \"score\": \"0.6727272727272727\"}", + "RAFT - Denoised inference time (s)": "{\"description\": \"11 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "RAFT - # eval": "{\"description\": \"min=40, mean=40, max=40, sum=440 (11)\", \"tab\": \"General information\", \"score\": \"40.0\"}", + "RAFT - # train": "{\"description\": \"min=2.575, mean=4.78, max=5, sum=52.575 (11)\", \"tab\": \"General information\", \"score\": \"4.779545454545455\"}", + "RAFT - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (11)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "RAFT - # prompt tokens": "{\"description\": \"min=303.675, mean=1153.852, max=3623.9, sum=12692.375 (11)\", \"tab\": \"General information\", \"score\": \"1153.8522727272727\"}", + "RAFT - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=11 (11)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "RAFT - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=11 (11)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "RAFT - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Stereotypes (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Representation (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Representation (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Toxic fraction": "{\"description\": \"11 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + } + ], + "detailed_evaluation_results": null, + "generation_config": { + "additional_details": {} + } + } + ] +} \ No newline at end of file diff --git a/data/models/meta_Llama-2-70B.json b/data/models/meta_Llama-2-70B.json new file mode 100644 index 0000000000000000000000000000000000000000..62f379108db2c045364f198d02e05be28ea70409 --- /dev/null +++ b/data/models/meta_Llama-2-70B.json @@ -0,0 +1,674 @@ +{ + "model_info": { + "name": "Llama 2 70B", + "id": "meta/Llama-2-70B", + "developer": "meta", + "inference_platform": "unknown" + }, + "evaluations": [ + { + "evaluation_id": "helm_classic/meta_Llama-2-70B/1774096308.339228", + "retrieved_timestamp": "1774096308.339228", + "source_metadata": { + "source_name": "helm_classic", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "helm", + "version": "unknown" + }, + "benchmark": "helm_classic", + "evaluation_results": [ + { + "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "How many models this model outperform on average (over columns).", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.944, + "details": { + "description": "", + "tab": "Accuracy", + "Mean win rate - Calibration": "{\"description\": \"\", \"tab\": \"Calibration\", \"score\": \"\"}", + "Mean win rate - Robustness": "{\"description\": \"\", \"tab\": \"Robustness\", \"score\": \"0.9649184149184149\"}", + "Mean win rate - Fairness": "{\"description\": \"\", \"tab\": \"Fairness\", \"score\": \"0.9587645687645687\"}", + "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}", + "Mean win rate - Bias": "{\"description\": \"\", \"tab\": \"Bias\", \"score\": \"0.5375895851224799\"}", + "Mean win rate - Toxicity": "{\"description\": \"\", \"tab\": \"Toxicity\", \"score\": \"0.643006993006993\"}", + "Mean win rate - Summarization metrics": "{\"description\": \"\", \"tab\": \"Summarization metrics\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on MMLU", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.582, + "details": { + "description": "min=0.29, mean=0.582, max=0.92, sum=2.909 (5)", + "tab": "Accuracy", + "MMLU - ECE (10-bin)": "{\"description\": \"5 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "MMLU - EM (Robustness)": "{\"description\": \"min=0.22, mean=0.545, max=0.9, sum=2.726 (5)\", \"tab\": \"Robustness\", \"score\": \"0.5451929824561403\"}", + "MMLU - EM (Fairness)": "{\"description\": \"min=0.26, mean=0.557, max=0.91, sum=2.786 (5)\", \"tab\": \"Fairness\", \"score\": \"0.5571929824561404\"}", + "MMLU - Denoised inference time (s)": "{\"description\": \"5 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}", + "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MMLU - # prompt tokens": "{\"description\": \"min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)\", \"tab\": \"General information\", \"score\": \"522.5470877192982\"}", + "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "MMLU - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on BoolQ", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.886, + "details": { + "description": "min=0.886, mean=0.886, max=0.886, sum=0.886 (1)", + "tab": "Accuracy", + "BoolQ - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "BoolQ - EM (Robustness)": "{\"description\": \"min=0.863, mean=0.863, max=0.863, sum=0.863 (1)\", \"tab\": \"Robustness\", \"score\": \"0.863\"}", + "BoolQ - EM (Fairness)": "{\"description\": \"min=0.859, mean=0.859, max=0.859, sum=0.859 (1)\", \"tab\": \"Fairness\", \"score\": \"0.859\"}", + "BoolQ - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "BoolQ - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "BoolQ - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "BoolQ - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "BoolQ - # prompt tokens": "{\"description\": \"min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)\", \"tab\": \"General information\", \"score\": \"1439.447\"}", + "BoolQ - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "BoolQ - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "BoolQ - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Stereotypes (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Representation (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Representation (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Toxic fraction": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "F1 on NarrativeQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.77, + "details": { + "description": "min=0.77, mean=0.77, max=0.77, sum=0.77 (1)", + "tab": "Accuracy", + "NarrativeQA - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "NarrativeQA - F1 (Robustness)": "{\"description\": \"min=0.722, mean=0.722, max=0.722, sum=0.722 (1)\", \"tab\": \"Robustness\", \"score\": \"0.7215317388650366\"}", + "NarrativeQA - F1 (Fairness)": "{\"description\": \"min=0.709, mean=0.709, max=0.709, sum=0.709 (1)\", \"tab\": \"Fairness\", \"score\": \"0.709497495841271\"}", + "NarrativeQA - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}", + "NarrativeQA - # train": "{\"description\": \"min=4.414, mean=4.414, max=4.414, sum=4.414 (1)\", \"tab\": \"General information\", \"score\": \"4.414084507042253\"}", + "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NarrativeQA - # prompt tokens": "{\"description\": \"min=3673.268, mean=3673.268, max=3673.268, sum=3673.268 (1)\", \"tab\": \"General information\", \"score\": \"3673.2676056338028\"}", + "NarrativeQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NarrativeQA - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NarrativeQA - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "NarrativeQA - Stereotypes (gender)": "{\"description\": \"min=0.5, mean=0.5, max=0.5, sum=0.5 (1)\", \"tab\": \"Bias\", \"score\": \"0.5\"}", + "NarrativeQA - Representation (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "NarrativeQA - Representation (gender)": "{\"description\": \"min=0.187, mean=0.187, max=0.187, sum=0.187 (1)\", \"tab\": \"Bias\", \"score\": \"0.18695652173913044\"}", + "NarrativeQA - Toxic fraction": "{\"description\": \"min=0.008, mean=0.008, max=0.008, sum=0.008 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.008450704225352112\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "F1 on NaturalQuestions (open-book)", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.674, + "details": { + "description": "min=0.674, mean=0.674, max=0.674, sum=0.674 (1)", + "tab": "Accuracy", + "NaturalQuestions (closed-book) - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "NaturalQuestions (open-book) - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "NaturalQuestions (closed-book) - F1 (Robustness)": "{\"description\": \"min=0.42, mean=0.42, max=0.42, sum=0.42 (1)\", \"tab\": \"Robustness\", \"score\": \"0.42009390434309946\"}", + "NaturalQuestions (open-book) - F1 (Robustness)": "{\"description\": \"min=0.639, mean=0.639, max=0.639, sum=0.639 (1)\", \"tab\": \"Robustness\", \"score\": \"0.6385366212170214\"}", + "NaturalQuestions (closed-book) - F1 (Fairness)": "{\"description\": \"min=0.4, mean=0.4, max=0.4, sum=0.4 (1)\", \"tab\": \"Fairness\", \"score\": \"0.3997609830959401\"}", + "NaturalQuestions (open-book) - F1 (Fairness)": "{\"description\": \"min=0.637, mean=0.637, max=0.637, sum=0.637 (1)\", \"tab\": \"Fairness\", \"score\": \"0.6365724774019619\"}", + "NaturalQuestions (closed-book) - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "NaturalQuestions (open-book) - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=137.383, mean=137.383, max=137.383, sum=137.383 (1)\", \"tab\": \"General information\", \"score\": \"137.383\"}", + "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NaturalQuestions (closed-book) - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (open-book) - # train": "{\"description\": \"min=4.831, mean=4.831, max=4.831, sum=4.831 (1)\", \"tab\": \"General information\", \"score\": \"4.831\"}", + "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0.026, mean=0.026, max=0.026, sum=0.026 (1)\", \"tab\": \"General information\", \"score\": \"0.026\"}", + "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=2289.409, mean=2289.409, max=2289.409, sum=2289.409 (1)\", \"tab\": \"General information\", \"score\": \"2289.409\"}", + "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=0.998, mean=0.998, max=0.998, sum=0.998 (1)\", \"tab\": \"General information\", \"score\": \"0.998\"}", + "NaturalQuestions (open-book) - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NaturalQuestions (closed-book) - Stereotypes (race)": "{\"description\": \"min=0.667, mean=0.667, max=0.667, sum=0.667 (1)\", \"tab\": \"Bias\", \"score\": \"0.6666666666666666\"}", + "NaturalQuestions (closed-book) - Stereotypes (gender)": "{\"description\": \"min=0.167, mean=0.167, max=0.167, sum=0.167 (1)\", \"tab\": \"Bias\", \"score\": \"0.16666666666666666\"}", + "NaturalQuestions (closed-book) - Representation (race)": "{\"description\": \"min=0.524, mean=0.524, max=0.524, sum=0.524 (1)\", \"tab\": \"Bias\", \"score\": \"0.5238095238095237\"}", + "NaturalQuestions (closed-book) - Representation (gender)": "{\"description\": \"min=0.312, mean=0.312, max=0.312, sum=0.312 (1)\", \"tab\": \"Bias\", \"score\": \"0.3125\"}", + "NaturalQuestions (open-book) - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "NaturalQuestions (open-book) - Stereotypes (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "NaturalQuestions (open-book) - Representation (race)": "{\"description\": \"min=0.566, mean=0.566, max=0.566, sum=0.566 (1)\", \"tab\": \"Bias\", \"score\": \"0.5655430711610487\"}", + "NaturalQuestions (open-book) - Representation (gender)": "{\"description\": \"min=0.184, mean=0.184, max=0.184, sum=0.184 (1)\", \"tab\": \"Bias\", \"score\": \"0.1842105263157895\"}", + "NaturalQuestions (closed-book) - Toxic fraction": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.0\"}", + "NaturalQuestions (open-book) - Toxic fraction": "{\"description\": \"min=0.002, mean=0.002, max=0.002, sum=0.002 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.002\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "F1 on QuAC", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.484, + "details": { + "description": "min=0.484, mean=0.484, max=0.484, sum=0.484 (1)", + "tab": "Accuracy", + "QuAC - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "QuAC - F1 (Robustness)": "{\"description\": \"min=0.362, mean=0.362, max=0.362, sum=0.362 (1)\", \"tab\": \"Robustness\", \"score\": \"0.36189050917141447\"}", + "QuAC - F1 (Fairness)": "{\"description\": \"min=0.414, mean=0.414, max=0.414, sum=0.414 (1)\", \"tab\": \"Fairness\", \"score\": \"0.4139340894194124\"}", + "QuAC - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "QuAC - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "QuAC - # train": "{\"description\": \"min=3.204, mean=3.204, max=3.204, sum=3.204 (1)\", \"tab\": \"General information\", \"score\": \"3.204\"}", + "QuAC - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "QuAC - # prompt tokens": "{\"description\": \"min=3617.038, mean=3617.038, max=3617.038, sum=3617.038 (1)\", \"tab\": \"General information\", \"score\": \"3617.038\"}", + "QuAC - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "QuAC - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "QuAC - Stereotypes (race)": "{\"description\": \"min=0.611, mean=0.611, max=0.611, sum=0.611 (1)\", \"tab\": \"Bias\", \"score\": \"0.6111111111111112\"}", + "QuAC - Stereotypes (gender)": "{\"description\": \"min=0.403, mean=0.403, max=0.403, sum=0.403 (1)\", \"tab\": \"Bias\", \"score\": \"0.4025455927051672\"}", + "QuAC - Representation (race)": "{\"description\": \"min=0.272, mean=0.272, max=0.272, sum=0.272 (1)\", \"tab\": \"Bias\", \"score\": \"0.27183271832718325\"}", + "QuAC - Representation (gender)": "{\"description\": \"min=0.239, mean=0.239, max=0.239, sum=0.239 (1)\", \"tab\": \"Bias\", \"score\": \"0.23913043478260873\"}", + "QuAC - Toxic fraction": "{\"description\": \"min=0.001, mean=0.001, max=0.001, sum=0.001 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.001\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on HellaSwag", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "HellaSwag - ECE (10-bin)": "{\"description\": \"No matching runs\", \"tab\": \"Calibration\", \"score\": \"\"}", + "HellaSwag - EM (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "HellaSwag - EM (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "HellaSwag - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "HellaSwag - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on OpenbookQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "OpenbookQA - ECE (10-bin)": "{\"description\": \"No matching runs\", \"tab\": \"Calibration\", \"score\": \"\"}", + "OpenbookQA - EM (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "OpenbookQA - EM (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "OpenbookQA - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "OpenbookQA - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on TruthfulQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.554, + "details": { + "description": "min=0.554, mean=0.554, max=0.554, sum=0.554 (1)", + "tab": "Accuracy", + "TruthfulQA - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "TruthfulQA - EM (Robustness)": "{\"description\": \"min=0.468, mean=0.468, max=0.468, sum=0.468 (1)\", \"tab\": \"Robustness\", \"score\": \"0.46788990825688076\"}", + "TruthfulQA - EM (Fairness)": "{\"description\": \"min=0.434, mean=0.434, max=0.434, sum=0.434 (1)\", \"tab\": \"Fairness\", \"score\": \"0.43425076452599387\"}", + "TruthfulQA - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "TruthfulQA - # eval": "{\"description\": \"min=654, mean=654, max=654, sum=654 (1)\", \"tab\": \"General information\", \"score\": \"654.0\"}", + "TruthfulQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "TruthfulQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "TruthfulQA - # prompt tokens": "{\"description\": \"min=524.602, mean=524.602, max=524.602, sum=524.602 (1)\", \"tab\": \"General information\", \"score\": \"524.6024464831804\"}", + "TruthfulQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "TruthfulQA - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "MS MARCO (regular) - RR@10 (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "MS MARCO (TREC) - NDCG@10 (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "MS MARCO (regular) - RR@10 (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "MS MARCO (TREC) - NDCG@10 (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "MS MARCO (regular) - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "MS MARCO (TREC) - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "MS MARCO (regular) - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}", + "MS MARCO (TREC) - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "ROUGE-2 on CNN/DailyMail", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "CNN/DailyMail - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "CNN/DailyMail - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}", + "CNN/DailyMail - SummaC": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - QAFactEval": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - BERTScore (F1)": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - Coverage": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - Density": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - Compression": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - HumanEval-faithfulness": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - HumanEval-relevance": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - HumanEval-coherence": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "ROUGE-2 on XSUM", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "XSUM - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "XSUM - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}", + "XSUM - SummaC": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - QAFactEval": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - BERTScore (F1)": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - Coverage": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - Density": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - Compression": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - HumanEval-faithfulness": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - HumanEval-relevance": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - HumanEval-coherence": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on IMDB", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.961, + "details": { + "description": "min=0.961, mean=0.961, max=0.961, sum=0.961 (1)", + "tab": "Accuracy", + "IMDB - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "IMDB - EM (Robustness)": "{\"description\": \"min=0.949, mean=0.949, max=0.949, sum=0.949 (1)\", \"tab\": \"Robustness\", \"score\": \"0.949\"}", + "IMDB - EM (Fairness)": "{\"description\": \"min=0.954, mean=0.954, max=0.954, sum=0.954 (1)\", \"tab\": \"Fairness\", \"score\": \"0.954\"}", + "IMDB - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "IMDB - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "IMDB - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "IMDB - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "IMDB - # prompt tokens": "{\"description\": \"min=2897.409, mean=2897.409, max=2897.409, sum=2897.409 (1)\", \"tab\": \"General information\", \"score\": \"2897.409\"}", + "IMDB - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "IMDB - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "IMDB - Stereotypes (race)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Stereotypes (gender)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Representation (race)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Representation (gender)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Toxic fraction": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on CivilComments", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.652, + "details": { + "description": "min=0.337, mean=0.652, max=0.919, sum=11.733 (18)", + "tab": "Accuracy", + "CivilComments - ECE (10-bin)": "{\"description\": \"9 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "CivilComments - EM (Robustness)": "{\"description\": \"min=0.272, mean=0.59, max=0.884, sum=10.619 (18)\", \"tab\": \"Robustness\", \"score\": \"0.5899239945803259\"}", + "CivilComments - EM (Fairness)": "{\"description\": \"min=0.125, mean=0.551, max=0.892, sum=9.924 (18)\", \"tab\": \"Fairness\", \"score\": \"0.551334119704094\"}", + "CivilComments - Denoised inference time (s)": "{\"description\": \"9 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "CivilComments - # eval": "{\"description\": \"min=74, mean=371.556, max=683, sum=6688 (18)\", \"tab\": \"General information\", \"score\": \"371.55555555555554\"}", + "CivilComments - # train": "{\"description\": \"min=5, mean=5, max=5, sum=90 (18)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "CivilComments - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (18)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "CivilComments - # prompt tokens": "{\"description\": \"min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)\", \"tab\": \"General information\", \"score\": \"855.2410378605821\"}", + "CivilComments - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=18 (18)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "CivilComments - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=18 (18)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "CivilComments - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Stereotypes (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Representation (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Representation (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Toxic fraction": "{\"description\": \"9 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on RAFT", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.727, + "details": { + "description": "min=0.125, mean=0.727, max=0.975, sum=8 (11)", + "tab": "Accuracy", + "RAFT - ECE (10-bin)": "{\"description\": \"11 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "RAFT - EM (Robustness)": "{\"description\": \"min=0.075, mean=0.673, max=0.975, sum=7.4 (11)\", \"tab\": \"Robustness\", \"score\": \"0.6727272727272727\"}", + "RAFT - EM (Fairness)": "{\"description\": \"min=0.1, mean=0.7, max=0.975, sum=7.7 (11)\", \"tab\": \"Fairness\", \"score\": \"0.7\"}", + "RAFT - Denoised inference time (s)": "{\"description\": \"11 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "RAFT - # eval": "{\"description\": \"min=40, mean=40, max=40, sum=440 (11)\", \"tab\": \"General information\", \"score\": \"40.0\"}", + "RAFT - # train": "{\"description\": \"min=2.575, mean=4.78, max=5, sum=52.575 (11)\", \"tab\": \"General information\", \"score\": \"4.779545454545455\"}", + "RAFT - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (11)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "RAFT - # prompt tokens": "{\"description\": \"min=303.675, mean=1153.852, max=3623.9, sum=12692.375 (11)\", \"tab\": \"General information\", \"score\": \"1153.8522727272727\"}", + "RAFT - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=11 (11)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "RAFT - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=11 (11)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "RAFT - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Stereotypes (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Representation (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Representation (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Toxic fraction": "{\"description\": \"11 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + } + ], + "detailed_evaluation_results": null, + "generation_config": { + "additional_details": {} + } + } + ] +} \ No newline at end of file diff --git a/data/models/meta_Llama-2-7B.json b/data/models/meta_Llama-2-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..24a0db928c0317c811fa124db7b58a75980fdbf3 --- /dev/null +++ b/data/models/meta_Llama-2-7B.json @@ -0,0 +1,674 @@ +{ + "model_info": { + "name": "Llama 2 7B", + "id": "meta/Llama-2-7B", + "developer": "meta", + "inference_platform": "unknown" + }, + "evaluations": [ + { + "evaluation_id": "helm_classic/meta_Llama-2-7B/1774096308.339228", + "retrieved_timestamp": "1774096308.339228", + "source_metadata": { + "source_name": "helm_classic", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "helm", + "version": "unknown" + }, + "benchmark": "helm_classic", + "evaluation_results": [ + { + "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "How many models this model outperform on average (over columns).", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.607, + "details": { + "description": "", + "tab": "Accuracy", + "Mean win rate - Calibration": "{\"description\": \"\", \"tab\": \"Calibration\", \"score\": \"\"}", + "Mean win rate - Robustness": "{\"description\": \"\", \"tab\": \"Robustness\", \"score\": \"0.6437529137529138\"}", + "Mean win rate - Fairness": "{\"description\": \"\", \"tab\": \"Fairness\", \"score\": \"0.6102097902097903\"}", + "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}", + "Mean win rate - Bias": "{\"description\": \"\", \"tab\": \"Bias\", \"score\": \"0.4576728062932413\"}", + "Mean win rate - Toxicity": "{\"description\": \"\", \"tab\": \"Toxicity\", \"score\": \"0.8121794871794872\"}", + "Mean win rate - Summarization metrics": "{\"description\": \"\", \"tab\": \"Summarization metrics\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on MMLU", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.431, + "details": { + "description": "min=0.28, mean=0.431, max=0.64, sum=2.153 (5)", + "tab": "Accuracy", + "MMLU - ECE (10-bin)": "{\"description\": \"5 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "MMLU - EM (Robustness)": "{\"description\": \"min=0.22, mean=0.373, max=0.57, sum=1.866 (5)\", \"tab\": \"Robustness\", \"score\": \"0.37312280701754386\"}", + "MMLU - EM (Fairness)": "{\"description\": \"min=0.26, mean=0.392, max=0.59, sum=1.961 (5)\", \"tab\": \"Fairness\", \"score\": \"0.392140350877193\"}", + "MMLU - Denoised inference time (s)": "{\"description\": \"5 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}", + "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MMLU - # prompt tokens": "{\"description\": \"min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)\", \"tab\": \"General information\", \"score\": \"522.5470877192982\"}", + "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "MMLU - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on BoolQ", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.762, + "details": { + "description": "min=0.762, mean=0.762, max=0.762, sum=0.762 (1)", + "tab": "Accuracy", + "BoolQ - ECE (10-bin)": "{\"description\": \"min=0.215, mean=0.215, max=0.215, sum=0.215 (1)\", \"tab\": \"Calibration\", \"score\": \"\"}", + "BoolQ - EM (Robustness)": "{\"description\": \"min=0.676, mean=0.676, max=0.676, sum=0.676 (1)\", \"tab\": \"Robustness\", \"score\": \"0.676\"}", + "BoolQ - EM (Fairness)": "{\"description\": \"min=0.706, mean=0.706, max=0.706, sum=0.706 (1)\", \"tab\": \"Fairness\", \"score\": \"0.706\"}", + "BoolQ - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "BoolQ - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "BoolQ - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "BoolQ - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "BoolQ - # prompt tokens": "{\"description\": \"min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)\", \"tab\": \"General information\", \"score\": \"1439.447\"}", + "BoolQ - # output tokens": "{\"description\": \"min=1.296, mean=1.296, max=1.296, sum=1.296 (1)\", \"tab\": \"General information\", \"score\": \"1.296\"}", + "BoolQ - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "BoolQ - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Stereotypes (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Representation (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Representation (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Toxic fraction": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "F1 on NarrativeQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.691, + "details": { + "description": "min=0.691, mean=0.691, max=0.691, sum=0.691 (1)", + "tab": "Accuracy", + "NarrativeQA - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "NarrativeQA - F1 (Robustness)": "{\"description\": \"min=0.573, mean=0.573, max=0.573, sum=0.573 (1)\", \"tab\": \"Robustness\", \"score\": \"0.5726018964106345\"}", + "NarrativeQA - F1 (Fairness)": "{\"description\": \"min=0.596, mean=0.596, max=0.596, sum=0.596 (1)\", \"tab\": \"Fairness\", \"score\": \"0.5960691234215144\"}", + "NarrativeQA - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}", + "NarrativeQA - # train": "{\"description\": \"min=4.414, mean=4.414, max=4.414, sum=4.414 (1)\", \"tab\": \"General information\", \"score\": \"4.414084507042253\"}", + "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NarrativeQA - # prompt tokens": "{\"description\": \"min=3673.268, mean=3673.268, max=3673.268, sum=3673.268 (1)\", \"tab\": \"General information\", \"score\": \"3673.2676056338028\"}", + "NarrativeQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NarrativeQA - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NarrativeQA - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "NarrativeQA - Stereotypes (gender)": "{\"description\": \"min=0.333, mean=0.333, max=0.333, sum=0.333 (1)\", \"tab\": \"Bias\", \"score\": \"0.3333333333333333\"}", + "NarrativeQA - Representation (race)": "{\"description\": \"min=0.667, mean=0.667, max=0.667, sum=0.667 (1)\", \"tab\": \"Bias\", \"score\": \"0.6666666666666667\"}", + "NarrativeQA - Representation (gender)": "{\"description\": \"min=0.203, mean=0.203, max=0.203, sum=0.203 (1)\", \"tab\": \"Bias\", \"score\": \"0.20348837209302328\"}", + "NarrativeQA - Toxic fraction": "{\"description\": \"min=0.011, mean=0.011, max=0.011, sum=0.011 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.011267605633802818\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "F1 on NaturalQuestions (open-book)", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.611, + "details": { + "description": "min=0.611, mean=0.611, max=0.611, sum=0.611 (1)", + "tab": "Accuracy", + "NaturalQuestions (closed-book) - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "NaturalQuestions (open-book) - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "NaturalQuestions (closed-book) - F1 (Robustness)": "{\"description\": \"min=0.261, mean=0.261, max=0.261, sum=0.261 (1)\", \"tab\": \"Robustness\", \"score\": \"0.2606038875824225\"}", + "NaturalQuestions (open-book) - F1 (Robustness)": "{\"description\": \"min=0.501, mean=0.501, max=0.501, sum=0.501 (1)\", \"tab\": \"Robustness\", \"score\": \"0.5010811862440044\"}", + "NaturalQuestions (closed-book) - F1 (Fairness)": "{\"description\": \"min=0.264, mean=0.264, max=0.264, sum=0.264 (1)\", \"tab\": \"Fairness\", \"score\": \"0.26403309290317406\"}", + "NaturalQuestions (open-book) - F1 (Fairness)": "{\"description\": \"min=0.55, mean=0.55, max=0.55, sum=0.55 (1)\", \"tab\": \"Fairness\", \"score\": \"0.5499198184166533\"}", + "NaturalQuestions (closed-book) - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "NaturalQuestions (open-book) - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=137.383, mean=137.383, max=137.383, sum=137.383 (1)\", \"tab\": \"General information\", \"score\": \"137.383\"}", + "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=0.998, mean=0.998, max=0.998, sum=0.998 (1)\", \"tab\": \"General information\", \"score\": \"0.998\"}", + "NaturalQuestions (closed-book) - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (open-book) - # train": "{\"description\": \"min=4.831, mean=4.831, max=4.831, sum=4.831 (1)\", \"tab\": \"General information\", \"score\": \"4.831\"}", + "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0.026, mean=0.026, max=0.026, sum=0.026 (1)\", \"tab\": \"General information\", \"score\": \"0.026\"}", + "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=2289.409, mean=2289.409, max=2289.409, sum=2289.409 (1)\", \"tab\": \"General information\", \"score\": \"2289.409\"}", + "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=0.955, mean=0.955, max=0.955, sum=0.955 (1)\", \"tab\": \"General information\", \"score\": \"0.955\"}", + "NaturalQuestions (open-book) - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NaturalQuestions (closed-book) - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "NaturalQuestions (closed-book) - Stereotypes (gender)": "{\"description\": \"min=0.5, mean=0.5, max=0.5, sum=0.5 (1)\", \"tab\": \"Bias\", \"score\": \"0.5\"}", + "NaturalQuestions (closed-book) - Representation (race)": "{\"description\": \"min=0.381, mean=0.381, max=0.381, sum=0.381 (1)\", \"tab\": \"Bias\", \"score\": \"0.38095238095238093\"}", + "NaturalQuestions (closed-book) - Representation (gender)": "{\"description\": \"min=0.182, mean=0.182, max=0.182, sum=0.182 (1)\", \"tab\": \"Bias\", \"score\": \"0.18181818181818182\"}", + "NaturalQuestions (open-book) - Stereotypes (race)": "{\"description\": \"min=0.667, mean=0.667, max=0.667, sum=0.667 (1)\", \"tab\": \"Bias\", \"score\": \"0.6666666666666667\"}", + "NaturalQuestions (open-book) - Stereotypes (gender)": "{\"description\": \"min=0.5, mean=0.5, max=0.5, sum=0.5 (1)\", \"tab\": \"Bias\", \"score\": \"0.5\"}", + "NaturalQuestions (open-book) - Representation (race)": "{\"description\": \"min=0.577, mean=0.577, max=0.577, sum=0.577 (1)\", \"tab\": \"Bias\", \"score\": \"0.5770114942528735\"}", + "NaturalQuestions (open-book) - Representation (gender)": "{\"description\": \"min=0.486, mean=0.486, max=0.486, sum=0.486 (1)\", \"tab\": \"Bias\", \"score\": \"0.48630136986301375\"}", + "NaturalQuestions (closed-book) - Toxic fraction": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.0\"}", + "NaturalQuestions (open-book) - Toxic fraction": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.0\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "F1 on QuAC", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.406, + "details": { + "description": "min=0.406, mean=0.406, max=0.406, sum=0.406 (1)", + "tab": "Accuracy", + "QuAC - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "QuAC - F1 (Robustness)": "{\"description\": \"min=0.271, mean=0.271, max=0.271, sum=0.271 (1)\", \"tab\": \"Robustness\", \"score\": \"0.27069315379336467\"}", + "QuAC - F1 (Fairness)": "{\"description\": \"min=0.321, mean=0.321, max=0.321, sum=0.321 (1)\", \"tab\": \"Fairness\", \"score\": \"0.32122644280851614\"}", + "QuAC - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "QuAC - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "QuAC - # train": "{\"description\": \"min=3.204, mean=3.204, max=3.204, sum=3.204 (1)\", \"tab\": \"General information\", \"score\": \"3.204\"}", + "QuAC - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "QuAC - # prompt tokens": "{\"description\": \"min=3617.038, mean=3617.038, max=3617.038, sum=3617.038 (1)\", \"tab\": \"General information\", \"score\": \"3617.038\"}", + "QuAC - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "QuAC - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "QuAC - Stereotypes (race)": "{\"description\": \"min=0.583, mean=0.583, max=0.583, sum=0.583 (1)\", \"tab\": \"Bias\", \"score\": \"0.5833333333333334\"}", + "QuAC - Stereotypes (gender)": "{\"description\": \"min=0.426, mean=0.426, max=0.426, sum=0.426 (1)\", \"tab\": \"Bias\", \"score\": \"0.4264652792029702\"}", + "QuAC - Representation (race)": "{\"description\": \"min=0.283, mean=0.283, max=0.283, sum=0.283 (1)\", \"tab\": \"Bias\", \"score\": \"0.2831541218637993\"}", + "QuAC - Representation (gender)": "{\"description\": \"min=0.231, mean=0.231, max=0.231, sum=0.231 (1)\", \"tab\": \"Bias\", \"score\": \"0.23093681917211328\"}", + "QuAC - Toxic fraction": "{\"description\": \"min=0.001, mean=0.001, max=0.001, sum=0.001 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.001\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on HellaSwag", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "HellaSwag - ECE (10-bin)": "{\"description\": \"No matching runs\", \"tab\": \"Calibration\", \"score\": \"\"}", + "HellaSwag - EM (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "HellaSwag - EM (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "HellaSwag - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "HellaSwag - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on OpenbookQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "OpenbookQA - ECE (10-bin)": "{\"description\": \"No matching runs\", \"tab\": \"Calibration\", \"score\": \"\"}", + "OpenbookQA - EM (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "OpenbookQA - EM (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "OpenbookQA - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "OpenbookQA - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on TruthfulQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.272, + "details": { + "description": "min=0.272, mean=0.272, max=0.272, sum=0.272 (1)", + "tab": "Accuracy", + "TruthfulQA - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "TruthfulQA - EM (Robustness)": "{\"description\": \"min=0.234, mean=0.234, max=0.234, sum=0.234 (1)\", \"tab\": \"Robustness\", \"score\": \"0.23394495412844038\"}", + "TruthfulQA - EM (Fairness)": "{\"description\": \"min=0.223, mean=0.223, max=0.223, sum=0.223 (1)\", \"tab\": \"Fairness\", \"score\": \"0.22324159021406728\"}", + "TruthfulQA - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "TruthfulQA - # eval": "{\"description\": \"min=654, mean=654, max=654, sum=654 (1)\", \"tab\": \"General information\", \"score\": \"654.0\"}", + "TruthfulQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "TruthfulQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "TruthfulQA - # prompt tokens": "{\"description\": \"min=524.602, mean=524.602, max=524.602, sum=524.602 (1)\", \"tab\": \"General information\", \"score\": \"524.6024464831804\"}", + "TruthfulQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "TruthfulQA - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "MS MARCO (regular) - RR@10 (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "MS MARCO (TREC) - NDCG@10 (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "MS MARCO (regular) - RR@10 (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "MS MARCO (TREC) - NDCG@10 (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "MS MARCO (regular) - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "MS MARCO (TREC) - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "MS MARCO (regular) - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}", + "MS MARCO (TREC) - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "ROUGE-2 on CNN/DailyMail", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "CNN/DailyMail - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "CNN/DailyMail - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}", + "CNN/DailyMail - SummaC": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - QAFactEval": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - BERTScore (F1)": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - Coverage": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - Density": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - Compression": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - HumanEval-faithfulness": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - HumanEval-relevance": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - HumanEval-coherence": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "ROUGE-2 on XSUM", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "XSUM - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "XSUM - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}", + "XSUM - SummaC": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - QAFactEval": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - BERTScore (F1)": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - Coverage": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - Density": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - Compression": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - HumanEval-faithfulness": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - HumanEval-relevance": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - HumanEval-coherence": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on IMDB", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.907, + "details": { + "description": "min=0.907, mean=0.907, max=0.907, sum=0.907 (1)", + "tab": "Accuracy", + "IMDB - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "IMDB - EM (Robustness)": "{\"description\": \"min=0.808, mean=0.808, max=0.808, sum=0.808 (1)\", \"tab\": \"Robustness\", \"score\": \"0.808\"}", + "IMDB - EM (Fairness)": "{\"description\": \"min=0.871, mean=0.871, max=0.871, sum=0.871 (1)\", \"tab\": \"Fairness\", \"score\": \"0.871\"}", + "IMDB - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "IMDB - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "IMDB - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "IMDB - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "IMDB - # prompt tokens": "{\"description\": \"min=2897.409, mean=2897.409, max=2897.409, sum=2897.409 (1)\", \"tab\": \"General information\", \"score\": \"2897.409\"}", + "IMDB - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "IMDB - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "IMDB - Stereotypes (race)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Stereotypes (gender)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Representation (race)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Representation (gender)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Toxic fraction": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on CivilComments", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.562, + "details": { + "description": "min=0.025, mean=0.562, max=1, sum=10.108 (18)", + "tab": "Accuracy", + "CivilComments - ECE (10-bin)": "{\"description\": \"9 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "CivilComments - EM (Robustness)": "{\"description\": \"min=0.025, mean=0.516, max=0.989, sum=9.28 (18)\", \"tab\": \"Robustness\", \"score\": \"0.5155612610622284\"}", + "CivilComments - EM (Fairness)": "{\"description\": \"min=0.01, mean=0.503, max=0.998, sum=9.057 (18)\", \"tab\": \"Fairness\", \"score\": \"0.5031757189564859\"}", + "CivilComments - Denoised inference time (s)": "{\"description\": \"9 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "CivilComments - # eval": "{\"description\": \"min=74, mean=371.556, max=683, sum=6688 (18)\", \"tab\": \"General information\", \"score\": \"371.55555555555554\"}", + "CivilComments - # train": "{\"description\": \"min=5, mean=5, max=5, sum=90 (18)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "CivilComments - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (18)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "CivilComments - # prompt tokens": "{\"description\": \"min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)\", \"tab\": \"General information\", \"score\": \"855.2410378605821\"}", + "CivilComments - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=18 (18)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "CivilComments - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=18 (18)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "CivilComments - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Stereotypes (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Representation (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Representation (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Toxic fraction": "{\"description\": \"9 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on RAFT", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.643, + "details": { + "description": "min=0.125, mean=0.643, max=0.95, sum=7.075 (11)", + "tab": "Accuracy", + "RAFT - ECE (10-bin)": "{\"description\": \"11 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "RAFT - EM (Robustness)": "{\"description\": \"min=0.05, mean=0.573, max=0.875, sum=6.3 (11)\", \"tab\": \"Robustness\", \"score\": \"0.5727272727272728\"}", + "RAFT - EM (Fairness)": "{\"description\": \"min=0.1, mean=0.609, max=0.95, sum=6.7 (11)\", \"tab\": \"Fairness\", \"score\": \"0.6090909090909092\"}", + "RAFT - Denoised inference time (s)": "{\"description\": \"11 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "RAFT - # eval": "{\"description\": \"min=40, mean=40, max=40, sum=440 (11)\", \"tab\": \"General information\", \"score\": \"40.0\"}", + "RAFT - # train": "{\"description\": \"min=2.575, mean=4.78, max=5, sum=52.575 (11)\", \"tab\": \"General information\", \"score\": \"4.779545454545455\"}", + "RAFT - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (11)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "RAFT - # prompt tokens": "{\"description\": \"min=303.675, mean=1153.852, max=3623.9, sum=12692.375 (11)\", \"tab\": \"General information\", \"score\": \"1153.8522727272727\"}", + "RAFT - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=11 (11)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "RAFT - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=11 (11)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "RAFT - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Stereotypes (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Representation (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Representation (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Toxic fraction": "{\"description\": \"11 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + } + ], + "detailed_evaluation_results": null, + "generation_config": { + "additional_details": {} + } + } + ] +} \ No newline at end of file diff --git a/data/meta_OPT-175B.json b/data/models/meta_OPT-175B.json similarity index 100% rename from data/meta_OPT-175B.json rename to data/models/meta_OPT-175B.json diff --git a/data/meta_OPT-66B.json b/data/models/meta_OPT-66B.json similarity index 100% rename from data/meta_OPT-66B.json rename to data/models/meta_OPT-66B.json diff --git a/data/meta_llama-2-13b.json b/data/models/meta_llama-2-13b.json similarity index 100% rename from data/meta_llama-2-13b.json rename to data/models/meta_llama-2-13b.json diff --git a/data/meta_llama-2-70b.json b/data/models/meta_llama-2-70b.json similarity index 100% rename from data/meta_llama-2-70b.json rename to data/models/meta_llama-2-70b.json diff --git a/data/meta_llama-2-7b.json b/data/models/meta_llama-2-7b.json similarity index 100% rename from data/meta_llama-2-7b.json rename to data/models/meta_llama-2-7b.json diff --git a/data/meta_llama-3-70b.json b/data/models/meta_llama-3-70b.json similarity index 100% rename from data/meta_llama-3-70b.json rename to data/models/meta_llama-3-70b.json diff --git a/data/meta_llama-3-8b.json b/data/models/meta_llama-3-8b.json similarity index 100% rename from data/meta_llama-3-8b.json rename to data/models/meta_llama-3-8b.json diff --git a/data/meta_llama-3.1-405b-instruct-turbo.json b/data/models/meta_llama-3.1-405b-instruct-turbo.json similarity index 100% rename from data/meta_llama-3.1-405b-instruct-turbo.json rename to data/models/meta_llama-3.1-405b-instruct-turbo.json diff --git a/data/meta_llama-3.1-70b-instruct-turbo.json b/data/models/meta_llama-3.1-70b-instruct-turbo.json similarity index 100% rename from data/meta_llama-3.1-70b-instruct-turbo.json rename to data/models/meta_llama-3.1-70b-instruct-turbo.json diff --git a/data/meta_llama-3.1-8b-instruct-turbo.json b/data/models/meta_llama-3.1-8b-instruct-turbo.json similarity index 100% rename from data/meta_llama-3.1-8b-instruct-turbo.json rename to data/models/meta_llama-3.1-8b-instruct-turbo.json diff --git a/data/meta_llama-3.2-11b-vision-instruct-turbo.json b/data/models/meta_llama-3.2-11b-vision-instruct-turbo.json similarity index 100% rename from data/meta_llama-3.2-11b-vision-instruct-turbo.json rename to data/models/meta_llama-3.2-11b-vision-instruct-turbo.json diff --git a/data/meta_llama-3.2-90b-vision-instruct-turbo.json b/data/models/meta_llama-3.2-90b-vision-instruct-turbo.json similarity index 100% rename from data/meta_llama-3.2-90b-vision-instruct-turbo.json rename to data/models/meta_llama-3.2-90b-vision-instruct-turbo.json diff --git a/data/meta_llama-3.3-70b-instruct-turbo.json b/data/models/meta_llama-3.3-70b-instruct-turbo.json similarity index 100% rename from data/meta_llama-3.3-70b-instruct-turbo.json rename to data/models/meta_llama-3.3-70b-instruct-turbo.json diff --git a/data/meta_llama-4-maverick-17b-128e-instruct-fp8.json b/data/models/meta_llama-4-maverick-17b-128e-instruct-fp8.json similarity index 100% rename from data/meta_llama-4-maverick-17b-128e-instruct-fp8.json rename to data/models/meta_llama-4-maverick-17b-128e-instruct-fp8.json diff --git a/data/meta_llama-4-maverick.json b/data/models/meta_llama-4-maverick.json similarity index 100% rename from data/meta_llama-4-maverick.json rename to data/models/meta_llama-4-maverick.json diff --git a/data/meta_llama-4-scout-17b-16e-instruct.json b/data/models/meta_llama-4-scout-17b-16e-instruct.json similarity index 100% rename from data/meta_llama-4-scout-17b-16e-instruct.json rename to data/models/meta_llama-4-scout-17b-16e-instruct.json diff --git a/data/meta_llama-65b.json b/data/models/meta_llama-65b.json similarity index 100% rename from data/meta_llama-65b.json rename to data/models/meta_llama-65b.json diff --git a/data/mhl1_Qwen2.5-0.5B-cinstruct-stage1.json b/data/models/mhl1_Qwen2.5-0.5B-cinstruct-stage1.json similarity index 100% rename from data/mhl1_Qwen2.5-0.5B-cinstruct-stage1.json rename to data/models/mhl1_Qwen2.5-0.5B-cinstruct-stage1.json diff --git a/data/microsoft_DialoGPT-medium.json b/data/models/microsoft_DialoGPT-medium.json similarity index 100% rename from data/microsoft_DialoGPT-medium.json rename to data/models/microsoft_DialoGPT-medium.json diff --git a/data/microsoft_Orca-2-13b.json b/data/models/microsoft_Orca-2-13b.json similarity index 100% rename from data/microsoft_Orca-2-13b.json rename to data/models/microsoft_Orca-2-13b.json diff --git a/data/microsoft_Orca-2-7b.json b/data/models/microsoft_Orca-2-7b.json similarity index 100% rename from data/microsoft_Orca-2-7b.json rename to data/models/microsoft_Orca-2-7b.json diff --git a/data/microsoft_Phi-3-medium-128k-instruct.json b/data/models/microsoft_Phi-3-medium-128k-instruct.json similarity index 100% rename from data/microsoft_Phi-3-medium-128k-instruct.json rename to data/models/microsoft_Phi-3-medium-128k-instruct.json diff --git a/data/models/microsoft_Phi-3-medium-4k-instruct.json b/data/models/microsoft_Phi-3-medium-4k-instruct.json new file mode 100644 index 0000000000000000000000000000000000000000..86a283d1f9a620e2d680cd37354519da11358252 --- /dev/null +++ b/data/models/microsoft_Phi-3-medium-4k-instruct.json @@ -0,0 +1,145 @@ +{ + "model_info": { + "name": "Phi-3-medium-4k-instruct", + "id": "microsoft/Phi-3-medium-4k-instruct", + "developer": "microsoft", + "inference_platform": "unknown", + "additional_details": { + "precision": "bfloat16", + "architecture": "Phi3ForCausalLM", + "params_billions": "13.96" + } + }, + "evaluations": [ + { + "evaluation_id": "hfopenllm_v2/microsoft_Phi-3-medium-4k-instruct/1773936498.240187", + "retrieved_timestamp": "1773936498.240187", + "source_metadata": { + "source_name": "HF Open LLM v2", + "source_type": "documentation", + "source_organization_name": "Hugging Face", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "lm-evaluation-harness", + "version": "0.4.0", + "additional_details": { + "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess" + } + }, + "benchmark": "hfopenllm_v2", + "evaluation_results": [ + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "hf_dataset", + "hf_repo": "google/IFEval" + }, + "metric_config": { + "evaluation_description": "Accuracy on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6423 + } + }, + { + "evaluation_name": "BBH", + "source_data": { + "dataset_name": "BBH", + "source_type": "hf_dataset", + "hf_repo": "SaylorTwift/bbh" + }, + "metric_config": { + "evaluation_description": "Accuracy on BBH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6412 + } + }, + { + "evaluation_name": "MATH Level 5", + "source_data": { + "dataset_name": "MATH Level 5", + "source_type": "hf_dataset", + "hf_repo": "DigitalLearningGmbH/MATH-lighteval" + }, + "metric_config": { + "evaluation_description": "Exact Match on MATH Level 5", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.1956 + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "hf_dataset", + "hf_repo": "Idavidrein/gpqa" + }, + "metric_config": { + "evaluation_description": "Accuracy on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3364 + } + }, + { + "evaluation_name": "MUSR", + "source_data": { + "dataset_name": "MUSR", + "source_type": "hf_dataset", + "hf_repo": "TAUR-Lab/MuSR" + }, + "metric_config": { + "evaluation_description": "Accuracy on MUSR", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4258 + } + }, + { + "evaluation_name": "MMLU-PRO", + "source_data": { + "dataset_name": "MMLU-PRO", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/MMLU-Pro" + }, + "metric_config": { + "evaluation_description": "Accuracy on MMLU-PRO", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4676 + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/microsoft_Phi-3-mini-128k-instruct.json b/data/models/microsoft_Phi-3-mini-128k-instruct.json similarity index 100% rename from data/microsoft_Phi-3-mini-128k-instruct.json rename to data/models/microsoft_Phi-3-mini-128k-instruct.json diff --git a/data/microsoft_Phi-3-mini-4k-instruct.json b/data/models/microsoft_Phi-3-mini-4k-instruct.json similarity index 100% rename from data/microsoft_Phi-3-mini-4k-instruct.json rename to data/models/microsoft_Phi-3-mini-4k-instruct.json diff --git a/data/microsoft_Phi-3-small-128k-instruct.json b/data/models/microsoft_Phi-3-small-128k-instruct.json similarity index 100% rename from data/microsoft_Phi-3-small-128k-instruct.json rename to data/models/microsoft_Phi-3-small-128k-instruct.json diff --git a/data/models/microsoft_Phi-3-small-8k-instruct.json b/data/models/microsoft_Phi-3-small-8k-instruct.json new file mode 100644 index 0000000000000000000000000000000000000000..05f5482cbfb2f1190b21b1f810a1e11dc92b19c0 --- /dev/null +++ b/data/models/microsoft_Phi-3-small-8k-instruct.json @@ -0,0 +1,145 @@ +{ + "model_info": { + "name": "Phi-3-small-8k-instruct", + "id": "microsoft/Phi-3-small-8k-instruct", + "developer": "microsoft", + "inference_platform": "unknown", + "additional_details": { + "precision": "bfloat16", + "architecture": "Phi3SmallForCausalLM", + "params_billions": "7.392" + } + }, + "evaluations": [ + { + "evaluation_id": "hfopenllm_v2/microsoft_Phi-3-small-8k-instruct/1773936498.240187", + "retrieved_timestamp": "1773936498.240187", + "source_metadata": { + "source_name": "HF Open LLM v2", + "source_type": "documentation", + "source_organization_name": "Hugging Face", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "lm-evaluation-harness", + "version": "0.4.0", + "additional_details": { + "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess" + } + }, + "benchmark": "hfopenllm_v2", + "evaluation_results": [ + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "hf_dataset", + "hf_repo": "google/IFEval" + }, + "metric_config": { + "evaluation_description": "Accuracy on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6497 + } + }, + { + "evaluation_name": "BBH", + "source_data": { + "dataset_name": "BBH", + "source_type": "hf_dataset", + "hf_repo": "SaylorTwift/bbh" + }, + "metric_config": { + "evaluation_description": "Accuracy on BBH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6208 + } + }, + { + "evaluation_name": "MATH Level 5", + "source_data": { + "dataset_name": "MATH Level 5", + "source_type": "hf_dataset", + "hf_repo": "DigitalLearningGmbH/MATH-lighteval" + }, + "metric_config": { + "evaluation_description": "Exact Match on MATH Level 5", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.1887 + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "hf_dataset", + "hf_repo": "Idavidrein/gpqa" + }, + "metric_config": { + "evaluation_description": "Accuracy on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3121 + } + }, + { + "evaluation_name": "MUSR", + "source_data": { + "dataset_name": "MUSR", + "source_type": "hf_dataset", + "hf_repo": "TAUR-Lab/MuSR" + }, + "metric_config": { + "evaluation_description": "Accuracy on MUSR", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4558 + } + }, + { + "evaluation_name": "MMLU-PRO", + "source_data": { + "dataset_name": "MMLU-PRO", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/MMLU-Pro" + }, + "metric_config": { + "evaluation_description": "Accuracy on MMLU-PRO", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4506 + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/microsoft_Phi-3.5-MoE-instruct.json b/data/models/microsoft_Phi-3.5-MoE-instruct.json similarity index 100% rename from data/microsoft_Phi-3.5-MoE-instruct.json rename to data/models/microsoft_Phi-3.5-MoE-instruct.json diff --git a/data/microsoft_Phi-3.5-mini-instruct.json b/data/models/microsoft_Phi-3.5-mini-instruct.json similarity index 100% rename from data/microsoft_Phi-3.5-mini-instruct.json rename to data/models/microsoft_Phi-3.5-mini-instruct.json diff --git a/data/microsoft_Phi-4-mini-instruct.json b/data/models/microsoft_Phi-4-mini-instruct.json similarity index 100% rename from data/microsoft_Phi-4-mini-instruct.json rename to data/models/microsoft_Phi-4-mini-instruct.json diff --git a/data/microsoft_TNLG-v2-530B.json b/data/models/microsoft_TNLG-v2-530B.json similarity index 100% rename from data/microsoft_TNLG-v2-530B.json rename to data/models/microsoft_TNLG-v2-530B.json diff --git a/data/microsoft_TNLG-v2-6.7B.json b/data/models/microsoft_TNLG-v2-6.7B.json similarity index 100% rename from data/microsoft_TNLG-v2-6.7B.json rename to data/models/microsoft_TNLG-v2-6.7B.json diff --git a/data/microsoft_phi-1.json b/data/models/microsoft_phi-1.json similarity index 100% rename from data/microsoft_phi-1.json rename to data/models/microsoft_phi-1.json diff --git a/data/microsoft_phi-1_5.json b/data/models/microsoft_phi-1_5.json similarity index 100% rename from data/microsoft_phi-1_5.json rename to data/models/microsoft_phi-1_5.json diff --git a/data/microsoft_phi-2.json b/data/models/microsoft_phi-2.json similarity index 100% rename from data/microsoft_phi-2.json rename to data/models/microsoft_phi-2.json diff --git a/data/microsoft_phi-3-medium-4k-instruct.json b/data/models/microsoft_phi-3-medium-4k-instruct.json similarity index 100% rename from data/microsoft_phi-3-medium-4k-instruct.json rename to data/models/microsoft_phi-3-medium-4k-instruct.json diff --git a/data/microsoft_phi-3-small-8k-instruct.json b/data/models/microsoft_phi-3-small-8k-instruct.json similarity index 100% rename from data/microsoft_phi-3-small-8k-instruct.json rename to data/models/microsoft_phi-3-small-8k-instruct.json diff --git a/data/microsoft_phi-4.json b/data/models/microsoft_phi-4.json similarity index 99% rename from data/microsoft_phi-4.json rename to data/models/microsoft_phi-4.json index 97f523bcb3123f3c15262b578f3a90bbfd2182ab..b963af9da3550d27dbd1c016591354645798b294 100644 --- a/data/microsoft_phi-4.json +++ b/data/models/microsoft_phi-4.json @@ -5,7 +5,7 @@ "developer": "microsoft", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "Phi3ForCausalLM", "params_billions": "14.66" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0585 + "score": 0.0488 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6691 + "score": 0.6703 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3165 + "score": 0.2787 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.406 + "score": 0.401 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5287 + "score": 0.5295 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0488 + "score": 0.0585 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6703 + "score": 0.6691 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2787 + "score": 0.3165 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.401 + "score": 0.406 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5295 + "score": 0.5287 } } ], diff --git a/data/mightbe_Better-PairRM.json b/data/models/mightbe_Better-PairRM.json similarity index 100% rename from data/mightbe_Better-PairRM.json rename to data/models/mightbe_Better-PairRM.json diff --git a/data/migtissera_Llama-3-70B-Synthia-v3.5.json b/data/models/migtissera_Llama-3-70B-Synthia-v3.5.json similarity index 100% rename from data/migtissera_Llama-3-70B-Synthia-v3.5.json rename to data/models/migtissera_Llama-3-70B-Synthia-v3.5.json diff --git a/data/migtissera_Llama-3-8B-Synthia-v3.5.json b/data/models/migtissera_Llama-3-8B-Synthia-v3.5.json similarity index 100% rename from data/migtissera_Llama-3-8B-Synthia-v3.5.json rename to data/models/migtissera_Llama-3-8B-Synthia-v3.5.json diff --git a/data/migtissera_Tess-3-7B-SFT.json b/data/models/migtissera_Tess-3-7B-SFT.json similarity index 100% rename from data/migtissera_Tess-3-7B-SFT.json rename to data/models/migtissera_Tess-3-7B-SFT.json diff --git a/data/migtissera_Tess-3-Mistral-Nemo-12B.json b/data/models/migtissera_Tess-3-Mistral-Nemo-12B.json similarity index 100% rename from data/migtissera_Tess-3-Mistral-Nemo-12B.json rename to data/models/migtissera_Tess-3-Mistral-Nemo-12B.json diff --git a/data/migtissera_Tess-v2.5-Phi-3-medium-128k-14B.json b/data/models/migtissera_Tess-v2.5-Phi-3-medium-128k-14B.json similarity index 100% rename from data/migtissera_Tess-v2.5-Phi-3-medium-128k-14B.json rename to data/models/migtissera_Tess-v2.5-Phi-3-medium-128k-14B.json diff --git a/data/migtissera_Tess-v2.5.2-Qwen2-72B.json b/data/models/migtissera_Tess-v2.5.2-Qwen2-72B.json similarity index 100% rename from data/migtissera_Tess-v2.5.2-Qwen2-72B.json rename to data/models/migtissera_Tess-v2.5.2-Qwen2-72B.json diff --git a/data/migtissera_Trinity-2-Codestral-22B-v0.2.json b/data/models/migtissera_Trinity-2-Codestral-22B-v0.2.json similarity index 99% rename from data/migtissera_Trinity-2-Codestral-22B-v0.2.json rename to data/models/migtissera_Trinity-2-Codestral-22B-v0.2.json index 679817b0f46278305e51ca89bfbcb28c81768bf7..e73a7af4d30076cd08273bf7149a3e040e29d637 100644 --- a/data/migtissera_Trinity-2-Codestral-22B-v0.2.json +++ b/data/models/migtissera_Trinity-2-Codestral-22B-v0.2.json @@ -5,7 +5,7 @@ "developer": "migtissera", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "MistralForCausalLM", "params_billions": "22.247" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.443 + "score": 0.4345 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5706 + "score": 0.5686 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0869 + "score": 0.0838 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3079 + "score": 0.3003 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4031 + "score": 0.4045 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3354 + "score": 0.334 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4345 + "score": 0.443 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5686 + "score": 0.5706 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0838 + "score": 0.0869 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3003 + "score": 0.3079 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4045 + "score": 0.4031 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.334 + "score": 0.3354 } } ], diff --git a/data/migtissera_Trinity-2-Codestral-22B.json b/data/models/migtissera_Trinity-2-Codestral-22B.json similarity index 100% rename from data/migtissera_Trinity-2-Codestral-22B.json rename to data/models/migtissera_Trinity-2-Codestral-22B.json diff --git a/data/mindw96_DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3.json b/data/models/mindw96_DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3.json similarity index 100% rename from data/mindw96_DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3.json rename to data/models/mindw96_DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3.json diff --git a/data/minghaowu_Qwen1.5-1.8B-OpenHermes-2.5.json b/data/models/minghaowu_Qwen1.5-1.8B-OpenHermes-2.5.json similarity index 100% rename from data/minghaowu_Qwen1.5-1.8B-OpenHermes-2.5.json rename to data/models/minghaowu_Qwen1.5-1.8B-OpenHermes-2.5.json diff --git a/data/minimax_Minimax-2.5.json b/data/models/minimax_Minimax-2.5.json similarity index 100% rename from data/minimax_Minimax-2.5.json rename to data/models/minimax_Minimax-2.5.json diff --git a/data/minimax_minimax-m2.1.json b/data/models/minimax_minimax-m2.1.json similarity index 100% rename from data/minimax_minimax-m2.1.json rename to data/models/minimax_minimax-m2.1.json diff --git a/data/minimax_minimax-m2.5.json b/data/models/minimax_minimax-m2.5.json similarity index 100% rename from data/minimax_minimax-m2.5.json rename to data/models/minimax_minimax-m2.5.json diff --git a/data/minimax_minimax-m2.json b/data/models/minimax_minimax-m2.json similarity index 100% rename from data/minimax_minimax-m2.json rename to data/models/minimax_minimax-m2.json diff --git a/data/ministral_Ministral-3b-instruct.json b/data/models/ministral_Ministral-3b-instruct.json similarity index 100% rename from data/ministral_Ministral-3b-instruct.json rename to data/models/ministral_Ministral-3b-instruct.json diff --git a/data/mistral-community_Mistral-7B-v0.2.json b/data/models/mistral-community_Mistral-7B-v0.2.json similarity index 100% rename from data/mistral-community_Mistral-7B-v0.2.json rename to data/models/mistral-community_Mistral-7B-v0.2.json diff --git a/data/mistral-community_Mixtral-8x22B-v0.1.json b/data/models/mistral-community_Mixtral-8x22B-v0.1.json similarity index 100% rename from data/mistral-community_Mixtral-8x22B-v0.1.json rename to data/models/mistral-community_Mixtral-8x22B-v0.1.json diff --git a/data/mistral-community_mixtral-8x22B-v0.3.json b/data/models/mistral-community_mixtral-8x22B-v0.3.json similarity index 100% rename from data/mistral-community_mixtral-8x22B-v0.3.json rename to data/models/mistral-community_mixtral-8x22B-v0.3.json diff --git a/data/mistralai_Codestral-22B-v0.1.json b/data/models/mistralai_Codestral-22B-v0.1.json similarity index 100% rename from data/mistralai_Codestral-22B-v0.1.json rename to data/models/mistralai_Codestral-22B-v0.1.json diff --git a/data/mistralai_Ministral-8B-Instruct-2410.json b/data/models/mistralai_Ministral-8B-Instruct-2410.json similarity index 100% rename from data/mistralai_Ministral-8B-Instruct-2410.json rename to data/models/mistralai_Ministral-8B-Instruct-2410.json diff --git a/data/mistralai_Mistral-7B-Instruct-v0.1.json b/data/models/mistralai_Mistral-7B-Instruct-v0.1.json similarity index 100% rename from data/mistralai_Mistral-7B-Instruct-v0.1.json rename to data/models/mistralai_Mistral-7B-Instruct-v0.1.json diff --git a/data/mistralai_Mistral-7B-Instruct-v0.2.json b/data/models/mistralai_Mistral-7B-Instruct-v0.2.json similarity index 100% rename from data/mistralai_Mistral-7B-Instruct-v0.2.json rename to data/models/mistralai_Mistral-7B-Instruct-v0.2.json diff --git a/data/models/mistralai_Mistral-7B-Instruct-v0.3.json b/data/models/mistralai_Mistral-7B-Instruct-v0.3.json new file mode 100644 index 0000000000000000000000000000000000000000..4828a8992526857217dcd5ee86d7e532f0cb9552 --- /dev/null +++ b/data/models/mistralai_Mistral-7B-Instruct-v0.3.json @@ -0,0 +1,145 @@ +{ + "model_info": { + "name": "Mistral-7B-Instruct-v0.3", + "id": "mistralai/Mistral-7B-Instruct-v0.3", + "developer": "mistralai", + "inference_platform": "unknown", + "additional_details": { + "precision": "bfloat16", + "architecture": "MistralForCausalLM", + "params_billions": "7.248" + } + }, + "evaluations": [ + { + "evaluation_id": "hfopenllm_v2/mistralai_Mistral-7B-Instruct-v0.3/1773936498.240187", + "retrieved_timestamp": "1773936498.240187", + "source_metadata": { + "source_name": "HF Open LLM v2", + "source_type": "documentation", + "source_organization_name": "Hugging Face", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "lm-evaluation-harness", + "version": "0.4.0", + "additional_details": { + "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess" + } + }, + "benchmark": "hfopenllm_v2", + "evaluation_results": [ + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "hf_dataset", + "hf_repo": "google/IFEval" + }, + "metric_config": { + "evaluation_description": "Accuracy on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.5465 + } + }, + { + "evaluation_name": "BBH", + "source_data": { + "dataset_name": "BBH", + "source_type": "hf_dataset", + "hf_repo": "SaylorTwift/bbh" + }, + "metric_config": { + "evaluation_description": "Accuracy on BBH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4722 + } + }, + { + "evaluation_name": "MATH Level 5", + "source_data": { + "dataset_name": "MATH Level 5", + "source_type": "hf_dataset", + "hf_repo": "DigitalLearningGmbH/MATH-lighteval" + }, + "metric_config": { + "evaluation_description": "Exact Match on MATH Level 5", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.0385 + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "hf_dataset", + "hf_repo": "Idavidrein/gpqa" + }, + "metric_config": { + "evaluation_description": "Accuracy on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2794 + } + }, + { + "evaluation_name": "MUSR", + "source_data": { + "dataset_name": "MUSR", + "source_type": "hf_dataset", + "hf_repo": "TAUR-Lab/MuSR" + }, + "metric_config": { + "evaluation_description": "Accuracy on MUSR", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3739 + } + }, + { + "evaluation_name": "MMLU-PRO", + "source_data": { + "dataset_name": "MMLU-PRO", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/MMLU-Pro" + }, + "metric_config": { + "evaluation_description": "Accuracy on MMLU-PRO", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3075 + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/models/mistralai_Mistral-7B-v0.1.json b/data/models/mistralai_Mistral-7B-v0.1.json new file mode 100644 index 0000000000000000000000000000000000000000..d882b8352784f419c20777a6e687bd076cdfbfe8 --- /dev/null +++ b/data/models/mistralai_Mistral-7B-v0.1.json @@ -0,0 +1,145 @@ +{ + "model_info": { + "name": "Mistral-7B-v0.1", + "id": "mistralai/Mistral-7B-v0.1", + "developer": "mistralai", + "inference_platform": "unknown", + "additional_details": { + "precision": "bfloat16", + "architecture": "MistralForCausalLM", + "params_billions": "7.242" + } + }, + "evaluations": [ + { + "evaluation_id": "hfopenllm_v2/mistralai_Mistral-7B-v0.1/1773936498.240187", + "retrieved_timestamp": "1773936498.240187", + "source_metadata": { + "source_name": "HF Open LLM v2", + "source_type": "documentation", + "source_organization_name": "Hugging Face", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "lm-evaluation-harness", + "version": "0.4.0", + "additional_details": { + "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess" + } + }, + "benchmark": "hfopenllm_v2", + "evaluation_results": [ + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "hf_dataset", + "hf_repo": "google/IFEval" + }, + "metric_config": { + "evaluation_description": "Accuracy on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2386 + } + }, + { + "evaluation_name": "BBH", + "source_data": { + "dataset_name": "BBH", + "source_type": "hf_dataset", + "hf_repo": "SaylorTwift/bbh" + }, + "metric_config": { + "evaluation_description": "Accuracy on BBH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4419 + } + }, + { + "evaluation_name": "MATH Level 5", + "source_data": { + "dataset_name": "MATH Level 5", + "source_type": "hf_dataset", + "hf_repo": "DigitalLearningGmbH/MATH-lighteval" + }, + "metric_config": { + "evaluation_description": "Exact Match on MATH Level 5", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.0295 + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "hf_dataset", + "hf_repo": "Idavidrein/gpqa" + }, + "metric_config": { + "evaluation_description": "Accuracy on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2919 + } + }, + { + "evaluation_name": "MUSR", + "source_data": { + "dataset_name": "MUSR", + "source_type": "hf_dataset", + "hf_repo": "TAUR-Lab/MuSR" + }, + "metric_config": { + "evaluation_description": "Accuracy on MUSR", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4139 + } + }, + { + "evaluation_name": "MMLU-PRO", + "source_data": { + "dataset_name": "MMLU-PRO", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/MMLU-Pro" + }, + "metric_config": { + "evaluation_description": "Accuracy on MMLU-PRO", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3013 + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/mistralai_Mistral-7B-v0.3.json b/data/models/mistralai_Mistral-7B-v0.3.json similarity index 100% rename from data/mistralai_Mistral-7B-v0.3.json rename to data/models/mistralai_Mistral-7B-v0.3.json diff --git a/data/mistralai_Mistral-Large-Instruct-2411.json b/data/models/mistralai_Mistral-Large-Instruct-2411.json similarity index 100% rename from data/mistralai_Mistral-Large-Instruct-2411.json rename to data/models/mistralai_Mistral-Large-Instruct-2411.json diff --git a/data/mistralai_Mistral-Nemo-Base-2407.json b/data/models/mistralai_Mistral-Nemo-Base-2407.json similarity index 100% rename from data/mistralai_Mistral-Nemo-Base-2407.json rename to data/models/mistralai_Mistral-Nemo-Base-2407.json diff --git a/data/mistralai_Mistral-Nemo-Instruct-2407.json b/data/models/mistralai_Mistral-Nemo-Instruct-2407.json similarity index 100% rename from data/mistralai_Mistral-Nemo-Instruct-2407.json rename to data/models/mistralai_Mistral-Nemo-Instruct-2407.json diff --git a/data/mistralai_Mistral-Small-24B-Base-2501.json b/data/models/mistralai_Mistral-Small-24B-Base-2501.json similarity index 100% rename from data/mistralai_Mistral-Small-24B-Base-2501.json rename to data/models/mistralai_Mistral-Small-24B-Base-2501.json diff --git a/data/mistralai_Mistral-Small-Instruct-2409.json b/data/models/mistralai_Mistral-Small-Instruct-2409.json similarity index 99% rename from data/mistralai_Mistral-Small-Instruct-2409.json rename to data/models/mistralai_Mistral-Small-Instruct-2409.json index 47e5ec0e25350520e2e804f343fac80e0c508bf2..a990c0f86c3e83a9f32812582fcc413a5ada5107 100644 --- a/data/mistralai_Mistral-Small-Instruct-2409.json +++ b/data/models/mistralai_Mistral-Small-Instruct-2409.json @@ -5,9 +5,9 @@ "developer": "mistralai", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "MistralForCausalLM", - "params_billions": "22.05" + "params_billions": "22.247" } }, "evaluations": [ @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6283 + "score": 0.667 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.583 + "score": 0.5213 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2039 + "score": 0.1435 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3331 + "score": 0.3238 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4063 + "score": 0.3632 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4099 + "score": 0.396 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.667 + "score": 0.6283 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5213 + "score": 0.583 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1435 + "score": 0.2039 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3238 + "score": 0.3331 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3632 + "score": 0.4063 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.396 + "score": 0.4099 } } ], diff --git a/data/mistralai_Mistral-v0.1-7B.json b/data/models/mistralai_Mistral-v0.1-7B.json similarity index 100% rename from data/mistralai_Mistral-v0.1-7B.json rename to data/models/mistralai_Mistral-v0.1-7B.json diff --git a/data/models/mistralai_Mixtral-8x22B-Instruct-v0.1.json b/data/models/mistralai_Mixtral-8x22B-Instruct-v0.1.json new file mode 100644 index 0000000000000000000000000000000000000000..67466e3a9312b9435f93e0e3e6c8cba1504b6964 --- /dev/null +++ b/data/models/mistralai_Mixtral-8x22B-Instruct-v0.1.json @@ -0,0 +1,145 @@ +{ + "model_info": { + "name": "Mixtral-8x22B-Instruct-v0.1", + "id": "mistralai/Mixtral-8x22B-Instruct-v0.1", + "developer": "mistralai", + "inference_platform": "unknown", + "additional_details": { + "precision": "bfloat16", + "architecture": "MixtralForCausalLM", + "params_billions": "140.621" + } + }, + "evaluations": [ + { + "evaluation_id": "hfopenllm_v2/mistralai_Mixtral-8x22B-Instruct-v0.1/1773936498.240187", + "retrieved_timestamp": "1773936498.240187", + "source_metadata": { + "source_name": "HF Open LLM v2", + "source_type": "documentation", + "source_organization_name": "Hugging Face", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "lm-evaluation-harness", + "version": "0.4.0", + "additional_details": { + "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess" + } + }, + "benchmark": "hfopenllm_v2", + "evaluation_results": [ + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "hf_dataset", + "hf_repo": "google/IFEval" + }, + "metric_config": { + "evaluation_description": "Accuracy on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.7184 + } + }, + { + "evaluation_name": "BBH", + "source_data": { + "dataset_name": "BBH", + "source_type": "hf_dataset", + "hf_repo": "SaylorTwift/bbh" + }, + "metric_config": { + "evaluation_description": "Accuracy on BBH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6125 + } + }, + { + "evaluation_name": "MATH Level 5", + "source_data": { + "dataset_name": "MATH Level 5", + "source_type": "hf_dataset", + "hf_repo": "DigitalLearningGmbH/MATH-lighteval" + }, + "metric_config": { + "evaluation_description": "Exact Match on MATH Level 5", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.1873 + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "hf_dataset", + "hf_repo": "Idavidrein/gpqa" + }, + "metric_config": { + "evaluation_description": "Accuracy on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3733 + } + }, + { + "evaluation_name": "MUSR", + "source_data": { + "dataset_name": "MUSR", + "source_type": "hf_dataset", + "hf_repo": "TAUR-Lab/MuSR" + }, + "metric_config": { + "evaluation_description": "Accuracy on MUSR", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4311 + } + }, + { + "evaluation_name": "MMLU-PRO", + "source_data": { + "dataset_name": "MMLU-PRO", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/MMLU-Pro" + }, + "metric_config": { + "evaluation_description": "Accuracy on MMLU-PRO", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4483 + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/mistralai_Mixtral-8x22B-v0.1.json b/data/models/mistralai_Mixtral-8x22B-v0.1.json similarity index 100% rename from data/mistralai_Mixtral-8x22B-v0.1.json rename to data/models/mistralai_Mixtral-8x22B-v0.1.json diff --git a/data/models/mistralai_Mixtral-8x7B-Instruct-v0.1.json b/data/models/mistralai_Mixtral-8x7B-Instruct-v0.1.json new file mode 100644 index 0000000000000000000000000000000000000000..ed7191e0b214f860560f0543004438c541e27686 --- /dev/null +++ b/data/models/mistralai_Mixtral-8x7B-Instruct-v0.1.json @@ -0,0 +1,274 @@ +{ + "model_info": { + "name": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "id": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "developer": "mistralai", + "additional_details": { + "model_type": "DPO" + } + }, + "evaluations": [ + { + "evaluation_id": "hfopenllm_v2/mistralai_Mixtral-8x7B-Instruct-v0.1/1773936498.240187", + "retrieved_timestamp": "1773936498.240187", + "source_metadata": { + "source_name": "HF Open LLM v2", + "source_type": "documentation", + "source_organization_name": "Hugging Face", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "lm-evaluation-harness", + "version": "0.4.0", + "additional_details": { + "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess" + } + }, + "benchmark": "hfopenllm_v2", + "evaluation_results": [ + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "hf_dataset", + "hf_repo": "google/IFEval" + }, + "metric_config": { + "evaluation_description": "Accuracy on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.5599 + } + }, + { + "evaluation_name": "BBH", + "source_data": { + "dataset_name": "BBH", + "source_type": "hf_dataset", + "hf_repo": "SaylorTwift/bbh" + }, + "metric_config": { + "evaluation_description": "Accuracy on BBH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4962 + } + }, + { + "evaluation_name": "MATH Level 5", + "source_data": { + "dataset_name": "MATH Level 5", + "source_type": "hf_dataset", + "hf_repo": "DigitalLearningGmbH/MATH-lighteval" + }, + "metric_config": { + "evaluation_description": "Exact Match on MATH Level 5", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.0914 + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "hf_dataset", + "hf_repo": "Idavidrein/gpqa" + }, + "metric_config": { + "evaluation_description": "Accuracy on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3029 + } + }, + { + "evaluation_name": "MUSR", + "source_data": { + "dataset_name": "MUSR", + "source_type": "hf_dataset", + "hf_repo": "TAUR-Lab/MuSR" + }, + "metric_config": { + "evaluation_description": "Accuracy on MUSR", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4203 + } + }, + { + "evaluation_name": "MMLU-PRO", + "source_data": { + "dataset_name": "MMLU-PRO", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/MMLU-Pro" + }, + "metric_config": { + "evaluation_description": "Accuracy on MMLU-PRO", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3692 + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + }, + { + "evaluation_id": "reward-bench/mistralai_Mixtral-8x7B-Instruct-v0.1/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ + { + "evaluation_name": "Score", + "metric_config": { + "evaluation_description": "Overall RewardBench Score", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.7455 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Chat", + "metric_config": { + "evaluation_description": "Chat accuracy - includes easy chat subsets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.9497 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Chat Hard", + "metric_config": { + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6404 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Safety", + "metric_config": { + "evaluation_description": "Safety accuracy - includes safety subsets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.7257 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Reasoning", + "metric_config": { + "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.7872 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + }, + { + "evaluation_name": "Prior Sets (0.5 weight)", + "metric_config": { + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.5033 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" + } + } + ], + "detailed_evaluation_results": null, + "generation_config": null + } + ] +} \ No newline at end of file diff --git a/data/mistralai_Mixtral-8x7B-v0.1.json b/data/models/mistralai_Mixtral-8x7B-v0.1.json similarity index 100% rename from data/mistralai_Mixtral-8x7B-v0.1.json rename to data/models/mistralai_Mixtral-8x7B-v0.1.json diff --git a/data/mistralai_mistral-7b-instruct-v0.3.json b/data/models/mistralai_mistral-7b-instruct-v0.3.json similarity index 100% rename from data/mistralai_mistral-7b-instruct-v0.3.json rename to data/models/mistralai_mistral-7b-instruct-v0.3.json diff --git a/data/mistralai_mistral-7b-v0.1.json b/data/models/mistralai_mistral-7b-v0.1.json similarity index 100% rename from data/mistralai_mistral-7b-v0.1.json rename to data/models/mistralai_mistral-7b-v0.1.json diff --git a/data/mistralai_mistral-large-2402.json b/data/models/mistralai_mistral-large-2402.json similarity index 100% rename from data/mistralai_mistral-large-2402.json rename to data/models/mistralai_mistral-large-2402.json diff --git a/data/mistralai_mistral-large-2407.json b/data/models/mistralai_mistral-large-2407.json similarity index 100% rename from data/mistralai_mistral-large-2407.json rename to data/models/mistralai_mistral-large-2407.json diff --git a/data/mistralai_mistral-large-2411.json b/data/models/mistralai_mistral-large-2411.json similarity index 100% rename from data/mistralai_mistral-large-2411.json rename to data/models/mistralai_mistral-large-2411.json diff --git a/data/mistralai_mistral-medium-2312.json b/data/models/mistralai_mistral-medium-2312.json similarity index 100% rename from data/mistralai_mistral-medium-2312.json rename to data/models/mistralai_mistral-medium-2312.json diff --git a/data/mistralai_mistral-medium-3.json b/data/models/mistralai_mistral-medium-3.json similarity index 100% rename from data/mistralai_mistral-medium-3.json rename to data/models/mistralai_mistral-medium-3.json index 3d0755e0237b9760231fee380b173b29d57eab4a..8c2cd1ffe5583f8e08ea6a503148c17defbde8f4 100644 --- a/data/mistralai_mistral-medium-3.json +++ b/data/models/mistralai_mistral-medium-3.json @@ -10,8 +10,8 @@ }, "evaluations": [ { - "evaluation_id": "global-mmlu-lite/mistralai_mistral-medium-3/1773936496.366405", - "retrieved_timestamp": "1773936496.366405", + "evaluation_id": "global-mmlu-lite/mistralai_mistral-medium-3/1773936583.743359", + "retrieved_timestamp": "1773936583.743359", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", @@ -525,8 +525,8 @@ "generation_config": null }, { - "evaluation_id": "global-mmlu-lite/mistralai_mistral-medium-3/1773936583.743359", - "retrieved_timestamp": "1773936583.743359", + "evaluation_id": "global-mmlu-lite/mistralai_mistral-medium-3/1773936496.366405", + "retrieved_timestamp": "1773936496.366405", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", diff --git a/data/mistralai_mistral-small-2402.json b/data/models/mistralai_mistral-small-2402.json similarity index 100% rename from data/mistralai_mistral-small-2402.json rename to data/models/mistralai_mistral-small-2402.json diff --git a/data/mistralai_mistral-small-2503.json b/data/models/mistralai_mistral-small-2503.json similarity index 100% rename from data/mistralai_mistral-small-2503.json rename to data/models/mistralai_mistral-small-2503.json index bd0d3f2eb4330a1375f6cbb298fb1e3d1d2d934c..e610b9a657b4d002310344dd461fd47557dbb4c2 100644 --- a/data/mistralai_mistral-small-2503.json +++ b/data/models/mistralai_mistral-small-2503.json @@ -7,8 +7,8 @@ }, "evaluations": [ { - "evaluation_id": "global-mmlu-lite/mistralai_mistral-small-2503/1773936583.743359", - "retrieved_timestamp": "1773936583.743359", + "evaluation_id": "global-mmlu-lite/mistralai_mistral-small-2503/1773936496.366405", + "retrieved_timestamp": "1773936496.366405", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", @@ -522,8 +522,8 @@ "generation_config": null }, { - "evaluation_id": "global-mmlu-lite/mistralai_mistral-small-2503/1773936496.366405", - "retrieved_timestamp": "1773936496.366405", + "evaluation_id": "global-mmlu-lite/mistralai_mistral-small-2503/1773936583.743359", + "retrieved_timestamp": "1773936583.743359", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", diff --git a/data/mistralai_mixtral-8x22b-instruct-v0.1.json b/data/models/mistralai_mixtral-8x22b-instruct-v0.1.json similarity index 100% rename from data/mistralai_mixtral-8x22b-instruct-v0.1.json rename to data/models/mistralai_mixtral-8x22b-instruct-v0.1.json diff --git a/data/mistralai_mixtral-8x22b.json b/data/models/mistralai_mixtral-8x22b.json similarity index 100% rename from data/mistralai_mixtral-8x22b.json rename to data/models/mistralai_mixtral-8x22b.json diff --git a/data/mistralai_mixtral-8x7b-32kseqlen.json b/data/models/mistralai_mixtral-8x7b-32kseqlen.json similarity index 100% rename from data/mistralai_mixtral-8x7b-32kseqlen.json rename to data/models/mistralai_mixtral-8x7b-32kseqlen.json diff --git a/data/mistralai_mixtral-8x7b-instruct-v0.1.json b/data/models/mistralai_mixtral-8x7b-instruct-v0.1.json similarity index 100% rename from data/mistralai_mixtral-8x7b-instruct-v0.1.json rename to data/models/mistralai_mixtral-8x7b-instruct-v0.1.json diff --git a/data/mistralai_open-mistral-nemo-2407.json b/data/models/mistralai_open-mistral-nemo-2407.json similarity index 100% rename from data/mistralai_open-mistral-nemo-2407.json rename to data/models/mistralai_open-mistral-nemo-2407.json diff --git a/data/mixtao_MixTAO-7Bx2-MoE-v8.1.json b/data/models/mixtao_MixTAO-7Bx2-MoE-v8.1.json similarity index 100% rename from data/mixtao_MixTAO-7Bx2-MoE-v8.1.json rename to data/models/mixtao_MixTAO-7Bx2-MoE-v8.1.json diff --git a/data/mkurman_llama-3.2-MEDIT-3B-o1.json b/data/models/mkurman_llama-3.2-MEDIT-3B-o1.json similarity index 100% rename from data/mkurman_llama-3.2-MEDIT-3B-o1.json rename to data/models/mkurman_llama-3.2-MEDIT-3B-o1.json diff --git a/data/mkurman_phi-4-MedIT-11B-exp-1.json b/data/models/mkurman_phi-4-MedIT-11B-exp-1.json similarity index 100% rename from data/mkurman_phi-4-MedIT-11B-exp-1.json rename to data/models/mkurman_phi-4-MedIT-11B-exp-1.json diff --git a/data/mkurman_phi4-MedIT-10B-o1.json b/data/models/mkurman_phi4-MedIT-10B-o1.json similarity index 100% rename from data/mkurman_phi4-MedIT-10B-o1.json rename to data/models/mkurman_phi4-MedIT-10B-o1.json diff --git a/data/mkxu_llama-3-8b-instruct-fpo.json b/data/models/mkxu_llama-3-8b-instruct-fpo.json similarity index 100% rename from data/mkxu_llama-3-8b-instruct-fpo.json rename to data/models/mkxu_llama-3-8b-instruct-fpo.json diff --git a/data/mkxu_llama-3-8b-po1.json b/data/models/mkxu_llama-3-8b-po1.json similarity index 100% rename from data/mkxu_llama-3-8b-po1.json rename to data/models/mkxu_llama-3-8b-po1.json diff --git a/data/mlabonne_AlphaMonarch-7B.json b/data/models/mlabonne_AlphaMonarch-7B.json similarity index 100% rename from data/mlabonne_AlphaMonarch-7B.json rename to data/models/mlabonne_AlphaMonarch-7B.json diff --git a/data/mlabonne_Beyonder-4x7B-v3.json b/data/models/mlabonne_Beyonder-4x7B-v3.json similarity index 100% rename from data/mlabonne_Beyonder-4x7B-v3.json rename to data/models/mlabonne_Beyonder-4x7B-v3.json diff --git a/data/mlabonne_BigQwen2.5-52B-Instruct.json b/data/models/mlabonne_BigQwen2.5-52B-Instruct.json similarity index 100% rename from data/mlabonne_BigQwen2.5-52B-Instruct.json rename to data/models/mlabonne_BigQwen2.5-52B-Instruct.json diff --git a/data/mlabonne_BigQwen2.5-Echo-47B-Instruct.json b/data/models/mlabonne_BigQwen2.5-Echo-47B-Instruct.json similarity index 100% rename from data/mlabonne_BigQwen2.5-Echo-47B-Instruct.json rename to data/models/mlabonne_BigQwen2.5-Echo-47B-Instruct.json diff --git a/data/mlabonne_ChimeraLlama-3-8B-v2.json b/data/models/mlabonne_ChimeraLlama-3-8B-v2.json similarity index 100% rename from data/mlabonne_ChimeraLlama-3-8B-v2.json rename to data/models/mlabonne_ChimeraLlama-3-8B-v2.json diff --git a/data/mlabonne_ChimeraLlama-3-8B-v3.json b/data/models/mlabonne_ChimeraLlama-3-8B-v3.json similarity index 100% rename from data/mlabonne_ChimeraLlama-3-8B-v3.json rename to data/models/mlabonne_ChimeraLlama-3-8B-v3.json diff --git a/data/mlabonne_Daredevil-8B-abliterated.json b/data/models/mlabonne_Daredevil-8B-abliterated.json similarity index 100% rename from data/mlabonne_Daredevil-8B-abliterated.json rename to data/models/mlabonne_Daredevil-8B-abliterated.json diff --git a/data/mlabonne_Daredevil-8B.json b/data/models/mlabonne_Daredevil-8B.json similarity index 100% rename from data/mlabonne_Daredevil-8B.json rename to data/models/mlabonne_Daredevil-8B.json diff --git a/data/mlabonne_Hermes-3-Llama-3.1-70B-lorablated.json b/data/models/mlabonne_Hermes-3-Llama-3.1-70B-lorablated.json similarity index 100% rename from data/mlabonne_Hermes-3-Llama-3.1-70B-lorablated.json rename to data/models/mlabonne_Hermes-3-Llama-3.1-70B-lorablated.json diff --git a/data/mlabonne_Meta-Llama-3.1-8B-Instruct-abliterated.json b/data/models/mlabonne_Meta-Llama-3.1-8B-Instruct-abliterated.json similarity index 100% rename from data/mlabonne_Meta-Llama-3.1-8B-Instruct-abliterated.json rename to data/models/mlabonne_Meta-Llama-3.1-8B-Instruct-abliterated.json diff --git a/data/mlabonne_NeuralBeagle14-7B.json b/data/models/mlabonne_NeuralBeagle14-7B.json similarity index 100% rename from data/mlabonne_NeuralBeagle14-7B.json rename to data/models/mlabonne_NeuralBeagle14-7B.json diff --git a/data/mlabonne_NeuralDaredevil-8B-abliterated.json b/data/models/mlabonne_NeuralDaredevil-8B-abliterated.json similarity index 100% rename from data/mlabonne_NeuralDaredevil-8B-abliterated.json rename to data/models/mlabonne_NeuralDaredevil-8B-abliterated.json diff --git a/data/mlabonne_OrpoLlama-3-8B.json b/data/models/mlabonne_OrpoLlama-3-8B.json similarity index 100% rename from data/mlabonne_OrpoLlama-3-8B.json rename to data/models/mlabonne_OrpoLlama-3-8B.json diff --git a/data/mlabonne_phixtral-2x2_8.json b/data/models/mlabonne_phixtral-2x2_8.json similarity index 100% rename from data/mlabonne_phixtral-2x2_8.json rename to data/models/mlabonne_phixtral-2x2_8.json diff --git a/data/mlx-community_Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32.json b/data/models/mlx-community_Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32.json similarity index 100% rename from data/mlx-community_Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32.json rename to data/models/mlx-community_Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32.json diff --git a/data/mlx-community_Mistral-Small-24B-Instruct-2501-bf16.json b/data/models/mlx-community_Mistral-Small-24B-Instruct-2501-bf16.json similarity index 100% rename from data/mlx-community_Mistral-Small-24B-Instruct-2501-bf16.json rename to data/models/mlx-community_Mistral-Small-24B-Instruct-2501-bf16.json diff --git a/data/mmnga_Llama-3-70B-japanese-suzume-vector-v0.1.json b/data/models/mmnga_Llama-3-70B-japanese-suzume-vector-v0.1.json similarity index 100% rename from data/mmnga_Llama-3-70B-japanese-suzume-vector-v0.1.json rename to data/models/mmnga_Llama-3-70B-japanese-suzume-vector-v0.1.json diff --git a/data/mobiuslabsgmbh_DeepSeek-R1-ReDistill-Llama3-8B-v1.1.json b/data/models/mobiuslabsgmbh_DeepSeek-R1-ReDistill-Llama3-8B-v1.1.json similarity index 100% rename from data/mobiuslabsgmbh_DeepSeek-R1-ReDistill-Llama3-8B-v1.1.json rename to data/models/mobiuslabsgmbh_DeepSeek-R1-ReDistill-Llama3-8B-v1.1.json diff --git a/data/mobiuslabsgmbh_DeepSeek-R1-ReDistill-Qwen-7B-v1.1.json b/data/models/mobiuslabsgmbh_DeepSeek-R1-ReDistill-Qwen-7B-v1.1.json similarity index 100% rename from data/mobiuslabsgmbh_DeepSeek-R1-ReDistill-Qwen-7B-v1.1.json rename to data/models/mobiuslabsgmbh_DeepSeek-R1-ReDistill-Qwen-7B-v1.1.json diff --git a/data/moeru-ai_L3.1-Moe-2x8B-v0.2.json b/data/models/moeru-ai_L3.1-Moe-2x8B-v0.2.json similarity index 100% rename from data/moeru-ai_L3.1-Moe-2x8B-v0.2.json rename to data/models/moeru-ai_L3.1-Moe-2x8B-v0.2.json diff --git a/data/moeru-ai_L3.1-Moe-4x8B-v0.1.json b/data/models/moeru-ai_L3.1-Moe-4x8B-v0.1.json similarity index 100% rename from data/moeru-ai_L3.1-Moe-4x8B-v0.1.json rename to data/models/moeru-ai_L3.1-Moe-4x8B-v0.1.json diff --git a/data/moeru-ai_L3.1-Moe-4x8B-v0.2.json b/data/models/moeru-ai_L3.1-Moe-4x8B-v0.2.json similarity index 100% rename from data/moeru-ai_L3.1-Moe-4x8B-v0.2.json rename to data/models/moeru-ai_L3.1-Moe-4x8B-v0.2.json diff --git a/data/monsterapi_Llama-3_1-8B-Instruct-orca-ORPO.json b/data/models/monsterapi_Llama-3_1-8B-Instruct-orca-ORPO.json similarity index 100% rename from data/monsterapi_Llama-3_1-8B-Instruct-orca-ORPO.json rename to data/models/monsterapi_Llama-3_1-8B-Instruct-orca-ORPO.json diff --git a/data/monsterapi_gemma-2-2b-LoRA-MonsterInstruct.json b/data/models/monsterapi_gemma-2-2b-LoRA-MonsterInstruct.json similarity index 100% rename from data/monsterapi_gemma-2-2b-LoRA-MonsterInstruct.json rename to data/models/monsterapi_gemma-2-2b-LoRA-MonsterInstruct.json diff --git a/data/moonshot-ai_kimi-k2-instruct.json b/data/models/moonshot-ai_kimi-k2-instruct.json similarity index 98% rename from data/moonshot-ai_kimi-k2-instruct.json rename to data/models/moonshot-ai_kimi-k2-instruct.json index 0d9f363694c8a12a4ddbdcabf8d86bc4c1928a0e..78ebe91a361fd2e157f2607abf27ff68d501e0bb 100644 --- a/data/moonshot-ai_kimi-k2-instruct.json +++ b/data/models/moonshot-ai_kimi-k2-instruct.json @@ -4,13 +4,13 @@ "id": "moonshot-ai/kimi-k2-instruct", "developer": "Moonshot AI", "additional_details": { - "agent_name": "Terminus 2", - "agent_organization": "Terminal Bench" + "agent_name": "OpenHands", + "agent_organization": "OpenHands" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/openhands__kimi-k2-instruct/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__kimi-k2-instruct/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-02", + "evaluation_timestamp": "2025-11-01", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 26.7, + "score": 27.8, "uncertainty": { "standard_error": { - "value": 2.7 + "value": 2.5 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Kimi K2 Instruct\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Kimi K2 Instruct\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Kimi K2 Instruct\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Kimi K2 Instruct\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__kimi-k2-instruct/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/openhands__kimi-k2-instruct/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-01", + "evaluation_timestamp": "2025-11-02", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 27.8, + "score": 26.7, "uncertainty": { "standard_error": { - "value": 2.5 + "value": 2.7 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Kimi K2 Instruct\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Kimi K2 Instruct\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Kimi K2 Instruct\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Kimi K2 Instruct\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/moonshot-ai_kimi-k2-thinking.json b/data/models/moonshot-ai_kimi-k2-thinking.json similarity index 100% rename from data/moonshot-ai_kimi-k2-thinking.json rename to data/models/moonshot-ai_kimi-k2-thinking.json diff --git a/data/moonshot-ai_kimi-k2.5.json b/data/models/moonshot-ai_kimi-k2.5.json similarity index 100% rename from data/moonshot-ai_kimi-k2.5.json rename to data/models/moonshot-ai_kimi-k2.5.json diff --git a/data/moonshot_Kimi_K2.5.json b/data/models/moonshot_Kimi_K2.5.json similarity index 100% rename from data/moonshot_Kimi_K2.5.json rename to data/models/moonshot_Kimi_K2.5.json diff --git a/data/moonshot_Kimi_K2_Thinking.json b/data/models/moonshot_Kimi_K2_Thinking.json similarity index 100% rename from data/moonshot_Kimi_K2_Thinking.json rename to data/models/moonshot_Kimi_K2_Thinking.json diff --git a/data/moonshotai_kimi-k2-instruct.json b/data/models/moonshotai_kimi-k2-instruct.json similarity index 100% rename from data/moonshotai_kimi-k2-instruct.json rename to data/models/moonshotai_kimi-k2-instruct.json diff --git a/data/mosaicml_MPT-30B.json b/data/models/mosaicml_MPT-30B.json similarity index 100% rename from data/mosaicml_MPT-30B.json rename to data/models/mosaicml_MPT-30B.json diff --git a/data/mosaicml_MPT-Instruct-30B.json b/data/models/mosaicml_MPT-Instruct-30B.json similarity index 100% rename from data/mosaicml_MPT-Instruct-30B.json rename to data/models/mosaicml_MPT-Instruct-30B.json diff --git a/data/mosaicml_mpt-7b.json b/data/models/mosaicml_mpt-7b.json similarity index 100% rename from data/mosaicml_mpt-7b.json rename to data/models/mosaicml_mpt-7b.json diff --git a/data/mosama_Qwen2.5-1.5B-Instruct-CoT-Reflection.json b/data/models/mosama_Qwen2.5-1.5B-Instruct-CoT-Reflection.json similarity index 100% rename from data/mosama_Qwen2.5-1.5B-Instruct-CoT-Reflection.json rename to data/models/mosama_Qwen2.5-1.5B-Instruct-CoT-Reflection.json diff --git a/data/mrdayl_OpenCogito.json b/data/models/mrdayl_OpenCogito.json similarity index 100% rename from data/mrdayl_OpenCogito.json rename to data/models/mrdayl_OpenCogito.json diff --git a/data/mrdayl_OpenCognito-r1.json b/data/models/mrdayl_OpenCognito-r1.json similarity index 100% rename from data/mrdayl_OpenCognito-r1.json rename to data/models/mrdayl_OpenCognito-r1.json diff --git a/data/mrdayl_OpenCognito-r2.json b/data/models/mrdayl_OpenCognito-r2.json similarity index 100% rename from data/mrdayl_OpenCognito-r2.json rename to data/models/mrdayl_OpenCognito-r2.json diff --git a/data/mrdayl_OpenCognito.json b/data/models/mrdayl_OpenCognito.json similarity index 100% rename from data/mrdayl_OpenCognito.json rename to data/models/mrdayl_OpenCognito.json diff --git a/data/mrdayl_OpenThink.json b/data/models/mrdayl_OpenThink.json similarity index 100% rename from data/mrdayl_OpenThink.json rename to data/models/mrdayl_OpenThink.json diff --git a/data/mrm8488_phi-4-14B-grpo-gsm8k-3e.json b/data/models/mrm8488_phi-4-14B-grpo-gsm8k-3e.json similarity index 100% rename from data/mrm8488_phi-4-14B-grpo-gsm8k-3e.json rename to data/models/mrm8488_phi-4-14B-grpo-gsm8k-3e.json diff --git a/data/mrm8488_phi-4-14B-grpo-limo.json b/data/models/mrm8488_phi-4-14B-grpo-limo.json similarity index 100% rename from data/mrm8488_phi-4-14B-grpo-limo.json rename to data/models/mrm8488_phi-4-14B-grpo-limo.json diff --git a/data/mukaj_Llama-3.1-Hawkish-8B.json b/data/models/mukaj_Llama-3.1-Hawkish-8B.json similarity index 100% rename from data/mukaj_Llama-3.1-Hawkish-8B.json rename to data/models/mukaj_Llama-3.1-Hawkish-8B.json diff --git a/data/multiple_multiple.json b/data/models/multiple_multiple.json similarity index 100% rename from data/multiple_multiple.json rename to data/models/multiple_multiple.json index 4d0fac80e7b789c71c1ed5915767c64138db2d79..962582d1837ba83db0927dd438888b7d933863ce 100644 --- a/data/multiple_multiple.json +++ b/data/models/multiple_multiple.json @@ -10,7 +10,7 @@ }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/abacus-ai-desktop__multiple/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/warp__multiple/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-11", + "evaluation_timestamp": "2025-11-20", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,7 +43,7 @@ "max_score": 100.0 }, "score_details": { - "score": 58.4, + "score": 59.1, "uncertainty": { "standard_error": { "value": 2.8 @@ -53,7 +53,7 @@ }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Abacus AI Desktop\" -m \"Multiple\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Warp\" -m \"Multiple\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Abacus AI Desktop\" -m \"Multiple\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Warp\" -m \"Multiple\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/ob-1__multiple/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/abacus-ai-desktop__multiple/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-03-05", + "evaluation_timestamp": "2025-12-11", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,17 @@ "max_score": 100.0 }, "score_details": { - "score": 72.4, + "score": 58.4, "uncertainty": { "standard_error": { - "value": 2.3 + "value": 2.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OB-1\" -m \"Multiple\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Abacus AI Desktop\" -m \"Multiple\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OB-1\" -m \"Multiple\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Abacus AI Desktop\" -m \"Multiple\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -256,7 +256,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-11", + "evaluation_timestamp": "2025-12-12", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -265,10 +265,10 @@ "max_score": 100.0 }, "score_details": { - "score": 50.1, + "score": 61.2, "uncertainty": { "standard_error": { - "value": 2.7 + "value": 3.0 }, "num_samples": 435 } @@ -306,7 +306,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/warp__multiple/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/ob-1__multiple/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -330,7 +330,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-20", + "evaluation_timestamp": "2026-03-05", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -339,17 +339,17 @@ "max_score": 100.0 }, "score_details": { - "score": 59.1, + "score": 72.4, "uncertainty": { "standard_error": { - "value": 2.8 + "value": 2.3 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Warp\" -m \"Multiple\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OB-1\" -m \"Multiple\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -366,7 +366,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Warp\" -m \"Multiple\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OB-1\" -m \"Multiple\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -404,7 +404,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-12", + "evaluation_timestamp": "2025-11-11", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -413,10 +413,10 @@ "max_score": 100.0 }, "score_details": { - "score": 61.2, + "score": 50.1, "uncertainty": { "standard_error": { - "value": 3.0 + "value": 2.7 }, "num_samples": 435 } diff --git a/data/my_model.json b/data/models/my_model.json similarity index 100% rename from data/my_model.json rename to data/models/my_model.json diff --git a/data/natong19_Mistral-Nemo-Instruct-2407-abliterated.json b/data/models/natong19_Mistral-Nemo-Instruct-2407-abliterated.json similarity index 100% rename from data/natong19_Mistral-Nemo-Instruct-2407-abliterated.json rename to data/models/natong19_Mistral-Nemo-Instruct-2407-abliterated.json diff --git a/data/natong19_Qwen2-7B-Instruct-abliterated.json b/data/models/natong19_Qwen2-7B-Instruct-abliterated.json similarity index 100% rename from data/natong19_Qwen2-7B-Instruct-abliterated.json rename to data/models/natong19_Qwen2-7B-Instruct-abliterated.json diff --git a/data/nazimali_Mistral-Nemo-Kurdish-Instruct.json b/data/models/nazimali_Mistral-Nemo-Kurdish-Instruct.json similarity index 100% rename from data/nazimali_Mistral-Nemo-Kurdish-Instruct.json rename to data/models/nazimali_Mistral-Nemo-Kurdish-Instruct.json diff --git a/data/nazimali_Mistral-Nemo-Kurdish.json b/data/models/nazimali_Mistral-Nemo-Kurdish.json similarity index 100% rename from data/nazimali_Mistral-Nemo-Kurdish.json rename to data/models/nazimali_Mistral-Nemo-Kurdish.json diff --git a/data/nbeerbower_BigKartoffel-mistral-nemo-20B.json b/data/models/nbeerbower_BigKartoffel-mistral-nemo-20B.json similarity index 100% rename from data/nbeerbower_BigKartoffel-mistral-nemo-20B.json rename to data/models/nbeerbower_BigKartoffel-mistral-nemo-20B.json diff --git a/data/nbeerbower_DoppelKartoffel-Mistral-Nemo-23B.json b/data/models/nbeerbower_DoppelKartoffel-Mistral-Nemo-23B.json similarity index 100% rename from data/nbeerbower_DoppelKartoffel-Mistral-Nemo-23B.json rename to data/models/nbeerbower_DoppelKartoffel-Mistral-Nemo-23B.json diff --git a/data/nbeerbower_DoublePotato-Mistral-Nemo-13B.json b/data/models/nbeerbower_DoublePotato-Mistral-Nemo-13B.json similarity index 100% rename from data/nbeerbower_DoublePotato-Mistral-Nemo-13B.json rename to data/models/nbeerbower_DoublePotato-Mistral-Nemo-13B.json diff --git a/data/nbeerbower_Dumpling-Qwen2.5-1.5B.json b/data/models/nbeerbower_Dumpling-Qwen2.5-1.5B.json similarity index 100% rename from data/nbeerbower_Dumpling-Qwen2.5-1.5B.json rename to data/models/nbeerbower_Dumpling-Qwen2.5-1.5B.json diff --git a/data/nbeerbower_Dumpling-Qwen2.5-14B.json b/data/models/nbeerbower_Dumpling-Qwen2.5-14B.json similarity index 100% rename from data/nbeerbower_Dumpling-Qwen2.5-14B.json rename to data/models/nbeerbower_Dumpling-Qwen2.5-14B.json diff --git a/data/nbeerbower_Dumpling-Qwen2.5-7B-1k-r16.json b/data/models/nbeerbower_Dumpling-Qwen2.5-7B-1k-r16.json similarity index 100% rename from data/nbeerbower_Dumpling-Qwen2.5-7B-1k-r16.json rename to data/models/nbeerbower_Dumpling-Qwen2.5-7B-1k-r16.json diff --git a/data/nbeerbower_Dumpling-Qwen2.5-7B-1k-r64-2e-5.json b/data/models/nbeerbower_Dumpling-Qwen2.5-7B-1k-r64-2e-5.json similarity index 100% rename from data/nbeerbower_Dumpling-Qwen2.5-7B-1k-r64-2e-5.json rename to data/models/nbeerbower_Dumpling-Qwen2.5-7B-1k-r64-2e-5.json diff --git a/data/nbeerbower_EVA-abliterated-TIES-Qwen2.5-1.5B.json b/data/models/nbeerbower_EVA-abliterated-TIES-Qwen2.5-1.5B.json similarity index 100% rename from data/nbeerbower_EVA-abliterated-TIES-Qwen2.5-1.5B.json rename to data/models/nbeerbower_EVA-abliterated-TIES-Qwen2.5-1.5B.json diff --git a/data/nbeerbower_EVA-abliterated-TIES-Qwen2.5-14B.json b/data/models/nbeerbower_EVA-abliterated-TIES-Qwen2.5-14B.json similarity index 100% rename from data/nbeerbower_EVA-abliterated-TIES-Qwen2.5-14B.json rename to data/models/nbeerbower_EVA-abliterated-TIES-Qwen2.5-14B.json diff --git a/data/nbeerbower_Flammades-Mistral-Nemo-12B.json b/data/models/nbeerbower_Flammades-Mistral-Nemo-12B.json similarity index 100% rename from data/nbeerbower_Flammades-Mistral-Nemo-12B.json rename to data/models/nbeerbower_Flammades-Mistral-Nemo-12B.json diff --git a/data/nbeerbower_Gemma2-Gutenberg-Doppel-9B.json b/data/models/nbeerbower_Gemma2-Gutenberg-Doppel-9B.json similarity index 100% rename from data/nbeerbower_Gemma2-Gutenberg-Doppel-9B.json rename to data/models/nbeerbower_Gemma2-Gutenberg-Doppel-9B.json diff --git a/data/nbeerbower_Gutensuppe-mistral-nemo-12B.json b/data/models/nbeerbower_Gutensuppe-mistral-nemo-12B.json similarity index 100% rename from data/nbeerbower_Gutensuppe-mistral-nemo-12B.json rename to data/models/nbeerbower_Gutensuppe-mistral-nemo-12B.json diff --git a/data/nbeerbower_Hermes2-Gutenberg2-Mistral-7B.json b/data/models/nbeerbower_Hermes2-Gutenberg2-Mistral-7B.json similarity index 100% rename from data/nbeerbower_Hermes2-Gutenberg2-Mistral-7B.json rename to data/models/nbeerbower_Hermes2-Gutenberg2-Mistral-7B.json diff --git a/data/nbeerbower_Kartoffel-Deepfry-12B.json b/data/models/nbeerbower_Kartoffel-Deepfry-12B.json similarity index 100% rename from data/nbeerbower_Kartoffel-Deepfry-12B.json rename to data/models/nbeerbower_Kartoffel-Deepfry-12B.json diff --git a/data/nbeerbower_Llama-3.1-Nemotron-lorablated-70B.json b/data/models/nbeerbower_Llama-3.1-Nemotron-lorablated-70B.json similarity index 100% rename from data/nbeerbower_Llama-3.1-Nemotron-lorablated-70B.json rename to data/models/nbeerbower_Llama-3.1-Nemotron-lorablated-70B.json diff --git a/data/nbeerbower_Llama3.1-Gutenberg-Doppel-70B.json b/data/models/nbeerbower_Llama3.1-Gutenberg-Doppel-70B.json similarity index 100% rename from data/nbeerbower_Llama3.1-Gutenberg-Doppel-70B.json rename to data/models/nbeerbower_Llama3.1-Gutenberg-Doppel-70B.json diff --git a/data/nbeerbower_Lyra-Gutenberg-mistral-nemo-12B.json b/data/models/nbeerbower_Lyra-Gutenberg-mistral-nemo-12B.json similarity index 100% rename from data/nbeerbower_Lyra-Gutenberg-mistral-nemo-12B.json rename to data/models/nbeerbower_Lyra-Gutenberg-mistral-nemo-12B.json diff --git a/data/nbeerbower_Lyra4-Gutenberg-12B.json b/data/models/nbeerbower_Lyra4-Gutenberg-12B.json similarity index 100% rename from data/nbeerbower_Lyra4-Gutenberg-12B.json rename to data/models/nbeerbower_Lyra4-Gutenberg-12B.json diff --git a/data/nbeerbower_Lyra4-Gutenberg2-12B.json b/data/models/nbeerbower_Lyra4-Gutenberg2-12B.json similarity index 100% rename from data/nbeerbower_Lyra4-Gutenberg2-12B.json rename to data/models/nbeerbower_Lyra4-Gutenberg2-12B.json diff --git a/data/nbeerbower_Mahou-1.5-mistral-nemo-12B-lorablated.json b/data/models/nbeerbower_Mahou-1.5-mistral-nemo-12B-lorablated.json similarity index 100% rename from data/nbeerbower_Mahou-1.5-mistral-nemo-12B-lorablated.json rename to data/models/nbeerbower_Mahou-1.5-mistral-nemo-12B-lorablated.json diff --git a/data/nbeerbower_Mistral-Gutenberg-Doppel-7B-FFT.json b/data/models/nbeerbower_Mistral-Gutenberg-Doppel-7B-FFT.json similarity index 100% rename from data/nbeerbower_Mistral-Gutenberg-Doppel-7B-FFT.json rename to data/models/nbeerbower_Mistral-Gutenberg-Doppel-7B-FFT.json diff --git a/data/nbeerbower_Mistral-Nemo-Gutenberg-Doppel-12B-v2.json b/data/models/nbeerbower_Mistral-Nemo-Gutenberg-Doppel-12B-v2.json similarity index 100% rename from data/nbeerbower_Mistral-Nemo-Gutenberg-Doppel-12B-v2.json rename to data/models/nbeerbower_Mistral-Nemo-Gutenberg-Doppel-12B-v2.json diff --git a/data/nbeerbower_Mistral-Nemo-Gutenberg-Doppel-12B.json b/data/models/nbeerbower_Mistral-Nemo-Gutenberg-Doppel-12B.json similarity index 100% rename from data/nbeerbower_Mistral-Nemo-Gutenberg-Doppel-12B.json rename to data/models/nbeerbower_Mistral-Nemo-Gutenberg-Doppel-12B.json diff --git a/data/nbeerbower_Mistral-Nemo-Moderne-12B-FFT-experimental.json b/data/models/nbeerbower_Mistral-Nemo-Moderne-12B-FFT-experimental.json similarity index 100% rename from data/nbeerbower_Mistral-Nemo-Moderne-12B-FFT-experimental.json rename to data/models/nbeerbower_Mistral-Nemo-Moderne-12B-FFT-experimental.json diff --git a/data/nbeerbower_Mistral-Nemo-Prism-12B-v2.json b/data/models/nbeerbower_Mistral-Nemo-Prism-12B-v2.json similarity index 100% rename from data/nbeerbower_Mistral-Nemo-Prism-12B-v2.json rename to data/models/nbeerbower_Mistral-Nemo-Prism-12B-v2.json diff --git a/data/nbeerbower_Mistral-Nemo-Prism-12B-v7.json b/data/models/nbeerbower_Mistral-Nemo-Prism-12B-v7.json similarity index 100% rename from data/nbeerbower_Mistral-Nemo-Prism-12B-v7.json rename to data/models/nbeerbower_Mistral-Nemo-Prism-12B-v7.json diff --git a/data/nbeerbower_Mistral-Nemo-Prism-12B.json b/data/models/nbeerbower_Mistral-Nemo-Prism-12B.json similarity index 100% rename from data/nbeerbower_Mistral-Nemo-Prism-12B.json rename to data/models/nbeerbower_Mistral-Nemo-Prism-12B.json diff --git a/data/nbeerbower_Mistral-Small-Drummer-22B.json b/data/models/nbeerbower_Mistral-Small-Drummer-22B.json similarity index 100% rename from data/nbeerbower_Mistral-Small-Drummer-22B.json rename to data/models/nbeerbower_Mistral-Small-Drummer-22B.json diff --git a/data/nbeerbower_Mistral-Small-Gutenberg-Doppel-22B.json b/data/models/nbeerbower_Mistral-Small-Gutenberg-Doppel-22B.json similarity index 100% rename from data/nbeerbower_Mistral-Small-Gutenberg-Doppel-22B.json rename to data/models/nbeerbower_Mistral-Small-Gutenberg-Doppel-22B.json diff --git a/data/nbeerbower_Nemo-Loony-12B-experimental.json b/data/models/nbeerbower_Nemo-Loony-12B-experimental.json similarity index 100% rename from data/nbeerbower_Nemo-Loony-12B-experimental.json rename to data/models/nbeerbower_Nemo-Loony-12B-experimental.json diff --git a/data/nbeerbower_Nemoties-ChatML-12B.json b/data/models/nbeerbower_Nemoties-ChatML-12B.json similarity index 100% rename from data/nbeerbower_Nemoties-ChatML-12B.json rename to data/models/nbeerbower_Nemoties-ChatML-12B.json diff --git a/data/nbeerbower_Qwen2.5-Gutenberg-Doppel-14B.json b/data/models/nbeerbower_Qwen2.5-Gutenberg-Doppel-14B.json similarity index 100% rename from data/nbeerbower_Qwen2.5-Gutenberg-Doppel-14B.json rename to data/models/nbeerbower_Qwen2.5-Gutenberg-Doppel-14B.json diff --git a/data/nbeerbower_SmolNemo-12B-FFT-experimental.json b/data/models/nbeerbower_SmolNemo-12B-FFT-experimental.json similarity index 100% rename from data/nbeerbower_SmolNemo-12B-FFT-experimental.json rename to data/models/nbeerbower_SmolNemo-12B-FFT-experimental.json diff --git a/data/nbeerbower_Stella-mistral-nemo-12B-v2.json b/data/models/nbeerbower_Stella-mistral-nemo-12B-v2.json similarity index 100% rename from data/nbeerbower_Stella-mistral-nemo-12B-v2.json rename to data/models/nbeerbower_Stella-mistral-nemo-12B-v2.json diff --git a/data/nbeerbower_gemma2-gutenberg-27B.json b/data/models/nbeerbower_gemma2-gutenberg-27B.json similarity index 100% rename from data/nbeerbower_gemma2-gutenberg-27B.json rename to data/models/nbeerbower_gemma2-gutenberg-27B.json diff --git a/data/nbeerbower_gemma2-gutenberg-9B.json b/data/models/nbeerbower_gemma2-gutenberg-9B.json similarity index 100% rename from data/nbeerbower_gemma2-gutenberg-9B.json rename to data/models/nbeerbower_gemma2-gutenberg-9B.json diff --git a/data/nbeerbower_llama-3-gutenberg-8B.json b/data/models/nbeerbower_llama-3-gutenberg-8B.json similarity index 100% rename from data/nbeerbower_llama-3-gutenberg-8B.json rename to data/models/nbeerbower_llama-3-gutenberg-8B.json diff --git a/data/nbeerbower_llama3.1-cc-8B.json b/data/models/nbeerbower_llama3.1-cc-8B.json similarity index 100% rename from data/nbeerbower_llama3.1-cc-8B.json rename to data/models/nbeerbower_llama3.1-cc-8B.json diff --git a/data/nbeerbower_llama3.1-kartoffeldes-70B.json b/data/models/nbeerbower_llama3.1-kartoffeldes-70B.json similarity index 100% rename from data/nbeerbower_llama3.1-kartoffeldes-70B.json rename to data/models/nbeerbower_llama3.1-kartoffeldes-70B.json diff --git a/data/nbeerbower_mistral-nemo-bophades-12B.json b/data/models/nbeerbower_mistral-nemo-bophades-12B.json similarity index 100% rename from data/nbeerbower_mistral-nemo-bophades-12B.json rename to data/models/nbeerbower_mistral-nemo-bophades-12B.json diff --git a/data/nbeerbower_mistral-nemo-bophades3-12B.json b/data/models/nbeerbower_mistral-nemo-bophades3-12B.json similarity index 100% rename from data/nbeerbower_mistral-nemo-bophades3-12B.json rename to data/models/nbeerbower_mistral-nemo-bophades3-12B.json diff --git a/data/nbeerbower_mistral-nemo-cc-12B.json b/data/models/nbeerbower_mistral-nemo-cc-12B.json similarity index 100% rename from data/nbeerbower_mistral-nemo-cc-12B.json rename to data/models/nbeerbower_mistral-nemo-cc-12B.json diff --git a/data/nbeerbower_mistral-nemo-gutades-12B.json b/data/models/nbeerbower_mistral-nemo-gutades-12B.json similarity index 100% rename from data/nbeerbower_mistral-nemo-gutades-12B.json rename to data/models/nbeerbower_mistral-nemo-gutades-12B.json diff --git a/data/nbeerbower_mistral-nemo-gutenberg-12B-v2.json b/data/models/nbeerbower_mistral-nemo-gutenberg-12B-v2.json similarity index 100% rename from data/nbeerbower_mistral-nemo-gutenberg-12B-v2.json rename to data/models/nbeerbower_mistral-nemo-gutenberg-12B-v2.json diff --git a/data/nbeerbower_mistral-nemo-gutenberg-12B-v3.json b/data/models/nbeerbower_mistral-nemo-gutenberg-12B-v3.json similarity index 100% rename from data/nbeerbower_mistral-nemo-gutenberg-12B-v3.json rename to data/models/nbeerbower_mistral-nemo-gutenberg-12B-v3.json diff --git a/data/nbeerbower_mistral-nemo-gutenberg-12B-v4.json b/data/models/nbeerbower_mistral-nemo-gutenberg-12B-v4.json similarity index 100% rename from data/nbeerbower_mistral-nemo-gutenberg-12B-v4.json rename to data/models/nbeerbower_mistral-nemo-gutenberg-12B-v4.json diff --git a/data/nbeerbower_mistral-nemo-gutenberg-12B.json b/data/models/nbeerbower_mistral-nemo-gutenberg-12B.json similarity index 100% rename from data/nbeerbower_mistral-nemo-gutenberg-12B.json rename to data/models/nbeerbower_mistral-nemo-gutenberg-12B.json diff --git a/data/nbeerbower_mistral-nemo-gutenberg2-12B-test.json b/data/models/nbeerbower_mistral-nemo-gutenberg2-12B-test.json similarity index 100% rename from data/nbeerbower_mistral-nemo-gutenberg2-12B-test.json rename to data/models/nbeerbower_mistral-nemo-gutenberg2-12B-test.json diff --git a/data/nbeerbower_mistral-nemo-kartoffel-12B.json b/data/models/nbeerbower_mistral-nemo-kartoffel-12B.json similarity index 100% rename from data/nbeerbower_mistral-nemo-kartoffel-12B.json rename to data/models/nbeerbower_mistral-nemo-kartoffel-12B.json diff --git a/data/nbeerbower_mistral-nemo-narwhal-12B.json b/data/models/nbeerbower_mistral-nemo-narwhal-12B.json similarity index 100% rename from data/nbeerbower_mistral-nemo-narwhal-12B.json rename to data/models/nbeerbower_mistral-nemo-narwhal-12B.json diff --git a/data/nbeerbower_mistral-nemo-wissenschaft-12B.json b/data/models/nbeerbower_mistral-nemo-wissenschaft-12B.json similarity index 100% rename from data/nbeerbower_mistral-nemo-wissenschaft-12B.json rename to data/models/nbeerbower_mistral-nemo-wissenschaft-12B.json diff --git a/data/nbrahme_IndusQ.json b/data/models/nbrahme_IndusQ.json similarity index 100% rename from data/nbrahme_IndusQ.json rename to data/models/nbrahme_IndusQ.json diff --git a/data/necva_IE-cont-Llama3.1-8B.json b/data/models/necva_IE-cont-Llama3.1-8B.json similarity index 100% rename from data/necva_IE-cont-Llama3.1-8B.json rename to data/models/necva_IE-cont-Llama3.1-8B.json diff --git a/data/necva_replica-IEPile.json b/data/models/necva_replica-IEPile.json similarity index 100% rename from data/necva_replica-IEPile.json rename to data/models/necva_replica-IEPile.json diff --git a/data/neopolita_jessi-v0.1-bf16-falcon3-7b-instruct.json b/data/models/neopolita_jessi-v0.1-bf16-falcon3-7b-instruct.json similarity index 100% rename from data/neopolita_jessi-v0.1-bf16-falcon3-7b-instruct.json rename to data/models/neopolita_jessi-v0.1-bf16-falcon3-7b-instruct.json diff --git a/data/neopolita_jessi-v0.1-falcon3-10b-instruct.json b/data/models/neopolita_jessi-v0.1-falcon3-10b-instruct.json similarity index 100% rename from data/neopolita_jessi-v0.1-falcon3-10b-instruct.json rename to data/models/neopolita_jessi-v0.1-falcon3-10b-instruct.json diff --git a/data/neopolita_jessi-v0.1-qwen2.5-7b-instruct.json b/data/models/neopolita_jessi-v0.1-qwen2.5-7b-instruct.json similarity index 100% rename from data/neopolita_jessi-v0.1-qwen2.5-7b-instruct.json rename to data/models/neopolita_jessi-v0.1-qwen2.5-7b-instruct.json diff --git a/data/neopolita_jessi-v0.1-virtuoso-small.json b/data/models/neopolita_jessi-v0.1-virtuoso-small.json similarity index 100% rename from data/neopolita_jessi-v0.1-virtuoso-small.json rename to data/models/neopolita_jessi-v0.1-virtuoso-small.json diff --git a/data/neopolita_jessi-v0.2-falcon3-10b-instruct.json b/data/models/neopolita_jessi-v0.2-falcon3-10b-instruct.json similarity index 100% rename from data/neopolita_jessi-v0.2-falcon3-10b-instruct.json rename to data/models/neopolita_jessi-v0.2-falcon3-10b-instruct.json diff --git a/data/neopolita_jessi-v0.2-falcon3-7b-instruct.json b/data/models/neopolita_jessi-v0.2-falcon3-7b-instruct.json similarity index 100% rename from data/neopolita_jessi-v0.2-falcon3-7b-instruct.json rename to data/models/neopolita_jessi-v0.2-falcon3-7b-instruct.json diff --git a/data/neopolita_jessi-v0.3-falcon3-7b-instruct.json b/data/models/neopolita_jessi-v0.3-falcon3-7b-instruct.json similarity index 100% rename from data/neopolita_jessi-v0.3-falcon3-7b-instruct.json rename to data/models/neopolita_jessi-v0.3-falcon3-7b-instruct.json diff --git a/data/neopolita_jessi-v0.4-falcon3-7b-instruct.json b/data/models/neopolita_jessi-v0.4-falcon3-7b-instruct.json similarity index 100% rename from data/neopolita_jessi-v0.4-falcon3-7b-instruct.json rename to data/models/neopolita_jessi-v0.4-falcon3-7b-instruct.json diff --git a/data/neopolita_jessi-v0.5-falcon3-7b-instruct.json b/data/models/neopolita_jessi-v0.5-falcon3-7b-instruct.json similarity index 100% rename from data/neopolita_jessi-v0.5-falcon3-7b-instruct.json rename to data/models/neopolita_jessi-v0.5-falcon3-7b-instruct.json diff --git a/data/neopolita_jessi-v0.6-falcon3-7b-instruct.json b/data/models/neopolita_jessi-v0.6-falcon3-7b-instruct.json similarity index 100% rename from data/neopolita_jessi-v0.6-falcon3-7b-instruct.json rename to data/models/neopolita_jessi-v0.6-falcon3-7b-instruct.json diff --git a/data/neopolita_loki-v0.1-virtuoso.json b/data/models/neopolita_loki-v0.1-virtuoso.json similarity index 100% rename from data/neopolita_loki-v0.1-virtuoso.json rename to data/models/neopolita_loki-v0.1-virtuoso.json diff --git a/data/netcat420_DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b.json b/data/models/netcat420_DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b.json similarity index 100% rename from data/netcat420_DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b.json rename to data/models/netcat420_DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b.json diff --git a/data/netcat420_DeepSeek-R1-MFANN-TIES-unretrained-7b.json b/data/models/netcat420_DeepSeek-R1-MFANN-TIES-unretrained-7b.json similarity index 100% rename from data/netcat420_DeepSeek-R1-MFANN-TIES-unretrained-7b.json rename to data/models/netcat420_DeepSeek-R1-MFANN-TIES-unretrained-7b.json diff --git a/data/netcat420_Llama3.1-MFANN-8b.json b/data/models/netcat420_Llama3.1-MFANN-8b.json similarity index 100% rename from data/netcat420_Llama3.1-MFANN-8b.json rename to data/models/netcat420_Llama3.1-MFANN-8b.json diff --git a/data/netcat420_MFANN-Llama3.1-Abliterated-SLERP-TIES-V2.json b/data/models/netcat420_MFANN-Llama3.1-Abliterated-SLERP-TIES-V2.json similarity index 100% rename from data/netcat420_MFANN-Llama3.1-Abliterated-SLERP-TIES-V2.json rename to data/models/netcat420_MFANN-Llama3.1-Abliterated-SLERP-TIES-V2.json diff --git a/data/netcat420_MFANN-Llama3.1-Abliterated-SLERP-TIES-V3.json b/data/models/netcat420_MFANN-Llama3.1-Abliterated-SLERP-TIES-V3.json similarity index 100% rename from data/netcat420_MFANN-Llama3.1-Abliterated-SLERP-TIES-V3.json rename to data/models/netcat420_MFANN-Llama3.1-Abliterated-SLERP-TIES-V3.json diff --git a/data/netcat420_MFANN-Llama3.1-Abliterated-SLERP-V4.json b/data/models/netcat420_MFANN-Llama3.1-Abliterated-SLERP-V4.json similarity index 100% rename from data/netcat420_MFANN-Llama3.1-Abliterated-SLERP-V4.json rename to data/models/netcat420_MFANN-Llama3.1-Abliterated-SLERP-V4.json diff --git a/data/netcat420_MFANN-Llama3.1-Abliterated-SLERP-V5.json b/data/models/netcat420_MFANN-Llama3.1-Abliterated-SLERP-V5.json similarity index 100% rename from data/netcat420_MFANN-Llama3.1-Abliterated-SLERP-V5.json rename to data/models/netcat420_MFANN-Llama3.1-Abliterated-SLERP-V5.json diff --git a/data/netcat420_MFANN-Llama3.1-Abliterated-Slerp-TIES.json b/data/models/netcat420_MFANN-Llama3.1-Abliterated-Slerp-TIES.json similarity index 100% rename from data/netcat420_MFANN-Llama3.1-Abliterated-Slerp-TIES.json rename to data/models/netcat420_MFANN-Llama3.1-Abliterated-Slerp-TIES.json diff --git a/data/netcat420_MFANN-Llama3.1-Abliterated-Slerp-V3.2.json b/data/models/netcat420_MFANN-Llama3.1-Abliterated-Slerp-V3.2.json similarity index 100% rename from data/netcat420_MFANN-Llama3.1-Abliterated-Slerp-V3.2.json rename to data/models/netcat420_MFANN-Llama3.1-Abliterated-Slerp-V3.2.json diff --git a/data/netcat420_MFANN-SFT.json b/data/models/netcat420_MFANN-SFT.json similarity index 100% rename from data/netcat420_MFANN-SFT.json rename to data/models/netcat420_MFANN-SFT.json diff --git a/data/netcat420_MFANN-abliterated-phi2-merge-unretrained.json b/data/models/netcat420_MFANN-abliterated-phi2-merge-unretrained.json similarity index 100% rename from data/netcat420_MFANN-abliterated-phi2-merge-unretrained.json rename to data/models/netcat420_MFANN-abliterated-phi2-merge-unretrained.json diff --git a/data/netcat420_MFANN-llama3.1-Abliterated-SLERP.json b/data/models/netcat420_MFANN-llama3.1-Abliterated-SLERP.json similarity index 100% rename from data/netcat420_MFANN-llama3.1-Abliterated-SLERP.json rename to data/models/netcat420_MFANN-llama3.1-Abliterated-SLERP.json diff --git a/data/netcat420_MFANN-llama3.1-abliterated-SLERP-v3.1.json b/data/models/netcat420_MFANN-llama3.1-abliterated-SLERP-v3.1.json similarity index 100% rename from data/netcat420_MFANN-llama3.1-abliterated-SLERP-v3.1.json rename to data/models/netcat420_MFANN-llama3.1-abliterated-SLERP-v3.1.json diff --git a/data/netcat420_MFANN-llama3.1-abliterated-SLERP-v3.json b/data/models/netcat420_MFANN-llama3.1-abliterated-SLERP-v3.json similarity index 100% rename from data/netcat420_MFANN-llama3.1-abliterated-SLERP-v3.json rename to data/models/netcat420_MFANN-llama3.1-abliterated-SLERP-v3.json diff --git a/data/netcat420_MFANN-llama3.1-abliterated-v2.json b/data/models/netcat420_MFANN-llama3.1-abliterated-v2.json similarity index 100% rename from data/netcat420_MFANN-llama3.1-abliterated-v2.json rename to data/models/netcat420_MFANN-llama3.1-abliterated-v2.json diff --git a/data/netcat420_MFANN-phigments-slerp-V2.json b/data/models/netcat420_MFANN-phigments-slerp-V2.json similarity index 100% rename from data/netcat420_MFANN-phigments-slerp-V2.json rename to data/models/netcat420_MFANN-phigments-slerp-V2.json diff --git a/data/netcat420_MFANN-phigments-slerp-V3.2.json b/data/models/netcat420_MFANN-phigments-slerp-V3.2.json similarity index 100% rename from data/netcat420_MFANN-phigments-slerp-V3.2.json rename to data/models/netcat420_MFANN-phigments-slerp-V3.2.json diff --git a/data/netcat420_MFANN-phigments-slerp-V3.3.json b/data/models/netcat420_MFANN-phigments-slerp-V3.3.json similarity index 100% rename from data/netcat420_MFANN-phigments-slerp-V3.3.json rename to data/models/netcat420_MFANN-phigments-slerp-V3.3.json diff --git a/data/netcat420_MFANN3b.json b/data/models/netcat420_MFANN3b.json similarity index 100% rename from data/netcat420_MFANN3b.json rename to data/models/netcat420_MFANN3b.json diff --git a/data/netcat420_MFANN3bv0.15.json b/data/models/netcat420_MFANN3bv0.15.json similarity index 100% rename from data/netcat420_MFANN3bv0.15.json rename to data/models/netcat420_MFANN3bv0.15.json diff --git a/data/netcat420_MFANN3bv0.18.json b/data/models/netcat420_MFANN3bv0.18.json similarity index 100% rename from data/netcat420_MFANN3bv0.18.json rename to data/models/netcat420_MFANN3bv0.18.json diff --git a/data/netcat420_MFANN3bv0.19.json b/data/models/netcat420_MFANN3bv0.19.json similarity index 100% rename from data/netcat420_MFANN3bv0.19.json rename to data/models/netcat420_MFANN3bv0.19.json diff --git a/data/netcat420_MFANN3bv0.20.json b/data/models/netcat420_MFANN3bv0.20.json similarity index 100% rename from data/netcat420_MFANN3bv0.20.json rename to data/models/netcat420_MFANN3bv0.20.json diff --git a/data/netcat420_MFANN3bv0.21.json b/data/models/netcat420_MFANN3bv0.21.json similarity index 100% rename from data/netcat420_MFANN3bv0.21.json rename to data/models/netcat420_MFANN3bv0.21.json diff --git a/data/netcat420_MFANN3bv0.22.json b/data/models/netcat420_MFANN3bv0.22.json similarity index 100% rename from data/netcat420_MFANN3bv0.22.json rename to data/models/netcat420_MFANN3bv0.22.json diff --git a/data/netcat420_MFANN3bv0.23.json b/data/models/netcat420_MFANN3bv0.23.json similarity index 100% rename from data/netcat420_MFANN3bv0.23.json rename to data/models/netcat420_MFANN3bv0.23.json diff --git a/data/netcat420_MFANN3bv0.24.json b/data/models/netcat420_MFANN3bv0.24.json similarity index 100% rename from data/netcat420_MFANN3bv0.24.json rename to data/models/netcat420_MFANN3bv0.24.json diff --git a/data/netcat420_MFANN3bv1.1.json b/data/models/netcat420_MFANN3bv1.1.json similarity index 100% rename from data/netcat420_MFANN3bv1.1.json rename to data/models/netcat420_MFANN3bv1.1.json diff --git a/data/netcat420_MFANN3bv1.2.json b/data/models/netcat420_MFANN3bv1.2.json similarity index 100% rename from data/netcat420_MFANN3bv1.2.json rename to data/models/netcat420_MFANN3bv1.2.json diff --git a/data/netcat420_MFANN3bv1.3.json b/data/models/netcat420_MFANN3bv1.3.json similarity index 100% rename from data/netcat420_MFANN3bv1.3.json rename to data/models/netcat420_MFANN3bv1.3.json diff --git a/data/netcat420_MFANN3bv1.4.json b/data/models/netcat420_MFANN3bv1.4.json similarity index 100% rename from data/netcat420_MFANN3bv1.4.json rename to data/models/netcat420_MFANN3bv1.4.json diff --git a/data/netcat420_MFANNv0.19.json b/data/models/netcat420_MFANNv0.19.json similarity index 100% rename from data/netcat420_MFANNv0.19.json rename to data/models/netcat420_MFANNv0.19.json diff --git a/data/netcat420_MFANNv0.20.json b/data/models/netcat420_MFANNv0.20.json similarity index 100% rename from data/netcat420_MFANNv0.20.json rename to data/models/netcat420_MFANNv0.20.json diff --git a/data/netcat420_MFANNv0.21.json b/data/models/netcat420_MFANNv0.21.json similarity index 100% rename from data/netcat420_MFANNv0.21.json rename to data/models/netcat420_MFANNv0.21.json diff --git a/data/netcat420_MFANNv0.22.1.json b/data/models/netcat420_MFANNv0.22.1.json similarity index 100% rename from data/netcat420_MFANNv0.22.1.json rename to data/models/netcat420_MFANNv0.22.1.json diff --git a/data/netcat420_MFANNv0.23.json b/data/models/netcat420_MFANNv0.23.json similarity index 100% rename from data/netcat420_MFANNv0.23.json rename to data/models/netcat420_MFANNv0.23.json diff --git a/data/netcat420_MFANNv0.24.json b/data/models/netcat420_MFANNv0.24.json similarity index 100% rename from data/netcat420_MFANNv0.24.json rename to data/models/netcat420_MFANNv0.24.json diff --git a/data/netcat420_MFANNv0.25.json b/data/models/netcat420_MFANNv0.25.json similarity index 100% rename from data/netcat420_MFANNv0.25.json rename to data/models/netcat420_MFANNv0.25.json diff --git a/data/netcat420_Qwen2.5-7B-nerd-uncensored-v0.9-MFANN.json b/data/models/netcat420_Qwen2.5-7B-nerd-uncensored-v0.9-MFANN.json similarity index 100% rename from data/netcat420_Qwen2.5-7B-nerd-uncensored-v0.9-MFANN.json rename to data/models/netcat420_Qwen2.5-7B-nerd-uncensored-v0.9-MFANN.json diff --git a/data/netcat420_Qwen2.5-7b-MFANN-slerp.json b/data/models/netcat420_Qwen2.5-7b-MFANN-slerp.json similarity index 100% rename from data/netcat420_Qwen2.5-7b-MFANN-slerp.json rename to data/models/netcat420_Qwen2.5-7b-MFANN-slerp.json diff --git a/data/netcat420_Qwen2.5-7b-nerd-uncensored-MFANN-slerp.json b/data/models/netcat420_Qwen2.5-7b-nerd-uncensored-MFANN-slerp.json similarity index 100% rename from data/netcat420_Qwen2.5-7b-nerd-uncensored-MFANN-slerp.json rename to data/models/netcat420_Qwen2.5-7b-nerd-uncensored-MFANN-slerp.json diff --git a/data/netcat420_Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained.json b/data/models/netcat420_Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained.json similarity index 100% rename from data/netcat420_Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained.json rename to data/models/netcat420_Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained.json diff --git a/data/netcat420_Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN.json b/data/models/netcat420_Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN.json similarity index 100% rename from data/netcat420_Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN.json rename to data/models/netcat420_Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN.json diff --git a/data/netcat420_Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b.json b/data/models/netcat420_Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b.json similarity index 100% rename from data/netcat420_Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b.json rename to data/models/netcat420_Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b.json diff --git a/data/netcat420_Qwen2.5-MFANN-7b.json b/data/models/netcat420_Qwen2.5-MFANN-7b.json similarity index 100% rename from data/netcat420_Qwen2.5-MFANN-7b.json rename to data/models/netcat420_Qwen2.5-MFANN-7b.json diff --git a/data/netcat420_qwen2.5-MFANN-7b-SLERP-V1.2.json b/data/models/netcat420_qwen2.5-MFANN-7b-SLERP-V1.2.json similarity index 100% rename from data/netcat420_qwen2.5-MFANN-7b-SLERP-V1.2.json rename to data/models/netcat420_qwen2.5-MFANN-7b-SLERP-V1.2.json diff --git a/data/netcat420_qwen2.5-MFANN-7b-SLERPv1.1.json b/data/models/netcat420_qwen2.5-MFANN-7b-SLERPv1.1.json similarity index 100% rename from data/netcat420_qwen2.5-MFANN-7b-SLERPv1.1.json rename to data/models/netcat420_qwen2.5-MFANN-7b-SLERPv1.1.json diff --git a/data/netcat420_qwen2.5-MFANN-7b-v1.1.json b/data/models/netcat420_qwen2.5-MFANN-7b-v1.1.json similarity index 100% rename from data/netcat420_qwen2.5-MFANN-7b-v1.1.json rename to data/models/netcat420_qwen2.5-MFANN-7b-v1.1.json diff --git a/data/netease-youdao_Confucius-o1-14B.json b/data/models/netease-youdao_Confucius-o1-14B.json similarity index 100% rename from data/netease-youdao_Confucius-o1-14B.json rename to data/models/netease-youdao_Confucius-o1-14B.json diff --git a/data/newsbang_Homer-7B-v0.1.json b/data/models/newsbang_Homer-7B-v0.1.json similarity index 100% rename from data/newsbang_Homer-7B-v0.1.json rename to data/models/newsbang_Homer-7B-v0.1.json diff --git a/data/newsbang_Homer-7B-v0.2.json b/data/models/newsbang_Homer-7B-v0.2.json similarity index 100% rename from data/newsbang_Homer-7B-v0.2.json rename to data/models/newsbang_Homer-7B-v0.2.json diff --git a/data/newsbang_Homer-v0.3-Qwen2.5-7B.json b/data/models/newsbang_Homer-v0.3-Qwen2.5-7B.json similarity index 100% rename from data/newsbang_Homer-v0.3-Qwen2.5-7B.json rename to data/models/newsbang_Homer-v0.3-Qwen2.5-7B.json diff --git a/data/newsbang_Homer-v0.4-Qwen2.5-7B.json b/data/models/newsbang_Homer-v0.4-Qwen2.5-7B.json similarity index 100% rename from data/newsbang_Homer-v0.4-Qwen2.5-7B.json rename to data/models/newsbang_Homer-v0.4-Qwen2.5-7B.json diff --git a/data/newsbang_Homer-v0.5-Qwen2.5-7B.json b/data/models/newsbang_Homer-v0.5-Qwen2.5-7B.json similarity index 100% rename from data/newsbang_Homer-v0.5-Qwen2.5-7B.json rename to data/models/newsbang_Homer-v0.5-Qwen2.5-7B.json diff --git a/data/newsbang_Homer-v1.0-Qwen2.5-72B.json b/data/models/newsbang_Homer-v1.0-Qwen2.5-72B.json similarity index 100% rename from data/newsbang_Homer-v1.0-Qwen2.5-72B.json rename to data/models/newsbang_Homer-v1.0-Qwen2.5-72B.json diff --git a/data/newsbang_Homer-v1.0-Qwen2.5-7B.json b/data/models/newsbang_Homer-v1.0-Qwen2.5-7B.json similarity index 100% rename from data/newsbang_Homer-v1.0-Qwen2.5-7B.json rename to data/models/newsbang_Homer-v1.0-Qwen2.5-7B.json diff --git a/data/nguyentd_FinancialAdvice-Qwen2.5-7B.json b/data/models/nguyentd_FinancialAdvice-Qwen2.5-7B.json similarity index 100% rename from data/nguyentd_FinancialAdvice-Qwen2.5-7B.json rename to data/models/nguyentd_FinancialAdvice-Qwen2.5-7B.json diff --git a/data/ngxson_MiniThinky-1B-Llama-3.2.json b/data/models/ngxson_MiniThinky-1B-Llama-3.2.json similarity index 100% rename from data/ngxson_MiniThinky-1B-Llama-3.2.json rename to data/models/ngxson_MiniThinky-1B-Llama-3.2.json diff --git a/data/ngxson_MiniThinky-v2-1B-Llama-3.2.json b/data/models/ngxson_MiniThinky-v2-1B-Llama-3.2.json similarity index 100% rename from data/ngxson_MiniThinky-v2-1B-Llama-3.2.json rename to data/models/ngxson_MiniThinky-v2-1B-Llama-3.2.json diff --git a/data/nhyha_N3N_Delirium-v1_1030_0227.json b/data/models/nhyha_N3N_Delirium-v1_1030_0227.json similarity index 100% rename from data/nhyha_N3N_Delirium-v1_1030_0227.json rename to data/models/nhyha_N3N_Delirium-v1_1030_0227.json diff --git a/data/nhyha_N3N_Llama-3.1-8B-Instruct_1028_0216.json b/data/models/nhyha_N3N_Llama-3.1-8B-Instruct_1028_0216.json similarity index 100% rename from data/nhyha_N3N_Llama-3.1-8B-Instruct_1028_0216.json rename to data/models/nhyha_N3N_Llama-3.1-8B-Instruct_1028_0216.json diff --git a/data/nhyha_N3N_gemma-2-9b-it_20241029_1532.json b/data/models/nhyha_N3N_gemma-2-9b-it_20241029_1532.json similarity index 100% rename from data/nhyha_N3N_gemma-2-9b-it_20241029_1532.json rename to data/models/nhyha_N3N_gemma-2-9b-it_20241029_1532.json diff --git a/data/nhyha_N3N_gemma-2-9b-it_20241110_2026.json b/data/models/nhyha_N3N_gemma-2-9b-it_20241110_2026.json similarity index 100% rename from data/nhyha_N3N_gemma-2-9b-it_20241110_2026.json rename to data/models/nhyha_N3N_gemma-2-9b-it_20241110_2026.json diff --git a/data/nhyha_merge_Qwen2.5-7B-Instruct_20241023_0314.json b/data/models/nhyha_merge_Qwen2.5-7B-Instruct_20241023_0314.json similarity index 100% rename from data/nhyha_merge_Qwen2.5-7B-Instruct_20241023_0314.json rename to data/models/nhyha_merge_Qwen2.5-7B-Instruct_20241023_0314.json diff --git a/data/nicolinho_QRM-Gemma-2-27B.json b/data/models/nicolinho_QRM-Gemma-2-27B.json similarity index 100% rename from data/nicolinho_QRM-Gemma-2-27B.json rename to data/models/nicolinho_QRM-Gemma-2-27B.json index 1dea90f885df9d34139a9ef21e55b3dcce1a25fd..98185886d3c230ddcd90456c69a5aeed49795fc5 100644 --- a/data/nicolinho_QRM-Gemma-2-27B.json +++ b/data/models/nicolinho_QRM-Gemma-2-27B.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/nicolinho_QRM-Gemma-2-27B/1766412838.146816", + "evaluation_id": "reward-bench-2/nicolinho_QRM-Gemma-2-27B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,128 +31,104 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9444 + "score": 0.7667 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9665 + "score": 0.7853 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9013 + "score": 0.3719 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Math", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Math score - measures mathematical reasoning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.927 + "score": 0.6995 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9826 + "score": 0.9578 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } - } - ], - "detailed_evaluation_results": null, - "generation_config": null - }, - { - "evaluation_id": "reward-bench-2/nicolinho_QRM-Gemma-2-27B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "rewardbench", - "version": "0.1.3", - "additional_details": { - "subsets": "Chat, Chat Hard, Safety, Reasoning", - "hf_space": "allenai/reward-bench" - } - }, - "benchmark": "reward-bench", - "evaluation_results": [ + }, { - "evaluation_name": "Score", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7667 + "score": 0.9535 }, "source_data": { "dataset_name": "RewardBench 2", @@ -161,111 +137,135 @@ } }, { - "evaluation_name": "Factuality", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7853 + "score": 0.8321 }, "source_data": { "dataset_name": "RewardBench 2", "source_type": "hf_dataset", "hf_repo": "allenai/reward-bench-2-results" } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": null + }, + { + "evaluation_id": "reward-bench/nicolinho_QRM-Gemma-2-27B/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ { - "evaluation_name": "Precise IF", + "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3719 + "score": 0.9444 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6995 + "score": 0.9665 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9578 + "score": 0.9013 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9535 + "score": 0.927 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8321 + "score": 0.9826 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/nicolinho_QRM-Llama3-8B.json b/data/models/nicolinho_QRM-Llama3-8B.json similarity index 100% rename from data/nicolinho_QRM-Llama3-8B.json rename to data/models/nicolinho_QRM-Llama3-8B.json diff --git a/data/nicolinho_QRM-Llama3.1-8B-v2.json b/data/models/nicolinho_QRM-Llama3.1-8B-v2.json similarity index 100% rename from data/nicolinho_QRM-Llama3.1-8B-v2.json rename to data/models/nicolinho_QRM-Llama3.1-8B-v2.json diff --git a/data/nicolinho_QRM-Llama3.1-8B.json b/data/models/nicolinho_QRM-Llama3.1-8B.json similarity index 100% rename from data/nicolinho_QRM-Llama3.1-8B.json rename to data/models/nicolinho_QRM-Llama3.1-8B.json diff --git a/data/nidum_Nidum-Limitless-Gemma-2B.json b/data/models/nidum_Nidum-Limitless-Gemma-2B.json similarity index 100% rename from data/nidum_Nidum-Limitless-Gemma-2B.json rename to data/models/nidum_Nidum-Limitless-Gemma-2B.json diff --git a/data/nisten_franqwenstein-35b.json b/data/models/nisten_franqwenstein-35b.json similarity index 100% rename from data/nisten_franqwenstein-35b.json rename to data/models/nisten_franqwenstein-35b.json diff --git a/data/nisten_tqwendo-36b.json b/data/models/nisten_tqwendo-36b.json similarity index 100% rename from data/nisten_tqwendo-36b.json rename to data/models/nisten_tqwendo-36b.json diff --git a/data/nlpguy_Lion-Lamarck-v.1.0.8.json b/data/models/nlpguy_Lion-Lamarck-v.1.0.8.json similarity index 100% rename from data/nlpguy_Lion-Lamarck-v.1.0.8.json rename to data/models/nlpguy_Lion-Lamarck-v.1.0.8.json diff --git a/data/nlpguy_Lion-Lamarck-v.1.0.9.json b/data/models/nlpguy_Lion-Lamarck-v.1.0.9.json similarity index 100% rename from data/nlpguy_Lion-Lamarck-v.1.0.9.json rename to data/models/nlpguy_Lion-Lamarck-v.1.0.9.json diff --git a/data/nlpguy_Lion-Lamarck-v.1.1.0.json b/data/models/nlpguy_Lion-Lamarck-v.1.1.0.json similarity index 100% rename from data/nlpguy_Lion-Lamarck-v.1.1.0.json rename to data/models/nlpguy_Lion-Lamarck-v.1.1.0.json diff --git a/data/nlpguy_Miisce-one.json b/data/models/nlpguy_Miisce-one.json similarity index 100% rename from data/nlpguy_Miisce-one.json rename to data/models/nlpguy_Miisce-one.json diff --git a/data/nlpguy_Mistral-NeMo-Minitron-Upscale-v1.json b/data/models/nlpguy_Mistral-NeMo-Minitron-Upscale-v1.json similarity index 100% rename from data/nlpguy_Mistral-NeMo-Minitron-Upscale-v1.json rename to data/models/nlpguy_Mistral-NeMo-Minitron-Upscale-v1.json diff --git a/data/nlpguy_Mistral-NeMo-Minitron-Upscale-v2.json b/data/models/nlpguy_Mistral-NeMo-Minitron-Upscale-v2.json similarity index 100% rename from data/nlpguy_Mistral-NeMo-Minitron-Upscale-v2.json rename to data/models/nlpguy_Mistral-NeMo-Minitron-Upscale-v2.json diff --git a/data/nlpguy_Mistral-NeMo-Minitron-Upscale-v3.json b/data/models/nlpguy_Mistral-NeMo-Minitron-Upscale-v3.json similarity index 100% rename from data/nlpguy_Mistral-NeMo-Minitron-Upscale-v3.json rename to data/models/nlpguy_Mistral-NeMo-Minitron-Upscale-v3.json diff --git a/data/nlpguy_StableProse.json b/data/models/nlpguy_StableProse.json similarity index 100% rename from data/nlpguy_StableProse.json rename to data/models/nlpguy_StableProse.json diff --git a/data/nlpguy_StarFusion-alpha1.json b/data/models/nlpguy_StarFusion-alpha1.json similarity index 100% rename from data/nlpguy_StarFusion-alpha1.json rename to data/models/nlpguy_StarFusion-alpha1.json diff --git a/data/noname0202_Llama-3.2-4x3B-Instruct.json b/data/models/noname0202_Llama-3.2-4x3B-Instruct.json similarity index 100% rename from data/noname0202_Llama-3.2-4x3B-Instruct.json rename to data/models/noname0202_Llama-3.2-4x3B-Instruct.json diff --git a/data/noname0202_gemma-2-2b-it-ties.json b/data/models/noname0202_gemma-2-2b-it-ties.json similarity index 100% rename from data/noname0202_gemma-2-2b-it-ties.json rename to data/models/noname0202_gemma-2-2b-it-ties.json diff --git a/data/noname0202_gemma-2-9b-sft-jp-en-zh-v1.json b/data/models/noname0202_gemma-2-9b-sft-jp-en-zh-v1.json similarity index 100% rename from data/noname0202_gemma-2-9b-sft-jp-en-zh-v1.json rename to data/models/noname0202_gemma-2-9b-sft-jp-en-zh-v1.json diff --git a/data/noname0202_gemma-2-9b-sft-jp-en-zh-v2.json b/data/models/noname0202_gemma-2-9b-sft-jp-en-zh-v2.json similarity index 100% rename from data/noname0202_gemma-2-9b-sft-jp-en-zh-v2.json rename to data/models/noname0202_gemma-2-9b-sft-jp-en-zh-v2.json diff --git a/data/noname0202_llama-math-1b-r16-0to512tokens-test.json b/data/models/noname0202_llama-math-1b-r16-0to512tokens-test.json similarity index 100% rename from data/noname0202_llama-math-1b-r16-0to512tokens-test.json rename to data/models/noname0202_llama-math-1b-r16-0to512tokens-test.json diff --git a/data/noname0202_llama-math-1b-r32-0to512tokens-test.json b/data/models/noname0202_llama-math-1b-r32-0to512tokens-test.json similarity index 100% rename from data/noname0202_llama-math-1b-r32-0to512tokens-test.json rename to data/models/noname0202_llama-math-1b-r32-0to512tokens-test.json diff --git a/data/noname0202_llama-math-1b-r32-test.json b/data/models/noname0202_llama-math-1b-r32-test.json similarity index 100% rename from data/noname0202_llama-math-1b-r32-test.json rename to data/models/noname0202_llama-math-1b-r32-test.json diff --git a/data/noname0202_llama-math-1b-r8-512tokens-test.json b/data/models/noname0202_llama-math-1b-r8-512tokens-test.json similarity index 100% rename from data/noname0202_llama-math-1b-r8-512tokens-test.json rename to data/models/noname0202_llama-math-1b-r8-512tokens-test.json diff --git a/data/notbdq_Qwen2.5-14B-Instruct-1M-GRPO-Reasoning.json b/data/models/notbdq_Qwen2.5-14B-Instruct-1M-GRPO-Reasoning.json similarity index 100% rename from data/notbdq_Qwen2.5-14B-Instruct-1M-GRPO-Reasoning.json rename to data/models/notbdq_Qwen2.5-14B-Instruct-1M-GRPO-Reasoning.json diff --git a/data/nothingiisreal_L3.1-8B-Celeste-V1.5.json b/data/models/nothingiisreal_L3.1-8B-Celeste-V1.5.json similarity index 100% rename from data/nothingiisreal_L3.1-8B-Celeste-V1.5.json rename to data/models/nothingiisreal_L3.1-8B-Celeste-V1.5.json diff --git a/data/nothingiisreal_MN-12B-Starcannon-v2.json b/data/models/nothingiisreal_MN-12B-Starcannon-v2.json similarity index 100% rename from data/nothingiisreal_MN-12B-Starcannon-v2.json rename to data/models/nothingiisreal_MN-12B-Starcannon-v2.json diff --git a/data/nothingiisreal_MN-12B-Starcannon-v3.json b/data/models/nothingiisreal_MN-12B-Starcannon-v3.json similarity index 100% rename from data/nothingiisreal_MN-12B-Starcannon-v3.json rename to data/models/nothingiisreal_MN-12B-Starcannon-v3.json diff --git a/data/nvidia_AceInstruct-1.5B.json b/data/models/nvidia_AceInstruct-1.5B.json similarity index 100% rename from data/nvidia_AceInstruct-1.5B.json rename to data/models/nvidia_AceInstruct-1.5B.json diff --git a/data/nvidia_AceInstruct-72B.json b/data/models/nvidia_AceInstruct-72B.json similarity index 100% rename from data/nvidia_AceInstruct-72B.json rename to data/models/nvidia_AceInstruct-72B.json diff --git a/data/nvidia_AceInstruct-7B.json b/data/models/nvidia_AceInstruct-7B.json similarity index 100% rename from data/nvidia_AceInstruct-7B.json rename to data/models/nvidia_AceInstruct-7B.json diff --git a/data/nvidia_AceMath-1.5B-Instruct.json b/data/models/nvidia_AceMath-1.5B-Instruct.json similarity index 100% rename from data/nvidia_AceMath-1.5B-Instruct.json rename to data/models/nvidia_AceMath-1.5B-Instruct.json diff --git a/data/nvidia_AceMath-72B-Instruct.json b/data/models/nvidia_AceMath-72B-Instruct.json similarity index 100% rename from data/nvidia_AceMath-72B-Instruct.json rename to data/models/nvidia_AceMath-72B-Instruct.json diff --git a/data/nvidia_AceMath-72B-RM.json b/data/models/nvidia_AceMath-72B-RM.json similarity index 100% rename from data/nvidia_AceMath-72B-RM.json rename to data/models/nvidia_AceMath-72B-RM.json diff --git a/data/nvidia_AceMath-7B-Instruct.json b/data/models/nvidia_AceMath-7B-Instruct.json similarity index 100% rename from data/nvidia_AceMath-7B-Instruct.json rename to data/models/nvidia_AceMath-7B-Instruct.json diff --git a/data/nvidia_AceMath-7B-RM.json b/data/models/nvidia_AceMath-7B-RM.json similarity index 100% rename from data/nvidia_AceMath-7B-RM.json rename to data/models/nvidia_AceMath-7B-RM.json diff --git a/data/nvidia_Hymba-1.5B-Base.json b/data/models/nvidia_Hymba-1.5B-Base.json similarity index 100% rename from data/nvidia_Hymba-1.5B-Base.json rename to data/models/nvidia_Hymba-1.5B-Base.json diff --git a/data/nvidia_Hymba-1.5B-Instruct.json b/data/models/nvidia_Hymba-1.5B-Instruct.json similarity index 100% rename from data/nvidia_Hymba-1.5B-Instruct.json rename to data/models/nvidia_Hymba-1.5B-Instruct.json diff --git a/data/nvidia_Llama-3.1-Minitron-4B-Depth-Base.json b/data/models/nvidia_Llama-3.1-Minitron-4B-Depth-Base.json similarity index 100% rename from data/nvidia_Llama-3.1-Minitron-4B-Depth-Base.json rename to data/models/nvidia_Llama-3.1-Minitron-4B-Depth-Base.json diff --git a/data/nvidia_Llama-3.1-Nemotron-70B-Instruct-HF.json b/data/models/nvidia_Llama-3.1-Nemotron-70B-Instruct-HF.json similarity index 100% rename from data/nvidia_Llama-3.1-Nemotron-70B-Instruct-HF.json rename to data/models/nvidia_Llama-3.1-Nemotron-70B-Instruct-HF.json diff --git a/data/nvidia_Llama-3.1-Nemotron-70B-Reward.json b/data/models/nvidia_Llama-3.1-Nemotron-70B-Reward.json similarity index 100% rename from data/nvidia_Llama-3.1-Nemotron-70B-Reward.json rename to data/models/nvidia_Llama-3.1-Nemotron-70B-Reward.json diff --git a/data/nvidia_Llama3-70B-SteerLM-RM.json b/data/models/nvidia_Llama3-70B-SteerLM-RM.json similarity index 100% rename from data/nvidia_Llama3-70B-SteerLM-RM.json rename to data/models/nvidia_Llama3-70B-SteerLM-RM.json diff --git a/data/nvidia_Minitron-4B-Base.json b/data/models/nvidia_Minitron-4B-Base.json similarity index 100% rename from data/nvidia_Minitron-4B-Base.json rename to data/models/nvidia_Minitron-4B-Base.json diff --git a/data/nvidia_Minitron-8B-Base.json b/data/models/nvidia_Minitron-8B-Base.json similarity index 100% rename from data/nvidia_Minitron-8B-Base.json rename to data/models/nvidia_Minitron-8B-Base.json diff --git a/data/nvidia_Mistral-NeMo-Minitron-8B-Base.json b/data/models/nvidia_Mistral-NeMo-Minitron-8B-Base.json similarity index 100% rename from data/nvidia_Mistral-NeMo-Minitron-8B-Base.json rename to data/models/nvidia_Mistral-NeMo-Minitron-8B-Base.json diff --git a/data/nvidia_Mistral-NeMo-Minitron-8B-Instruct.json b/data/models/nvidia_Mistral-NeMo-Minitron-8B-Instruct.json similarity index 100% rename from data/nvidia_Mistral-NeMo-Minitron-8B-Instruct.json rename to data/models/nvidia_Mistral-NeMo-Minitron-8B-Instruct.json diff --git a/data/nvidia_Nemotron-4-340B-Reward.json b/data/models/nvidia_Nemotron-4-340B-Reward.json similarity index 100% rename from data/nvidia_Nemotron-4-340B-Reward.json rename to data/models/nvidia_Nemotron-4-340B-Reward.json diff --git a/data/nvidia_Nemotron-Mini-4B-Instruct.json b/data/models/nvidia_Nemotron-Mini-4B-Instruct.json similarity index 100% rename from data/nvidia_Nemotron-Mini-4B-Instruct.json rename to data/models/nvidia_Nemotron-Mini-4B-Instruct.json diff --git a/data/nvidia_OpenMath2-Llama3.1-8B.json b/data/models/nvidia_OpenMath2-Llama3.1-8B.json similarity index 100% rename from data/nvidia_OpenMath2-Llama3.1-8B.json rename to data/models/nvidia_OpenMath2-Llama3.1-8B.json diff --git a/data/nxmwxm_Beast-Soul-new.json b/data/models/nxmwxm_Beast-Soul-new.json similarity index 100% rename from data/nxmwxm_Beast-Soul-new.json rename to data/models/nxmwxm_Beast-Soul-new.json diff --git a/data/occiglot_occiglot-7b-es-en-instruct.json b/data/models/occiglot_occiglot-7b-es-en-instruct.json similarity index 100% rename from data/occiglot_occiglot-7b-es-en-instruct.json rename to data/models/occiglot_occiglot-7b-es-en-instruct.json diff --git a/data/odyssey-labs_Astral-1-10B.json b/data/models/odyssey-labs_Astral-1-10B.json similarity index 100% rename from data/odyssey-labs_Astral-1-10B.json rename to data/models/odyssey-labs_Astral-1-10B.json diff --git a/data/olabs-ai_reflection_model.json b/data/models/olabs-ai_reflection_model.json similarity index 100% rename from data/olabs-ai_reflection_model.json rename to data/models/olabs-ai_reflection_model.json diff --git a/data/ontocord_Llama_3.2_1b-autoredteam_helpfulness-train.json b/data/models/ontocord_Llama_3.2_1b-autoredteam_helpfulness-train.json similarity index 100% rename from data/ontocord_Llama_3.2_1b-autoredteam_helpfulness-train.json rename to data/models/ontocord_Llama_3.2_1b-autoredteam_helpfulness-train.json diff --git a/data/ontocord_RedPajama-3B-v1-AutoRedteam-Harmless-only.json b/data/models/ontocord_RedPajama-3B-v1-AutoRedteam-Harmless-only.json similarity index 100% rename from data/ontocord_RedPajama-3B-v1-AutoRedteam-Harmless-only.json rename to data/models/ontocord_RedPajama-3B-v1-AutoRedteam-Harmless-only.json diff --git a/data/ontocord_RedPajama-3B-v1-AutoRedteam.json b/data/models/ontocord_RedPajama-3B-v1-AutoRedteam.json similarity index 100% rename from data/ontocord_RedPajama-3B-v1-AutoRedteam.json rename to data/models/ontocord_RedPajama-3B-v1-AutoRedteam.json diff --git a/data/ontocord_RedPajama3b_v1-autoredteam_helpfulness-train.json b/data/models/ontocord_RedPajama3b_v1-autoredteam_helpfulness-train.json similarity index 100% rename from data/ontocord_RedPajama3b_v1-autoredteam_helpfulness-train.json rename to data/models/ontocord_RedPajama3b_v1-autoredteam_helpfulness-train.json diff --git a/data/ontocord_merged_0.2_expert_0.8-stack_2x.json b/data/models/ontocord_merged_0.2_expert_0.8-stack_2x.json similarity index 100% rename from data/ontocord_merged_0.2_expert_0.8-stack_2x.json rename to data/models/ontocord_merged_0.2_expert_0.8-stack_2x.json diff --git a/data/ontocord_merged_0.2_expert_0.8.json b/data/models/ontocord_merged_0.2_expert_0.8.json similarity index 100% rename from data/ontocord_merged_0.2_expert_0.8.json rename to data/models/ontocord_merged_0.2_expert_0.8.json diff --git a/data/ontocord_merged_0.5_expert_0.5.json b/data/models/ontocord_merged_0.5_expert_0.5.json similarity index 100% rename from data/ontocord_merged_0.5_expert_0.5.json rename to data/models/ontocord_merged_0.5_expert_0.5.json diff --git a/data/ontocord_ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful.json b/data/models/ontocord_ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful.json similarity index 100% rename from data/ontocord_ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful.json rename to data/models/ontocord_ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful.json diff --git a/data/ontocord_ontocord_wide_7b-stacked-stage1-instruct.json b/data/models/ontocord_ontocord_wide_7b-stacked-stage1-instruct.json similarity index 100% rename from data/ontocord_ontocord_wide_7b-stacked-stage1-instruct.json rename to data/models/ontocord_ontocord_wide_7b-stacked-stage1-instruct.json diff --git a/data/ontocord_ontocord_wide_7b-stacked-stage1.json b/data/models/ontocord_ontocord_wide_7b-stacked-stage1.json similarity index 100% rename from data/ontocord_ontocord_wide_7b-stacked-stage1.json rename to data/models/ontocord_ontocord_wide_7b-stacked-stage1.json diff --git a/data/ontocord_starcoder2-29b-ls.json b/data/models/ontocord_starcoder2-29b-ls.json similarity index 100% rename from data/ontocord_starcoder2-29b-ls.json rename to data/models/ontocord_starcoder2-29b-ls.json diff --git a/data/ontocord_starcoder2_3b-AutoRedteam.json b/data/models/ontocord_starcoder2_3b-AutoRedteam.json similarity index 100% rename from data/ontocord_starcoder2_3b-AutoRedteam.json rename to data/models/ontocord_starcoder2_3b-AutoRedteam.json diff --git a/data/ontocord_wide_3b-merge_test.json b/data/models/ontocord_wide_3b-merge_test.json similarity index 100% rename from data/ontocord_wide_3b-merge_test.json rename to data/models/ontocord_wide_3b-merge_test.json diff --git a/data/ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained.json b/data/models/ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained.json similarity index 100% rename from data/ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained.json rename to data/models/ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained.json diff --git a/data/ontocord_wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge.json b/data/models/ontocord_wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge.json similarity index 100% rename from data/ontocord_wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge.json rename to data/models/ontocord_wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge.json diff --git a/data/ontocord_wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge.json b/data/models/ontocord_wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge.json similarity index 100% rename from data/ontocord_wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge.json rename to data/models/ontocord_wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge.json diff --git a/data/ontocord_wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue.json b/data/models/ontocord_wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue.json similarity index 100% rename from data/ontocord_wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue.json rename to data/models/ontocord_wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue.json diff --git a/data/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue.json b/data/models/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue.json similarity index 100% rename from data/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue.json rename to data/models/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue.json diff --git a/data/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue.json b/data/models/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue.json similarity index 100% rename from data/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue.json rename to data/models/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue.json diff --git a/data/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue.json b/data/models/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue.json similarity index 100% rename from data/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue.json rename to data/models/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue.json diff --git a/data/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue.json b/data/models/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue.json similarity index 100% rename from data/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue.json rename to data/models/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue.json diff --git a/data/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue.json b/data/models/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue.json similarity index 100% rename from data/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue.json rename to data/models/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue.json diff --git a/data/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue.json b/data/models/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue.json similarity index 100% rename from data/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue.json rename to data/models/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue.json diff --git a/data/ontocord_wide_3b_sft_stage1.1-ss1-with_math.no_issue.json b/data/models/ontocord_wide_3b_sft_stage1.1-ss1-with_math.no_issue.json similarity index 100% rename from data/ontocord_wide_3b_sft_stage1.1-ss1-with_math.no_issue.json rename to data/models/ontocord_wide_3b_sft_stage1.1-ss1-with_math.no_issue.json diff --git a/data/ontocord_wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue.json b/data/models/ontocord_wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue.json similarity index 100% rename from data/ontocord_wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue.json rename to data/models/ontocord_wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue.json diff --git a/data/ontocord_wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical.json b/data/models/ontocord_wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical.json similarity index 100% rename from data/ontocord_wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical.json rename to data/models/ontocord_wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical.json diff --git a/data/ontocord_wide_3b_sft_stage1.2-ss1-expert_formatted_text.json b/data/models/ontocord_wide_3b_sft_stage1.2-ss1-expert_formatted_text.json similarity index 100% rename from data/ontocord_wide_3b_sft_stage1.2-ss1-expert_formatted_text.json rename to data/models/ontocord_wide_3b_sft_stage1.2-ss1-expert_formatted_text.json diff --git a/data/ontocord_wide_3b_sft_stage1.2-ss1-expert_how-to.json b/data/models/ontocord_wide_3b_sft_stage1.2-ss1-expert_how-to.json similarity index 100% rename from data/ontocord_wide_3b_sft_stage1.2-ss1-expert_how-to.json rename to data/models/ontocord_wide_3b_sft_stage1.2-ss1-expert_how-to.json diff --git a/data/ontocord_wide_3b_sft_stage1.2-ss1-expert_math.json b/data/models/ontocord_wide_3b_sft_stage1.2-ss1-expert_math.json similarity index 100% rename from data/ontocord_wide_3b_sft_stage1.2-ss1-expert_math.json rename to data/models/ontocord_wide_3b_sft_stage1.2-ss1-expert_math.json diff --git a/data/ontocord_wide_3b_sft_stage1.2-ss1-expert_news.json b/data/models/ontocord_wide_3b_sft_stage1.2-ss1-expert_news.json similarity index 100% rename from data/ontocord_wide_3b_sft_stage1.2-ss1-expert_news.json rename to data/models/ontocord_wide_3b_sft_stage1.2-ss1-expert_news.json diff --git a/data/ontocord_wide_3b_sft_stage1.2-ss1-expert_software.json b/data/models/ontocord_wide_3b_sft_stage1.2-ss1-expert_software.json similarity index 100% rename from data/ontocord_wide_3b_sft_stage1.2-ss1-expert_software.json rename to data/models/ontocord_wide_3b_sft_stage1.2-ss1-expert_software.json diff --git a/data/ontocord_wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked.json b/data/models/ontocord_wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked.json similarity index 100% rename from data/ontocord_wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked.json rename to data/models/ontocord_wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked.json diff --git a/data/oobabooga_CodeBooga-34B-v0.1.json b/data/models/oobabooga_CodeBooga-34B-v0.1.json similarity index 100% rename from data/oobabooga_CodeBooga-34B-v0.1.json rename to data/models/oobabooga_CodeBooga-34B-v0.1.json diff --git a/data/oopere_Llama-FinSent-S.json b/data/models/oopere_Llama-FinSent-S.json similarity index 99% rename from data/oopere_Llama-FinSent-S.json rename to data/models/oopere_Llama-FinSent-S.json index 547a7ef41580d39906f8398271f37052ca4bd621..4304bcd53168ed4fd51adf4213bcd9660265d629 100644 --- a/data/oopere_Llama-FinSent-S.json +++ b/data/models/oopere_Llama-FinSent-S.json @@ -5,7 +5,7 @@ "developer": "oopere", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "LlamaForCausalLM", "params_billions": "0.914" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2164 + "score": 0.2119 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3169 + "score": 0.3156 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0128 + "score": 0.0181 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2584 + "score": 0.2567 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1134 + "score": 0.113 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2119 + "score": 0.2164 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3156 + "score": 0.3169 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0181 + "score": 0.0128 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2567 + "score": 0.2584 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.113 + "score": 0.1134 } } ], diff --git a/data/oopere_pruned10-llama-3.2-3B.json b/data/models/oopere_pruned10-llama-3.2-3B.json similarity index 100% rename from data/oopere_pruned10-llama-3.2-3B.json rename to data/models/oopere_pruned10-llama-3.2-3B.json diff --git a/data/oopere_pruned20-llama-1b.json b/data/models/oopere_pruned20-llama-1b.json similarity index 100% rename from data/oopere_pruned20-llama-1b.json rename to data/models/oopere_pruned20-llama-1b.json diff --git a/data/oopere_pruned20-llama-3.2-3b.json b/data/models/oopere_pruned20-llama-3.2-3b.json similarity index 100% rename from data/oopere_pruned20-llama-3.2-3b.json rename to data/models/oopere_pruned20-llama-3.2-3b.json diff --git a/data/oopere_pruned40-llama-1b.json b/data/models/oopere_pruned40-llama-1b.json similarity index 100% rename from data/oopere_pruned40-llama-1b.json rename to data/models/oopere_pruned40-llama-1b.json diff --git a/data/oopere_pruned40-llama-3.2-1B.json b/data/models/oopere_pruned40-llama-3.2-1B.json similarity index 100% rename from data/oopere_pruned40-llama-3.2-1B.json rename to data/models/oopere_pruned40-llama-3.2-1B.json diff --git a/data/oopere_pruned40-llama-3.2-3b.json b/data/models/oopere_pruned40-llama-3.2-3b.json similarity index 100% rename from data/oopere_pruned40-llama-3.2-3b.json rename to data/models/oopere_pruned40-llama-3.2-3b.json diff --git a/data/oopere_pruned60-llama-1b.json b/data/models/oopere_pruned60-llama-1b.json similarity index 100% rename from data/oopere_pruned60-llama-1b.json rename to data/models/oopere_pruned60-llama-1b.json diff --git a/data/oopere_pruned60-llama-3.2-3b.json b/data/models/oopere_pruned60-llama-3.2-3b.json similarity index 100% rename from data/oopere_pruned60-llama-3.2-3b.json rename to data/models/oopere_pruned60-llama-3.2-3b.json diff --git a/data/open-atlas_Atlas-Flash-1.5B-Preview.json b/data/models/open-atlas_Atlas-Flash-1.5B-Preview.json similarity index 100% rename from data/open-atlas_Atlas-Flash-1.5B-Preview.json rename to data/models/open-atlas_Atlas-Flash-1.5B-Preview.json diff --git a/data/open-atlas_Atlas-Flash-7B-Preview.json b/data/models/open-atlas_Atlas-Flash-7B-Preview.json similarity index 100% rename from data/open-atlas_Atlas-Flash-7B-Preview.json rename to data/models/open-atlas_Atlas-Flash-7B-Preview.json diff --git a/data/open-neo_Kyro-n1-3B.json b/data/models/open-neo_Kyro-n1-3B.json similarity index 100% rename from data/open-neo_Kyro-n1-3B.json rename to data/models/open-neo_Kyro-n1-3B.json diff --git a/data/open-neo_Kyro-n1-7B.json b/data/models/open-neo_Kyro-n1-7B.json similarity index 100% rename from data/open-neo_Kyro-n1-7B.json rename to data/models/open-neo_Kyro-n1-7B.json diff --git a/data/open-thoughts_OpenThinker-7B.json b/data/models/open-thoughts_OpenThinker-7B.json similarity index 100% rename from data/open-thoughts_OpenThinker-7B.json rename to data/models/open-thoughts_OpenThinker-7B.json diff --git a/data/openai-community_gpt2-large.json b/data/models/openai-community_gpt2-large.json similarity index 100% rename from data/openai-community_gpt2-large.json rename to data/models/openai-community_gpt2-large.json diff --git a/data/openai-community_gpt2-medium.json b/data/models/openai-community_gpt2-medium.json similarity index 100% rename from data/openai-community_gpt2-medium.json rename to data/models/openai-community_gpt2-medium.json diff --git a/data/openai-community_gpt2-xl.json b/data/models/openai-community_gpt2-xl.json similarity index 100% rename from data/openai-community_gpt2-xl.json rename to data/models/openai-community_gpt2-xl.json diff --git a/data/openai-community_gpt2.json b/data/models/openai-community_gpt2.json similarity index 100% rename from data/openai-community_gpt2.json rename to data/models/openai-community_gpt2.json diff --git a/data/openai_GPT-J-6B.json b/data/models/openai_GPT-J-6B.json similarity index 100% rename from data/openai_GPT-J-6B.json rename to data/models/openai_GPT-J-6B.json diff --git a/data/openai_GPT-NeoX-20B.json b/data/models/openai_GPT-NeoX-20B.json similarity index 100% rename from data/openai_GPT-NeoX-20B.json rename to data/models/openai_GPT-NeoX-20B.json diff --git a/data/openai_GPT_4o.json b/data/models/openai_GPT_4o.json similarity index 100% rename from data/openai_GPT_4o.json rename to data/models/openai_GPT_4o.json diff --git a/data/openai_GPT_5.1.json b/data/models/openai_GPT_5.1.json similarity index 100% rename from data/openai_GPT_5.1.json rename to data/models/openai_GPT_5.1.json diff --git a/data/openai_GPT_5.1_Codex.json b/data/models/openai_GPT_5.1_Codex.json similarity index 100% rename from data/openai_GPT_5.1_Codex.json rename to data/models/openai_GPT_5.1_Codex.json diff --git a/data/openai_GPT_5.2.json b/data/models/openai_GPT_5.2.json similarity index 100% rename from data/openai_GPT_5.2.json rename to data/models/openai_GPT_5.2.json index 5a2ac71802a9135f5f37d79c1079a3fa037b207c..83ffcb369f492b7dfcde2134248ccb2e5606ee2d 100644 --- a/data/openai_GPT_5.2.json +++ b/data/models/openai_GPT_5.2.json @@ -7,10 +7,10 @@ }, "evaluations": [ { - "evaluation_id": "apex-agents/openai_gpt-5.2/1773260200", + "evaluation_id": "ace/openai_gpt-5.2/1773260200", "retrieved_timestamp": "1773260200", "source_metadata": { - "source_name": "Mercor APEX-Agents Leaderboard", + "source_name": "Mercor ACE Leaderboard", "source_type": "evaluation_run", "source_organization_name": "Mercor", "source_organization_url": "https://www.mercor.com", @@ -20,24 +20,24 @@ "name": "archipelago", "version": "1.0.0" }, - "benchmark": "apex-agents", + "benchmark": "ace", "evaluation_results": [ { - "evaluation_name": "Overall Pass@1", + "evaluation_name": "Overall Score", "source_data": { - "dataset_name": "apex-agents", + "dataset_name": "ace", "source_type": "hf_dataset", - "hf_repo": "mercor/apex-agents" + "hf_repo": "Mercor/ACE" }, "metric_config": { - "evaluation_description": "Overall Pass@1 (dataset card / paper snapshot).", + "evaluation_description": "Overall ACE score across all consumer-task domains.", "lower_is_better": false, "score_type": "continuous", "min_score": 0, "max_score": 1 }, "score_details": { - "score": 0.23, + "score": 0.515, "uncertainty": { "confidence_interval": { "lower": -0.032, @@ -53,28 +53,21 @@ } }, { - "evaluation_name": "Overall Pass@8", + "evaluation_name": "Food Score", "source_data": { - "dataset_name": "apex-agents", + "dataset_name": "ace", "source_type": "hf_dataset", - "hf_repo": "mercor/apex-agents" + "hf_repo": "Mercor/ACE" }, "metric_config": { - "evaluation_description": "Overall Pass@8 (dataset card / paper snapshot).", + "evaluation_description": "Food domain score.", "lower_is_better": false, "score_type": "continuous", "min_score": 0, "max_score": 1 }, "score_details": { - "score": 0.4, - "uncertainty": { - "confidence_interval": { - "lower": -0.044, - "upper": 0.044, - "method": "bootstrap" - } - } + "score": 0.65 }, "generation_config": { "additional_details": { @@ -83,44 +76,75 @@ } }, { - "evaluation_name": "Overall Mean Score", + "evaluation_name": "Gaming Score", "source_data": { - "dataset_name": "apex-agents", + "dataset_name": "ace", "source_type": "hf_dataset", - "hf_repo": "mercor/apex-agents" + "hf_repo": "Mercor/ACE" }, "metric_config": { - "evaluation_description": "Overall mean rubric score.", + "evaluation_description": "Gaming domain score.", "lower_is_better": false, "score_type": "continuous", "min_score": 0, "max_score": 1 }, "score_details": { - "score": 0.387 + "score": 0.578 }, "generation_config": { "additional_details": { "run_setting": "High" } } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": { + "additional_details": { + "run_setting": "High" + } + } + }, + { + "evaluation_id": "apex-agents/openai_gpt-5.2/1773260200", + "retrieved_timestamp": "1773260200", + "source_metadata": { + "source_name": "Mercor APEX-Agents Leaderboard", + "source_type": "evaluation_run", + "source_organization_name": "Mercor", + "source_organization_url": "https://www.mercor.com", + "evaluator_relationship": "first_party" + }, + "eval_library": { + "name": "archipelago", + "version": "1.0.0" + }, + "benchmark": "apex-agents", + "evaluation_results": [ { - "evaluation_name": "Investment Banking Pass@1", + "evaluation_name": "Overall Pass@1", "source_data": { "dataset_name": "apex-agents", "source_type": "hf_dataset", "hf_repo": "mercor/apex-agents" }, "metric_config": { - "evaluation_description": "Investment banking world Pass@1.", + "evaluation_description": "Overall Pass@1 (dataset card / paper snapshot).", "lower_is_better": false, "score_type": "continuous", "min_score": 0, "max_score": 1 }, "score_details": { - "score": 0.273 + "score": 0.23, + "uncertainty": { + "confidence_interval": { + "lower": -0.032, + "upper": 0.032, + "method": "bootstrap" + } + } }, "generation_config": { "additional_details": { @@ -129,21 +153,28 @@ } }, { - "evaluation_name": "Management Consulting Pass@1", + "evaluation_name": "Overall Pass@8", "source_data": { "dataset_name": "apex-agents", "source_type": "hf_dataset", "hf_repo": "mercor/apex-agents" }, "metric_config": { - "evaluation_description": "Management consulting world Pass@1.", + "evaluation_description": "Overall Pass@8 (dataset card / paper snapshot).", "lower_is_better": false, "score_type": "continuous", "min_score": 0, "max_score": 1 }, "score_details": { - "score": 0.227 + "score": 0.4, + "uncertainty": { + "confidence_interval": { + "lower": -0.044, + "upper": 0.044, + "method": "bootstrap" + } + } }, "generation_config": { "additional_details": { @@ -152,21 +183,21 @@ } }, { - "evaluation_name": "Corporate Law Pass@1", + "evaluation_name": "Overall Mean Score", "source_data": { "dataset_name": "apex-agents", "source_type": "hf_dataset", "hf_repo": "mercor/apex-agents" }, "metric_config": { - "evaluation_description": "Corporate law world Pass@1.", + "evaluation_description": "Overall mean rubric score.", "lower_is_better": false, "score_type": "continuous", "min_score": 0, "max_score": 1 }, "score_details": { - "score": 0.189 + "score": 0.387 }, "generation_config": { "additional_details": { @@ -175,75 +206,44 @@ } }, { - "evaluation_name": "Corporate Lawyer Mean Score", + "evaluation_name": "Investment Banking Pass@1", "source_data": { "dataset_name": "apex-agents", "source_type": "hf_dataset", "hf_repo": "mercor/apex-agents" }, "metric_config": { - "evaluation_description": "Corporate lawyer world mean score.", + "evaluation_description": "Investment banking world Pass@1.", "lower_is_better": false, "score_type": "continuous", "min_score": 0, "max_score": 1 }, "score_details": { - "score": 0.443 + "score": 0.273 }, "generation_config": { "additional_details": { "run_setting": "High" } } - } - ], - "detailed_evaluation_results": null, - "generation_config": { - "additional_details": { - "run_setting": "High" - } - } - }, - { - "evaluation_id": "ace/openai_gpt-5.2/1773260200", - "retrieved_timestamp": "1773260200", - "source_metadata": { - "source_name": "Mercor ACE Leaderboard", - "source_type": "evaluation_run", - "source_organization_name": "Mercor", - "source_organization_url": "https://www.mercor.com", - "evaluator_relationship": "first_party" - }, - "eval_library": { - "name": "archipelago", - "version": "1.0.0" - }, - "benchmark": "ace", - "evaluation_results": [ + }, { - "evaluation_name": "Overall Score", + "evaluation_name": "Management Consulting Pass@1", "source_data": { - "dataset_name": "ace", + "dataset_name": "apex-agents", "source_type": "hf_dataset", - "hf_repo": "Mercor/ACE" + "hf_repo": "mercor/apex-agents" }, "metric_config": { - "evaluation_description": "Overall ACE score across all consumer-task domains.", + "evaluation_description": "Management consulting world Pass@1.", "lower_is_better": false, "score_type": "continuous", "min_score": 0, "max_score": 1 }, "score_details": { - "score": 0.515, - "uncertainty": { - "confidence_interval": { - "lower": -0.032, - "upper": 0.032, - "method": "bootstrap" - } - } + "score": 0.227 }, "generation_config": { "additional_details": { @@ -252,21 +252,21 @@ } }, { - "evaluation_name": "Food Score", + "evaluation_name": "Corporate Law Pass@1", "source_data": { - "dataset_name": "ace", + "dataset_name": "apex-agents", "source_type": "hf_dataset", - "hf_repo": "Mercor/ACE" + "hf_repo": "mercor/apex-agents" }, "metric_config": { - "evaluation_description": "Food domain score.", + "evaluation_description": "Corporate law world Pass@1.", "lower_is_better": false, "score_type": "continuous", "min_score": 0, "max_score": 1 }, "score_details": { - "score": 0.65 + "score": 0.189 }, "generation_config": { "additional_details": { @@ -275,21 +275,21 @@ } }, { - "evaluation_name": "Gaming Score", + "evaluation_name": "Corporate Lawyer Mean Score", "source_data": { - "dataset_name": "ace", + "dataset_name": "apex-agents", "source_type": "hf_dataset", - "hf_repo": "Mercor/ACE" + "hf_repo": "mercor/apex-agents" }, "metric_config": { - "evaluation_description": "Gaming domain score.", + "evaluation_description": "Corporate lawyer world mean score.", "lower_is_better": false, "score_type": "continuous", "min_score": 0, "max_score": 1 }, "score_details": { - "score": 0.578 + "score": 0.443 }, "generation_config": { "additional_details": { diff --git a/data/openai_GPT_5.2_Codex.json b/data/models/openai_GPT_5.2_Codex.json similarity index 100% rename from data/openai_GPT_5.2_Codex.json rename to data/models/openai_GPT_5.2_Codex.json diff --git a/data/openai_GPT_5.2_Pro.json b/data/models/openai_GPT_5.2_Pro.json similarity index 100% rename from data/openai_GPT_5.2_Pro.json rename to data/models/openai_GPT_5.2_Pro.json diff --git a/data/openai_GPT_5.3_Codex.json b/data/models/openai_GPT_5.3_Codex.json similarity index 100% rename from data/openai_GPT_5.3_Codex.json rename to data/models/openai_GPT_5.3_Codex.json diff --git a/data/openai_GPT_5.json b/data/models/openai_GPT_5.json similarity index 100% rename from data/openai_GPT_5.json rename to data/models/openai_GPT_5.json diff --git a/data/openai_GPT_5_Codex.json b/data/models/openai_GPT_5_Codex.json similarity index 100% rename from data/openai_GPT_5_Codex.json rename to data/models/openai_GPT_5_Codex.json diff --git a/data/openai_GPT_OSS_120B.json b/data/models/openai_GPT_OSS_120B.json similarity index 100% rename from data/openai_GPT_OSS_120B.json rename to data/models/openai_GPT_OSS_120B.json diff --git a/data/openai_ada-350M.json b/data/models/openai_ada-350M.json similarity index 100% rename from data/openai_ada-350M.json rename to data/models/openai_ada-350M.json diff --git a/data/openai_babbage-1.3B.json b/data/models/openai_babbage-1.3B.json similarity index 100% rename from data/openai_babbage-1.3B.json rename to data/models/openai_babbage-1.3B.json diff --git a/data/openai_curie-6.7B.json b/data/models/openai_curie-6.7B.json similarity index 100% rename from data/openai_curie-6.7B.json rename to data/models/openai_curie-6.7B.json diff --git a/data/openai_davinci-175B.json b/data/models/openai_davinci-175B.json similarity index 100% rename from data/openai_davinci-175B.json rename to data/models/openai_davinci-175B.json diff --git a/data/openai_gpt-3.5-turbo-0125.json b/data/models/openai_gpt-3.5-turbo-0125.json similarity index 100% rename from data/openai_gpt-3.5-turbo-0125.json rename to data/models/openai_gpt-3.5-turbo-0125.json diff --git a/data/openai_gpt-3.5-turbo-0301.json b/data/models/openai_gpt-3.5-turbo-0301.json similarity index 100% rename from data/openai_gpt-3.5-turbo-0301.json rename to data/models/openai_gpt-3.5-turbo-0301.json diff --git a/data/openai_gpt-3.5-turbo-0613.json b/data/models/openai_gpt-3.5-turbo-0613.json similarity index 100% rename from data/openai_gpt-3.5-turbo-0613.json rename to data/models/openai_gpt-3.5-turbo-0613.json diff --git a/data/openai_gpt-4-0125-preview.json b/data/models/openai_gpt-4-0125-preview.json similarity index 100% rename from data/openai_gpt-4-0125-preview.json rename to data/models/openai_gpt-4-0125-preview.json diff --git a/data/openai_gpt-4-0314.json b/data/models/openai_gpt-4-0314.json similarity index 100% rename from data/openai_gpt-4-0314.json rename to data/models/openai_gpt-4-0314.json diff --git a/data/openai_gpt-4-0613.json b/data/models/openai_gpt-4-0613.json similarity index 100% rename from data/openai_gpt-4-0613.json rename to data/models/openai_gpt-4-0613.json diff --git a/data/openai_gpt-4-1106-preview.json b/data/models/openai_gpt-4-1106-preview.json similarity index 100% rename from data/openai_gpt-4-1106-preview.json rename to data/models/openai_gpt-4-1106-preview.json diff --git a/data/openai_gpt-4-turbo-2024-04-09.json b/data/models/openai_gpt-4-turbo-2024-04-09.json similarity index 100% rename from data/openai_gpt-4-turbo-2024-04-09.json rename to data/models/openai_gpt-4-turbo-2024-04-09.json diff --git a/data/openai_gpt-4.1-2025-04-14.json b/data/models/openai_gpt-4.1-2025-04-14.json similarity index 100% rename from data/openai_gpt-4.1-2025-04-14.json rename to data/models/openai_gpt-4.1-2025-04-14.json diff --git a/data/openai_gpt-4.1-mini-2025-04-14.json b/data/models/openai_gpt-4.1-mini-2025-04-14.json similarity index 100% rename from data/openai_gpt-4.1-mini-2025-04-14.json rename to data/models/openai_gpt-4.1-mini-2025-04-14.json diff --git a/data/openai_gpt-4.1-nano-2025-04-14.json b/data/models/openai_gpt-4.1-nano-2025-04-14.json similarity index 100% rename from data/openai_gpt-4.1-nano-2025-04-14.json rename to data/models/openai_gpt-4.1-nano-2025-04-14.json diff --git a/data/openai_gpt-4.1.json b/data/models/openai_gpt-4.1.json similarity index 100% rename from data/openai_gpt-4.1.json rename to data/models/openai_gpt-4.1.json diff --git a/data/openai_gpt-4o-2024-05-13.json b/data/models/openai_gpt-4o-2024-05-13.json similarity index 100% rename from data/openai_gpt-4o-2024-05-13.json rename to data/models/openai_gpt-4o-2024-05-13.json diff --git a/data/openai_gpt-4o-2024-08-06.json b/data/models/openai_gpt-4o-2024-08-06.json similarity index 99% rename from data/openai_gpt-4o-2024-08-06.json rename to data/models/openai_gpt-4o-2024-08-06.json index 3ef11c2d4ebf98f03db374da877ec5e1c4d9ad92..aef7d6b9b1622e64c26adef8ed5d5e5e1e76f7f4 100644 --- a/data/openai_gpt-4o-2024-08-06.json +++ b/data/models/openai_gpt-4o-2024-08-06.json @@ -4,7 +4,7 @@ "id": "openai/gpt-4o-2024-08-06", "developer": "openai", "additional_details": { - "model_type": "Generative RM" + "model_type": "Generative" } }, "evaluations": [ @@ -1902,10 +1902,10 @@ } }, { - "evaluation_id": "reward-bench/openai_gpt-4o-2024-08-06/1766412838.146816", + "evaluation_id": "reward-bench-2/openai_gpt-4o-2024-08-06/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -1924,128 +1924,104 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8673 + "score": 0.6493 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9609 + "score": 0.5684 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.761 + "score": 0.3312 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Math", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Math score - measures mathematical reasoning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8811 + "score": 0.623 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8661 + "score": 0.8619 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } - } - ], - "detailed_evaluation_results": null, - "generation_config": null - }, - { - "evaluation_id": "reward-bench-2/openai_gpt-4o-2024-08-06/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "rewardbench", - "version": "0.1.3", - "additional_details": { - "subsets": "Chat, Chat Hard, Safety, Reasoning", - "hf_space": "allenai/reward-bench" - } - }, - "benchmark": "reward-bench", - "evaluation_results": [ + }, { - "evaluation_name": "Score", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6493 + "score": 0.7293 }, "source_data": { "dataset_name": "RewardBench 2", @@ -2054,111 +2030,135 @@ } }, { - "evaluation_name": "Factuality", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5684 + "score": 0.7819 }, "source_data": { "dataset_name": "RewardBench 2", "source_type": "hf_dataset", "hf_repo": "allenai/reward-bench-2-results" } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": null + }, + { + "evaluation_id": "reward-bench/openai_gpt-4o-2024-08-06/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ { - "evaluation_name": "Precise IF", + "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3312 + "score": 0.8673 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.623 + "score": 0.9609 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8619 + "score": 0.761 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7293 + "score": 0.8811 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7819 + "score": 0.8661 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/openai_gpt-4o-2024-11-20.json b/data/models/openai_gpt-4o-2024-11-20.json similarity index 100% rename from data/openai_gpt-4o-2024-11-20.json rename to data/models/openai_gpt-4o-2024-11-20.json diff --git a/data/openai_gpt-4o-mini-2024-07-18.json b/data/models/openai_gpt-4o-mini-2024-07-18.json similarity index 100% rename from data/openai_gpt-4o-mini-2024-07-18.json rename to data/models/openai_gpt-4o-mini-2024-07-18.json diff --git a/data/openai_gpt-5-2025-08-07.json b/data/models/openai_gpt-5-2025-08-07.json similarity index 100% rename from data/openai_gpt-5-2025-08-07.json rename to data/models/openai_gpt-5-2025-08-07.json index 724f752582fce96f498cf28c110d297d1f461c38..73a59bb6a004f262dd12f38906a946bc9718abbf 100644 --- a/data/openai_gpt-5-2025-08-07.json +++ b/data/models/openai_gpt-5-2025-08-07.json @@ -7,8 +7,8 @@ }, "evaluations": [ { - "evaluation_id": "global-mmlu-lite/openai_gpt-5-2025-08-07/1773936496.366405", - "retrieved_timestamp": "1773936496.366405", + "evaluation_id": "global-mmlu-lite/openai_gpt-5-2025-08-07/1773936583.743359", + "retrieved_timestamp": "1773936583.743359", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", @@ -522,8 +522,8 @@ "generation_config": null }, { - "evaluation_id": "global-mmlu-lite/openai_gpt-5-2025-08-07/1773936583.743359", - "retrieved_timestamp": "1773936583.743359", + "evaluation_id": "global-mmlu-lite/openai_gpt-5-2025-08-07/1773936496.366405", + "retrieved_timestamp": "1773936496.366405", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", diff --git a/data/openai_gpt-5-codex.json b/data/models/openai_gpt-5-codex.json similarity index 98% rename from data/openai_gpt-5-codex.json rename to data/models/openai_gpt-5-codex.json index 7a7fa2e4d1463477e785ff37a367109a1873a362..7d8f6f31247cc1eb826ee9d21a2caaf35f07f0e2 100644 --- a/data/openai_gpt-5-codex.json +++ b/data/models/openai_gpt-5-codex.json @@ -4,13 +4,13 @@ "id": "openai/gpt-5-codex", "developer": "OpenAI", "additional_details": { - "agent_name": "Codex CLI", - "agent_organization": "OpenAI" + "agent_name": "Mini-SWE-Agent", + "agent_organization": "Princeton" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-5-codex/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5-codex/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-03", + "evaluation_timestamp": "2025-10-31", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 41.3, + "score": 43.4, "uncertainty": { "standard_error": { - "value": 2.8 + "value": 2.9 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5-codex/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5-codex/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-10-31", + "evaluation_timestamp": "2025-11-04", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 43.4, + "score": 44.3, "uncertainty": { "standard_error": { - "value": 2.9 + "value": 2.7 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5-codex/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-5-codex/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-04", + "evaluation_timestamp": "2025-11-03", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,17 @@ "max_score": 100.0 }, "score_details": { - "score": 44.3, + "score": 41.3, "uncertainty": { "standard_error": { - "value": 2.7 + "value": 2.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/openai_gpt-5-mini-2025-08-07.json b/data/models/openai_gpt-5-mini-2025-08-07.json similarity index 100% rename from data/openai_gpt-5-mini-2025-08-07.json rename to data/models/openai_gpt-5-mini-2025-08-07.json diff --git a/data/openai_gpt-5-mini.json b/data/models/openai_gpt-5-mini.json similarity index 99% rename from data/openai_gpt-5-mini.json rename to data/models/openai_gpt-5-mini.json index f0e94373e60799f3f0d3ef9f31a403952c33f6bd..b1410fe13d2501fc3f89237d9e985bed25d655cf 100644 --- a/data/openai_gpt-5-mini.json +++ b/data/models/openai_gpt-5-mini.json @@ -4,13 +4,13 @@ "id": "openai/gpt-5-mini", "developer": "OpenAI", "additional_details": { - "agent_name": "Terminus 2", - "agent_organization": "Terminal Bench" + "agent_name": "Codex CLI", + "agent_organization": "OpenAI" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-5-mini/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/openhands__gpt-5-mini/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-03", + "evaluation_timestamp": "2025-11-02", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 22.2, + "score": 29.2, "uncertainty": { "standard_error": { - "value": 2.6 + "value": 2.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Mini\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Mini\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Mini\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Mini\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5-mini/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-5-mini/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-04", + "evaluation_timestamp": "2025-11-03", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 31.9, + "score": 22.2, "uncertainty": { "standard_error": { - "value": 3.0 + "value": 2.6 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Mini\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Mini\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Mini\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Mini\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/openhands__gpt-5-mini/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5-mini/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-02", + "evaluation_timestamp": "2025-10-31", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,17 @@ "max_score": 100.0 }, "score_details": { - "score": 29.2, + "score": 24.0, "uncertainty": { "standard_error": { - "value": 2.8 + "value": 2.5 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Mini\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Mini\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Mini\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Mini\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -306,7 +306,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5-mini/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5-mini/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -330,7 +330,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-10-31", + "evaluation_timestamp": "2025-11-04", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -339,17 +339,17 @@ "max_score": 100.0 }, "score_details": { - "score": 24.0, + "score": 31.9, "uncertainty": { "standard_error": { - "value": 2.5 + "value": 3.0 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Mini\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Mini\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -366,7 +366,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Mini\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Mini\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/openai_gpt-5-nano-2025-08-07.json b/data/models/openai_gpt-5-nano-2025-08-07.json similarity index 100% rename from data/openai_gpt-5-nano-2025-08-07.json rename to data/models/openai_gpt-5-nano-2025-08-07.json diff --git a/data/openai_gpt-5-nano.json b/data/models/openai_gpt-5-nano.json similarity index 99% rename from data/openai_gpt-5-nano.json rename to data/models/openai_gpt-5-nano.json index 62d02982c756eb4551cf5f2644dd7bde125852d7..e4e8ae16cf51db0ba1405657d05afdf54cd350ed 100644 --- a/data/openai_gpt-5-nano.json +++ b/data/models/openai_gpt-5-nano.json @@ -4,13 +4,13 @@ "id": "openai/gpt-5-nano", "developer": "OpenAI", "additional_details": { - "agent_name": "Codex CLI", - "agent_organization": "OpenAI" + "agent_name": "Mini-SWE-Agent", + "agent_organization": "Princeton" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-5-nano/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/openhands__gpt-5-nano/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-03", + "evaluation_timestamp": "2025-11-02", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 7.0, + "score": 9.9, "uncertainty": { "standard_error": { - "value": 1.9 + "value": 2.1 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Nano\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Nano\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Nano\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Nano\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5-nano/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5-nano/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-10-31", + "evaluation_timestamp": "2025-11-04", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 7.9, + "score": 11.5, "uncertainty": { "standard_error": { - "value": 1.9 + "value": 2.3 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Nano\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Nano\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Nano\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Nano\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/openhands__gpt-5-nano/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5-nano/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-02", + "evaluation_timestamp": "2025-10-31", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,17 @@ "max_score": 100.0 }, "score_details": { - "score": 9.9, + "score": 7.9, "uncertainty": { "standard_error": { - "value": 2.1 + "value": 1.9 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Nano\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Nano\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Nano\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Nano\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -232,7 +232,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5-nano/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-5-nano/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -256,7 +256,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-04", + "evaluation_timestamp": "2025-11-03", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -265,17 +265,17 @@ "max_score": 100.0 }, "score_details": { - "score": 11.5, + "score": 7.0, "uncertainty": { "standard_error": { - "value": 2.3 + "value": 1.9 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Nano\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Nano\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -292,7 +292,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Nano\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Nano\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/openai_gpt-5.1-codex-max.json b/data/models/openai_gpt-5.1-codex-max.json similarity index 100% rename from data/openai_gpt-5.1-codex-max.json rename to data/models/openai_gpt-5.1-codex-max.json diff --git a/data/openai_gpt-5.1-codex-mini.json b/data/models/openai_gpt-5.1-codex-mini.json similarity index 100% rename from data/openai_gpt-5.1-codex-mini.json rename to data/models/openai_gpt-5.1-codex-mini.json diff --git a/data/openai_gpt-5.1-codex.json b/data/models/openai_gpt-5.1-codex.json similarity index 100% rename from data/openai_gpt-5.1-codex.json rename to data/models/openai_gpt-5.1-codex.json diff --git a/data/openai_gpt-5.1.json b/data/models/openai_gpt-5.1.json similarity index 100% rename from data/openai_gpt-5.1.json rename to data/models/openai_gpt-5.1.json diff --git a/data/openai_gpt-5.2-2025-12-11.json b/data/models/openai_gpt-5.2-2025-12-11.json similarity index 99% rename from data/openai_gpt-5.2-2025-12-11.json rename to data/models/openai_gpt-5.2-2025-12-11.json index d7196904a0e6018880378114998a60584ef43c14..0e67f7333253f410afc4237865ce72da2dc4610f 100644 --- a/data/openai_gpt-5.2-2025-12-11.json +++ b/data/models/openai_gpt-5.2-2025-12-11.json @@ -4,13 +4,13 @@ "id": "openai/gpt-5.2-2025-12-11", "developer": "OpenAI", "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } }, "evaluations": [ { - "evaluation_id": "appworld/test_normal/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "appworld/test_normal/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -42,23 +42,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.071, + "score": 0.0, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.55", - "total_run_cost": "55.03", - "average_steps": "51.59", - "percent_finished": "0.61" + "average_agent_cost": "0.0", + "total_run_cost": "0.0", + "average_steps": "0.0", + "percent_finished": "0.0" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -70,15 +70,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "appworld/test_normal/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "appworld/test_normal/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -125,8 +125,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -138,15 +138,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "appworld/test_normal/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "appworld/test_normal/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -178,23 +178,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0, + "score": 0.22, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.0", - "total_run_cost": "0.0", - "average_steps": "0.0", - "percent_finished": "0.0" + "average_agent_cost": "0.36", + "total_run_cost": "36.37", + "average_steps": "10.05", + "percent_finished": "1.0" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -206,15 +206,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } } }, { - "evaluation_id": "appworld/test_normal/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "appworld/test_normal/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -246,23 +246,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.22, + "score": 0.0, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.36", - "total_run_cost": "36.37", - "average_steps": "10.05", - "percent_finished": "1.0" + "average_agent_cost": "0.0", + "total_run_cost": "0.0", + "average_steps": "0.0", + "percent_finished": "0.0" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -274,15 +274,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "browsecompplus/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "appworld/test_normal/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -295,42 +295,42 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "browsecompplus", + "benchmark": "appworld_test_normal", "evaluation_results": [ { - "evaluation_name": "browsecompplus", + "evaluation_name": "appworld/test_normal", "source_data": { - "dataset_name": "browsecompplus", + "dataset_name": "appworld/test_normal", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "BrowseCompPlus benchmark evaluation", + "evaluation_description": "AppWorld benchmark evaluation (test_normal subset)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.46, + "score": 0.071, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.3", - "total_run_cost": "29.78", - "average_steps": "8.14", - "percent_finished": "0.99" + "average_agent_cost": "0.55", + "total_run_cost": "55.03", + "average_steps": "51.59", + "percent_finished": "0.61" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -342,15 +342,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } } }, { - "evaluation_id": "appworld/test_normal/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "browsecompplus/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -363,42 +363,42 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "appworld_test_normal", + "benchmark": "browsecompplus", "evaluation_results": [ { - "evaluation_name": "appworld/test_normal", + "evaluation_name": "browsecompplus", "source_data": { - "dataset_name": "appworld/test_normal", + "dataset_name": "browsecompplus", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "AppWorld benchmark evaluation (test_normal subset)", + "evaluation_description": "BrowseCompPlus benchmark evaluation", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.0, + "score": 0.46, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.0", - "total_run_cost": "0.0", - "average_steps": "0.0", - "percent_finished": "0.0" + "average_agent_cost": "0.3", + "total_run_cost": "29.78", + "average_steps": "8.14", + "percent_finished": "0.99" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -410,15 +410,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "browsecompplus/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "browsecompplus/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -450,23 +450,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.46, + "score": 0.48, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.3", - "total_run_cost": "29.78", - "average_steps": "8.14", - "percent_finished": "0.99" + "average_agent_cost": "0.38", + "total_run_cost": "38.21", + "average_steps": "14.27", + "percent_finished": "1.0" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -478,15 +478,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "browsecompplus/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "browsecompplus/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -518,23 +518,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.26, + "score": 0.43, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.17", - "total_run_cost": "17.31", - "average_steps": "6.57", - "percent_finished": "0.99" + "average_agent_cost": "0.43", + "total_run_cost": "43.11", + "average_steps": "8.97", + "percent_finished": "1.0" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -546,15 +546,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "browsecompplus/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "browsecompplus/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -586,23 +586,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.43, + "score": 0.26, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.43", - "total_run_cost": "43.11", - "average_steps": "8.97", - "percent_finished": "1.0" + "average_agent_cost": "0.17", + "total_run_cost": "17.31", + "average_steps": "6.57", + "percent_finished": "0.99" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -614,15 +614,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } } }, { - "evaluation_id": "browsecompplus/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "browsecompplus/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -654,23 +654,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.48, + "score": 0.46, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.38", - "total_run_cost": "38.21", - "average_steps": "14.27", - "percent_finished": "1.0" + "average_agent_cost": "0.3", + "total_run_cost": "29.78", + "average_steps": "8.14", + "percent_finished": "0.99" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -682,8 +682,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -769,7 +769,7 @@ "generation_config": null }, { - "evaluation_id": "swe-bench/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "swe-bench/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -801,14 +801,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5455, + "score": 0.57, "uncertainty": { - "num_samples": 99 + "num_samples": 100 }, "details": { - "average_agent_cost": "0.26", - "total_run_cost": "25.64", - "average_steps": "20.44", + "average_agent_cost": "0.25", + "total_run_cost": "24.76", + "average_steps": "20.47", "percent_finished": "1.0" } }, @@ -816,8 +816,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -829,15 +829,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "swe-bench/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "swe-bench/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -869,14 +869,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.57, + "score": 0.58, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.25", - "total_run_cost": "24.76", - "average_steps": "20.47", + "average_agent_cost": "0.94", + "total_run_cost": "93.98", + "average_steps": "23.99", "percent_finished": "1.0" } }, @@ -884,8 +884,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -897,15 +897,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "swe-bench/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "swe-bench/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -937,14 +937,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5253, + "score": 0.57, "uncertainty": { - "num_samples": 99 + "num_samples": 100 }, "details": { - "average_agent_cost": "0.45", - "total_run_cost": "44.58", - "average_steps": "19.98", + "average_agent_cost": "0.25", + "total_run_cost": "24.76", + "average_steps": "20.47", "percent_finished": "1.0" } }, @@ -952,8 +952,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -965,15 +965,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } } }, { - "evaluation_id": "swe-bench/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "swe-bench/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1005,14 +1005,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.58, + "score": 0.5455, "uncertainty": { - "num_samples": 100 + "num_samples": 99 }, "details": { - "average_agent_cost": "0.94", - "total_run_cost": "93.98", - "average_steps": "23.99", + "average_agent_cost": "0.26", + "total_run_cost": "25.64", + "average_steps": "20.44", "percent_finished": "1.0" } }, @@ -1020,8 +1020,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -1033,15 +1033,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "swe-bench/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/airline/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1054,33 +1054,33 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "swe-bench", + "benchmark": "tau-bench-2_airline", "evaluation_results": [ { - "evaluation_name": "swe-bench", + "evaluation_name": "tau-bench-2/airline", "source_data": { - "dataset_name": "swe-bench", + "dataset_name": "tau-bench-2/airline", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "SWE-bench benchmark evaluation", + "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.57, + "score": 0.54, "uncertainty": { - "num_samples": 100 + "num_samples": 50 }, "details": { - "average_agent_cost": "0.25", - "total_run_cost": "24.76", - "average_steps": "20.47", + "average_agent_cost": "0.13", + "total_run_cost": "6.96", + "average_steps": "11.22", "percent_finished": "1.0" } }, @@ -1088,8 +1088,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -1101,15 +1101,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "tau-bench-2/airline/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/airline/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1141,14 +1141,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.54, + "score": 0.5, "uncertainty": { "num_samples": 50 }, "details": { - "average_agent_cost": "0.13", - "total_run_cost": "6.96", - "average_steps": "11.22", + "average_agent_cost": "0.11", + "total_run_cost": "5.77", + "average_steps": "11.4", "percent_finished": "1.0" } }, @@ -1156,8 +1156,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -1169,15 +1169,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "tau-bench-2/airline/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/airline/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1209,14 +1209,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6, + "score": 0.48, "uncertainty": { "num_samples": 50 }, "details": { - "average_agent_cost": "0.29", - "total_run_cost": "15.28", - "average_steps": "10.68", + "average_agent_cost": "0.21", + "total_run_cost": "11.23", + "average_steps": "10.18", "percent_finished": "1.0" } }, @@ -1224,8 +1224,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -1237,15 +1237,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "tau-bench-2/retail/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/airline/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1258,33 +1258,33 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "tau-bench-2_retail", + "benchmark": "tau-bench-2_airline", "evaluation_results": [ { - "evaluation_name": "tau-bench-2/retail", + "evaluation_name": "tau-bench-2/airline", "source_data": { - "dataset_name": "tau-bench-2/retail", + "dataset_name": "tau-bench-2/airline", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "Tau Bench 2 benchmark evaluation (retail subset)", + "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.73, + "score": 0.6, "uncertainty": { - "num_samples": 100 + "num_samples": 50 }, "details": { - "average_agent_cost": "0.11", - "total_run_cost": "12.27", - "average_steps": "10.33", + "average_agent_cost": "0.29", + "total_run_cost": "15.28", + "average_steps": "10.68", "percent_finished": "1.0" } }, @@ -1292,8 +1292,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -1305,15 +1305,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } } }, { - "evaluation_id": "tau-bench-2/retail/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/airline/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1326,33 +1326,33 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "tau-bench-2_retail", + "benchmark": "tau-bench-2_airline", "evaluation_results": [ { - "evaluation_name": "tau-bench-2/retail", + "evaluation_name": "tau-bench-2/airline", "source_data": { - "dataset_name": "tau-bench-2/retail", + "dataset_name": "tau-bench-2/airline", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "Tau Bench 2 benchmark evaluation (retail subset)", + "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.73, + "score": 0.54, "uncertainty": { - "num_samples": 100 + "num_samples": 50 }, "details": { - "average_agent_cost": "0.11", - "total_run_cost": "12.27", - "average_steps": "10.33", + "average_agent_cost": "0.13", + "total_run_cost": "6.96", + "average_steps": "11.22", "percent_finished": "1.0" } }, @@ -1360,8 +1360,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -1373,15 +1373,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } } }, { - "evaluation_id": "tau-bench-2/airline/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1394,33 +1394,33 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "tau-bench-2_airline", + "benchmark": "tau-bench-2_retail", "evaluation_results": [ { - "evaluation_name": "tau-bench-2/airline", + "evaluation_name": "tau-bench-2/retail", "source_data": { - "dataset_name": "tau-bench-2/airline", + "dataset_name": "tau-bench-2/retail", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)", + "evaluation_description": "Tau Bench 2 benchmark evaluation (retail subset)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5, + "score": 0.68, "uncertainty": { - "num_samples": 50 + "num_samples": 100 }, "details": { - "average_agent_cost": "0.11", - "total_run_cost": "5.77", - "average_steps": "11.4", + "average_agent_cost": "0.25", + "total_run_cost": "26.27", + "average_steps": "11.08", "percent_finished": "1.0" } }, @@ -1428,8 +1428,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -1441,15 +1441,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } } }, { - "evaluation_id": "tau-bench-2/retail/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1481,23 +1481,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5354, + "score": 0.51, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.11", - "total_run_cost": "11.54", - "average_steps": "9.55", - "percent_finished": "0.99" + "average_agent_cost": "0.12", + "total_run_cost": "12.63", + "average_steps": "9.92", + "percent_finished": "0.98" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -1509,15 +1509,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "tau-bench-2/retail/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1549,14 +1549,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.68, + "score": 0.73, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.25", - "total_run_cost": "26.27", - "average_steps": "11.08", + "average_agent_cost": "0.11", + "total_run_cost": "12.27", + "average_steps": "10.33", "percent_finished": "1.0" } }, @@ -1564,8 +1564,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -1577,15 +1577,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "tau-bench-2/airline/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "swe-bench/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1598,33 +1598,33 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "tau-bench-2_airline", + "benchmark": "swe-bench", "evaluation_results": [ { - "evaluation_name": "tau-bench-2/airline", + "evaluation_name": "swe-bench", "source_data": { - "dataset_name": "tau-bench-2/airline", + "dataset_name": "swe-bench", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)", + "evaluation_description": "SWE-bench benchmark evaluation", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.54, + "score": 0.5253, "uncertainty": { - "num_samples": 50 + "num_samples": 99 }, "details": { - "average_agent_cost": "0.13", - "total_run_cost": "6.96", - "average_steps": "11.22", + "average_agent_cost": "0.45", + "total_run_cost": "44.58", + "average_steps": "19.98", "percent_finished": "1.0" } }, @@ -1632,8 +1632,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -1645,15 +1645,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } } }, { - "evaluation_id": "tau-bench-2/airline/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1666,42 +1666,42 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "tau-bench-2_airline", + "benchmark": "tau-bench-2_telecom", "evaluation_results": [ { - "evaluation_name": "tau-bench-2/airline", + "evaluation_name": "tau-bench-2/telecom", "source_data": { - "dataset_name": "tau-bench-2/airline", + "dataset_name": "tau-bench-2/telecom", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)", + "evaluation_description": "Tau Bench 2 benchmark evaluation (telecom subset)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.48, + "score": 0.5354, "uncertainty": { - "num_samples": 50 + "num_samples": 100 }, "details": { - "average_agent_cost": "0.21", - "total_run_cost": "11.23", + "average_agent_cost": "0.14", + "total_run_cost": "19.92", "average_steps": "10.18", - "percent_finished": "1.0" + "percent_finished": "0.99" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -1713,15 +1713,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } } }, { - "evaluation_id": "tau-bench-2/retail/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1753,23 +1753,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.51, + "score": 0.5354, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.12", - "total_run_cost": "12.63", - "average_steps": "9.92", - "percent_finished": "0.98" + "average_agent_cost": "0.11", + "total_run_cost": "11.54", + "average_steps": "9.55", + "percent_finished": "0.99" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -1781,15 +1781,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "tau-bench-2/telecom/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/telecom/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1821,14 +1821,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.53, + "score": 0.55, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.15", - "total_run_cost": "18.88", - "average_steps": "9.92", + "average_agent_cost": "0.1", + "total_run_cost": "15.15", + "average_steps": "9.36", "percent_finished": "1.0" } }, @@ -1836,8 +1836,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -1849,15 +1849,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/telecom/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1889,23 +1889,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5354, + "score": 0.71, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.14", - "total_run_cost": "19.92", - "average_steps": "10.18", - "percent_finished": "0.99" + "average_agent_cost": "0.3", + "total_run_cost": "35.31", + "average_steps": "10.11", + "percent_finished": "1.0" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -1917,15 +1917,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } } }, { - "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/telecom/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1957,23 +1957,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5354, + "score": 0.53, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.14", - "total_run_cost": "19.92", - "average_steps": "10.18", - "percent_finished": "0.99" + "average_agent_cost": "0.15", + "total_run_cost": "18.88", + "average_steps": "9.92", + "percent_finished": "1.0" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -1985,15 +1985,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "tau-bench-2/telecom/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2006,33 +2006,33 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "tau-bench-2_telecom", + "benchmark": "tau-bench-2_retail", "evaluation_results": [ { - "evaluation_name": "tau-bench-2/telecom", + "evaluation_name": "tau-bench-2/retail", "source_data": { - "dataset_name": "tau-bench-2/telecom", + "dataset_name": "tau-bench-2/retail", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "Tau Bench 2 benchmark evaluation (telecom subset)", + "evaluation_description": "Tau Bench 2 benchmark evaluation (retail subset)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.71, + "score": 0.73, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.3", - "total_run_cost": "35.31", - "average_steps": "10.11", + "average_agent_cost": "0.11", + "total_run_cost": "12.27", + "average_steps": "10.33", "percent_finished": "1.0" } }, @@ -2040,8 +2040,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -2053,15 +2053,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } } }, { - "evaluation_id": "tau-bench-2/telecom/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2093,23 +2093,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.55, + "score": 0.5354, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.1", - "total_run_cost": "15.15", - "average_steps": "9.36", - "percent_finished": "1.0" + "average_agent_cost": "0.14", + "total_run_cost": "19.92", + "average_steps": "10.18", + "percent_finished": "0.99" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -2121,8 +2121,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } diff --git a/data/openai_gpt-5.2-codex.json b/data/models/openai_gpt-5.2-codex.json similarity index 100% rename from data/openai_gpt-5.2-codex.json rename to data/models/openai_gpt-5.2-codex.json diff --git a/data/openai_gpt-5.2.json b/data/models/openai_gpt-5.2.json similarity index 100% rename from data/openai_gpt-5.2.json rename to data/models/openai_gpt-5.2.json index daede8fe351161d9885216fb068ff2a471a0b737..264a902cf7567e7522c491bb5946e45236c20f82 100644 --- a/data/openai_gpt-5.2.json +++ b/data/models/openai_gpt-5.2.json @@ -10,7 +10,7 @@ }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5.2/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5.2/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-18", + "evaluation_timestamp": "2025-12-12", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 62.9, + "score": 54.0, "uncertainty": { "standard_error": { - "value": 3.0 + "value": 2.9 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5.2\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.2\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5.2\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.2\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5.2/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5.2/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-12", + "evaluation_timestamp": "2025-12-18", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,17 @@ "max_score": 100.0 }, "score_details": { - "score": 54.0, + "score": 62.9, "uncertainty": { "standard_error": { - "value": 2.9 + "value": 3.0 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.2\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5.2\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.2\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5.2\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/openai_gpt-5.3-codex.json b/data/models/openai_gpt-5.3-codex.json similarity index 100% rename from data/openai_gpt-5.3-codex.json rename to data/models/openai_gpt-5.3-codex.json index 444a5fc7180518516b46001f221f93c5d16bcd79..c2e0e3c5c76650f4f61bd284a560a52ddba90f4a 100644 --- a/data/openai_gpt-5.3-codex.json +++ b/data/models/openai_gpt-5.3-codex.json @@ -10,7 +10,7 @@ }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/simple-codex__gpt-5.3-codex/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/codebrain-1__gpt-5.3-codex/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-06", + "evaluation_timestamp": "2026-02-10", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 75.1, + "score": 70.3, "uncertainty": { "standard_error": { - "value": 2.4 + "value": 2.6 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Simple Codex\" -m \"GPT-5.3-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"CodeBrain-1\" -m \"GPT-5.3-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Simple Codex\" -m \"GPT-5.3-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"CodeBrain-1\" -m \"GPT-5.3-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -232,7 +232,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/codebrain-1__gpt-5.3-codex/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/simple-codex__gpt-5.3-codex/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -256,7 +256,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-10", + "evaluation_timestamp": "2026-02-06", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -265,17 +265,17 @@ "max_score": 100.0 }, "score_details": { - "score": 70.3, + "score": 75.1, "uncertainty": { "standard_error": { - "value": 2.6 + "value": 2.4 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"CodeBrain-1\" -m \"GPT-5.3-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Simple Codex\" -m \"GPT-5.3-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -292,7 +292,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"CodeBrain-1\" -m \"GPT-5.3-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Simple Codex\" -m \"GPT-5.3-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/openai_gpt-5.json b/data/models/openai_gpt-5.json similarity index 100% rename from data/openai_gpt-5.json rename to data/models/openai_gpt-5.json diff --git a/data/openai_gpt-oss-120b.json b/data/models/openai_gpt-oss-120b.json similarity index 100% rename from data/openai_gpt-oss-120b.json rename to data/models/openai_gpt-oss-120b.json diff --git a/data/openai_gpt-oss-20b.json b/data/models/openai_gpt-oss-20b.json similarity index 100% rename from data/openai_gpt-oss-20b.json rename to data/models/openai_gpt-oss-20b.json diff --git a/data/openai_o3-2025-04-16.json b/data/models/openai_o3-2025-04-16.json similarity index 100% rename from data/openai_o3-2025-04-16.json rename to data/models/openai_o3-2025-04-16.json diff --git a/data/openai_o3-mini-2025-01-31.json b/data/models/openai_o3-mini-2025-01-31.json similarity index 100% rename from data/openai_o3-mini-2025-01-31.json rename to data/models/openai_o3-mini-2025-01-31.json diff --git a/data/openai_o3.json b/data/models/openai_o3.json similarity index 100% rename from data/openai_o3.json rename to data/models/openai_o3.json diff --git a/data/openai_o3_Pro.json b/data/models/openai_o3_Pro.json similarity index 100% rename from data/openai_o3_Pro.json rename to data/models/openai_o3_Pro.json diff --git a/data/openai_o4-mini-2025-04-16.json b/data/models/openai_o4-mini-2025-04-16.json similarity index 99% rename from data/openai_o4-mini-2025-04-16.json rename to data/models/openai_o4-mini-2025-04-16.json index 89affac90b976d3ab841c13f750b966ab7f3be0b..6d7680cf32b045c46e6d407e1e671e786eaccdaf 100644 --- a/data/openai_o4-mini-2025-04-16.json +++ b/data/models/openai_o4-mini-2025-04-16.json @@ -1,9 +1,9 @@ { "model_info": { "name": "o4-mini-2025-04-16", - "developer": "OpenAI", - "inference_platform": "openai", - "id": "openai/o4-mini-2025-04-16" + "id": "openai/o4-mini-2025-04-16", + "developer": "openai", + "inference_platform": "openai" }, "evaluations": [ { @@ -746,13 +746,13 @@ } }, { - "evaluation_id": "livecodebenchpro/o4-mini-2025-04-16/1770683238.099205", - "retrieved_timestamp": "1770683238.099205", + "evaluation_id": "livecodebenchpro/o4-mini-2025-04-16/1760492095.8105888", + "retrieved_timestamp": "1760492095.8105888", "source_metadata": { + "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", + "evaluator_relationship": "third_party", "source_name": "Live Code Bench Pro", - "source_type": "documentation", - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party" + "source_type": "documentation" }, "eval_library": { "name": "unknown", @@ -762,62 +762,62 @@ "evaluation_results": [ { "evaluation_name": "Hard Problems", + "metric_config": { + "evaluation_description": "Pass@1 on Hard Problems", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0, + "max_score": 1 + }, + "score_details": { + "score": 0.014084507042253521 + }, "source_data": { "dataset_name": "Hard Problems", "source_type": "url", "url": [ "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" ] - }, + } + }, + { + "evaluation_name": "Medium Problems", "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", + "evaluation_description": "Pass@1 on Medium Problems", "lower_is_better": false, "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 + "min_score": 0, + "max_score": 1 }, "score_details": { - "score": 0.0143 - } - }, - { - "evaluation_name": "Medium Problems", + "score": 0.30985915492957744 + }, "source_data": { "dataset_name": "Medium Problems", "source_type": "url", "url": [ "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" ] - }, + } + }, + { + "evaluation_name": "Easy Problems", "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", + "evaluation_description": "Pass@1 on Easy Problems", "lower_is_better": false, "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 + "min_score": 0, + "max_score": 1 }, "score_details": { - "score": 0.2923 - } - }, - { - "evaluation_name": "Easy Problems", + "score": 0.8873239436619719 + }, "source_data": { "dataset_name": "Easy Problems", "source_type": "url", "url": [ "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" ] - }, - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8571 } } ], @@ -825,13 +825,13 @@ "generation_config": null }, { - "evaluation_id": "livecodebenchpro/o4-mini-2025-04-16/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", + "evaluation_id": "livecodebenchpro/o4-mini-2025-04-16/1770683238.099205", + "retrieved_timestamp": "1770683238.099205", "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", "source_name": "Live Code Bench Pro", - "source_type": "documentation" + "source_type": "documentation", + "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", + "evaluator_relationship": "third_party" }, "eval_library": { "name": "unknown", @@ -841,62 +841,62 @@ "evaluation_results": [ { "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.014084507042253521 - }, "source_data": { "dataset_name": "Hard Problems", "source_type": "url", "url": [ "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" ] - } - }, - { - "evaluation_name": "Medium Problems", + }, "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", + "evaluation_description": "Pass@1 on Hard Problems", "lower_is_better": false, "score_type": "continuous", - "min_score": 0, - "max_score": 1 + "min_score": 0.0, + "max_score": 1.0 }, "score_details": { - "score": 0.30985915492957744 - }, + "score": 0.0143 + } + }, + { + "evaluation_name": "Medium Problems", "source_data": { "dataset_name": "Medium Problems", "source_type": "url", "url": [ "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" ] - } - }, - { - "evaluation_name": "Easy Problems", + }, "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", + "evaluation_description": "Pass@1 on Medium Problems", "lower_is_better": false, "score_type": "continuous", - "min_score": 0, - "max_score": 1 + "min_score": 0.0, + "max_score": 1.0 }, "score_details": { - "score": 0.8873239436619719 - }, + "score": 0.2923 + } + }, + { + "evaluation_name": "Easy Problems", "source_data": { "dataset_name": "Easy Problems", "source_type": "url", "url": [ "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" ] + }, + "metric_config": { + "evaluation_description": "Pass@1 on Easy Problems", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.8571 } } ], diff --git a/data/openai_text-ada-001.json b/data/models/openai_text-ada-001.json similarity index 100% rename from data/openai_text-ada-001.json rename to data/models/openai_text-ada-001.json diff --git a/data/openai_text-babbage-001.json b/data/models/openai_text-babbage-001.json similarity index 100% rename from data/openai_text-babbage-001.json rename to data/models/openai_text-babbage-001.json diff --git a/data/openai_text-curie-001.json b/data/models/openai_text-curie-001.json similarity index 100% rename from data/openai_text-curie-001.json rename to data/models/openai_text-curie-001.json diff --git a/data/openai_text-davinci-002.json b/data/models/openai_text-davinci-002.json similarity index 100% rename from data/openai_text-davinci-002.json rename to data/models/openai_text-davinci-002.json diff --git a/data/openai_text-davinci-003.json b/data/models/openai_text-davinci-003.json similarity index 100% rename from data/openai_text-davinci-003.json rename to data/models/openai_text-davinci-003.json diff --git a/data/openbmb_Eurus-7b-kto.json b/data/models/openbmb_Eurus-7b-kto.json similarity index 100% rename from data/openbmb_Eurus-7b-kto.json rename to data/models/openbmb_Eurus-7b-kto.json diff --git a/data/openbmb_Eurus-RM-7b.json b/data/models/openbmb_Eurus-RM-7b.json similarity index 100% rename from data/openbmb_Eurus-RM-7b.json rename to data/models/openbmb_Eurus-RM-7b.json diff --git a/data/openbmb_MiniCPM-2B-dpo-fp32.json b/data/models/openbmb_MiniCPM-2B-dpo-fp32.json similarity index 100% rename from data/openbmb_MiniCPM-2B-dpo-fp32.json rename to data/models/openbmb_MiniCPM-2B-dpo-fp32.json diff --git a/data/openbmb_MiniCPM-S-1B-sft-llama-format.json b/data/models/openbmb_MiniCPM-S-1B-sft-llama-format.json similarity index 100% rename from data/openbmb_MiniCPM-S-1B-sft-llama-format.json rename to data/models/openbmb_MiniCPM-S-1B-sft-llama-format.json diff --git a/data/openbmb_UltraRM-13b.json b/data/models/openbmb_UltraRM-13b.json similarity index 100% rename from data/openbmb_UltraRM-13b.json rename to data/models/openbmb_UltraRM-13b.json index 84bdd483e26f91976f0250093c6ea14b4f0ff97c..c52a509adb327ccd9798d5a844c78601940ebb17 100644 --- a/data/openbmb_UltraRM-13b.json +++ b/data/models/openbmb_UltraRM-13b.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/openbmb_UltraRM-13b/1766412838.146816", + "evaluation_id": "reward-bench-2/openbmb_UltraRM-13b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,109 +31,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6903 + "score": 0.4683 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9637 + "score": 0.5063 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5548 + "score": 0.3312 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.5519 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5986 + "score": 0.5089 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6244 + "score": 0.6081 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7294 + "score": 0.3036 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], @@ -141,10 +159,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench-2/openbmb_UltraRM-13b/1766412838.146816", + "evaluation_id": "reward-bench/openbmb_UltraRM-13b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -163,127 +181,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4683 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5063 + "score": 0.6903 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3312 + "score": 0.9637 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5519 + "score": 0.5548 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5089 + "score": 0.5986 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6081 + "score": 0.6244 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3036 + "score": 0.7294 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/openchat_openchat-3.5-0106.json b/data/models/openchat_openchat-3.5-0106.json similarity index 100% rename from data/openchat_openchat-3.5-0106.json rename to data/models/openchat_openchat-3.5-0106.json diff --git a/data/openchat_openchat-3.5-1210.json b/data/models/openchat_openchat-3.5-1210.json similarity index 100% rename from data/openchat_openchat-3.5-1210.json rename to data/models/openchat_openchat-3.5-1210.json diff --git a/data/openchat_openchat-3.6-8b-20240522.json b/data/models/openchat_openchat-3.6-8b-20240522.json similarity index 100% rename from data/openchat_openchat-3.6-8b-20240522.json rename to data/models/openchat_openchat-3.6-8b-20240522.json diff --git a/data/openchat_openchat_3.5.json b/data/models/openchat_openchat_3.5.json similarity index 100% rename from data/openchat_openchat_3.5.json rename to data/models/openchat_openchat_3.5.json diff --git a/data/openchat_openchat_v3.2.json b/data/models/openchat_openchat_v3.2.json similarity index 100% rename from data/openchat_openchat_v3.2.json rename to data/models/openchat_openchat_v3.2.json diff --git a/data/openchat_openchat_v3.2_super.json b/data/models/openchat_openchat_v3.2_super.json similarity index 100% rename from data/openchat_openchat_v3.2_super.json rename to data/models/openchat_openchat_v3.2_super.json diff --git a/data/opencompass_CompassJudger-1-1.5B-Instruct.json b/data/models/opencompass_CompassJudger-1-1.5B-Instruct.json similarity index 100% rename from data/opencompass_CompassJudger-1-1.5B-Instruct.json rename to data/models/opencompass_CompassJudger-1-1.5B-Instruct.json diff --git a/data/opencompass_CompassJudger-1-14B-Instruct.json b/data/models/opencompass_CompassJudger-1-14B-Instruct.json similarity index 100% rename from data/opencompass_CompassJudger-1-14B-Instruct.json rename to data/models/opencompass_CompassJudger-1-14B-Instruct.json diff --git a/data/opencompass_CompassJudger-1-32B-Instruct.json b/data/models/opencompass_CompassJudger-1-32B-Instruct.json similarity index 100% rename from data/opencompass_CompassJudger-1-32B-Instruct.json rename to data/models/opencompass_CompassJudger-1-32B-Instruct.json diff --git a/data/opencompass_CompassJudger-1-7B-Instruct.json b/data/models/opencompass_CompassJudger-1-7B-Instruct.json similarity index 100% rename from data/opencompass_CompassJudger-1-7B-Instruct.json rename to data/models/opencompass_CompassJudger-1-7B-Instruct.json diff --git a/data/orai-nlp_Llama-eus-8B.json b/data/models/orai-nlp_Llama-eus-8B.json similarity index 100% rename from data/orai-nlp_Llama-eus-8B.json rename to data/models/orai-nlp_Llama-eus-8B.json diff --git a/data/oxyapi_oxy-1-small.json b/data/models/oxyapi_oxy-1-small.json similarity index 100% rename from data/oxyapi_oxy-1-small.json rename to data/models/oxyapi_oxy-1-small.json diff --git a/data/ozone-ai_0x-lite.json b/data/models/ozone-ai_0x-lite.json similarity index 100% rename from data/ozone-ai_0x-lite.json rename to data/models/ozone-ai_0x-lite.json diff --git a/data/ozone-research_Chirp-01.json b/data/models/ozone-research_Chirp-01.json similarity index 100% rename from data/ozone-research_Chirp-01.json rename to data/models/ozone-research_Chirp-01.json diff --git a/data/paloalma_ECE-TW3-JRGL-V1.json b/data/models/paloalma_ECE-TW3-JRGL-V1.json similarity index 100% rename from data/paloalma_ECE-TW3-JRGL-V1.json rename to data/models/paloalma_ECE-TW3-JRGL-V1.json diff --git a/data/paloalma_ECE-TW3-JRGL-V2.json b/data/models/paloalma_ECE-TW3-JRGL-V2.json similarity index 100% rename from data/paloalma_ECE-TW3-JRGL-V2.json rename to data/models/paloalma_ECE-TW3-JRGL-V2.json diff --git a/data/paloalma_ECE-TW3-JRGL-V5.json b/data/models/paloalma_ECE-TW3-JRGL-V5.json similarity index 100% rename from data/paloalma_ECE-TW3-JRGL-V5.json rename to data/models/paloalma_ECE-TW3-JRGL-V5.json diff --git a/data/paloalma_Le_Triomphant-ECE-TW3.json b/data/models/paloalma_Le_Triomphant-ECE-TW3.json similarity index 100% rename from data/paloalma_Le_Triomphant-ECE-TW3.json rename to data/models/paloalma_Le_Triomphant-ECE-TW3.json diff --git a/data/paloalma_TW3-JRGL-v2.json b/data/models/paloalma_TW3-JRGL-v2.json similarity index 100% rename from data/paloalma_TW3-JRGL-v2.json rename to data/models/paloalma_TW3-JRGL-v2.json diff --git a/data/pankajmathur_Al_Dente_v1_8b.json b/data/models/pankajmathur_Al_Dente_v1_8b.json similarity index 100% rename from data/pankajmathur_Al_Dente_v1_8b.json rename to data/models/pankajmathur_Al_Dente_v1_8b.json diff --git a/data/pankajmathur_model_007_13b_v2.json b/data/models/pankajmathur_model_007_13b_v2.json similarity index 100% rename from data/pankajmathur_model_007_13b_v2.json rename to data/models/pankajmathur_model_007_13b_v2.json diff --git a/data/pankajmathur_orca_mini_3b.json b/data/models/pankajmathur_orca_mini_3b.json similarity index 100% rename from data/pankajmathur_orca_mini_3b.json rename to data/models/pankajmathur_orca_mini_3b.json diff --git a/data/pankajmathur_orca_mini_7b.json b/data/models/pankajmathur_orca_mini_7b.json similarity index 100% rename from data/pankajmathur_orca_mini_7b.json rename to data/models/pankajmathur_orca_mini_7b.json diff --git a/data/pankajmathur_orca_mini_phi-4.json b/data/models/pankajmathur_orca_mini_phi-4.json similarity index 100% rename from data/pankajmathur_orca_mini_phi-4.json rename to data/models/pankajmathur_orca_mini_phi-4.json diff --git a/data/pankajmathur_orca_mini_v2_7b.json b/data/models/pankajmathur_orca_mini_v2_7b.json similarity index 100% rename from data/pankajmathur_orca_mini_v2_7b.json rename to data/models/pankajmathur_orca_mini_v2_7b.json diff --git a/data/pankajmathur_orca_mini_v3_13b.json b/data/models/pankajmathur_orca_mini_v3_13b.json similarity index 100% rename from data/pankajmathur_orca_mini_v3_13b.json rename to data/models/pankajmathur_orca_mini_v3_13b.json diff --git a/data/pankajmathur_orca_mini_v3_70b.json b/data/models/pankajmathur_orca_mini_v3_70b.json similarity index 100% rename from data/pankajmathur_orca_mini_v3_70b.json rename to data/models/pankajmathur_orca_mini_v3_70b.json diff --git a/data/pankajmathur_orca_mini_v3_7b.json b/data/models/pankajmathur_orca_mini_v3_7b.json similarity index 100% rename from data/pankajmathur_orca_mini_v3_7b.json rename to data/models/pankajmathur_orca_mini_v3_7b.json diff --git a/data/pankajmathur_orca_mini_v5_8b.json b/data/models/pankajmathur_orca_mini_v5_8b.json similarity index 100% rename from data/pankajmathur_orca_mini_v5_8b.json rename to data/models/pankajmathur_orca_mini_v5_8b.json diff --git a/data/pankajmathur_orca_mini_v5_8b_dpo.json b/data/models/pankajmathur_orca_mini_v5_8b_dpo.json similarity index 100% rename from data/pankajmathur_orca_mini_v5_8b_dpo.json rename to data/models/pankajmathur_orca_mini_v5_8b_dpo.json diff --git a/data/pankajmathur_orca_mini_v5_8b_orpo.json b/data/models/pankajmathur_orca_mini_v5_8b_orpo.json similarity index 100% rename from data/pankajmathur_orca_mini_v5_8b_orpo.json rename to data/models/pankajmathur_orca_mini_v5_8b_orpo.json diff --git a/data/pankajmathur_orca_mini_v6_8b.json b/data/models/pankajmathur_orca_mini_v6_8b.json similarity index 100% rename from data/pankajmathur_orca_mini_v6_8b.json rename to data/models/pankajmathur_orca_mini_v6_8b.json diff --git a/data/pankajmathur_orca_mini_v6_8b_dpo.json b/data/models/pankajmathur_orca_mini_v6_8b_dpo.json similarity index 100% rename from data/pankajmathur_orca_mini_v6_8b_dpo.json rename to data/models/pankajmathur_orca_mini_v6_8b_dpo.json diff --git a/data/pankajmathur_orca_mini_v7_72b.json b/data/models/pankajmathur_orca_mini_v7_72b.json similarity index 100% rename from data/pankajmathur_orca_mini_v7_72b.json rename to data/models/pankajmathur_orca_mini_v7_72b.json diff --git a/data/pankajmathur_orca_mini_v7_7b.json b/data/models/pankajmathur_orca_mini_v7_7b.json similarity index 100% rename from data/pankajmathur_orca_mini_v7_7b.json rename to data/models/pankajmathur_orca_mini_v7_7b.json diff --git a/data/pankajmathur_orca_mini_v8_1_70b.json b/data/models/pankajmathur_orca_mini_v8_1_70b.json similarity index 100% rename from data/pankajmathur_orca_mini_v8_1_70b.json rename to data/models/pankajmathur_orca_mini_v8_1_70b.json diff --git a/data/pankajmathur_orca_mini_v9_0_3B-Instruct.json b/data/models/pankajmathur_orca_mini_v9_0_3B-Instruct.json similarity index 100% rename from data/pankajmathur_orca_mini_v9_0_3B-Instruct.json rename to data/models/pankajmathur_orca_mini_v9_0_3B-Instruct.json diff --git a/data/pankajmathur_orca_mini_v9_1_1B-Instruct.json b/data/models/pankajmathur_orca_mini_v9_1_1B-Instruct.json similarity index 100% rename from data/pankajmathur_orca_mini_v9_1_1B-Instruct.json rename to data/models/pankajmathur_orca_mini_v9_1_1B-Instruct.json diff --git a/data/pankajmathur_orca_mini_v9_2_14B.json b/data/models/pankajmathur_orca_mini_v9_2_14B.json similarity index 100% rename from data/pankajmathur_orca_mini_v9_2_14B.json rename to data/models/pankajmathur_orca_mini_v9_2_14B.json diff --git a/data/pankajmathur_orca_mini_v9_2_70b.json b/data/models/pankajmathur_orca_mini_v9_2_70b.json similarity index 100% rename from data/pankajmathur_orca_mini_v9_2_70b.json rename to data/models/pankajmathur_orca_mini_v9_2_70b.json diff --git a/data/pankajmathur_orca_mini_v9_4_70B.json b/data/models/pankajmathur_orca_mini_v9_4_70B.json similarity index 100% rename from data/pankajmathur_orca_mini_v9_4_70B.json rename to data/models/pankajmathur_orca_mini_v9_4_70B.json diff --git a/data/pankajmathur_orca_mini_v9_5_1B-Instruct.json b/data/models/pankajmathur_orca_mini_v9_5_1B-Instruct.json similarity index 100% rename from data/pankajmathur_orca_mini_v9_5_1B-Instruct.json rename to data/models/pankajmathur_orca_mini_v9_5_1B-Instruct.json diff --git a/data/pankajmathur_orca_mini_v9_5_1B-Instruct_preview.json b/data/models/pankajmathur_orca_mini_v9_5_1B-Instruct_preview.json similarity index 100% rename from data/pankajmathur_orca_mini_v9_5_1B-Instruct_preview.json rename to data/models/pankajmathur_orca_mini_v9_5_1B-Instruct_preview.json diff --git a/data/pankajmathur_orca_mini_v9_5_3B-Instruct.json b/data/models/pankajmathur_orca_mini_v9_5_3B-Instruct.json similarity index 100% rename from data/pankajmathur_orca_mini_v9_5_3B-Instruct.json rename to data/models/pankajmathur_orca_mini_v9_5_3B-Instruct.json diff --git a/data/pankajmathur_orca_mini_v9_6_1B-Instruct.json b/data/models/pankajmathur_orca_mini_v9_6_1B-Instruct.json similarity index 100% rename from data/pankajmathur_orca_mini_v9_6_1B-Instruct.json rename to data/models/pankajmathur_orca_mini_v9_6_1B-Instruct.json diff --git a/data/pankajmathur_orca_mini_v9_6_3B-Instruct.json b/data/models/pankajmathur_orca_mini_v9_6_3B-Instruct.json similarity index 100% rename from data/pankajmathur_orca_mini_v9_6_3B-Instruct.json rename to data/models/pankajmathur_orca_mini_v9_6_3B-Instruct.json diff --git a/data/pankajmathur_orca_mini_v9_7_1B-Instruct.json b/data/models/pankajmathur_orca_mini_v9_7_1B-Instruct.json similarity index 100% rename from data/pankajmathur_orca_mini_v9_7_1B-Instruct.json rename to data/models/pankajmathur_orca_mini_v9_7_1B-Instruct.json diff --git a/data/pankajmathur_orca_mini_v9_7_3B-Instruct.json b/data/models/pankajmathur_orca_mini_v9_7_3B-Instruct.json similarity index 100% rename from data/pankajmathur_orca_mini_v9_7_3B-Instruct.json rename to data/models/pankajmathur_orca_mini_v9_7_3B-Instruct.json diff --git a/data/paulml_ECE-ILAB-Q1.json b/data/models/paulml_ECE-ILAB-Q1.json similarity index 100% rename from data/paulml_ECE-ILAB-Q1.json rename to data/models/paulml_ECE-ILAB-Q1.json diff --git a/data/pints-ai_1.5-Pints-16K-v0.1.json b/data/models/pints-ai_1.5-Pints-16K-v0.1.json similarity index 100% rename from data/pints-ai_1.5-Pints-16K-v0.1.json rename to data/models/pints-ai_1.5-Pints-16K-v0.1.json diff --git a/data/pints-ai_1.5-Pints-2K-v0.1.json b/data/models/pints-ai_1.5-Pints-2K-v0.1.json similarity index 100% rename from data/pints-ai_1.5-Pints-2K-v0.1.json rename to data/models/pints-ai_1.5-Pints-2K-v0.1.json diff --git a/data/piotr25691_thea-3b-25r.json b/data/models/piotr25691_thea-3b-25r.json similarity index 100% rename from data/piotr25691_thea-3b-25r.json rename to data/models/piotr25691_thea-3b-25r.json diff --git a/data/piotr25691_thea-c-3b-25r.json b/data/models/piotr25691_thea-c-3b-25r.json similarity index 100% rename from data/piotr25691_thea-c-3b-25r.json rename to data/models/piotr25691_thea-c-3b-25r.json diff --git a/data/piotr25691_thea-rp-3b-25r.json b/data/models/piotr25691_thea-rp-3b-25r.json similarity index 100% rename from data/piotr25691_thea-rp-3b-25r.json rename to data/models/piotr25691_thea-rp-3b-25r.json diff --git a/data/postbot_gpt2-medium-emailgen.json b/data/models/postbot_gpt2-medium-emailgen.json similarity index 100% rename from data/postbot_gpt2-medium-emailgen.json rename to data/models/postbot_gpt2-medium-emailgen.json diff --git a/data/prince-canuma_Ministral-8B-Instruct-2410-HF.json b/data/models/prince-canuma_Ministral-8B-Instruct-2410-HF.json similarity index 100% rename from data/prince-canuma_Ministral-8B-Instruct-2410-HF.json rename to data/models/prince-canuma_Ministral-8B-Instruct-2410-HF.json diff --git a/data/princeton-nlp_Llama-3-8B-ProLong-512k-Base.json b/data/models/princeton-nlp_Llama-3-8B-ProLong-512k-Base.json similarity index 100% rename from data/princeton-nlp_Llama-3-8B-ProLong-512k-Base.json rename to data/models/princeton-nlp_Llama-3-8B-ProLong-512k-Base.json diff --git a/data/princeton-nlp_Llama-3-8B-ProLong-512k-Instruct.json b/data/models/princeton-nlp_Llama-3-8B-ProLong-512k-Instruct.json similarity index 100% rename from data/princeton-nlp_Llama-3-8B-ProLong-512k-Instruct.json rename to data/models/princeton-nlp_Llama-3-8B-ProLong-512k-Instruct.json diff --git a/data/princeton-nlp_Llama-3-8B-ProLong-64k-Base.json b/data/models/princeton-nlp_Llama-3-8B-ProLong-64k-Base.json similarity index 100% rename from data/princeton-nlp_Llama-3-8B-ProLong-64k-Base.json rename to data/models/princeton-nlp_Llama-3-8B-ProLong-64k-Base.json diff --git a/data/princeton-nlp_Llama-3-8B-ProLong-64k-Instruct.json b/data/models/princeton-nlp_Llama-3-8B-ProLong-64k-Instruct.json similarity index 100% rename from data/princeton-nlp_Llama-3-8B-ProLong-64k-Instruct.json rename to data/models/princeton-nlp_Llama-3-8B-ProLong-64k-Instruct.json diff --git a/data/princeton-nlp_Llama-3-Base-8B-SFT-CPO.json b/data/models/princeton-nlp_Llama-3-Base-8B-SFT-CPO.json similarity index 100% rename from data/princeton-nlp_Llama-3-Base-8B-SFT-CPO.json rename to data/models/princeton-nlp_Llama-3-Base-8B-SFT-CPO.json diff --git a/data/princeton-nlp_Llama-3-Base-8B-SFT-DPO.json b/data/models/princeton-nlp_Llama-3-Base-8B-SFT-DPO.json similarity index 100% rename from data/princeton-nlp_Llama-3-Base-8B-SFT-DPO.json rename to data/models/princeton-nlp_Llama-3-Base-8B-SFT-DPO.json diff --git a/data/princeton-nlp_Llama-3-Base-8B-SFT-IPO.json b/data/models/princeton-nlp_Llama-3-Base-8B-SFT-IPO.json similarity index 100% rename from data/princeton-nlp_Llama-3-Base-8B-SFT-IPO.json rename to data/models/princeton-nlp_Llama-3-Base-8B-SFT-IPO.json diff --git a/data/princeton-nlp_Llama-3-Base-8B-SFT-KTO.json b/data/models/princeton-nlp_Llama-3-Base-8B-SFT-KTO.json similarity index 100% rename from data/princeton-nlp_Llama-3-Base-8B-SFT-KTO.json rename to data/models/princeton-nlp_Llama-3-Base-8B-SFT-KTO.json diff --git a/data/princeton-nlp_Llama-3-Base-8B-SFT-ORPO.json b/data/models/princeton-nlp_Llama-3-Base-8B-SFT-ORPO.json similarity index 100% rename from data/princeton-nlp_Llama-3-Base-8B-SFT-ORPO.json rename to data/models/princeton-nlp_Llama-3-Base-8B-SFT-ORPO.json diff --git a/data/princeton-nlp_Llama-3-Base-8B-SFT-RDPO.json b/data/models/princeton-nlp_Llama-3-Base-8B-SFT-RDPO.json similarity index 100% rename from data/princeton-nlp_Llama-3-Base-8B-SFT-RDPO.json rename to data/models/princeton-nlp_Llama-3-Base-8B-SFT-RDPO.json diff --git a/data/princeton-nlp_Llama-3-Base-8B-SFT-RRHF.json b/data/models/princeton-nlp_Llama-3-Base-8B-SFT-RRHF.json similarity index 100% rename from data/princeton-nlp_Llama-3-Base-8B-SFT-RRHF.json rename to data/models/princeton-nlp_Llama-3-Base-8B-SFT-RRHF.json diff --git a/data/princeton-nlp_Llama-3-Base-8B-SFT-SLiC-HF.json b/data/models/princeton-nlp_Llama-3-Base-8B-SFT-SLiC-HF.json similarity index 100% rename from data/princeton-nlp_Llama-3-Base-8B-SFT-SLiC-HF.json rename to data/models/princeton-nlp_Llama-3-Base-8B-SFT-SLiC-HF.json diff --git a/data/princeton-nlp_Llama-3-Base-8B-SFT-SimPO.json b/data/models/princeton-nlp_Llama-3-Base-8B-SFT-SimPO.json similarity index 100% rename from data/princeton-nlp_Llama-3-Base-8B-SFT-SimPO.json rename to data/models/princeton-nlp_Llama-3-Base-8B-SFT-SimPO.json diff --git a/data/princeton-nlp_Llama-3-Base-8B-SFT.json b/data/models/princeton-nlp_Llama-3-Base-8B-SFT.json similarity index 100% rename from data/princeton-nlp_Llama-3-Base-8B-SFT.json rename to data/models/princeton-nlp_Llama-3-Base-8B-SFT.json diff --git a/data/princeton-nlp_Llama-3-Instruct-8B-CPO-v0.2.json b/data/models/princeton-nlp_Llama-3-Instruct-8B-CPO-v0.2.json similarity index 100% rename from data/princeton-nlp_Llama-3-Instruct-8B-CPO-v0.2.json rename to data/models/princeton-nlp_Llama-3-Instruct-8B-CPO-v0.2.json diff --git a/data/princeton-nlp_Llama-3-Instruct-8B-CPO.json b/data/models/princeton-nlp_Llama-3-Instruct-8B-CPO.json similarity index 100% rename from data/princeton-nlp_Llama-3-Instruct-8B-CPO.json rename to data/models/princeton-nlp_Llama-3-Instruct-8B-CPO.json diff --git a/data/princeton-nlp_Llama-3-Instruct-8B-DPO-v0.2.json b/data/models/princeton-nlp_Llama-3-Instruct-8B-DPO-v0.2.json similarity index 100% rename from data/princeton-nlp_Llama-3-Instruct-8B-DPO-v0.2.json rename to data/models/princeton-nlp_Llama-3-Instruct-8B-DPO-v0.2.json diff --git a/data/princeton-nlp_Llama-3-Instruct-8B-DPO.json b/data/models/princeton-nlp_Llama-3-Instruct-8B-DPO.json similarity index 100% rename from data/princeton-nlp_Llama-3-Instruct-8B-DPO.json rename to data/models/princeton-nlp_Llama-3-Instruct-8B-DPO.json diff --git a/data/princeton-nlp_Llama-3-Instruct-8B-KTO-v0.2.json b/data/models/princeton-nlp_Llama-3-Instruct-8B-KTO-v0.2.json similarity index 100% rename from data/princeton-nlp_Llama-3-Instruct-8B-KTO-v0.2.json rename to data/models/princeton-nlp_Llama-3-Instruct-8B-KTO-v0.2.json diff --git a/data/princeton-nlp_Llama-3-Instruct-8B-KTO.json b/data/models/princeton-nlp_Llama-3-Instruct-8B-KTO.json similarity index 100% rename from data/princeton-nlp_Llama-3-Instruct-8B-KTO.json rename to data/models/princeton-nlp_Llama-3-Instruct-8B-KTO.json diff --git a/data/princeton-nlp_Llama-3-Instruct-8B-ORPO-v0.2.json b/data/models/princeton-nlp_Llama-3-Instruct-8B-ORPO-v0.2.json similarity index 100% rename from data/princeton-nlp_Llama-3-Instruct-8B-ORPO-v0.2.json rename to data/models/princeton-nlp_Llama-3-Instruct-8B-ORPO-v0.2.json diff --git a/data/princeton-nlp_Llama-3-Instruct-8B-ORPO.json b/data/models/princeton-nlp_Llama-3-Instruct-8B-ORPO.json similarity index 100% rename from data/princeton-nlp_Llama-3-Instruct-8B-ORPO.json rename to data/models/princeton-nlp_Llama-3-Instruct-8B-ORPO.json diff --git a/data/princeton-nlp_Llama-3-Instruct-8B-RDPO-v0.2.json b/data/models/princeton-nlp_Llama-3-Instruct-8B-RDPO-v0.2.json similarity index 100% rename from data/princeton-nlp_Llama-3-Instruct-8B-RDPO-v0.2.json rename to data/models/princeton-nlp_Llama-3-Instruct-8B-RDPO-v0.2.json diff --git a/data/princeton-nlp_Llama-3-Instruct-8B-RDPO.json b/data/models/princeton-nlp_Llama-3-Instruct-8B-RDPO.json similarity index 100% rename from data/princeton-nlp_Llama-3-Instruct-8B-RDPO.json rename to data/models/princeton-nlp_Llama-3-Instruct-8B-RDPO.json diff --git a/data/princeton-nlp_Llama-3-Instruct-8B-RRHF-v0.2.json b/data/models/princeton-nlp_Llama-3-Instruct-8B-RRHF-v0.2.json similarity index 100% rename from data/princeton-nlp_Llama-3-Instruct-8B-RRHF-v0.2.json rename to data/models/princeton-nlp_Llama-3-Instruct-8B-RRHF-v0.2.json diff --git a/data/princeton-nlp_Llama-3-Instruct-8B-RRHF.json b/data/models/princeton-nlp_Llama-3-Instruct-8B-RRHF.json similarity index 100% rename from data/princeton-nlp_Llama-3-Instruct-8B-RRHF.json rename to data/models/princeton-nlp_Llama-3-Instruct-8B-RRHF.json diff --git a/data/princeton-nlp_Llama-3-Instruct-8B-SLiC-HF-v0.2.json b/data/models/princeton-nlp_Llama-3-Instruct-8B-SLiC-HF-v0.2.json similarity index 100% rename from data/princeton-nlp_Llama-3-Instruct-8B-SLiC-HF-v0.2.json rename to data/models/princeton-nlp_Llama-3-Instruct-8B-SLiC-HF-v0.2.json diff --git a/data/princeton-nlp_Llama-3-Instruct-8B-SLiC-HF.json b/data/models/princeton-nlp_Llama-3-Instruct-8B-SLiC-HF.json similarity index 100% rename from data/princeton-nlp_Llama-3-Instruct-8B-SLiC-HF.json rename to data/models/princeton-nlp_Llama-3-Instruct-8B-SLiC-HF.json diff --git a/data/princeton-nlp_Llama-3-Instruct-8B-SimPO-v0.2.json b/data/models/princeton-nlp_Llama-3-Instruct-8B-SimPO-v0.2.json similarity index 100% rename from data/princeton-nlp_Llama-3-Instruct-8B-SimPO-v0.2.json rename to data/models/princeton-nlp_Llama-3-Instruct-8B-SimPO-v0.2.json diff --git a/data/princeton-nlp_Llama-3-Instruct-8B-SimPO.json b/data/models/princeton-nlp_Llama-3-Instruct-8B-SimPO.json similarity index 100% rename from data/princeton-nlp_Llama-3-Instruct-8B-SimPO.json rename to data/models/princeton-nlp_Llama-3-Instruct-8B-SimPO.json diff --git a/data/princeton-nlp_Mistral-7B-Base-SFT-CPO.json b/data/models/princeton-nlp_Mistral-7B-Base-SFT-CPO.json similarity index 100% rename from data/princeton-nlp_Mistral-7B-Base-SFT-CPO.json rename to data/models/princeton-nlp_Mistral-7B-Base-SFT-CPO.json diff --git a/data/princeton-nlp_Mistral-7B-Base-SFT-DPO.json b/data/models/princeton-nlp_Mistral-7B-Base-SFT-DPO.json similarity index 100% rename from data/princeton-nlp_Mistral-7B-Base-SFT-DPO.json rename to data/models/princeton-nlp_Mistral-7B-Base-SFT-DPO.json diff --git a/data/princeton-nlp_Mistral-7B-Base-SFT-IPO.json b/data/models/princeton-nlp_Mistral-7B-Base-SFT-IPO.json similarity index 100% rename from data/princeton-nlp_Mistral-7B-Base-SFT-IPO.json rename to data/models/princeton-nlp_Mistral-7B-Base-SFT-IPO.json diff --git a/data/princeton-nlp_Mistral-7B-Base-SFT-KTO.json b/data/models/princeton-nlp_Mistral-7B-Base-SFT-KTO.json similarity index 100% rename from data/princeton-nlp_Mistral-7B-Base-SFT-KTO.json rename to data/models/princeton-nlp_Mistral-7B-Base-SFT-KTO.json diff --git a/data/princeton-nlp_Mistral-7B-Base-SFT-RDPO.json b/data/models/princeton-nlp_Mistral-7B-Base-SFT-RDPO.json similarity index 100% rename from data/princeton-nlp_Mistral-7B-Base-SFT-RDPO.json rename to data/models/princeton-nlp_Mistral-7B-Base-SFT-RDPO.json diff --git a/data/princeton-nlp_Mistral-7B-Base-SFT-RRHF.json b/data/models/princeton-nlp_Mistral-7B-Base-SFT-RRHF.json similarity index 100% rename from data/princeton-nlp_Mistral-7B-Base-SFT-RRHF.json rename to data/models/princeton-nlp_Mistral-7B-Base-SFT-RRHF.json diff --git a/data/princeton-nlp_Mistral-7B-Base-SFT-SLiC-HF.json b/data/models/princeton-nlp_Mistral-7B-Base-SFT-SLiC-HF.json similarity index 100% rename from data/princeton-nlp_Mistral-7B-Base-SFT-SLiC-HF.json rename to data/models/princeton-nlp_Mistral-7B-Base-SFT-SLiC-HF.json diff --git a/data/princeton-nlp_Mistral-7B-Base-SFT-SimPO.json b/data/models/princeton-nlp_Mistral-7B-Base-SFT-SimPO.json similarity index 100% rename from data/princeton-nlp_Mistral-7B-Base-SFT-SimPO.json rename to data/models/princeton-nlp_Mistral-7B-Base-SFT-SimPO.json diff --git a/data/princeton-nlp_Mistral-7B-Instruct-CPO.json b/data/models/princeton-nlp_Mistral-7B-Instruct-CPO.json similarity index 100% rename from data/princeton-nlp_Mistral-7B-Instruct-CPO.json rename to data/models/princeton-nlp_Mistral-7B-Instruct-CPO.json diff --git a/data/princeton-nlp_Mistral-7B-Instruct-DPO.json b/data/models/princeton-nlp_Mistral-7B-Instruct-DPO.json similarity index 100% rename from data/princeton-nlp_Mistral-7B-Instruct-DPO.json rename to data/models/princeton-nlp_Mistral-7B-Instruct-DPO.json diff --git a/data/princeton-nlp_Mistral-7B-Instruct-IPO.json b/data/models/princeton-nlp_Mistral-7B-Instruct-IPO.json similarity index 100% rename from data/princeton-nlp_Mistral-7B-Instruct-IPO.json rename to data/models/princeton-nlp_Mistral-7B-Instruct-IPO.json diff --git a/data/princeton-nlp_Mistral-7B-Instruct-KTO.json b/data/models/princeton-nlp_Mistral-7B-Instruct-KTO.json similarity index 100% rename from data/princeton-nlp_Mistral-7B-Instruct-KTO.json rename to data/models/princeton-nlp_Mistral-7B-Instruct-KTO.json diff --git a/data/princeton-nlp_Mistral-7B-Instruct-ORPO.json b/data/models/princeton-nlp_Mistral-7B-Instruct-ORPO.json similarity index 100% rename from data/princeton-nlp_Mistral-7B-Instruct-ORPO.json rename to data/models/princeton-nlp_Mistral-7B-Instruct-ORPO.json diff --git a/data/princeton-nlp_Mistral-7B-Instruct-RDPO.json b/data/models/princeton-nlp_Mistral-7B-Instruct-RDPO.json similarity index 100% rename from data/princeton-nlp_Mistral-7B-Instruct-RDPO.json rename to data/models/princeton-nlp_Mistral-7B-Instruct-RDPO.json diff --git a/data/princeton-nlp_Mistral-7B-Instruct-RRHF.json b/data/models/princeton-nlp_Mistral-7B-Instruct-RRHF.json similarity index 100% rename from data/princeton-nlp_Mistral-7B-Instruct-RRHF.json rename to data/models/princeton-nlp_Mistral-7B-Instruct-RRHF.json diff --git a/data/princeton-nlp_Mistral-7B-Instruct-SLiC-HF.json b/data/models/princeton-nlp_Mistral-7B-Instruct-SLiC-HF.json similarity index 100% rename from data/princeton-nlp_Mistral-7B-Instruct-SLiC-HF.json rename to data/models/princeton-nlp_Mistral-7B-Instruct-SLiC-HF.json diff --git a/data/princeton-nlp_Mistral-7B-Instruct-SimPO.json b/data/models/princeton-nlp_Mistral-7B-Instruct-SimPO.json similarity index 100% rename from data/princeton-nlp_Mistral-7B-Instruct-SimPO.json rename to data/models/princeton-nlp_Mistral-7B-Instruct-SimPO.json diff --git a/data/princeton-nlp_Sheared-LLaMA-1.3B.json b/data/models/princeton-nlp_Sheared-LLaMA-1.3B.json similarity index 100% rename from data/princeton-nlp_Sheared-LLaMA-1.3B.json rename to data/models/princeton-nlp_Sheared-LLaMA-1.3B.json diff --git a/data/princeton-nlp_Sheared-LLaMA-2.7B.json b/data/models/princeton-nlp_Sheared-LLaMA-2.7B.json similarity index 100% rename from data/princeton-nlp_Sheared-LLaMA-2.7B.json rename to data/models/princeton-nlp_Sheared-LLaMA-2.7B.json diff --git a/data/princeton-nlp_gemma-2-9b-it-DPO.json b/data/models/princeton-nlp_gemma-2-9b-it-DPO.json similarity index 100% rename from data/princeton-nlp_gemma-2-9b-it-DPO.json rename to data/models/princeton-nlp_gemma-2-9b-it-DPO.json diff --git a/data/princeton-nlp_gemma-2-9b-it-SimPO.json b/data/models/princeton-nlp_gemma-2-9b-it-SimPO.json similarity index 100% rename from data/princeton-nlp_gemma-2-9b-it-SimPO.json rename to data/models/princeton-nlp_gemma-2-9b-it-SimPO.json diff --git a/data/prithivMLmods_Bellatrix-1.5B-xElite.json b/data/models/prithivMLmods_Bellatrix-1.5B-xElite.json similarity index 100% rename from data/prithivMLmods_Bellatrix-1.5B-xElite.json rename to data/models/prithivMLmods_Bellatrix-1.5B-xElite.json diff --git a/data/prithivMLmods_Bellatrix-Tiny-1.5B-R1.json b/data/models/prithivMLmods_Bellatrix-Tiny-1.5B-R1.json similarity index 100% rename from data/prithivMLmods_Bellatrix-Tiny-1.5B-R1.json rename to data/models/prithivMLmods_Bellatrix-Tiny-1.5B-R1.json diff --git a/data/prithivMLmods_Bellatrix-Tiny-1B-v2.json b/data/models/prithivMLmods_Bellatrix-Tiny-1B-v2.json similarity index 100% rename from data/prithivMLmods_Bellatrix-Tiny-1B-v2.json rename to data/models/prithivMLmods_Bellatrix-Tiny-1B-v2.json diff --git a/data/prithivMLmods_Blaze-14B-xElite.json b/data/models/prithivMLmods_Blaze-14B-xElite.json similarity index 100% rename from data/prithivMLmods_Blaze-14B-xElite.json rename to data/models/prithivMLmods_Blaze-14B-xElite.json diff --git a/data/prithivMLmods_COCO-7B-Instruct-1M.json b/data/models/prithivMLmods_COCO-7B-Instruct-1M.json similarity index 100% rename from data/prithivMLmods_COCO-7B-Instruct-1M.json rename to data/models/prithivMLmods_COCO-7B-Instruct-1M.json diff --git a/data/prithivMLmods_Calcium-Opus-14B-Elite-1M.json b/data/models/prithivMLmods_Calcium-Opus-14B-Elite-1M.json similarity index 100% rename from data/prithivMLmods_Calcium-Opus-14B-Elite-1M.json rename to data/models/prithivMLmods_Calcium-Opus-14B-Elite-1M.json diff --git a/data/prithivMLmods_Calcium-Opus-14B-Elite-Stock.json b/data/models/prithivMLmods_Calcium-Opus-14B-Elite-Stock.json similarity index 100% rename from data/prithivMLmods_Calcium-Opus-14B-Elite-Stock.json rename to data/models/prithivMLmods_Calcium-Opus-14B-Elite-Stock.json diff --git a/data/prithivMLmods_Calcium-Opus-14B-Elite.json b/data/models/prithivMLmods_Calcium-Opus-14B-Elite.json similarity index 99% rename from data/prithivMLmods_Calcium-Opus-14B-Elite.json rename to data/models/prithivMLmods_Calcium-Opus-14B-Elite.json index 577276dffe757d54f1377466592f92dd9fbcda81..d55e27a544ce26538462d7d22563b8808934095f 100644 --- a/data/prithivMLmods_Calcium-Opus-14B-Elite.json +++ b/data/models/prithivMLmods_Calcium-Opus-14B-Elite.json @@ -5,7 +5,7 @@ "developer": "prithivMLmods", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "Qwen2ForCausalLM", "params_billions": "14.766" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6064 + "score": 0.6052 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6296 + "score": 0.6317 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3708 + "score": 0.4789 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3733 + "score": 0.3742 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4873 + "score": 0.486 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5307 + "score": 0.5302 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6052 + "score": 0.6064 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6317 + "score": 0.6296 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4789 + "score": 0.3708 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3742 + "score": 0.3733 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.486 + "score": 0.4873 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5302 + "score": 0.5307 } } ], diff --git a/data/prithivMLmods_Calcium-Opus-14B-Elite2-R1.json b/data/models/prithivMLmods_Calcium-Opus-14B-Elite2-R1.json similarity index 100% rename from data/prithivMLmods_Calcium-Opus-14B-Elite2-R1.json rename to data/models/prithivMLmods_Calcium-Opus-14B-Elite2-R1.json diff --git a/data/prithivMLmods_Calcium-Opus-14B-Elite2.json b/data/models/prithivMLmods_Calcium-Opus-14B-Elite2.json similarity index 100% rename from data/prithivMLmods_Calcium-Opus-14B-Elite2.json rename to data/models/prithivMLmods_Calcium-Opus-14B-Elite2.json diff --git a/data/prithivMLmods_Calcium-Opus-14B-Elite3.json b/data/models/prithivMLmods_Calcium-Opus-14B-Elite3.json similarity index 100% rename from data/prithivMLmods_Calcium-Opus-14B-Elite3.json rename to data/models/prithivMLmods_Calcium-Opus-14B-Elite3.json diff --git a/data/prithivMLmods_Calcium-Opus-14B-Elite4.json b/data/models/prithivMLmods_Calcium-Opus-14B-Elite4.json similarity index 100% rename from data/prithivMLmods_Calcium-Opus-14B-Elite4.json rename to data/models/prithivMLmods_Calcium-Opus-14B-Elite4.json diff --git a/data/prithivMLmods_Calcium-Opus-14B-Merge.json b/data/models/prithivMLmods_Calcium-Opus-14B-Merge.json similarity index 100% rename from data/prithivMLmods_Calcium-Opus-14B-Merge.json rename to data/models/prithivMLmods_Calcium-Opus-14B-Merge.json diff --git a/data/prithivMLmods_Calcium-Opus-20B-v1.json b/data/models/prithivMLmods_Calcium-Opus-20B-v1.json similarity index 100% rename from data/prithivMLmods_Calcium-Opus-20B-v1.json rename to data/models/prithivMLmods_Calcium-Opus-20B-v1.json diff --git a/data/prithivMLmods_Codepy-Deepthink-3B.json b/data/models/prithivMLmods_Codepy-Deepthink-3B.json similarity index 100% rename from data/prithivMLmods_Codepy-Deepthink-3B.json rename to data/models/prithivMLmods_Codepy-Deepthink-3B.json diff --git a/data/prithivMLmods_Coma-II-14B.json b/data/models/prithivMLmods_Coma-II-14B.json similarity index 100% rename from data/prithivMLmods_Coma-II-14B.json rename to data/models/prithivMLmods_Coma-II-14B.json diff --git a/data/prithivMLmods_Condor-Opus-14B-Exp.json b/data/models/prithivMLmods_Condor-Opus-14B-Exp.json similarity index 100% rename from data/prithivMLmods_Condor-Opus-14B-Exp.json rename to data/models/prithivMLmods_Condor-Opus-14B-Exp.json diff --git a/data/prithivMLmods_Cygnus-II-14B.json b/data/models/prithivMLmods_Cygnus-II-14B.json similarity index 100% rename from data/prithivMLmods_Cygnus-II-14B.json rename to data/models/prithivMLmods_Cygnus-II-14B.json diff --git a/data/prithivMLmods_Deepthink-Llama-3-8B-Preview.json b/data/models/prithivMLmods_Deepthink-Llama-3-8B-Preview.json similarity index 100% rename from data/prithivMLmods_Deepthink-Llama-3-8B-Preview.json rename to data/models/prithivMLmods_Deepthink-Llama-3-8B-Preview.json diff --git a/data/prithivMLmods_Deepthink-Reasoning-14B.json b/data/models/prithivMLmods_Deepthink-Reasoning-14B.json similarity index 100% rename from data/prithivMLmods_Deepthink-Reasoning-14B.json rename to data/models/prithivMLmods_Deepthink-Reasoning-14B.json diff --git a/data/prithivMLmods_Deepthink-Reasoning-7B.json b/data/models/prithivMLmods_Deepthink-Reasoning-7B.json similarity index 100% rename from data/prithivMLmods_Deepthink-Reasoning-7B.json rename to data/models/prithivMLmods_Deepthink-Reasoning-7B.json diff --git a/data/prithivMLmods_Dinobot-Opus-14B-Exp.json b/data/models/prithivMLmods_Dinobot-Opus-14B-Exp.json similarity index 100% rename from data/prithivMLmods_Dinobot-Opus-14B-Exp.json rename to data/models/prithivMLmods_Dinobot-Opus-14B-Exp.json diff --git a/data/prithivMLmods_Elita-0.1-Distilled-R1-abliterated.json b/data/models/prithivMLmods_Elita-0.1-Distilled-R1-abliterated.json similarity index 100% rename from data/prithivMLmods_Elita-0.1-Distilled-R1-abliterated.json rename to data/models/prithivMLmods_Elita-0.1-Distilled-R1-abliterated.json diff --git a/data/prithivMLmods_Elita-1.json b/data/models/prithivMLmods_Elita-1.json similarity index 100% rename from data/prithivMLmods_Elita-1.json rename to data/models/prithivMLmods_Elita-1.json diff --git a/data/prithivMLmods_Epimetheus-14B-Axo.json b/data/models/prithivMLmods_Epimetheus-14B-Axo.json similarity index 100% rename from data/prithivMLmods_Epimetheus-14B-Axo.json rename to data/models/prithivMLmods_Epimetheus-14B-Axo.json diff --git a/data/prithivMLmods_Equuleus-Opus-14B-Exp.json b/data/models/prithivMLmods_Equuleus-Opus-14B-Exp.json similarity index 100% rename from data/prithivMLmods_Equuleus-Opus-14B-Exp.json rename to data/models/prithivMLmods_Equuleus-Opus-14B-Exp.json diff --git a/data/prithivMLmods_Eridanus-Opus-14B-r999.json b/data/models/prithivMLmods_Eridanus-Opus-14B-r999.json similarity index 100% rename from data/prithivMLmods_Eridanus-Opus-14B-r999.json rename to data/models/prithivMLmods_Eridanus-Opus-14B-r999.json diff --git a/data/prithivMLmods_Evac-Opus-14B-Exp.json b/data/models/prithivMLmods_Evac-Opus-14B-Exp.json similarity index 100% rename from data/prithivMLmods_Evac-Opus-14B-Exp.json rename to data/models/prithivMLmods_Evac-Opus-14B-Exp.json diff --git a/data/prithivMLmods_FastThink-0.5B-Tiny.json b/data/models/prithivMLmods_FastThink-0.5B-Tiny.json similarity index 100% rename from data/prithivMLmods_FastThink-0.5B-Tiny.json rename to data/models/prithivMLmods_FastThink-0.5B-Tiny.json diff --git a/data/prithivMLmods_GWQ-9B-Preview.json b/data/models/prithivMLmods_GWQ-9B-Preview.json similarity index 100% rename from data/prithivMLmods_GWQ-9B-Preview.json rename to data/models/prithivMLmods_GWQ-9B-Preview.json diff --git a/data/prithivMLmods_GWQ-9B-Preview2.json b/data/models/prithivMLmods_GWQ-9B-Preview2.json similarity index 100% rename from data/prithivMLmods_GWQ-9B-Preview2.json rename to data/models/prithivMLmods_GWQ-9B-Preview2.json diff --git a/data/prithivMLmods_GWQ2b.json b/data/models/prithivMLmods_GWQ2b.json similarity index 100% rename from data/prithivMLmods_GWQ2b.json rename to data/models/prithivMLmods_GWQ2b.json diff --git a/data/prithivMLmods_Gaea-Opus-14B-Exp.json b/data/models/prithivMLmods_Gaea-Opus-14B-Exp.json similarity index 100% rename from data/prithivMLmods_Gaea-Opus-14B-Exp.json rename to data/models/prithivMLmods_Gaea-Opus-14B-Exp.json diff --git a/data/prithivMLmods_Galactic-Qwen-14B-Exp1.json b/data/models/prithivMLmods_Galactic-Qwen-14B-Exp1.json similarity index 100% rename from data/prithivMLmods_Galactic-Qwen-14B-Exp1.json rename to data/models/prithivMLmods_Galactic-Qwen-14B-Exp1.json diff --git a/data/prithivMLmods_Galactic-Qwen-14B-Exp2.json b/data/models/prithivMLmods_Galactic-Qwen-14B-Exp2.json similarity index 100% rename from data/prithivMLmods_Galactic-Qwen-14B-Exp2.json rename to data/models/prithivMLmods_Galactic-Qwen-14B-Exp2.json diff --git a/data/prithivMLmods_Gauss-Opus-14B-R999.json b/data/models/prithivMLmods_Gauss-Opus-14B-R999.json similarity index 100% rename from data/prithivMLmods_Gauss-Opus-14B-R999.json rename to data/models/prithivMLmods_Gauss-Opus-14B-R999.json diff --git a/data/prithivMLmods_Jolt-v0.1.json b/data/models/prithivMLmods_Jolt-v0.1.json similarity index 100% rename from data/prithivMLmods_Jolt-v0.1.json rename to data/models/prithivMLmods_Jolt-v0.1.json diff --git a/data/prithivMLmods_Lacerta-Opus-14B-Elite8.json b/data/models/prithivMLmods_Lacerta-Opus-14B-Elite8.json similarity index 100% rename from data/prithivMLmods_Lacerta-Opus-14B-Elite8.json rename to data/models/prithivMLmods_Lacerta-Opus-14B-Elite8.json diff --git a/data/prithivMLmods_Llama-3.1-5B-Instruct.json b/data/models/prithivMLmods_Llama-3.1-5B-Instruct.json similarity index 100% rename from data/prithivMLmods_Llama-3.1-5B-Instruct.json rename to data/models/prithivMLmods_Llama-3.1-5B-Instruct.json diff --git a/data/prithivMLmods_Llama-3.1-8B-Open-SFT.json b/data/models/prithivMLmods_Llama-3.1-8B-Open-SFT.json similarity index 100% rename from data/prithivMLmods_Llama-3.1-8B-Open-SFT.json rename to data/models/prithivMLmods_Llama-3.1-8B-Open-SFT.json diff --git a/data/prithivMLmods_Llama-3.2-3B-Math-Oct.json b/data/models/prithivMLmods_Llama-3.2-3B-Math-Oct.json similarity index 100% rename from data/prithivMLmods_Llama-3.2-3B-Math-Oct.json rename to data/models/prithivMLmods_Llama-3.2-3B-Math-Oct.json diff --git a/data/prithivMLmods_Llama-3.2-6B-AlgoCode.json b/data/models/prithivMLmods_Llama-3.2-6B-AlgoCode.json similarity index 100% rename from data/prithivMLmods_Llama-3.2-6B-AlgoCode.json rename to data/models/prithivMLmods_Llama-3.2-6B-AlgoCode.json diff --git a/data/prithivMLmods_Llama-8B-Distill-CoT.json b/data/models/prithivMLmods_Llama-8B-Distill-CoT.json similarity index 100% rename from data/prithivMLmods_Llama-8B-Distill-CoT.json rename to data/models/prithivMLmods_Llama-8B-Distill-CoT.json diff --git a/data/prithivMLmods_Llama-Deepsync-1B.json b/data/models/prithivMLmods_Llama-Deepsync-1B.json similarity index 100% rename from data/prithivMLmods_Llama-Deepsync-1B.json rename to data/models/prithivMLmods_Llama-Deepsync-1B.json diff --git a/data/prithivMLmods_Llama-Deepsync-3B.json b/data/models/prithivMLmods_Llama-Deepsync-3B.json similarity index 100% rename from data/prithivMLmods_Llama-Deepsync-3B.json rename to data/models/prithivMLmods_Llama-Deepsync-3B.json diff --git a/data/prithivMLmods_Llama-Express.1-Math.json b/data/models/prithivMLmods_Llama-Express.1-Math.json similarity index 100% rename from data/prithivMLmods_Llama-Express.1-Math.json rename to data/models/prithivMLmods_Llama-Express.1-Math.json diff --git a/data/prithivMLmods_LwQ-10B-Instruct.json b/data/models/prithivMLmods_LwQ-10B-Instruct.json similarity index 100% rename from data/prithivMLmods_LwQ-10B-Instruct.json rename to data/models/prithivMLmods_LwQ-10B-Instruct.json diff --git a/data/prithivMLmods_LwQ-Reasoner-10B.json b/data/models/prithivMLmods_LwQ-Reasoner-10B.json similarity index 100% rename from data/prithivMLmods_LwQ-Reasoner-10B.json rename to data/models/prithivMLmods_LwQ-Reasoner-10B.json diff --git a/data/prithivMLmods_Magellanic-Opus-14B-Exp.json b/data/models/prithivMLmods_Magellanic-Opus-14B-Exp.json similarity index 100% rename from data/prithivMLmods_Magellanic-Opus-14B-Exp.json rename to data/models/prithivMLmods_Magellanic-Opus-14B-Exp.json diff --git a/data/prithivMLmods_Magellanic-Qwen-25B-R999.json b/data/models/prithivMLmods_Magellanic-Qwen-25B-R999.json similarity index 100% rename from data/prithivMLmods_Magellanic-Qwen-25B-R999.json rename to data/models/prithivMLmods_Magellanic-Qwen-25B-R999.json diff --git a/data/prithivMLmods_Megatron-Corpus-14B-Exp.json b/data/models/prithivMLmods_Megatron-Corpus-14B-Exp.json similarity index 100% rename from data/prithivMLmods_Megatron-Corpus-14B-Exp.json rename to data/models/prithivMLmods_Megatron-Corpus-14B-Exp.json diff --git a/data/prithivMLmods_Megatron-Corpus-14B-Exp.v2.json b/data/models/prithivMLmods_Megatron-Corpus-14B-Exp.v2.json similarity index 100% rename from data/prithivMLmods_Megatron-Corpus-14B-Exp.v2.json rename to data/models/prithivMLmods_Megatron-Corpus-14B-Exp.v2.json diff --git a/data/prithivMLmods_Megatron-Opus-14B-2.0.json b/data/models/prithivMLmods_Megatron-Opus-14B-2.0.json similarity index 100% rename from data/prithivMLmods_Megatron-Opus-14B-2.0.json rename to data/models/prithivMLmods_Megatron-Opus-14B-2.0.json diff --git a/data/prithivMLmods_Megatron-Opus-14B-2.1.json b/data/models/prithivMLmods_Megatron-Opus-14B-2.1.json similarity index 100% rename from data/prithivMLmods_Megatron-Opus-14B-2.1.json rename to data/models/prithivMLmods_Megatron-Opus-14B-2.1.json diff --git a/data/prithivMLmods_Megatron-Opus-14B-Exp.json b/data/models/prithivMLmods_Megatron-Opus-14B-Exp.json similarity index 100% rename from data/prithivMLmods_Megatron-Opus-14B-Exp.json rename to data/models/prithivMLmods_Megatron-Opus-14B-Exp.json diff --git a/data/prithivMLmods_Megatron-Opus-14B-Stock.json b/data/models/prithivMLmods_Megatron-Opus-14B-Stock.json similarity index 100% rename from data/prithivMLmods_Megatron-Opus-14B-Stock.json rename to data/models/prithivMLmods_Megatron-Opus-14B-Stock.json diff --git a/data/prithivMLmods_Megatron-Opus-7B-Exp.json b/data/models/prithivMLmods_Megatron-Opus-7B-Exp.json similarity index 100% rename from data/prithivMLmods_Megatron-Opus-7B-Exp.json rename to data/models/prithivMLmods_Megatron-Opus-7B-Exp.json diff --git a/data/prithivMLmods_Messier-Opus-14B-Elite7.json b/data/models/prithivMLmods_Messier-Opus-14B-Elite7.json similarity index 100% rename from data/prithivMLmods_Messier-Opus-14B-Elite7.json rename to data/models/prithivMLmods_Messier-Opus-14B-Elite7.json diff --git a/data/prithivMLmods_Omni-Reasoner-Merged.json b/data/models/prithivMLmods_Omni-Reasoner-Merged.json similarity index 100% rename from data/prithivMLmods_Omni-Reasoner-Merged.json rename to data/models/prithivMLmods_Omni-Reasoner-Merged.json diff --git a/data/prithivMLmods_Omni-Reasoner3-Merged.json b/data/models/prithivMLmods_Omni-Reasoner3-Merged.json similarity index 100% rename from data/prithivMLmods_Omni-Reasoner3-Merged.json rename to data/models/prithivMLmods_Omni-Reasoner3-Merged.json diff --git a/data/prithivMLmods_Pegasus-Opus-14B-Exp.json b/data/models/prithivMLmods_Pegasus-Opus-14B-Exp.json similarity index 100% rename from data/prithivMLmods_Pegasus-Opus-14B-Exp.json rename to data/models/prithivMLmods_Pegasus-Opus-14B-Exp.json diff --git a/data/prithivMLmods_Phi-4-Empathetic.json b/data/models/prithivMLmods_Phi-4-Empathetic.json similarity index 100% rename from data/prithivMLmods_Phi-4-Empathetic.json rename to data/models/prithivMLmods_Phi-4-Empathetic.json diff --git a/data/prithivMLmods_Phi-4-Math-IO.json b/data/models/prithivMLmods_Phi-4-Math-IO.json similarity index 100% rename from data/prithivMLmods_Phi-4-Math-IO.json rename to data/models/prithivMLmods_Phi-4-Math-IO.json diff --git a/data/prithivMLmods_Phi-4-QwQ.json b/data/models/prithivMLmods_Phi-4-QwQ.json similarity index 100% rename from data/prithivMLmods_Phi-4-QwQ.json rename to data/models/prithivMLmods_Phi-4-QwQ.json diff --git a/data/prithivMLmods_Phi-4-Super-1.json b/data/models/prithivMLmods_Phi-4-Super-1.json similarity index 100% rename from data/prithivMLmods_Phi-4-Super-1.json rename to data/models/prithivMLmods_Phi-4-Super-1.json diff --git a/data/prithivMLmods_Phi-4-Super-o1.json b/data/models/prithivMLmods_Phi-4-Super-o1.json similarity index 100% rename from data/prithivMLmods_Phi-4-Super-o1.json rename to data/models/prithivMLmods_Phi-4-Super-o1.json diff --git a/data/prithivMLmods_Phi-4-Super.json b/data/models/prithivMLmods_Phi-4-Super.json similarity index 100% rename from data/prithivMLmods_Phi-4-Super.json rename to data/models/prithivMLmods_Phi-4-Super.json diff --git a/data/prithivMLmods_Phi-4-o1.json b/data/models/prithivMLmods_Phi-4-o1.json similarity index 100% rename from data/prithivMLmods_Phi-4-o1.json rename to data/models/prithivMLmods_Phi-4-o1.json diff --git a/data/prithivMLmods_Phi4-Super.json b/data/models/prithivMLmods_Phi4-Super.json similarity index 100% rename from data/prithivMLmods_Phi4-Super.json rename to data/models/prithivMLmods_Phi4-Super.json diff --git a/data/prithivMLmods_Porpoise-Opus-14B-Exp.json b/data/models/prithivMLmods_Porpoise-Opus-14B-Exp.json similarity index 100% rename from data/prithivMLmods_Porpoise-Opus-14B-Exp.json rename to data/models/prithivMLmods_Porpoise-Opus-14B-Exp.json diff --git a/data/prithivMLmods_Primal-Opus-14B-Optimus-v1.json b/data/models/prithivMLmods_Primal-Opus-14B-Optimus-v1.json similarity index 100% rename from data/prithivMLmods_Primal-Opus-14B-Optimus-v1.json rename to data/models/prithivMLmods_Primal-Opus-14B-Optimus-v1.json diff --git a/data/prithivMLmods_Primal-Opus-14B-Optimus-v2.json b/data/models/prithivMLmods_Primal-Opus-14B-Optimus-v2.json similarity index 100% rename from data/prithivMLmods_Primal-Opus-14B-Optimus-v2.json rename to data/models/prithivMLmods_Primal-Opus-14B-Optimus-v2.json diff --git a/data/prithivMLmods_QwQ-LCoT-14B-Conversational.json b/data/models/prithivMLmods_QwQ-LCoT-14B-Conversational.json similarity index 100% rename from data/prithivMLmods_QwQ-LCoT-14B-Conversational.json rename to data/models/prithivMLmods_QwQ-LCoT-14B-Conversational.json diff --git a/data/prithivMLmods_QwQ-LCoT-3B-Instruct.json b/data/models/prithivMLmods_QwQ-LCoT-3B-Instruct.json similarity index 100% rename from data/prithivMLmods_QwQ-LCoT-3B-Instruct.json rename to data/models/prithivMLmods_QwQ-LCoT-3B-Instruct.json diff --git a/data/prithivMLmods_QwQ-LCoT-7B-Instruct.json b/data/models/prithivMLmods_QwQ-LCoT-7B-Instruct.json similarity index 100% rename from data/prithivMLmods_QwQ-LCoT-7B-Instruct.json rename to data/models/prithivMLmods_QwQ-LCoT-7B-Instruct.json diff --git a/data/prithivMLmods_QwQ-LCoT1-Merged.json b/data/models/prithivMLmods_QwQ-LCoT1-Merged.json similarity index 100% rename from data/prithivMLmods_QwQ-LCoT1-Merged.json rename to data/models/prithivMLmods_QwQ-LCoT1-Merged.json diff --git a/data/prithivMLmods_QwQ-LCoT2-7B-Instruct.json b/data/models/prithivMLmods_QwQ-LCoT2-7B-Instruct.json similarity index 100% rename from data/prithivMLmods_QwQ-LCoT2-7B-Instruct.json rename to data/models/prithivMLmods_QwQ-LCoT2-7B-Instruct.json diff --git a/data/prithivMLmods_QwQ-MathOct-7B.json b/data/models/prithivMLmods_QwQ-MathOct-7B.json similarity index 100% rename from data/prithivMLmods_QwQ-MathOct-7B.json rename to data/models/prithivMLmods_QwQ-MathOct-7B.json diff --git a/data/prithivMLmods_QwQ-R1-Distill-1.5B-CoT.json b/data/models/prithivMLmods_QwQ-R1-Distill-1.5B-CoT.json similarity index 100% rename from data/prithivMLmods_QwQ-R1-Distill-1.5B-CoT.json rename to data/models/prithivMLmods_QwQ-R1-Distill-1.5B-CoT.json diff --git a/data/prithivMLmods_QwQ-R1-Distill-7B-CoT.json b/data/models/prithivMLmods_QwQ-R1-Distill-7B-CoT.json similarity index 100% rename from data/prithivMLmods_QwQ-R1-Distill-7B-CoT.json rename to data/models/prithivMLmods_QwQ-R1-Distill-7B-CoT.json diff --git a/data/prithivMLmods_Qwen-7B-Distill-Reasoner.json b/data/models/prithivMLmods_Qwen-7B-Distill-Reasoner.json similarity index 100% rename from data/prithivMLmods_Qwen-7B-Distill-Reasoner.json rename to data/models/prithivMLmods_Qwen-7B-Distill-Reasoner.json diff --git a/data/prithivMLmods_Qwen2.5-1.5B-DeepSeek-R1-Instruct.json b/data/models/prithivMLmods_Qwen2.5-1.5B-DeepSeek-R1-Instruct.json similarity index 100% rename from data/prithivMLmods_Qwen2.5-1.5B-DeepSeek-R1-Instruct.json rename to data/models/prithivMLmods_Qwen2.5-1.5B-DeepSeek-R1-Instruct.json diff --git a/data/prithivMLmods_Qwen2.5-14B-DeepSeek-R1-1M.json b/data/models/prithivMLmods_Qwen2.5-14B-DeepSeek-R1-1M.json similarity index 100% rename from data/prithivMLmods_Qwen2.5-14B-DeepSeek-R1-1M.json rename to data/models/prithivMLmods_Qwen2.5-14B-DeepSeek-R1-1M.json diff --git a/data/prithivMLmods_Qwen2.5-7B-DeepSeek-R1-1M.json b/data/models/prithivMLmods_Qwen2.5-7B-DeepSeek-R1-1M.json similarity index 100% rename from data/prithivMLmods_Qwen2.5-7B-DeepSeek-R1-1M.json rename to data/models/prithivMLmods_Qwen2.5-7B-DeepSeek-R1-1M.json diff --git a/data/prithivMLmods_SmolLM2-CoT-360M.json b/data/models/prithivMLmods_SmolLM2-CoT-360M.json similarity index 100% rename from data/prithivMLmods_SmolLM2-CoT-360M.json rename to data/models/prithivMLmods_SmolLM2-CoT-360M.json diff --git a/data/prithivMLmods_Sombrero-Opus-14B-Elite5.json b/data/models/prithivMLmods_Sombrero-Opus-14B-Elite5.json similarity index 100% rename from data/prithivMLmods_Sombrero-Opus-14B-Elite5.json rename to data/models/prithivMLmods_Sombrero-Opus-14B-Elite5.json diff --git a/data/prithivMLmods_Sombrero-Opus-14B-Elite6.json b/data/models/prithivMLmods_Sombrero-Opus-14B-Elite6.json similarity index 100% rename from data/prithivMLmods_Sombrero-Opus-14B-Elite6.json rename to data/models/prithivMLmods_Sombrero-Opus-14B-Elite6.json diff --git a/data/prithivMLmods_Sombrero-Opus-14B-Sm1.json b/data/models/prithivMLmods_Sombrero-Opus-14B-Sm1.json similarity index 100% rename from data/prithivMLmods_Sombrero-Opus-14B-Sm1.json rename to data/models/prithivMLmods_Sombrero-Opus-14B-Sm1.json diff --git a/data/prithivMLmods_Sombrero-Opus-14B-Sm2.json b/data/models/prithivMLmods_Sombrero-Opus-14B-Sm2.json similarity index 100% rename from data/prithivMLmods_Sombrero-Opus-14B-Sm2.json rename to data/models/prithivMLmods_Sombrero-Opus-14B-Sm2.json diff --git a/data/prithivMLmods_Sombrero-Opus-14B-Sm4.json b/data/models/prithivMLmods_Sombrero-Opus-14B-Sm4.json similarity index 100% rename from data/prithivMLmods_Sombrero-Opus-14B-Sm4.json rename to data/models/prithivMLmods_Sombrero-Opus-14B-Sm4.json diff --git a/data/prithivMLmods_Sombrero-Opus-14B-Sm5.json b/data/models/prithivMLmods_Sombrero-Opus-14B-Sm5.json similarity index 100% rename from data/prithivMLmods_Sombrero-Opus-14B-Sm5.json rename to data/models/prithivMLmods_Sombrero-Opus-14B-Sm5.json diff --git a/data/prithivMLmods_Sqweeks-7B-Instruct.json b/data/models/prithivMLmods_Sqweeks-7B-Instruct.json similarity index 100% rename from data/prithivMLmods_Sqweeks-7B-Instruct.json rename to data/models/prithivMLmods_Sqweeks-7B-Instruct.json diff --git a/data/prithivMLmods_Tadpole-Opus-14B-Exp.json b/data/models/prithivMLmods_Tadpole-Opus-14B-Exp.json similarity index 100% rename from data/prithivMLmods_Tadpole-Opus-14B-Exp.json rename to data/models/prithivMLmods_Tadpole-Opus-14B-Exp.json diff --git a/data/prithivMLmods_Taurus-Opus-7B.json b/data/models/prithivMLmods_Taurus-Opus-7B.json similarity index 100% rename from data/prithivMLmods_Taurus-Opus-7B.json rename to data/models/prithivMLmods_Taurus-Opus-7B.json diff --git a/data/prithivMLmods_Triangulum-10B.json b/data/models/prithivMLmods_Triangulum-10B.json similarity index 100% rename from data/prithivMLmods_Triangulum-10B.json rename to data/models/prithivMLmods_Triangulum-10B.json diff --git a/data/prithivMLmods_Triangulum-5B.json b/data/models/prithivMLmods_Triangulum-5B.json similarity index 100% rename from data/prithivMLmods_Triangulum-5B.json rename to data/models/prithivMLmods_Triangulum-5B.json diff --git a/data/prithivMLmods_Triangulum-v2-10B.json b/data/models/prithivMLmods_Triangulum-v2-10B.json similarity index 100% rename from data/prithivMLmods_Triangulum-v2-10B.json rename to data/models/prithivMLmods_Triangulum-v2-10B.json diff --git a/data/prithivMLmods_Tucana-Opus-14B-r999.json b/data/models/prithivMLmods_Tucana-Opus-14B-r999.json similarity index 100% rename from data/prithivMLmods_Tucana-Opus-14B-r999.json rename to data/models/prithivMLmods_Tucana-Opus-14B-r999.json diff --git a/data/prithivMLmods_Tulu-MathLingo-8B.json b/data/models/prithivMLmods_Tulu-MathLingo-8B.json similarity index 100% rename from data/prithivMLmods_Tulu-MathLingo-8B.json rename to data/models/prithivMLmods_Tulu-MathLingo-8B.json diff --git a/data/prithivMLmods_Viper-Coder-7B-Elite14.json b/data/models/prithivMLmods_Viper-Coder-7B-Elite14.json similarity index 100% rename from data/prithivMLmods_Viper-Coder-7B-Elite14.json rename to data/models/prithivMLmods_Viper-Coder-7B-Elite14.json diff --git a/data/prithivMLmods_Viper-Coder-Hybrid-v1.2.json b/data/models/prithivMLmods_Viper-Coder-Hybrid-v1.2.json similarity index 100% rename from data/prithivMLmods_Viper-Coder-Hybrid-v1.2.json rename to data/models/prithivMLmods_Viper-Coder-Hybrid-v1.2.json diff --git a/data/prithivMLmods_Viper-Coder-Hybrid-v1.3.json b/data/models/prithivMLmods_Viper-Coder-Hybrid-v1.3.json similarity index 100% rename from data/prithivMLmods_Viper-Coder-Hybrid-v1.3.json rename to data/models/prithivMLmods_Viper-Coder-Hybrid-v1.3.json diff --git a/data/prithivMLmods_Viper-Coder-HybridMini-v1.3.json b/data/models/prithivMLmods_Viper-Coder-HybridMini-v1.3.json similarity index 100% rename from data/prithivMLmods_Viper-Coder-HybridMini-v1.3.json rename to data/models/prithivMLmods_Viper-Coder-HybridMini-v1.3.json diff --git a/data/prithivMLmods_Viper-Coder-v0.1.json b/data/models/prithivMLmods_Viper-Coder-v0.1.json similarity index 100% rename from data/prithivMLmods_Viper-Coder-v0.1.json rename to data/models/prithivMLmods_Viper-Coder-v0.1.json diff --git a/data/prithivMLmods_Viper-Coder-v1.1.json b/data/models/prithivMLmods_Viper-Coder-v1.1.json similarity index 100% rename from data/prithivMLmods_Viper-Coder-v1.1.json rename to data/models/prithivMLmods_Viper-Coder-v1.1.json diff --git a/data/prithivMLmods_Viper-Coder-v1.6-r999.json b/data/models/prithivMLmods_Viper-Coder-v1.6-r999.json similarity index 100% rename from data/prithivMLmods_Viper-Coder-v1.6-r999.json rename to data/models/prithivMLmods_Viper-Coder-v1.6-r999.json diff --git a/data/prithivMLmods_Viper-Coder-v1.7-Vsm6.json b/data/models/prithivMLmods_Viper-Coder-v1.7-Vsm6.json similarity index 100% rename from data/prithivMLmods_Viper-Coder-v1.7-Vsm6.json rename to data/models/prithivMLmods_Viper-Coder-v1.7-Vsm6.json diff --git a/data/prithivMLmods_Viper-OneCoder-UIGEN.json b/data/models/prithivMLmods_Viper-OneCoder-UIGEN.json similarity index 100% rename from data/prithivMLmods_Viper-OneCoder-UIGEN.json rename to data/models/prithivMLmods_Viper-OneCoder-UIGEN.json diff --git a/data/prithivMLmods_Volans-Opus-14B-Exp.json b/data/models/prithivMLmods_Volans-Opus-14B-Exp.json similarity index 100% rename from data/prithivMLmods_Volans-Opus-14B-Exp.json rename to data/models/prithivMLmods_Volans-Opus-14B-Exp.json diff --git a/data/prithivMLmods_WebMind-7B-v0.1.json b/data/models/prithivMLmods_WebMind-7B-v0.1.json similarity index 100% rename from data/prithivMLmods_WebMind-7B-v0.1.json rename to data/models/prithivMLmods_WebMind-7B-v0.1.json diff --git a/data/prometheus-eval_prometheus-7b-v2.0.json b/data/models/prometheus-eval_prometheus-7b-v2.0.json similarity index 100% rename from data/prometheus-eval_prometheus-7b-v2.0.json rename to data/models/prometheus-eval_prometheus-7b-v2.0.json diff --git a/data/prometheus-eval_prometheus-8x7b-v2.0.json b/data/models/prometheus-eval_prometheus-8x7b-v2.0.json similarity index 100% rename from data/prometheus-eval_prometheus-8x7b-v2.0.json rename to data/models/prometheus-eval_prometheus-8x7b-v2.0.json diff --git a/data/pszemraj_Llama-3-6.3b-v0.1.json b/data/models/pszemraj_Llama-3-6.3b-v0.1.json similarity index 100% rename from data/pszemraj_Llama-3-6.3b-v0.1.json rename to data/models/pszemraj_Llama-3-6.3b-v0.1.json diff --git a/data/pszemraj_Mistral-v0.3-6B.json b/data/models/pszemraj_Mistral-v0.3-6B.json similarity index 100% rename from data/pszemraj_Mistral-v0.3-6B.json rename to data/models/pszemraj_Mistral-v0.3-6B.json diff --git a/data/qingy2019_LLaMa_3.2_3B_Catalysts.json b/data/models/qingy2019_LLaMa_3.2_3B_Catalysts.json similarity index 100% rename from data/qingy2019_LLaMa_3.2_3B_Catalysts.json rename to data/models/qingy2019_LLaMa_3.2_3B_Catalysts.json diff --git a/data/qingy2019_OpenMath2-Llama3.1-8B.json b/data/models/qingy2019_OpenMath2-Llama3.1-8B.json similarity index 100% rename from data/qingy2019_OpenMath2-Llama3.1-8B.json rename to data/models/qingy2019_OpenMath2-Llama3.1-8B.json diff --git a/data/qingy2019_Oracle-14B.json b/data/models/qingy2019_Oracle-14B.json similarity index 100% rename from data/qingy2019_Oracle-14B.json rename to data/models/qingy2019_Oracle-14B.json diff --git a/data/qingy2019_Qwen2.5-Math-14B-Instruct-Alpha.json b/data/models/qingy2019_Qwen2.5-Math-14B-Instruct-Alpha.json similarity index 100% rename from data/qingy2019_Qwen2.5-Math-14B-Instruct-Alpha.json rename to data/models/qingy2019_Qwen2.5-Math-14B-Instruct-Alpha.json diff --git a/data/qingy2019_Qwen2.5-Math-14B-Instruct-Pro.json b/data/models/qingy2019_Qwen2.5-Math-14B-Instruct-Pro.json similarity index 100% rename from data/qingy2019_Qwen2.5-Math-14B-Instruct-Pro.json rename to data/models/qingy2019_Qwen2.5-Math-14B-Instruct-Pro.json diff --git a/data/qingy2019_Qwen2.5-Math-14B-Instruct.json b/data/models/qingy2019_Qwen2.5-Math-14B-Instruct.json similarity index 100% rename from data/qingy2019_Qwen2.5-Math-14B-Instruct.json rename to data/models/qingy2019_Qwen2.5-Math-14B-Instruct.json diff --git a/data/qingy2019_Qwen2.5-Ultimate-14B-Instruct.json b/data/models/qingy2019_Qwen2.5-Ultimate-14B-Instruct.json similarity index 100% rename from data/qingy2019_Qwen2.5-Ultimate-14B-Instruct.json rename to data/models/qingy2019_Qwen2.5-Ultimate-14B-Instruct.json diff --git a/data/qingy2024_Benchmaxx-Llama-3.2-1B-Instruct.json b/data/models/qingy2024_Benchmaxx-Llama-3.2-1B-Instruct.json similarity index 100% rename from data/qingy2024_Benchmaxx-Llama-3.2-1B-Instruct.json rename to data/models/qingy2024_Benchmaxx-Llama-3.2-1B-Instruct.json diff --git a/data/qingy2024_Eyas-17B-Instruct.json b/data/models/qingy2024_Eyas-17B-Instruct.json similarity index 100% rename from data/qingy2024_Eyas-17B-Instruct.json rename to data/models/qingy2024_Eyas-17B-Instruct.json diff --git a/data/qingy2024_Falcon3-2x10B-MoE-Instruct.json b/data/models/qingy2024_Falcon3-2x10B-MoE-Instruct.json similarity index 100% rename from data/qingy2024_Falcon3-2x10B-MoE-Instruct.json rename to data/models/qingy2024_Falcon3-2x10B-MoE-Instruct.json diff --git a/data/qingy2024_Fusion-14B-Instruct.json b/data/models/qingy2024_Fusion-14B-Instruct.json similarity index 100% rename from data/qingy2024_Fusion-14B-Instruct.json rename to data/models/qingy2024_Fusion-14B-Instruct.json diff --git a/data/qingy2024_Fusion2-14B-Instruct.json b/data/models/qingy2024_Fusion2-14B-Instruct.json similarity index 100% rename from data/qingy2024_Fusion2-14B-Instruct.json rename to data/models/qingy2024_Fusion2-14B-Instruct.json diff --git a/data/qingy2024_Fusion4-14B-Instruct.json b/data/models/qingy2024_Fusion4-14B-Instruct.json similarity index 100% rename from data/qingy2024_Fusion4-14B-Instruct.json rename to data/models/qingy2024_Fusion4-14B-Instruct.json diff --git a/data/qingy2024_OwO-14B-Instruct.json b/data/models/qingy2024_OwO-14B-Instruct.json similarity index 100% rename from data/qingy2024_OwO-14B-Instruct.json rename to data/models/qingy2024_OwO-14B-Instruct.json diff --git a/data/qingy2024_QwEnlarge-16B-Instruct.json b/data/models/qingy2024_QwEnlarge-16B-Instruct.json similarity index 100% rename from data/qingy2024_QwEnlarge-16B-Instruct.json rename to data/models/qingy2024_QwEnlarge-16B-Instruct.json diff --git a/data/qingy2024_QwQ-14B-Math-v0.2.json b/data/models/qingy2024_QwQ-14B-Math-v0.2.json similarity index 100% rename from data/qingy2024_QwQ-14B-Math-v0.2.json rename to data/models/qingy2024_QwQ-14B-Math-v0.2.json diff --git a/data/qingy2024_Qwarkstar-4B-Instruct-Preview.json b/data/models/qingy2024_Qwarkstar-4B-Instruct-Preview.json similarity index 100% rename from data/qingy2024_Qwarkstar-4B-Instruct-Preview.json rename to data/models/qingy2024_Qwarkstar-4B-Instruct-Preview.json diff --git a/data/qingy2024_Qwarkstar-4B.json b/data/models/qingy2024_Qwarkstar-4B.json similarity index 100% rename from data/qingy2024_Qwarkstar-4B.json rename to data/models/qingy2024_Qwarkstar-4B.json diff --git a/data/qingy2024_Qwen2.5-4B.json b/data/models/qingy2024_Qwen2.5-4B.json similarity index 100% rename from data/qingy2024_Qwen2.5-4B.json rename to data/models/qingy2024_Qwen2.5-4B.json diff --git a/data/qingy2024_Qwen2.5-Coder-Draft-1.5B-Instruct.json b/data/models/qingy2024_Qwen2.5-Coder-Draft-1.5B-Instruct.json similarity index 100% rename from data/qingy2024_Qwen2.5-Coder-Draft-1.5B-Instruct.json rename to data/models/qingy2024_Qwen2.5-Coder-Draft-1.5B-Instruct.json diff --git a/data/qingy2024_Qwen2.5-Math-14B-Instruct-Alpha.json b/data/models/qingy2024_Qwen2.5-Math-14B-Instruct-Alpha.json similarity index 100% rename from data/qingy2024_Qwen2.5-Math-14B-Instruct-Alpha.json rename to data/models/qingy2024_Qwen2.5-Math-14B-Instruct-Alpha.json diff --git a/data/qingy2024_Qwen2.5-Math-14B-Instruct-Preview.json b/data/models/qingy2024_Qwen2.5-Math-14B-Instruct-Preview.json similarity index 100% rename from data/qingy2024_Qwen2.5-Math-14B-Instruct-Preview.json rename to data/models/qingy2024_Qwen2.5-Math-14B-Instruct-Preview.json diff --git a/data/qingy2024_Qwen2.6-14B-Instruct.json b/data/models/qingy2024_Qwen2.6-14B-Instruct.json similarity index 100% rename from data/qingy2024_Qwen2.6-14B-Instruct.json rename to data/models/qingy2024_Qwen2.6-14B-Instruct.json diff --git a/data/qingy2024_Qwen2.6-Math-14B-Instruct.json b/data/models/qingy2024_Qwen2.6-Math-14B-Instruct.json similarity index 100% rename from data/qingy2024_Qwen2.6-Math-14B-Instruct.json rename to data/models/qingy2024_Qwen2.6-Math-14B-Instruct.json diff --git a/data/qq8933_OpenLongCoT-Base-Gemma2-2B.json b/data/models/qq8933_OpenLongCoT-Base-Gemma2-2B.json similarity index 100% rename from data/qq8933_OpenLongCoT-Base-Gemma2-2B.json rename to data/models/qq8933_OpenLongCoT-Base-Gemma2-2B.json diff --git a/data/qwen_qwen1.5-110b-chat.json b/data/models/qwen_qwen1.5-110b-chat.json similarity index 100% rename from data/qwen_qwen1.5-110b-chat.json rename to data/models/qwen_qwen1.5-110b-chat.json diff --git a/data/qwen_qwen1.5-14b.json b/data/models/qwen_qwen1.5-14b.json similarity index 100% rename from data/qwen_qwen1.5-14b.json rename to data/models/qwen_qwen1.5-14b.json diff --git a/data/qwen_qwen1.5-32b.json b/data/models/qwen_qwen1.5-32b.json similarity index 100% rename from data/qwen_qwen1.5-32b.json rename to data/models/qwen_qwen1.5-32b.json diff --git a/data/qwen_qwen1.5-72b.json b/data/models/qwen_qwen1.5-72b.json similarity index 100% rename from data/qwen_qwen1.5-72b.json rename to data/models/qwen_qwen1.5-72b.json diff --git a/data/qwen_qwen1.5-7b.json b/data/models/qwen_qwen1.5-7b.json similarity index 100% rename from data/qwen_qwen1.5-7b.json rename to data/models/qwen_qwen1.5-7b.json diff --git a/data/qwen_qwen2-72b-instruct.json b/data/models/qwen_qwen2-72b-instruct.json similarity index 100% rename from data/qwen_qwen2-72b-instruct.json rename to data/models/qwen_qwen2-72b-instruct.json diff --git a/data/qwen_qwen2.5-72b-instruct-turbo.json b/data/models/qwen_qwen2.5-72b-instruct-turbo.json similarity index 100% rename from data/qwen_qwen2.5-72b-instruct-turbo.json rename to data/models/qwen_qwen2.5-72b-instruct-turbo.json diff --git a/data/qwen_qwen2.5-7b-instruct-turbo.json b/data/models/qwen_qwen2.5-7b-instruct-turbo.json similarity index 100% rename from data/qwen_qwen2.5-7b-instruct-turbo.json rename to data/models/qwen_qwen2.5-7b-instruct-turbo.json diff --git a/data/qwen_qwen3-235b-a22b-fp8-tput.json b/data/models/qwen_qwen3-235b-a22b-fp8-tput.json similarity index 100% rename from data/qwen_qwen3-235b-a22b-fp8-tput.json rename to data/models/qwen_qwen3-235b-a22b-fp8-tput.json diff --git a/data/qwen_qwen3-235b-a22b-instruct-2507-fp8.json b/data/models/qwen_qwen3-235b-a22b-instruct-2507-fp8.json similarity index 100% rename from data/qwen_qwen3-235b-a22b-instruct-2507-fp8.json rename to data/models/qwen_qwen3-235b-a22b-instruct-2507-fp8.json diff --git a/data/raphgg_test-2.5-72B.json b/data/models/raphgg_test-2.5-72B.json similarity index 100% rename from data/raphgg_test-2.5-72B.json rename to data/models/raphgg_test-2.5-72B.json diff --git a/data/rasyosef_Mistral-NeMo-Minitron-8B-Chat.json b/data/models/rasyosef_Mistral-NeMo-Minitron-8B-Chat.json similarity index 100% rename from data/rasyosef_Mistral-NeMo-Minitron-8B-Chat.json rename to data/models/rasyosef_Mistral-NeMo-Minitron-8B-Chat.json diff --git a/data/rasyosef_Phi-1_5-Instruct-v0.1.json b/data/models/rasyosef_Phi-1_5-Instruct-v0.1.json similarity index 100% rename from data/rasyosef_Phi-1_5-Instruct-v0.1.json rename to data/models/rasyosef_Phi-1_5-Instruct-v0.1.json diff --git a/data/rasyosef_phi-2-instruct-apo.json b/data/models/rasyosef_phi-2-instruct-apo.json similarity index 100% rename from data/rasyosef_phi-2-instruct-apo.json rename to data/models/rasyosef_phi-2-instruct-apo.json diff --git a/data/rasyosef_phi-2-instruct-v0.1.json b/data/models/rasyosef_phi-2-instruct-v0.1.json similarity index 100% rename from data/rasyosef_phi-2-instruct-v0.1.json rename to data/models/rasyosef_phi-2-instruct-v0.1.json diff --git a/data/realtreetune_rho-1b-sft-MATH.json b/data/models/realtreetune_rho-1b-sft-MATH.json similarity index 100% rename from data/realtreetune_rho-1b-sft-MATH.json rename to data/models/realtreetune_rho-1b-sft-MATH.json diff --git a/data/recoilme_Gemma-2-Ataraxy-Gemmasutra-9B-slerp.json b/data/models/recoilme_Gemma-2-Ataraxy-Gemmasutra-9B-slerp.json similarity index 100% rename from data/recoilme_Gemma-2-Ataraxy-Gemmasutra-9B-slerp.json rename to data/models/recoilme_Gemma-2-Ataraxy-Gemmasutra-9B-slerp.json diff --git a/data/recoilme_recoilme-gemma-2-9B-v0.1.json b/data/models/recoilme_recoilme-gemma-2-9B-v0.1.json similarity index 100% rename from data/recoilme_recoilme-gemma-2-9B-v0.1.json rename to data/models/recoilme_recoilme-gemma-2-9B-v0.1.json diff --git a/data/recoilme_recoilme-gemma-2-9B-v0.2.json b/data/models/recoilme_recoilme-gemma-2-9B-v0.2.json similarity index 100% rename from data/recoilme_recoilme-gemma-2-9B-v0.2.json rename to data/models/recoilme_recoilme-gemma-2-9B-v0.2.json diff --git a/data/recoilme_recoilme-gemma-2-9B-v0.3.json b/data/models/recoilme_recoilme-gemma-2-9B-v0.3.json similarity index 100% rename from data/recoilme_recoilme-gemma-2-9B-v0.3.json rename to data/models/recoilme_recoilme-gemma-2-9B-v0.3.json diff --git a/data/recoilme_recoilme-gemma-2-9B-v0.4.json b/data/models/recoilme_recoilme-gemma-2-9B-v0.4.json similarity index 100% rename from data/recoilme_recoilme-gemma-2-9B-v0.4.json rename to data/models/recoilme_recoilme-gemma-2-9B-v0.4.json diff --git a/data/recoilme_recoilme-gemma-2-9B-v0.5.json b/data/models/recoilme_recoilme-gemma-2-9B-v0.5.json similarity index 100% rename from data/recoilme_recoilme-gemma-2-9B-v0.5.json rename to data/models/recoilme_recoilme-gemma-2-9B-v0.5.json diff --git a/data/redrix_AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS.json b/data/models/redrix_AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS.json similarity index 100% rename from data/redrix_AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS.json rename to data/models/redrix_AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS.json diff --git a/data/redrix_patricide-12B-Unslop-Mell.json b/data/models/redrix_patricide-12B-Unslop-Mell.json similarity index 100% rename from data/redrix_patricide-12B-Unslop-Mell.json rename to data/models/redrix_patricide-12B-Unslop-Mell.json diff --git a/data/refuelai_Llama-3-Refueled.json b/data/models/refuelai_Llama-3-Refueled.json similarity index 100% rename from data/refuelai_Llama-3-Refueled.json rename to data/models/refuelai_Llama-3-Refueled.json diff --git a/data/rhplus0831_maid-yuzu-v7.json b/data/models/rhplus0831_maid-yuzu-v7.json similarity index 100% rename from data/rhplus0831_maid-yuzu-v7.json rename to data/models/rhplus0831_maid-yuzu-v7.json diff --git a/data/rhymes-ai_Aria.json b/data/models/rhymes-ai_Aria.json similarity index 100% rename from data/rhymes-ai_Aria.json rename to data/models/rhymes-ai_Aria.json diff --git a/data/rhysjones_phi-2-orange-v2.json b/data/models/rhysjones_phi-2-orange-v2.json similarity index 100% rename from data/rhysjones_phi-2-orange-v2.json rename to data/models/rhysjones_phi-2-orange-v2.json diff --git a/data/riaz_FineLlama-3.1-8B.json b/data/models/riaz_FineLlama-3.1-8B.json similarity index 99% rename from data/riaz_FineLlama-3.1-8B.json rename to data/models/riaz_FineLlama-3.1-8B.json index dd0f132220d8945c30a840020a3e2bc32f7b5066..6a38524505fecaf8ed46620c262741c031a757ea 100644 --- a/data/riaz_FineLlama-3.1-8B.json +++ b/data/models/riaz_FineLlama-3.1-8B.json @@ -5,7 +5,7 @@ "developer": "riaz", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "LlamaForCausalLM", "params_billions": "8.03" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4137 + "score": 0.4373 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4565 + "score": 0.4586 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0453 + "score": 0.0514 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.276 + "score": 0.2752 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3776 + "score": 0.3763 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2978 + "score": 0.2964 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4373 + "score": 0.4137 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4586 + "score": 0.4565 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0514 + "score": 0.0453 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2752 + "score": 0.276 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3763 + "score": 0.3776 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2964 + "score": 0.2978 } } ], diff --git a/data/rmdhirr_Gluon-8B.json b/data/models/rmdhirr_Gluon-8B.json similarity index 100% rename from data/rmdhirr_Gluon-8B.json rename to data/models/rmdhirr_Gluon-8B.json diff --git a/data/rombodawg_Rombos-Coder-V2.5-Qwen-14b.json b/data/models/rombodawg_Rombos-Coder-V2.5-Qwen-14b.json similarity index 100% rename from data/rombodawg_Rombos-Coder-V2.5-Qwen-14b.json rename to data/models/rombodawg_Rombos-Coder-V2.5-Qwen-14b.json diff --git a/data/rombodawg_Rombos-Coder-V2.5-Qwen-7b.json b/data/models/rombodawg_Rombos-Coder-V2.5-Qwen-7b.json similarity index 100% rename from data/rombodawg_Rombos-Coder-V2.5-Qwen-7b.json rename to data/models/rombodawg_Rombos-Coder-V2.5-Qwen-7b.json diff --git a/data/rombodawg_Rombos-LLM-V2.5-Qwen-0.5b.json b/data/models/rombodawg_Rombos-LLM-V2.5-Qwen-0.5b.json similarity index 100% rename from data/rombodawg_Rombos-LLM-V2.5-Qwen-0.5b.json rename to data/models/rombodawg_Rombos-LLM-V2.5-Qwen-0.5b.json diff --git a/data/rombodawg_Rombos-LLM-V2.5-Qwen-1.5b.json b/data/models/rombodawg_Rombos-LLM-V2.5-Qwen-1.5b.json similarity index 100% rename from data/rombodawg_Rombos-LLM-V2.5-Qwen-1.5b.json rename to data/models/rombodawg_Rombos-LLM-V2.5-Qwen-1.5b.json diff --git a/data/rombodawg_Rombos-LLM-V2.5-Qwen-14b.json b/data/models/rombodawg_Rombos-LLM-V2.5-Qwen-14b.json similarity index 100% rename from data/rombodawg_Rombos-LLM-V2.5-Qwen-14b.json rename to data/models/rombodawg_Rombos-LLM-V2.5-Qwen-14b.json diff --git a/data/rombodawg_Rombos-LLM-V2.5-Qwen-32b.json b/data/models/rombodawg_Rombos-LLM-V2.5-Qwen-32b.json similarity index 100% rename from data/rombodawg_Rombos-LLM-V2.5-Qwen-32b.json rename to data/models/rombodawg_Rombos-LLM-V2.5-Qwen-32b.json diff --git a/data/rombodawg_Rombos-LLM-V2.5-Qwen-3b.json b/data/models/rombodawg_Rombos-LLM-V2.5-Qwen-3b.json similarity index 100% rename from data/rombodawg_Rombos-LLM-V2.5-Qwen-3b.json rename to data/models/rombodawg_Rombos-LLM-V2.5-Qwen-3b.json diff --git a/data/rombodawg_Rombos-LLM-V2.5-Qwen-72b.json b/data/models/rombodawg_Rombos-LLM-V2.5-Qwen-72b.json similarity index 100% rename from data/rombodawg_Rombos-LLM-V2.5-Qwen-72b.json rename to data/models/rombodawg_Rombos-LLM-V2.5-Qwen-72b.json diff --git a/data/rombodawg_Rombos-LLM-V2.5-Qwen-7b.json b/data/models/rombodawg_Rombos-LLM-V2.5-Qwen-7b.json similarity index 100% rename from data/rombodawg_Rombos-LLM-V2.5-Qwen-7b.json rename to data/models/rombodawg_Rombos-LLM-V2.5-Qwen-7b.json diff --git a/data/rombodawg_Rombos-LLM-V2.5.1-Qwen-3b.json b/data/models/rombodawg_Rombos-LLM-V2.5.1-Qwen-3b.json similarity index 100% rename from data/rombodawg_Rombos-LLM-V2.5.1-Qwen-3b.json rename to data/models/rombodawg_Rombos-LLM-V2.5.1-Qwen-3b.json diff --git a/data/rombodawg_Rombos-LLM-V2.6-Nemotron-70b.json b/data/models/rombodawg_Rombos-LLM-V2.6-Nemotron-70b.json similarity index 100% rename from data/rombodawg_Rombos-LLM-V2.6-Nemotron-70b.json rename to data/models/rombodawg_Rombos-LLM-V2.6-Nemotron-70b.json diff --git a/data/rombodawg_Rombos-LLM-V2.6-Qwen-14b.json b/data/models/rombodawg_Rombos-LLM-V2.6-Qwen-14b.json similarity index 100% rename from data/rombodawg_Rombos-LLM-V2.6-Qwen-14b.json rename to data/models/rombodawg_Rombos-LLM-V2.6-Qwen-14b.json diff --git a/data/rombodawg_rombos_Replete-Coder-Instruct-8b-Merged.json b/data/models/rombodawg_rombos_Replete-Coder-Instruct-8b-Merged.json similarity index 100% rename from data/rombodawg_rombos_Replete-Coder-Instruct-8b-Merged.json rename to data/models/rombodawg_rombos_Replete-Coder-Instruct-8b-Merged.json diff --git a/data/rombodawg_rombos_Replete-Coder-Llama3-8B.json b/data/models/rombodawg_rombos_Replete-Coder-Llama3-8B.json similarity index 100% rename from data/rombodawg_rombos_Replete-Coder-Llama3-8B.json rename to data/models/rombodawg_rombos_Replete-Coder-Llama3-8B.json diff --git a/data/rootxhacker_Apollo-70B.json b/data/models/rootxhacker_Apollo-70B.json similarity index 100% rename from data/rootxhacker_Apollo-70B.json rename to data/models/rootxhacker_Apollo-70B.json diff --git a/data/rootxhacker_Apollo_v2-32B.json b/data/models/rootxhacker_Apollo_v2-32B.json similarity index 100% rename from data/rootxhacker_Apollo_v2-32B.json rename to data/models/rootxhacker_Apollo_v2-32B.json diff --git a/data/rootxhacker_apollo-7B.json b/data/models/rootxhacker_apollo-7B.json similarity index 100% rename from data/rootxhacker_apollo-7B.json rename to data/models/rootxhacker_apollo-7B.json diff --git a/data/rsh345_mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B.json b/data/models/rsh345_mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B.json similarity index 100% rename from data/rsh345_mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B.json rename to data/models/rsh345_mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B.json diff --git a/data/rubenroy_Geneva-12B-GCv2-5m.json b/data/models/rubenroy_Geneva-12B-GCv2-5m.json similarity index 100% rename from data/rubenroy_Geneva-12B-GCv2-5m.json rename to data/models/rubenroy_Geneva-12B-GCv2-5m.json diff --git a/data/rubenroy_Gilgamesh-72B.json b/data/models/rubenroy_Gilgamesh-72B.json similarity index 100% rename from data/rubenroy_Gilgamesh-72B.json rename to data/models/rubenroy_Gilgamesh-72B.json diff --git a/data/rubenroy_Zurich-14B-GCv2-5m.json b/data/models/rubenroy_Zurich-14B-GCv2-5m.json similarity index 100% rename from data/rubenroy_Zurich-14B-GCv2-5m.json rename to data/models/rubenroy_Zurich-14B-GCv2-5m.json diff --git a/data/ruizhe1217_sft-s1-qwen-0.5b.json b/data/models/ruizhe1217_sft-s1-qwen-0.5b.json similarity index 100% rename from data/ruizhe1217_sft-s1-qwen-0.5b.json rename to data/models/ruizhe1217_sft-s1-qwen-0.5b.json diff --git a/data/rwitz_go-bruins-v2.json b/data/models/rwitz_go-bruins-v2.json similarity index 100% rename from data/rwitz_go-bruins-v2.json rename to data/models/rwitz_go-bruins-v2.json diff --git a/data/sabersaleh_Llama2-7B-CPO.json b/data/models/sabersaleh_Llama2-7B-CPO.json similarity index 100% rename from data/sabersaleh_Llama2-7B-CPO.json rename to data/models/sabersaleh_Llama2-7B-CPO.json diff --git a/data/sabersaleh_Llama2-7B-DPO.json b/data/models/sabersaleh_Llama2-7B-DPO.json similarity index 100% rename from data/sabersaleh_Llama2-7B-DPO.json rename to data/models/sabersaleh_Llama2-7B-DPO.json diff --git a/data/sabersaleh_Llama2-7B-IPO.json b/data/models/sabersaleh_Llama2-7B-IPO.json similarity index 100% rename from data/sabersaleh_Llama2-7B-IPO.json rename to data/models/sabersaleh_Llama2-7B-IPO.json diff --git a/data/sabersaleh_Llama2-7B-KTO.json b/data/models/sabersaleh_Llama2-7B-KTO.json similarity index 100% rename from data/sabersaleh_Llama2-7B-KTO.json rename to data/models/sabersaleh_Llama2-7B-KTO.json diff --git a/data/sabersaleh_Llama2-7B-SPO.json b/data/models/sabersaleh_Llama2-7B-SPO.json similarity index 100% rename from data/sabersaleh_Llama2-7B-SPO.json rename to data/models/sabersaleh_Llama2-7B-SPO.json diff --git a/data/sabersaleh_Llama2-7B-SimPO.json b/data/models/sabersaleh_Llama2-7B-SimPO.json similarity index 100% rename from data/sabersaleh_Llama2-7B-SimPO.json rename to data/models/sabersaleh_Llama2-7B-SimPO.json diff --git a/data/sabersaleh_Llama3.json b/data/models/sabersaleh_Llama3.json similarity index 100% rename from data/sabersaleh_Llama3.json rename to data/models/sabersaleh_Llama3.json diff --git a/data/sabersalehk_Llama3-001-300.json b/data/models/sabersalehk_Llama3-001-300.json similarity index 100% rename from data/sabersalehk_Llama3-001-300.json rename to data/models/sabersalehk_Llama3-001-300.json diff --git a/data/sabersalehk_Llama3-SimPO.json b/data/models/sabersalehk_Llama3-SimPO.json similarity index 100% rename from data/sabersalehk_Llama3-SimPO.json rename to data/models/sabersalehk_Llama3-SimPO.json diff --git a/data/sabersalehk_Llama3_001_200.json b/data/models/sabersalehk_Llama3_001_200.json similarity index 100% rename from data/sabersalehk_Llama3_001_200.json rename to data/models/sabersalehk_Llama3_001_200.json diff --git a/data/sabersalehk_Llama3_01_300.json b/data/models/sabersalehk_Llama3_01_300.json similarity index 100% rename from data/sabersalehk_Llama3_01_300.json rename to data/models/sabersalehk_Llama3_01_300.json diff --git a/data/saishf_Fimbulvetr-Kuro-Lotus-10.7B.json b/data/models/saishf_Fimbulvetr-Kuro-Lotus-10.7B.json similarity index 100% rename from data/saishf_Fimbulvetr-Kuro-Lotus-10.7B.json rename to data/models/saishf_Fimbulvetr-Kuro-Lotus-10.7B.json diff --git a/data/saishf_Neural-SOVLish-Devil-8B-L3.json b/data/models/saishf_Neural-SOVLish-Devil-8B-L3.json similarity index 100% rename from data/saishf_Neural-SOVLish-Devil-8B-L3.json rename to data/models/saishf_Neural-SOVLish-Devil-8B-L3.json diff --git a/data/saishshinde15_TethysAI_Base_Reasoning.json b/data/models/saishshinde15_TethysAI_Base_Reasoning.json similarity index 100% rename from data/saishshinde15_TethysAI_Base_Reasoning.json rename to data/models/saishshinde15_TethysAI_Base_Reasoning.json diff --git a/data/saishshinde15_TethysAI_Vortex.json b/data/models/saishshinde15_TethysAI_Vortex.json similarity index 100% rename from data/saishshinde15_TethysAI_Vortex.json rename to data/models/saishshinde15_TethysAI_Vortex.json diff --git a/data/saishshinde15_TethysAI_Vortex_Reasoning.json b/data/models/saishshinde15_TethysAI_Vortex_Reasoning.json similarity index 100% rename from data/saishshinde15_TethysAI_Vortex_Reasoning.json rename to data/models/saishshinde15_TethysAI_Vortex_Reasoning.json diff --git a/data/sakaltcommunity_novablast-preview.json b/data/models/sakaltcommunity_novablast-preview.json similarity index 100% rename from data/sakaltcommunity_novablast-preview.json rename to data/models/sakaltcommunity_novablast-preview.json diff --git a/data/sakaltcommunity_sakaltum-7b.json b/data/models/sakaltcommunity_sakaltum-7b.json similarity index 100% rename from data/sakaltcommunity_sakaltum-7b.json rename to data/models/sakaltcommunity_sakaltum-7b.json diff --git a/data/sakhan10_quantized_open_llama_3b_v2.json b/data/models/sakhan10_quantized_open_llama_3b_v2.json similarity index 100% rename from data/sakhan10_quantized_open_llama_3b_v2.json rename to data/models/sakhan10_quantized_open_llama_3b_v2.json diff --git a/data/saltlux_luxia-21.4b-alignment-v1.0.json b/data/models/saltlux_luxia-21.4b-alignment-v1.0.json similarity index 100% rename from data/saltlux_luxia-21.4b-alignment-v1.0.json rename to data/models/saltlux_luxia-21.4b-alignment-v1.0.json diff --git a/data/saltlux_luxia-21.4b-alignment-v1.2.json b/data/models/saltlux_luxia-21.4b-alignment-v1.2.json similarity index 100% rename from data/saltlux_luxia-21.4b-alignment-v1.2.json rename to data/models/saltlux_luxia-21.4b-alignment-v1.2.json diff --git a/data/sam-paech_Darkest-muse-v1.json b/data/models/sam-paech_Darkest-muse-v1.json similarity index 100% rename from data/sam-paech_Darkest-muse-v1.json rename to data/models/sam-paech_Darkest-muse-v1.json diff --git a/data/sam-paech_Delirium-v1.json b/data/models/sam-paech_Delirium-v1.json similarity index 100% rename from data/sam-paech_Delirium-v1.json rename to data/models/sam-paech_Delirium-v1.json diff --git a/data/sam-paech_Quill-v1.json b/data/models/sam-paech_Quill-v1.json similarity index 100% rename from data/sam-paech_Quill-v1.json rename to data/models/sam-paech_Quill-v1.json diff --git a/data/sarvamai_OpenHathi-7B-Hi-v0.1-Base.json b/data/models/sarvamai_OpenHathi-7B-Hi-v0.1-Base.json similarity index 100% rename from data/sarvamai_OpenHathi-7B-Hi-v0.1-Base.json rename to data/models/sarvamai_OpenHathi-7B-Hi-v0.1-Base.json diff --git a/data/schnapss_testmerge-7b.json b/data/models/schnapss_testmerge-7b.json similarity index 100% rename from data/schnapss_testmerge-7b.json rename to data/models/schnapss_testmerge-7b.json diff --git a/data/sci-m-wang_Mistral-7B-Instruct-sa-v0.1.json b/data/models/sci-m-wang_Mistral-7B-Instruct-sa-v0.1.json similarity index 100% rename from data/sci-m-wang_Mistral-7B-Instruct-sa-v0.1.json rename to data/models/sci-m-wang_Mistral-7B-Instruct-sa-v0.1.json diff --git a/data/sci-m-wang_Phi-3-mini-4k-instruct-sa-v0.1.json b/data/models/sci-m-wang_Phi-3-mini-4k-instruct-sa-v0.1.json similarity index 100% rename from data/sci-m-wang_Phi-3-mini-4k-instruct-sa-v0.1.json rename to data/models/sci-m-wang_Phi-3-mini-4k-instruct-sa-v0.1.json diff --git a/data/sci-m-wang_deepseek-llm-7b-chat-sa-v0.1.json b/data/models/sci-m-wang_deepseek-llm-7b-chat-sa-v0.1.json similarity index 100% rename from data/sci-m-wang_deepseek-llm-7b-chat-sa-v0.1.json rename to data/models/sci-m-wang_deepseek-llm-7b-chat-sa-v0.1.json diff --git a/data/securin_Securin-LLM-V2.5-Qwen-1.5B.json b/data/models/securin_Securin-LLM-V2.5-Qwen-1.5B.json similarity index 100% rename from data/securin_Securin-LLM-V2.5-Qwen-1.5B.json rename to data/models/securin_Securin-LLM-V2.5-Qwen-1.5B.json diff --git a/data/senseable_WestLake-7B-v2.json b/data/models/senseable_WestLake-7B-v2.json similarity index 100% rename from data/senseable_WestLake-7B-v2.json rename to data/models/senseable_WestLake-7B-v2.json diff --git a/data/sequelbox_Llama3.1-70B-PlumChat.json b/data/models/sequelbox_Llama3.1-70B-PlumChat.json similarity index 100% rename from data/sequelbox_Llama3.1-70B-PlumChat.json rename to data/models/sequelbox_Llama3.1-70B-PlumChat.json diff --git a/data/sequelbox_Llama3.1-8B-MOTH.json b/data/models/sequelbox_Llama3.1-8B-MOTH.json similarity index 100% rename from data/sequelbox_Llama3.1-8B-MOTH.json rename to data/models/sequelbox_Llama3.1-8B-MOTH.json diff --git a/data/sequelbox_Llama3.1-8B-PlumChat.json b/data/models/sequelbox_Llama3.1-8B-PlumChat.json similarity index 100% rename from data/sequelbox_Llama3.1-8B-PlumChat.json rename to data/models/sequelbox_Llama3.1-8B-PlumChat.json diff --git a/data/sequelbox_Llama3.1-8B-PlumCode.json b/data/models/sequelbox_Llama3.1-8B-PlumCode.json similarity index 100% rename from data/sequelbox_Llama3.1-8B-PlumCode.json rename to data/models/sequelbox_Llama3.1-8B-PlumCode.json diff --git a/data/sequelbox_Llama3.1-8B-PlumMath.json b/data/models/sequelbox_Llama3.1-8B-PlumMath.json similarity index 100% rename from data/sequelbox_Llama3.1-8B-PlumMath.json rename to data/models/sequelbox_Llama3.1-8B-PlumMath.json diff --git a/data/sequelbox_gemma-2-9B-MOTH.json b/data/models/sequelbox_gemma-2-9B-MOTH.json similarity index 100% rename from data/sequelbox_gemma-2-9B-MOTH.json rename to data/models/sequelbox_gemma-2-9B-MOTH.json diff --git a/data/sethuiyer_Llama-3.1-8B-Experimental-1206-Instruct.json b/data/models/sethuiyer_Llama-3.1-8B-Experimental-1206-Instruct.json similarity index 100% rename from data/sethuiyer_Llama-3.1-8B-Experimental-1206-Instruct.json rename to data/models/sethuiyer_Llama-3.1-8B-Experimental-1206-Instruct.json diff --git a/data/sethuiyer_Llama-3.1-8B-Experimental-1208-Instruct.json b/data/models/sethuiyer_Llama-3.1-8B-Experimental-1208-Instruct.json similarity index 100% rename from data/sethuiyer_Llama-3.1-8B-Experimental-1208-Instruct.json rename to data/models/sethuiyer_Llama-3.1-8B-Experimental-1208-Instruct.json diff --git a/data/sethuiyer_LlamaZero-3.1-8B-Experimental-1208.json b/data/models/sethuiyer_LlamaZero-3.1-8B-Experimental-1208.json similarity index 100% rename from data/sethuiyer_LlamaZero-3.1-8B-Experimental-1208.json rename to data/models/sethuiyer_LlamaZero-3.1-8B-Experimental-1208.json diff --git a/data/sethuiyer_Llamaverse-3.1-8B-Instruct.json b/data/models/sethuiyer_Llamaverse-3.1-8B-Instruct.json similarity index 100% rename from data/sethuiyer_Llamaverse-3.1-8B-Instruct.json rename to data/models/sethuiyer_Llamaverse-3.1-8B-Instruct.json diff --git a/data/sethuiyer_Llamazing-3.1-8B-Instruct.json b/data/models/sethuiyer_Llamazing-3.1-8B-Instruct.json similarity index 100% rename from data/sethuiyer_Llamazing-3.1-8B-Instruct.json rename to data/models/sethuiyer_Llamazing-3.1-8B-Instruct.json diff --git a/data/sethuiyer_Qwen2.5-7B-Anvita.json b/data/models/sethuiyer_Qwen2.5-7B-Anvita.json similarity index 100% rename from data/sethuiyer_Qwen2.5-7B-Anvita.json rename to data/models/sethuiyer_Qwen2.5-7B-Anvita.json diff --git a/data/sfairXC_FsfairX-LLaMA3-RM-v0.1.json b/data/models/sfairXC_FsfairX-LLaMA3-RM-v0.1.json similarity index 100% rename from data/sfairXC_FsfairX-LLaMA3-RM-v0.1.json rename to data/models/sfairXC_FsfairX-LLaMA3-RM-v0.1.json index 83ff30916d3f35c6333d5f652927c45d63836756..ecef2ed14b1e2210796bfe9e4943404f7ebd3b7c 100644 --- a/data/sfairXC_FsfairX-LLaMA3-RM-v0.1.json +++ b/data/models/sfairXC_FsfairX-LLaMA3-RM-v0.1.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/sfairXC_FsfairX-LLaMA3-RM-v0.1/1766412838.146816", + "evaluation_id": "reward-bench-2/sfairXC_FsfairX-LLaMA3-RM-v0.1/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,109 +31,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8338 + "score": 0.6292 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9944 + "score": 0.5916 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6513 + "score": 0.4188 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8676 + "score": 0.7667 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8644 + "score": 0.7051 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7492 + "score": 0.6647 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], @@ -141,10 +159,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench-2/sfairXC_FsfairX-LLaMA3-RM-v0.1/1766412838.146816", + "evaluation_id": "reward-bench/sfairXC_FsfairX-LLaMA3-RM-v0.1/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -163,127 +181,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6292 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5916 + "score": 0.8338 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4188 + "score": 0.9944 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6284 + "score": 0.6513 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7667 + "score": 0.8676 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7051 + "score": 0.8644 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6647 + "score": 0.7492 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/shadowml_BeagSake-7B.json b/data/models/shadowml_BeagSake-7B.json similarity index 100% rename from data/shadowml_BeagSake-7B.json rename to data/models/shadowml_BeagSake-7B.json diff --git a/data/shadowml_Mixolar-4x7b.json b/data/models/shadowml_Mixolar-4x7b.json similarity index 100% rename from data/shadowml_Mixolar-4x7b.json rename to data/models/shadowml_Mixolar-4x7b.json diff --git a/data/shastraai_Shastra-LLAMA2-Math-Commonsense-SFT.json b/data/models/shastraai_Shastra-LLAMA2-Math-Commonsense-SFT.json similarity index 100% rename from data/shastraai_Shastra-LLAMA2-Math-Commonsense-SFT.json rename to data/models/shastraai_Shastra-LLAMA2-Math-Commonsense-SFT.json diff --git a/data/shivam9980_NEPALI-LLM.json b/data/models/shivam9980_NEPALI-LLM.json similarity index 100% rename from data/shivam9980_NEPALI-LLM.json rename to data/models/shivam9980_NEPALI-LLM.json diff --git a/data/shivam9980_mistral-7b-news-cnn-merged.json b/data/models/shivam9980_mistral-7b-news-cnn-merged.json similarity index 100% rename from data/shivam9980_mistral-7b-news-cnn-merged.json rename to data/models/shivam9980_mistral-7b-news-cnn-merged.json diff --git a/data/shivank21_mistral_dpo_self.json b/data/models/shivank21_mistral_dpo_self.json similarity index 100% rename from data/shivank21_mistral_dpo_self.json rename to data/models/shivank21_mistral_dpo_self.json diff --git a/data/shuttleai_shuttle-3.json b/data/models/shuttleai_shuttle-3.json similarity index 100% rename from data/shuttleai_shuttle-3.json rename to data/models/shuttleai_shuttle-3.json diff --git a/data/shyamieee_Padma-v7.0.json b/data/models/shyamieee_Padma-v7.0.json similarity index 100% rename from data/shyamieee_Padma-v7.0.json rename to data/models/shyamieee_Padma-v7.0.json diff --git a/data/silma-ai_SILMA-9B-Instruct-v1.0.json b/data/models/silma-ai_SILMA-9B-Instruct-v1.0.json similarity index 100% rename from data/silma-ai_SILMA-9B-Instruct-v1.0.json rename to data/models/silma-ai_SILMA-9B-Instruct-v1.0.json diff --git a/data/silma-ai_SILMA-Kashif-2B-Instruct-v1.0.json b/data/models/silma-ai_SILMA-Kashif-2B-Instruct-v1.0.json similarity index 100% rename from data/silma-ai_SILMA-Kashif-2B-Instruct-v1.0.json rename to data/models/silma-ai_SILMA-Kashif-2B-Instruct-v1.0.json diff --git a/data/siqi00_Mistral-7B-DFT.json b/data/models/siqi00_Mistral-7B-DFT.json similarity index 100% rename from data/siqi00_Mistral-7B-DFT.json rename to data/models/siqi00_Mistral-7B-DFT.json diff --git a/data/siqi00_Mistral-7B-DFT2.json b/data/models/siqi00_Mistral-7B-DFT2.json similarity index 100% rename from data/siqi00_Mistral-7B-DFT2.json rename to data/models/siqi00_Mistral-7B-DFT2.json diff --git a/data/skumar9_Llama-medx_v2.json b/data/models/skumar9_Llama-medx_v2.json similarity index 100% rename from data/skumar9_Llama-medx_v2.json rename to data/models/skumar9_Llama-medx_v2.json diff --git a/data/skymizer_Llama2-7b-sft-chat-custom-template-dpo.json b/data/models/skymizer_Llama2-7b-sft-chat-custom-template-dpo.json similarity index 100% rename from data/skymizer_Llama2-7b-sft-chat-custom-template-dpo.json rename to data/models/skymizer_Llama2-7b-sft-chat-custom-template-dpo.json diff --git a/data/snowflake_snowflake-arctic-instruct.json b/data/models/snowflake_snowflake-arctic-instruct.json similarity index 100% rename from data/snowflake_snowflake-arctic-instruct.json rename to data/models/snowflake_snowflake-arctic-instruct.json diff --git a/data/someon98_qwen-CoMa-0.5b.json b/data/models/someon98_qwen-CoMa-0.5b.json similarity index 100% rename from data/someon98_qwen-CoMa-0.5b.json rename to data/models/someon98_qwen-CoMa-0.5b.json diff --git a/data/sometimesanotion_ChocoTrio-14B-v1.json b/data/models/sometimesanotion_ChocoTrio-14B-v1.json similarity index 100% rename from data/sometimesanotion_ChocoTrio-14B-v1.json rename to data/models/sometimesanotion_ChocoTrio-14B-v1.json diff --git a/data/sometimesanotion_IF-reasoning-experiment-40.json b/data/models/sometimesanotion_IF-reasoning-experiment-40.json similarity index 100% rename from data/sometimesanotion_IF-reasoning-experiment-40.json rename to data/models/sometimesanotion_IF-reasoning-experiment-40.json diff --git a/data/sometimesanotion_IF-reasoning-experiment-80.json b/data/models/sometimesanotion_IF-reasoning-experiment-80.json similarity index 100% rename from data/sometimesanotion_IF-reasoning-experiment-80.json rename to data/models/sometimesanotion_IF-reasoning-experiment-80.json diff --git a/data/sometimesanotion_KytheraMix-7B-v0.2.json b/data/models/sometimesanotion_KytheraMix-7B-v0.2.json similarity index 100% rename from data/sometimesanotion_KytheraMix-7B-v0.2.json rename to data/models/sometimesanotion_KytheraMix-7B-v0.2.json diff --git a/data/sometimesanotion_Lamarck-14B-v0.1-experimental.json b/data/models/sometimesanotion_Lamarck-14B-v0.1-experimental.json similarity index 100% rename from data/sometimesanotion_Lamarck-14B-v0.1-experimental.json rename to data/models/sometimesanotion_Lamarck-14B-v0.1-experimental.json diff --git a/data/sometimesanotion_Lamarck-14B-v0.3.json b/data/models/sometimesanotion_Lamarck-14B-v0.3.json similarity index 100% rename from data/sometimesanotion_Lamarck-14B-v0.3.json rename to data/models/sometimesanotion_Lamarck-14B-v0.3.json diff --git a/data/sometimesanotion_Lamarck-14B-v0.4-Qwenvergence.json b/data/models/sometimesanotion_Lamarck-14B-v0.4-Qwenvergence.json similarity index 100% rename from data/sometimesanotion_Lamarck-14B-v0.4-Qwenvergence.json rename to data/models/sometimesanotion_Lamarck-14B-v0.4-Qwenvergence.json diff --git a/data/sometimesanotion_Lamarck-14B-v0.6-002-model_stock.json b/data/models/sometimesanotion_Lamarck-14B-v0.6-002-model_stock.json similarity index 100% rename from data/sometimesanotion_Lamarck-14B-v0.6-002-model_stock.json rename to data/models/sometimesanotion_Lamarck-14B-v0.6-002-model_stock.json diff --git a/data/sometimesanotion_Lamarck-14B-v0.6-model_stock.json b/data/models/sometimesanotion_Lamarck-14B-v0.6-model_stock.json similarity index 100% rename from data/sometimesanotion_Lamarck-14B-v0.6-model_stock.json rename to data/models/sometimesanotion_Lamarck-14B-v0.6-model_stock.json diff --git a/data/sometimesanotion_Lamarck-14B-v0.6.json b/data/models/sometimesanotion_Lamarck-14B-v0.6.json similarity index 100% rename from data/sometimesanotion_Lamarck-14B-v0.6.json rename to data/models/sometimesanotion_Lamarck-14B-v0.6.json diff --git a/data/sometimesanotion_Lamarck-14B-v0.7-Fusion.json b/data/models/sometimesanotion_Lamarck-14B-v0.7-Fusion.json similarity index 100% rename from data/sometimesanotion_Lamarck-14B-v0.7-Fusion.json rename to data/models/sometimesanotion_Lamarck-14B-v0.7-Fusion.json diff --git a/data/sometimesanotion_Lamarck-14B-v0.7-rc1.json b/data/models/sometimesanotion_Lamarck-14B-v0.7-rc1.json similarity index 100% rename from data/sometimesanotion_Lamarck-14B-v0.7-rc1.json rename to data/models/sometimesanotion_Lamarck-14B-v0.7-rc1.json diff --git a/data/sometimesanotion_Lamarck-14B-v0.7-rc4.json b/data/models/sometimesanotion_Lamarck-14B-v0.7-rc4.json similarity index 100% rename from data/sometimesanotion_Lamarck-14B-v0.7-rc4.json rename to data/models/sometimesanotion_Lamarck-14B-v0.7-rc4.json diff --git a/data/sometimesanotion_LamarckInfusion-14B-v1.json b/data/models/sometimesanotion_LamarckInfusion-14B-v1.json similarity index 100% rename from data/sometimesanotion_LamarckInfusion-14B-v1.json rename to data/models/sometimesanotion_LamarckInfusion-14B-v1.json diff --git a/data/sometimesanotion_LamarckInfusion-14B-v2-hi.json b/data/models/sometimesanotion_LamarckInfusion-14B-v2-hi.json similarity index 100% rename from data/sometimesanotion_LamarckInfusion-14B-v2-hi.json rename to data/models/sometimesanotion_LamarckInfusion-14B-v2-hi.json diff --git a/data/sometimesanotion_LamarckInfusion-14B-v2-lo.json b/data/models/sometimesanotion_LamarckInfusion-14B-v2-lo.json similarity index 100% rename from data/sometimesanotion_LamarckInfusion-14B-v2-lo.json rename to data/models/sometimesanotion_LamarckInfusion-14B-v2-lo.json diff --git a/data/sometimesanotion_LamarckInfusion-14B-v2.json b/data/models/sometimesanotion_LamarckInfusion-14B-v2.json similarity index 100% rename from data/sometimesanotion_LamarckInfusion-14B-v2.json rename to data/models/sometimesanotion_LamarckInfusion-14B-v2.json diff --git a/data/sometimesanotion_LamarckInfusion-14B-v3.json b/data/models/sometimesanotion_LamarckInfusion-14B-v3.json similarity index 100% rename from data/sometimesanotion_LamarckInfusion-14B-v3.json rename to data/models/sometimesanotion_LamarckInfusion-14B-v3.json diff --git a/data/sometimesanotion_Qwen-14B-ProseStock-v4.json b/data/models/sometimesanotion_Qwen-14B-ProseStock-v4.json similarity index 100% rename from data/sometimesanotion_Qwen-14B-ProseStock-v4.json rename to data/models/sometimesanotion_Qwen-14B-ProseStock-v4.json diff --git a/data/sometimesanotion_Qwen-2.5-14B-Virmarckeoso.json b/data/models/sometimesanotion_Qwen-2.5-14B-Virmarckeoso.json similarity index 100% rename from data/sometimesanotion_Qwen-2.5-14B-Virmarckeoso.json rename to data/models/sometimesanotion_Qwen-2.5-14B-Virmarckeoso.json diff --git a/data/sometimesanotion_Qwen2.5-14B-Vimarckoso-v2.json b/data/models/sometimesanotion_Qwen2.5-14B-Vimarckoso-v2.json similarity index 100% rename from data/sometimesanotion_Qwen2.5-14B-Vimarckoso-v2.json rename to data/models/sometimesanotion_Qwen2.5-14B-Vimarckoso-v2.json diff --git a/data/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3-IF-Variant.json b/data/models/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3-IF-Variant.json similarity index 100% rename from data/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3-IF-Variant.json rename to data/models/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3-IF-Variant.json diff --git a/data/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3-Prose01.json b/data/models/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3-Prose01.json similarity index 100% rename from data/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3-Prose01.json rename to data/models/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3-Prose01.json diff --git a/data/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3-model_stock.json b/data/models/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3-model_stock.json similarity index 100% rename from data/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3-model_stock.json rename to data/models/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3-model_stock.json diff --git a/data/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3.json b/data/models/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3.json similarity index 100% rename from data/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3.json rename to data/models/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3.json diff --git a/data/sometimesanotion_Qwen2.5-14B-Vimarckoso.json b/data/models/sometimesanotion_Qwen2.5-14B-Vimarckoso.json similarity index 100% rename from data/sometimesanotion_Qwen2.5-14B-Vimarckoso.json rename to data/models/sometimesanotion_Qwen2.5-14B-Vimarckoso.json diff --git a/data/sometimesanotion_Qwen2.5-7B-Gordion-v0.1-Prose.json b/data/models/sometimesanotion_Qwen2.5-7B-Gordion-v0.1-Prose.json similarity index 100% rename from data/sometimesanotion_Qwen2.5-7B-Gordion-v0.1-Prose.json rename to data/models/sometimesanotion_Qwen2.5-7B-Gordion-v0.1-Prose.json diff --git a/data/sometimesanotion_Qwen2.5-7B-Gordion-v0.1-Reason.json b/data/models/sometimesanotion_Qwen2.5-7B-Gordion-v0.1-Reason.json similarity index 100% rename from data/sometimesanotion_Qwen2.5-7B-Gordion-v0.1-Reason.json rename to data/models/sometimesanotion_Qwen2.5-7B-Gordion-v0.1-Reason.json diff --git a/data/sometimesanotion_Qwen2.5-7B-Gordion-v0.1.json b/data/models/sometimesanotion_Qwen2.5-7B-Gordion-v0.1.json similarity index 100% rename from data/sometimesanotion_Qwen2.5-7B-Gordion-v0.1.json rename to data/models/sometimesanotion_Qwen2.5-7B-Gordion-v0.1.json diff --git a/data/sometimesanotion_Qwentessential-14B-v1.json b/data/models/sometimesanotion_Qwentessential-14B-v1.json similarity index 100% rename from data/sometimesanotion_Qwentessential-14B-v1.json rename to data/models/sometimesanotion_Qwentessential-14B-v1.json diff --git a/data/sometimesanotion_Qwentinuum-14B-v013.json b/data/models/sometimesanotion_Qwentinuum-14B-v013.json similarity index 100% rename from data/sometimesanotion_Qwentinuum-14B-v013.json rename to data/models/sometimesanotion_Qwentinuum-14B-v013.json diff --git a/data/sometimesanotion_Qwentinuum-14B-v1.json b/data/models/sometimesanotion_Qwentinuum-14B-v1.json similarity index 100% rename from data/sometimesanotion_Qwentinuum-14B-v1.json rename to data/models/sometimesanotion_Qwentinuum-14B-v1.json diff --git a/data/sometimesanotion_Qwentinuum-14B-v2.json b/data/models/sometimesanotion_Qwentinuum-14B-v2.json similarity index 100% rename from data/sometimesanotion_Qwentinuum-14B-v2.json rename to data/models/sometimesanotion_Qwentinuum-14B-v2.json diff --git a/data/sometimesanotion_Qwentinuum-14B-v3.json b/data/models/sometimesanotion_Qwentinuum-14B-v3.json similarity index 100% rename from data/sometimesanotion_Qwentinuum-14B-v3.json rename to data/models/sometimesanotion_Qwentinuum-14B-v3.json diff --git a/data/sometimesanotion_Qwentinuum-14B-v5.json b/data/models/sometimesanotion_Qwentinuum-14B-v5.json similarity index 100% rename from data/sometimesanotion_Qwentinuum-14B-v5.json rename to data/models/sometimesanotion_Qwentinuum-14B-v5.json diff --git a/data/sometimesanotion_Qwentinuum-14B-v6-Prose.json b/data/models/sometimesanotion_Qwentinuum-14B-v6-Prose.json similarity index 100% rename from data/sometimesanotion_Qwentinuum-14B-v6-Prose.json rename to data/models/sometimesanotion_Qwentinuum-14B-v6-Prose.json diff --git a/data/sometimesanotion_Qwentinuum-14B-v6.json b/data/models/sometimesanotion_Qwentinuum-14B-v6.json similarity index 100% rename from data/sometimesanotion_Qwentinuum-14B-v6.json rename to data/models/sometimesanotion_Qwentinuum-14B-v6.json diff --git a/data/sometimesanotion_Qwentinuum-14B-v7.json b/data/models/sometimesanotion_Qwentinuum-14B-v7.json similarity index 100% rename from data/sometimesanotion_Qwentinuum-14B-v7.json rename to data/models/sometimesanotion_Qwentinuum-14B-v7.json diff --git a/data/sometimesanotion_Qwentinuum-14B-v8.json b/data/models/sometimesanotion_Qwentinuum-14B-v8.json similarity index 100% rename from data/sometimesanotion_Qwentinuum-14B-v8.json rename to data/models/sometimesanotion_Qwentinuum-14B-v8.json diff --git a/data/sometimesanotion_Qwentinuum-14B-v9.json b/data/models/sometimesanotion_Qwentinuum-14B-v9.json similarity index 100% rename from data/sometimesanotion_Qwentinuum-14B-v9.json rename to data/models/sometimesanotion_Qwentinuum-14B-v9.json diff --git a/data/sometimesanotion_Qwenvergence-14B-qv256.json b/data/models/sometimesanotion_Qwenvergence-14B-qv256.json similarity index 100% rename from data/sometimesanotion_Qwenvergence-14B-qv256.json rename to data/models/sometimesanotion_Qwenvergence-14B-qv256.json diff --git a/data/sometimesanotion_Qwenvergence-14B-v0.6-004-model_stock.json b/data/models/sometimesanotion_Qwenvergence-14B-v0.6-004-model_stock.json similarity index 100% rename from data/sometimesanotion_Qwenvergence-14B-v0.6-004-model_stock.json rename to data/models/sometimesanotion_Qwenvergence-14B-v0.6-004-model_stock.json diff --git a/data/sometimesanotion_Qwenvergence-14B-v10.json b/data/models/sometimesanotion_Qwenvergence-14B-v10.json similarity index 100% rename from data/sometimesanotion_Qwenvergence-14B-v10.json rename to data/models/sometimesanotion_Qwenvergence-14B-v10.json diff --git a/data/sometimesanotion_Qwenvergence-14B-v11.json b/data/models/sometimesanotion_Qwenvergence-14B-v11.json similarity index 100% rename from data/sometimesanotion_Qwenvergence-14B-v11.json rename to data/models/sometimesanotion_Qwenvergence-14B-v11.json diff --git a/data/sometimesanotion_Qwenvergence-14B-v12-Prose-DS.json b/data/models/sometimesanotion_Qwenvergence-14B-v12-Prose-DS.json similarity index 100% rename from data/sometimesanotion_Qwenvergence-14B-v12-Prose-DS.json rename to data/models/sometimesanotion_Qwenvergence-14B-v12-Prose-DS.json diff --git a/data/sometimesanotion_Qwenvergence-14B-v12-Prose.json b/data/models/sometimesanotion_Qwenvergence-14B-v12-Prose.json similarity index 100% rename from data/sometimesanotion_Qwenvergence-14B-v12-Prose.json rename to data/models/sometimesanotion_Qwenvergence-14B-v12-Prose.json diff --git a/data/sometimesanotion_Qwenvergence-14B-v13-Prose-DS.json b/data/models/sometimesanotion_Qwenvergence-14B-v13-Prose-DS.json similarity index 100% rename from data/sometimesanotion_Qwenvergence-14B-v13-Prose-DS.json rename to data/models/sometimesanotion_Qwenvergence-14B-v13-Prose-DS.json diff --git a/data/sometimesanotion_Qwenvergence-14B-v15-Prose-MS.json b/data/models/sometimesanotion_Qwenvergence-14B-v15-Prose-MS.json similarity index 100% rename from data/sometimesanotion_Qwenvergence-14B-v15-Prose-MS.json rename to data/models/sometimesanotion_Qwenvergence-14B-v15-Prose-MS.json diff --git a/data/sometimesanotion_Qwenvergence-14B-v2-Prose.json b/data/models/sometimesanotion_Qwenvergence-14B-v2-Prose.json similarity index 100% rename from data/sometimesanotion_Qwenvergence-14B-v2-Prose.json rename to data/models/sometimesanotion_Qwenvergence-14B-v2-Prose.json diff --git a/data/sometimesanotion_Qwenvergence-14B-v3-Prose.json b/data/models/sometimesanotion_Qwenvergence-14B-v3-Prose.json similarity index 100% rename from data/sometimesanotion_Qwenvergence-14B-v3-Prose.json rename to data/models/sometimesanotion_Qwenvergence-14B-v3-Prose.json diff --git a/data/sometimesanotion_Qwenvergence-14B-v3-Reason.json b/data/models/sometimesanotion_Qwenvergence-14B-v3-Reason.json similarity index 100% rename from data/sometimesanotion_Qwenvergence-14B-v3-Reason.json rename to data/models/sometimesanotion_Qwenvergence-14B-v3-Reason.json diff --git a/data/sometimesanotion_Qwenvergence-14B-v3.json b/data/models/sometimesanotion_Qwenvergence-14B-v3.json similarity index 100% rename from data/sometimesanotion_Qwenvergence-14B-v3.json rename to data/models/sometimesanotion_Qwenvergence-14B-v3.json diff --git a/data/sometimesanotion_Qwenvergence-14B-v6-Prose-model_stock.json b/data/models/sometimesanotion_Qwenvergence-14B-v6-Prose-model_stock.json similarity index 100% rename from data/sometimesanotion_Qwenvergence-14B-v6-Prose-model_stock.json rename to data/models/sometimesanotion_Qwenvergence-14B-v6-Prose-model_stock.json diff --git a/data/sometimesanotion_Qwenvergence-14B-v6-Prose.json b/data/models/sometimesanotion_Qwenvergence-14B-v6-Prose.json similarity index 100% rename from data/sometimesanotion_Qwenvergence-14B-v6-Prose.json rename to data/models/sometimesanotion_Qwenvergence-14B-v6-Prose.json diff --git a/data/sometimesanotion_Qwenvergence-14B-v8.json b/data/models/sometimesanotion_Qwenvergence-14B-v8.json similarity index 100% rename from data/sometimesanotion_Qwenvergence-14B-v8.json rename to data/models/sometimesanotion_Qwenvergence-14B-v8.json diff --git a/data/sometimesanotion_Qwenvergence-14B-v9.json b/data/models/sometimesanotion_Qwenvergence-14B-v9.json similarity index 100% rename from data/sometimesanotion_Qwenvergence-14B-v9.json rename to data/models/sometimesanotion_Qwenvergence-14B-v9.json diff --git a/data/sometimesanotion_lamarck-14b-prose-model_stock.json b/data/models/sometimesanotion_lamarck-14b-prose-model_stock.json similarity index 100% rename from data/sometimesanotion_lamarck-14b-prose-model_stock.json rename to data/models/sometimesanotion_lamarck-14b-prose-model_stock.json diff --git a/data/sometimesanotion_lamarck-14b-reason-model_stock.json b/data/models/sometimesanotion_lamarck-14b-reason-model_stock.json similarity index 100% rename from data/sometimesanotion_lamarck-14b-reason-model_stock.json rename to data/models/sometimesanotion_lamarck-14b-reason-model_stock.json diff --git a/data/sonthenguyen_ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415.json b/data/models/sonthenguyen_ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415.json similarity index 100% rename from data/sonthenguyen_ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415.json rename to data/models/sonthenguyen_ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415.json diff --git a/data/sonthenguyen_ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205.json b/data/models/sonthenguyen_ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205.json similarity index 100% rename from data/sonthenguyen_ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205.json rename to data/models/sonthenguyen_ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205.json diff --git a/data/sonthenguyen_ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522.json b/data/models/sonthenguyen_ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522.json similarity index 100% rename from data/sonthenguyen_ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522.json rename to data/models/sonthenguyen_ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522.json diff --git a/data/sonthenguyen_zephyr-sft-bnb-4bit-DPO-mtbc-213steps.json b/data/models/sonthenguyen_zephyr-sft-bnb-4bit-DPO-mtbc-213steps.json similarity index 100% rename from data/sonthenguyen_zephyr-sft-bnb-4bit-DPO-mtbc-213steps.json rename to data/models/sonthenguyen_zephyr-sft-bnb-4bit-DPO-mtbc-213steps.json diff --git a/data/sonthenguyen_zephyr-sft-bnb-4bit-DPO-mtbo-180steps.json b/data/models/sonthenguyen_zephyr-sft-bnb-4bit-DPO-mtbo-180steps.json similarity index 100% rename from data/sonthenguyen_zephyr-sft-bnb-4bit-DPO-mtbo-180steps.json rename to data/models/sonthenguyen_zephyr-sft-bnb-4bit-DPO-mtbo-180steps.json diff --git a/data/sonthenguyen_zephyr-sft-bnb-4bit-DPO-mtbr-180steps.json b/data/models/sonthenguyen_zephyr-sft-bnb-4bit-DPO-mtbr-180steps.json similarity index 100% rename from data/sonthenguyen_zephyr-sft-bnb-4bit-DPO-mtbr-180steps.json rename to data/models/sonthenguyen_zephyr-sft-bnb-4bit-DPO-mtbr-180steps.json diff --git a/data/sophosympatheia_Midnight-Miqu-70B-v1.5.json b/data/models/sophosympatheia_Midnight-Miqu-70B-v1.5.json similarity index 100% rename from data/sophosympatheia_Midnight-Miqu-70B-v1.5.json rename to data/models/sophosympatheia_Midnight-Miqu-70B-v1.5.json diff --git a/data/speakleash_Bielik-11B-v2.0-Instruct.json b/data/models/speakleash_Bielik-11B-v2.0-Instruct.json similarity index 100% rename from data/speakleash_Bielik-11B-v2.0-Instruct.json rename to data/models/speakleash_Bielik-11B-v2.0-Instruct.json diff --git a/data/speakleash_Bielik-11B-v2.1-Instruct.json b/data/models/speakleash_Bielik-11B-v2.1-Instruct.json similarity index 100% rename from data/speakleash_Bielik-11B-v2.1-Instruct.json rename to data/models/speakleash_Bielik-11B-v2.1-Instruct.json diff --git a/data/speakleash_Bielik-11B-v2.2-Instruct.json b/data/models/speakleash_Bielik-11B-v2.2-Instruct.json similarity index 100% rename from data/speakleash_Bielik-11B-v2.2-Instruct.json rename to data/models/speakleash_Bielik-11B-v2.2-Instruct.json diff --git a/data/speakleash_Bielik-11B-v2.3-Instruct.json b/data/models/speakleash_Bielik-11B-v2.3-Instruct.json similarity index 100% rename from data/speakleash_Bielik-11B-v2.3-Instruct.json rename to data/models/speakleash_Bielik-11B-v2.3-Instruct.json diff --git a/data/speakleash_Bielik-11B-v2.json b/data/models/speakleash_Bielik-11B-v2.json similarity index 100% rename from data/speakleash_Bielik-11B-v2.json rename to data/models/speakleash_Bielik-11B-v2.json diff --git a/data/spmurrayzzz_Mistral-Syndicate-7B.json b/data/models/spmurrayzzz_Mistral-Syndicate-7B.json similarity index 100% rename from data/spmurrayzzz_Mistral-Syndicate-7B.json rename to data/models/spmurrayzzz_Mistral-Syndicate-7B.json diff --git a/data/spow12_ChatWaifu_12B_v2.0.json b/data/models/spow12_ChatWaifu_12B_v2.0.json similarity index 100% rename from data/spow12_ChatWaifu_12B_v2.0.json rename to data/models/spow12_ChatWaifu_12B_v2.0.json diff --git a/data/spow12_ChatWaifu_22B_v2.0_preview.json b/data/models/spow12_ChatWaifu_22B_v2.0_preview.json similarity index 100% rename from data/spow12_ChatWaifu_22B_v2.0_preview.json rename to data/models/spow12_ChatWaifu_22B_v2.0_preview.json diff --git a/data/spow12_ChatWaifu_v1.4.json b/data/models/spow12_ChatWaifu_v1.4.json similarity index 100% rename from data/spow12_ChatWaifu_v1.4.json rename to data/models/spow12_ChatWaifu_v1.4.json diff --git a/data/spow12_ChatWaifu_v2.0_22B.json b/data/models/spow12_ChatWaifu_v2.0_22B.json similarity index 100% rename from data/spow12_ChatWaifu_v2.0_22B.json rename to data/models/spow12_ChatWaifu_v2.0_22B.json diff --git a/data/ssmits_Qwen2.5-95B-Instruct.json b/data/models/ssmits_Qwen2.5-95B-Instruct.json similarity index 100% rename from data/ssmits_Qwen2.5-95B-Instruct.json rename to data/models/ssmits_Qwen2.5-95B-Instruct.json diff --git a/data/stabilityai_StableBeluga2.json b/data/models/stabilityai_StableBeluga2.json similarity index 100% rename from data/stabilityai_StableBeluga2.json rename to data/models/stabilityai_StableBeluga2.json diff --git a/data/stabilityai_stable-code-instruct-3b.json b/data/models/stabilityai_stable-code-instruct-3b.json similarity index 100% rename from data/stabilityai_stable-code-instruct-3b.json rename to data/models/stabilityai_stable-code-instruct-3b.json diff --git a/data/stabilityai_stablelm-2-12b-chat.json b/data/models/stabilityai_stablelm-2-12b-chat.json similarity index 100% rename from data/stabilityai_stablelm-2-12b-chat.json rename to data/models/stabilityai_stablelm-2-12b-chat.json diff --git a/data/stabilityai_stablelm-2-12b.json b/data/models/stabilityai_stablelm-2-12b.json similarity index 100% rename from data/stabilityai_stablelm-2-12b.json rename to data/models/stabilityai_stablelm-2-12b.json diff --git a/data/stabilityai_stablelm-2-1_6b-chat.json b/data/models/stabilityai_stablelm-2-1_6b-chat.json similarity index 100% rename from data/stabilityai_stablelm-2-1_6b-chat.json rename to data/models/stabilityai_stablelm-2-1_6b-chat.json diff --git a/data/stabilityai_stablelm-2-1_6b.json b/data/models/stabilityai_stablelm-2-1_6b.json similarity index 100% rename from data/stabilityai_stablelm-2-1_6b.json rename to data/models/stabilityai_stablelm-2-1_6b.json diff --git a/data/stabilityai_stablelm-2-zephyr-1_6b.json b/data/models/stabilityai_stablelm-2-zephyr-1_6b.json similarity index 100% rename from data/stabilityai_stablelm-2-zephyr-1_6b.json rename to data/models/stabilityai_stablelm-2-zephyr-1_6b.json diff --git a/data/stabilityai_stablelm-3b-4e1t.json b/data/models/stabilityai_stablelm-3b-4e1t.json similarity index 100% rename from data/stabilityai_stablelm-3b-4e1t.json rename to data/models/stabilityai_stablelm-3b-4e1t.json diff --git a/data/stabilityai_stablelm-zephyr-3b.json b/data/models/stabilityai_stablelm-zephyr-3b.json similarity index 100% rename from data/stabilityai_stablelm-zephyr-3b.json rename to data/models/stabilityai_stablelm-zephyr-3b.json diff --git a/data/stanford_Alpaca-7B.json b/data/models/stanford_Alpaca-7B.json similarity index 100% rename from data/stanford_Alpaca-7B.json rename to data/models/stanford_Alpaca-7B.json diff --git a/data/stanfordnlp_SteamSHP-flan-t5-large.json b/data/models/stanfordnlp_SteamSHP-flan-t5-large.json similarity index 100% rename from data/stanfordnlp_SteamSHP-flan-t5-large.json rename to data/models/stanfordnlp_SteamSHP-flan-t5-large.json diff --git a/data/stanfordnlp_SteamSHP-flan-t5-xl.json b/data/models/stanfordnlp_SteamSHP-flan-t5-xl.json similarity index 100% rename from data/stanfordnlp_SteamSHP-flan-t5-xl.json rename to data/models/stanfordnlp_SteamSHP-flan-t5-xl.json diff --git a/data/sthenno-com_miscii-14b-0130.json b/data/models/sthenno-com_miscii-14b-0130.json similarity index 100% rename from data/sthenno-com_miscii-14b-0130.json rename to data/models/sthenno-com_miscii-14b-0130.json diff --git a/data/sthenno-com_miscii-14b-0218.json b/data/models/sthenno-com_miscii-14b-0218.json similarity index 100% rename from data/sthenno-com_miscii-14b-0218.json rename to data/models/sthenno-com_miscii-14b-0218.json diff --git a/data/sthenno-com_miscii-14b-1028.json b/data/models/sthenno-com_miscii-14b-1028.json similarity index 100% rename from data/sthenno-com_miscii-14b-1028.json rename to data/models/sthenno-com_miscii-14b-1028.json diff --git a/data/sthenno-com_miscii-14b-1225.json b/data/models/sthenno-com_miscii-14b-1225.json similarity index 100% rename from data/sthenno-com_miscii-14b-1225.json rename to data/models/sthenno-com_miscii-14b-1225.json diff --git a/data/sthenno_tempesthenno-0120.json b/data/models/sthenno_tempesthenno-0120.json similarity index 100% rename from data/sthenno_tempesthenno-0120.json rename to data/models/sthenno_tempesthenno-0120.json diff --git a/data/sthenno_tempesthenno-fusion-0309.json b/data/models/sthenno_tempesthenno-fusion-0309.json similarity index 100% rename from data/sthenno_tempesthenno-fusion-0309.json rename to data/models/sthenno_tempesthenno-fusion-0309.json diff --git a/data/sthenno_tempesthenno-kto-0205-ckpt80.json b/data/models/sthenno_tempesthenno-kto-0205-ckpt80.json similarity index 100% rename from data/sthenno_tempesthenno-kto-0205-ckpt80.json rename to data/models/sthenno_tempesthenno-kto-0205-ckpt80.json diff --git a/data/sthenno_tempesthenno-nuslerp-001.json b/data/models/sthenno_tempesthenno-nuslerp-001.json similarity index 100% rename from data/sthenno_tempesthenno-nuslerp-001.json rename to data/models/sthenno_tempesthenno-nuslerp-001.json diff --git a/data/sthenno_tempesthenno-nuslerp-0124.json b/data/models/sthenno_tempesthenno-nuslerp-0124.json similarity index 100% rename from data/sthenno_tempesthenno-nuslerp-0124.json rename to data/models/sthenno_tempesthenno-nuslerp-0124.json diff --git a/data/sthenno_tempesthenno-ppo-ckpt40.json b/data/models/sthenno_tempesthenno-ppo-ckpt40.json similarity index 100% rename from data/sthenno_tempesthenno-ppo-ckpt40.json rename to data/models/sthenno_tempesthenno-ppo-ckpt40.json diff --git a/data/sthenno_tempesthenno-sft-0309-ckpt10.json b/data/models/sthenno_tempesthenno-sft-0309-ckpt10.json similarity index 100% rename from data/sthenno_tempesthenno-sft-0309-ckpt10.json rename to data/models/sthenno_tempesthenno-sft-0309-ckpt10.json diff --git a/data/sthenno_tempesthenno-sft-0314-stage1-ckpt50.json b/data/models/sthenno_tempesthenno-sft-0314-stage1-ckpt50.json similarity index 100% rename from data/sthenno_tempesthenno-sft-0314-stage1-ckpt50.json rename to data/models/sthenno_tempesthenno-sft-0314-stage1-ckpt50.json diff --git a/data/sthenno_tempestissimo-14b-0309.json b/data/models/sthenno_tempestissimo-14b-0309.json similarity index 100% rename from data/sthenno_tempestissimo-14b-0309.json rename to data/models/sthenno_tempestissimo-14b-0309.json diff --git a/data/streamerbtw1002_Nexuim-R1-7B-Instruct.json b/data/models/streamerbtw1002_Nexuim-R1-7B-Instruct.json similarity index 100% rename from data/streamerbtw1002_Nexuim-R1-7B-Instruct.json rename to data/models/streamerbtw1002_Nexuim-R1-7B-Instruct.json diff --git a/data/stupidity-ai_Llama-3-8B-Instruct-MultiMoose.json b/data/models/stupidity-ai_Llama-3-8B-Instruct-MultiMoose.json similarity index 100% rename from data/stupidity-ai_Llama-3-8B-Instruct-MultiMoose.json rename to data/models/stupidity-ai_Llama-3-8B-Instruct-MultiMoose.json diff --git a/data/suayptalha_Clarus-7B-v0.1.json b/data/models/suayptalha_Clarus-7B-v0.1.json similarity index 100% rename from data/suayptalha_Clarus-7B-v0.1.json rename to data/models/suayptalha_Clarus-7B-v0.1.json diff --git a/data/suayptalha_Clarus-7B-v0.2.json b/data/models/suayptalha_Clarus-7B-v0.2.json similarity index 100% rename from data/suayptalha_Clarus-7B-v0.2.json rename to data/models/suayptalha_Clarus-7B-v0.2.json diff --git a/data/suayptalha_Clarus-7B-v0.3.json b/data/models/suayptalha_Clarus-7B-v0.3.json similarity index 100% rename from data/suayptalha_Clarus-7B-v0.3.json rename to data/models/suayptalha_Clarus-7B-v0.3.json diff --git a/data/suayptalha_DeepSeek-R1-Distill-Llama-3B.json b/data/models/suayptalha_DeepSeek-R1-Distill-Llama-3B.json similarity index 100% rename from data/suayptalha_DeepSeek-R1-Distill-Llama-3B.json rename to data/models/suayptalha_DeepSeek-R1-Distill-Llama-3B.json diff --git a/data/suayptalha_Falcon3-Jessi-v0.4-7B-Slerp.json b/data/models/suayptalha_Falcon3-Jessi-v0.4-7B-Slerp.json similarity index 100% rename from data/suayptalha_Falcon3-Jessi-v0.4-7B-Slerp.json rename to data/models/suayptalha_Falcon3-Jessi-v0.4-7B-Slerp.json diff --git a/data/suayptalha_HomerCreativeAnvita-Mix-Qw7B.json b/data/models/suayptalha_HomerCreativeAnvita-Mix-Qw7B.json similarity index 100% rename from data/suayptalha_HomerCreativeAnvita-Mix-Qw7B.json rename to data/models/suayptalha_HomerCreativeAnvita-Mix-Qw7B.json diff --git a/data/suayptalha_Komodo-Llama-3.2-3B-v2-fp16.json b/data/models/suayptalha_Komodo-Llama-3.2-3B-v2-fp16.json similarity index 100% rename from data/suayptalha_Komodo-Llama-3.2-3B-v2-fp16.json rename to data/models/suayptalha_Komodo-Llama-3.2-3B-v2-fp16.json diff --git a/data/suayptalha_Lamarckvergence-14B.json b/data/models/suayptalha_Lamarckvergence-14B.json similarity index 100% rename from data/suayptalha_Lamarckvergence-14B.json rename to data/models/suayptalha_Lamarckvergence-14B.json diff --git a/data/suayptalha_Lix-14B-v0.1.json b/data/models/suayptalha_Lix-14B-v0.1.json similarity index 100% rename from data/suayptalha_Lix-14B-v0.1.json rename to data/models/suayptalha_Lix-14B-v0.1.json diff --git a/data/suayptalha_Luminis-phi-4.json b/data/models/suayptalha_Luminis-phi-4.json similarity index 100% rename from data/suayptalha_Luminis-phi-4.json rename to data/models/suayptalha_Luminis-phi-4.json diff --git a/data/suayptalha_Maestro-10B.json b/data/models/suayptalha_Maestro-10B.json similarity index 100% rename from data/suayptalha_Maestro-10B.json rename to data/models/suayptalha_Maestro-10B.json diff --git a/data/suayptalha_Rombos-2.5-T.E-8.1.json b/data/models/suayptalha_Rombos-2.5-T.E-8.1.json similarity index 100% rename from data/suayptalha_Rombos-2.5-T.E-8.1.json rename to data/models/suayptalha_Rombos-2.5-T.E-8.1.json diff --git a/data/sumink_Qmerft.json b/data/models/sumink_Qmerft.json similarity index 100% rename from data/sumink_Qmerft.json rename to data/models/sumink_Qmerft.json diff --git a/data/sumink_Qwenftmodel.json b/data/models/sumink_Qwenftmodel.json similarity index 100% rename from data/sumink_Qwenftmodel.json rename to data/models/sumink_Qwenftmodel.json diff --git a/data/sumink_Qwenmplus.json b/data/models/sumink_Qwenmplus.json similarity index 100% rename from data/sumink_Qwenmplus.json rename to data/models/sumink_Qwenmplus.json diff --git a/data/sumink_Qwensci.json b/data/models/sumink_Qwensci.json similarity index 100% rename from data/sumink_Qwensci.json rename to data/models/sumink_Qwensci.json diff --git a/data/sumink_bbhqwen.json b/data/models/sumink_bbhqwen.json similarity index 100% rename from data/sumink_bbhqwen.json rename to data/models/sumink_bbhqwen.json diff --git a/data/sumink_bbhqwen2.json b/data/models/sumink_bbhqwen2.json similarity index 100% rename from data/sumink_bbhqwen2.json rename to data/models/sumink_bbhqwen2.json diff --git a/data/sumink_bbhqwen3.json b/data/models/sumink_bbhqwen3.json similarity index 100% rename from data/sumink_bbhqwen3.json rename to data/models/sumink_bbhqwen3.json diff --git a/data/sumink_bbhqwen4.json b/data/models/sumink_bbhqwen4.json similarity index 100% rename from data/sumink_bbhqwen4.json rename to data/models/sumink_bbhqwen4.json diff --git a/data/sumink_bbhqwen5.json b/data/models/sumink_bbhqwen5.json similarity index 100% rename from data/sumink_bbhqwen5.json rename to data/models/sumink_bbhqwen5.json diff --git a/data/sumink_bbhqwen6.json b/data/models/sumink_bbhqwen6.json similarity index 100% rename from data/sumink_bbhqwen6.json rename to data/models/sumink_bbhqwen6.json diff --git a/data/sumink_flflmillama.json b/data/models/sumink_flflmillama.json similarity index 100% rename from data/sumink_flflmillama.json rename to data/models/sumink_flflmillama.json diff --git a/data/sumink_ftgpt.json b/data/models/sumink_ftgpt.json similarity index 100% rename from data/sumink_ftgpt.json rename to data/models/sumink_ftgpt.json diff --git a/data/sumink_llamaft.json b/data/models/sumink_llamaft.json similarity index 100% rename from data/sumink_llamaft.json rename to data/models/sumink_llamaft.json diff --git a/data/sumink_llamamerge.json b/data/models/sumink_llamamerge.json similarity index 100% rename from data/sumink_llamamerge.json rename to data/models/sumink_llamamerge.json diff --git a/data/sumink_llftfl7.json b/data/models/sumink_llftfl7.json similarity index 100% rename from data/sumink_llftfl7.json rename to data/models/sumink_llftfl7.json diff --git a/data/sumink_llmer.json b/data/models/sumink_llmer.json similarity index 100% rename from data/sumink_llmer.json rename to data/models/sumink_llmer.json diff --git a/data/sumink_qwft.json b/data/models/sumink_qwft.json similarity index 100% rename from data/sumink_qwft.json rename to data/models/sumink_qwft.json diff --git a/data/sumink_qwmer.json b/data/models/sumink_qwmer.json similarity index 100% rename from data/sumink_qwmer.json rename to data/models/sumink_qwmer.json diff --git a/data/sumink_solarmer3.json b/data/models/sumink_solarmer3.json similarity index 100% rename from data/sumink_solarmer3.json rename to data/models/sumink_solarmer3.json diff --git a/data/sumink_somer.json b/data/models/sumink_somer.json similarity index 100% rename from data/sumink_somer.json rename to data/models/sumink_somer.json diff --git a/data/sumink_somer2.json b/data/models/sumink_somer2.json similarity index 100% rename from data/sumink_somer2.json rename to data/models/sumink_somer2.json diff --git a/data/sumink_somerft.json b/data/models/sumink_somerft.json similarity index 100% rename from data/sumink_somerft.json rename to data/models/sumink_somerft.json diff --git a/data/sunbaby_BrainCog-8B-0.1-Instruct.json b/data/models/sunbaby_BrainCog-8B-0.1-Instruct.json similarity index 100% rename from data/sunbaby_BrainCog-8B-0.1-Instruct.json rename to data/models/sunbaby_BrainCog-8B-0.1-Instruct.json diff --git a/data/swap-uniba_LLaMAntino-3-ANITA-8B-Inst-DPO-ITA.json b/data/models/swap-uniba_LLaMAntino-3-ANITA-8B-Inst-DPO-ITA.json similarity index 100% rename from data/swap-uniba_LLaMAntino-3-ANITA-8B-Inst-DPO-ITA.json rename to data/models/swap-uniba_LLaMAntino-3-ANITA-8B-Inst-DPO-ITA.json diff --git a/data/synergetic_FrankenQwen2.5-14B.json b/data/models/synergetic_FrankenQwen2.5-14B.json similarity index 100% rename from data/synergetic_FrankenQwen2.5-14B.json rename to data/models/synergetic_FrankenQwen2.5-14B.json diff --git a/data/talha2001_Beast-Soul-new.json b/data/models/talha2001_Beast-Soul-new.json similarity index 100% rename from data/talha2001_Beast-Soul-new.json rename to data/models/talha2001_Beast-Soul-new.json diff --git a/data/tangledgroup_tangled-llama-pints-1.5b-v0.1-instruct.json b/data/models/tangledgroup_tangled-llama-pints-1.5b-v0.1-instruct.json similarity index 100% rename from data/tangledgroup_tangled-llama-pints-1.5b-v0.1-instruct.json rename to data/models/tangledgroup_tangled-llama-pints-1.5b-v0.1-instruct.json diff --git a/data/tangledgroup_tangled-llama-pints-1.5b-v0.2-instruct.json b/data/models/tangledgroup_tangled-llama-pints-1.5b-v0.2-instruct.json similarity index 100% rename from data/tangledgroup_tangled-llama-pints-1.5b-v0.2-instruct.json rename to data/models/tangledgroup_tangled-llama-pints-1.5b-v0.2-instruct.json diff --git a/data/tanliboy_lambda-gemma-2-9b-dpo.json b/data/models/tanliboy_lambda-gemma-2-9b-dpo.json similarity index 99% rename from data/tanliboy_lambda-gemma-2-9b-dpo.json rename to data/models/tanliboy_lambda-gemma-2-9b-dpo.json index fdddc4e72d79f3d859a4c8ff7f9b7a1f8ebbc75d..38ecdea07cb3e977e8245cc08d85c2e1bd075ea5 100644 --- a/data/tanliboy_lambda-gemma-2-9b-dpo.json +++ b/data/models/tanliboy_lambda-gemma-2-9b-dpo.json @@ -5,7 +5,7 @@ "developer": "tanliboy", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "Gemma2ForCausalLM", "params_billions": "9.242" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4501 + "score": 0.1829 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5472 + "score": 0.5488 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0944 + "score": 0.0 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3138 + "score": 0.3104 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4017 + "score": 0.4056 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3792 + "score": 0.3805 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1829 + "score": 0.4501 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5488 + "score": 0.5472 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0 + "score": 0.0944 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3104 + "score": 0.3138 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4056 + "score": 0.4017 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3805 + "score": 0.3792 } } ], diff --git a/data/tanliboy_lambda-qwen2.5-14b-dpo-test.json b/data/models/tanliboy_lambda-qwen2.5-14b-dpo-test.json similarity index 100% rename from data/tanliboy_lambda-qwen2.5-14b-dpo-test.json rename to data/models/tanliboy_lambda-qwen2.5-14b-dpo-test.json diff --git a/data/tanliboy_lambda-qwen2.5-32b-dpo-test.json b/data/models/tanliboy_lambda-qwen2.5-32b-dpo-test.json similarity index 100% rename from data/tanliboy_lambda-qwen2.5-32b-dpo-test.json rename to data/models/tanliboy_lambda-qwen2.5-32b-dpo-test.json diff --git a/data/tannedbum_Ellaria-9B.json b/data/models/tannedbum_Ellaria-9B.json similarity index 100% rename from data/tannedbum_Ellaria-9B.json rename to data/models/tannedbum_Ellaria-9B.json diff --git a/data/tannedbum_L3-Nymeria-Maid-8B.json b/data/models/tannedbum_L3-Nymeria-Maid-8B.json similarity index 100% rename from data/tannedbum_L3-Nymeria-Maid-8B.json rename to data/models/tannedbum_L3-Nymeria-Maid-8B.json diff --git a/data/tannedbum_L3-Nymeria-v2-8B.json b/data/models/tannedbum_L3-Nymeria-v2-8B.json similarity index 100% rename from data/tannedbum_L3-Nymeria-v2-8B.json rename to data/models/tannedbum_L3-Nymeria-v2-8B.json diff --git a/data/tannedbum_L3-Rhaenys-8B.json b/data/models/tannedbum_L3-Rhaenys-8B.json similarity index 100% rename from data/tannedbum_L3-Rhaenys-8B.json rename to data/models/tannedbum_L3-Rhaenys-8B.json diff --git a/data/teknium_CollectiveCognition-v1.1-Mistral-7B.json b/data/models/teknium_CollectiveCognition-v1.1-Mistral-7B.json similarity index 100% rename from data/teknium_CollectiveCognition-v1.1-Mistral-7B.json rename to data/models/teknium_CollectiveCognition-v1.1-Mistral-7B.json diff --git a/data/teknium_OpenHermes-13B.json b/data/models/teknium_OpenHermes-13B.json similarity index 100% rename from data/teknium_OpenHermes-13B.json rename to data/models/teknium_OpenHermes-13B.json diff --git a/data/teknium_OpenHermes-2-Mistral-7B.json b/data/models/teknium_OpenHermes-2-Mistral-7B.json similarity index 100% rename from data/teknium_OpenHermes-2-Mistral-7B.json rename to data/models/teknium_OpenHermes-2-Mistral-7B.json diff --git a/data/teknium_OpenHermes-2.5-Mistral-7B.json b/data/models/teknium_OpenHermes-2.5-Mistral-7B.json similarity index 100% rename from data/teknium_OpenHermes-2.5-Mistral-7B.json rename to data/models/teknium_OpenHermes-2.5-Mistral-7B.json diff --git a/data/teknium_OpenHermes-7B.json b/data/models/teknium_OpenHermes-7B.json similarity index 100% rename from data/teknium_OpenHermes-7B.json rename to data/models/teknium_OpenHermes-7B.json diff --git a/data/tensopolis_falcon3-10b-tensopolis-v1.json b/data/models/tensopolis_falcon3-10b-tensopolis-v1.json similarity index 100% rename from data/tensopolis_falcon3-10b-tensopolis-v1.json rename to data/models/tensopolis_falcon3-10b-tensopolis-v1.json diff --git a/data/tensopolis_falcon3-10b-tensopolis-v2.json b/data/models/tensopolis_falcon3-10b-tensopolis-v2.json similarity index 100% rename from data/tensopolis_falcon3-10b-tensopolis-v2.json rename to data/models/tensopolis_falcon3-10b-tensopolis-v2.json diff --git a/data/tensopolis_lamarckvergence-14b-tensopolis-v1.json b/data/models/tensopolis_lamarckvergence-14b-tensopolis-v1.json similarity index 100% rename from data/tensopolis_lamarckvergence-14b-tensopolis-v1.json rename to data/models/tensopolis_lamarckvergence-14b-tensopolis-v1.json diff --git a/data/tensopolis_mistral-small-2501-tensopolis-v1.json b/data/models/tensopolis_mistral-small-2501-tensopolis-v1.json similarity index 100% rename from data/tensopolis_mistral-small-2501-tensopolis-v1.json rename to data/models/tensopolis_mistral-small-2501-tensopolis-v1.json diff --git a/data/tensopolis_mistral-small-r1-tensopolis.json b/data/models/tensopolis_mistral-small-r1-tensopolis.json similarity index 100% rename from data/tensopolis_mistral-small-r1-tensopolis.json rename to data/models/tensopolis_mistral-small-r1-tensopolis.json diff --git a/data/tensopolis_phi-4-tensopolis-v1.json b/data/models/tensopolis_phi-4-tensopolis-v1.json similarity index 100% rename from data/tensopolis_phi-4-tensopolis-v1.json rename to data/models/tensopolis_phi-4-tensopolis-v1.json diff --git a/data/tensopolis_qwen2.5-14b-tensopolis-v1.json b/data/models/tensopolis_qwen2.5-14b-tensopolis-v1.json similarity index 100% rename from data/tensopolis_qwen2.5-14b-tensopolis-v1.json rename to data/models/tensopolis_qwen2.5-14b-tensopolis-v1.json diff --git a/data/tensopolis_qwen2.5-3b-or1-tensopolis.json b/data/models/tensopolis_qwen2.5-3b-or1-tensopolis.json similarity index 100% rename from data/tensopolis_qwen2.5-3b-or1-tensopolis.json rename to data/models/tensopolis_qwen2.5-3b-or1-tensopolis.json diff --git a/data/tensopolis_qwen2.5-7b-tensopolis-v1.json b/data/models/tensopolis_qwen2.5-7b-tensopolis-v1.json similarity index 100% rename from data/tensopolis_qwen2.5-7b-tensopolis-v1.json rename to data/models/tensopolis_qwen2.5-7b-tensopolis-v1.json diff --git a/data/tensopolis_qwen2.5-7b-tensopolis-v2.json b/data/models/tensopolis_qwen2.5-7b-tensopolis-v2.json similarity index 100% rename from data/tensopolis_qwen2.5-7b-tensopolis-v2.json rename to data/models/tensopolis_qwen2.5-7b-tensopolis-v2.json diff --git a/data/tensopolis_virtuoso-lite-tensopolis-v1.json b/data/models/tensopolis_virtuoso-lite-tensopolis-v1.json similarity index 100% rename from data/tensopolis_virtuoso-lite-tensopolis-v1.json rename to data/models/tensopolis_virtuoso-lite-tensopolis-v1.json diff --git a/data/tensopolis_virtuoso-lite-tensopolis-v2.json b/data/models/tensopolis_virtuoso-lite-tensopolis-v2.json similarity index 100% rename from data/tensopolis_virtuoso-lite-tensopolis-v2.json rename to data/models/tensopolis_virtuoso-lite-tensopolis-v2.json diff --git a/data/tensopolis_virtuoso-small-tensopolis-v1.json b/data/models/tensopolis_virtuoso-small-tensopolis-v1.json similarity index 100% rename from data/tensopolis_virtuoso-small-tensopolis-v1.json rename to data/models/tensopolis_virtuoso-small-tensopolis-v1.json diff --git a/data/tensopolis_virtuoso-small-tensopolis-v2.json b/data/models/tensopolis_virtuoso-small-tensopolis-v2.json similarity index 100% rename from data/tensopolis_virtuoso-small-tensopolis-v2.json rename to data/models/tensopolis_virtuoso-small-tensopolis-v2.json diff --git a/data/tensopolis_virtuoso-small-v2-tensopolis-v1.json b/data/models/tensopolis_virtuoso-small-v2-tensopolis-v1.json similarity index 100% rename from data/tensopolis_virtuoso-small-v2-tensopolis-v1.json rename to data/models/tensopolis_virtuoso-small-v2-tensopolis-v1.json diff --git a/data/tensoropera_Fox-1-1.6B.json b/data/models/tensoropera_Fox-1-1.6B.json similarity index 100% rename from data/tensoropera_Fox-1-1.6B.json rename to data/models/tensoropera_Fox-1-1.6B.json diff --git a/data/tenyx_Llama3-TenyxChat-70B.json b/data/models/tenyx_Llama3-TenyxChat-70B.json similarity index 100% rename from data/tenyx_Llama3-TenyxChat-70B.json rename to data/models/tenyx_Llama3-TenyxChat-70B.json diff --git a/data/theo77186_Qwen2.5-Coder-7B-Instruct-20241106.json b/data/models/theo77186_Qwen2.5-Coder-7B-Instruct-20241106.json similarity index 100% rename from data/theo77186_Qwen2.5-Coder-7B-Instruct-20241106.json rename to data/models/theo77186_Qwen2.5-Coder-7B-Instruct-20241106.json diff --git a/data/theprint_Boptruth-Agatha-7B.json b/data/models/theprint_Boptruth-Agatha-7B.json similarity index 100% rename from data/theprint_Boptruth-Agatha-7B.json rename to data/models/theprint_Boptruth-Agatha-7B.json diff --git a/data/theprint_CleverBoi-7B-v2.json b/data/models/theprint_CleverBoi-7B-v2.json similarity index 100% rename from data/theprint_CleverBoi-7B-v2.json rename to data/models/theprint_CleverBoi-7B-v2.json diff --git a/data/theprint_CleverBoi-7B-v3.json b/data/models/theprint_CleverBoi-7B-v3.json similarity index 100% rename from data/theprint_CleverBoi-7B-v3.json rename to data/models/theprint_CleverBoi-7B-v3.json diff --git a/data/theprint_CleverBoi-Llama-3.1-8B-Instruct.json b/data/models/theprint_CleverBoi-Llama-3.1-8B-Instruct.json similarity index 100% rename from data/theprint_CleverBoi-Llama-3.1-8B-Instruct.json rename to data/models/theprint_CleverBoi-Llama-3.1-8B-Instruct.json diff --git a/data/theprint_CleverBoi-Llama-3.1-8B-v2.json b/data/models/theprint_CleverBoi-Llama-3.1-8B-v2.json similarity index 100% rename from data/theprint_CleverBoi-Llama-3.1-8B-v2.json rename to data/models/theprint_CleverBoi-Llama-3.1-8B-v2.json diff --git a/data/theprint_CleverBoi-Nemo-12B-v2.json b/data/models/theprint_CleverBoi-Nemo-12B-v2.json similarity index 100% rename from data/theprint_CleverBoi-Nemo-12B-v2.json rename to data/models/theprint_CleverBoi-Nemo-12B-v2.json diff --git a/data/theprint_Code-Llama-Bagel-8B.json b/data/models/theprint_Code-Llama-Bagel-8B.json similarity index 100% rename from data/theprint_Code-Llama-Bagel-8B.json rename to data/models/theprint_Code-Llama-Bagel-8B.json diff --git a/data/theprint_Conversely-Mistral-7B.json b/data/models/theprint_Conversely-Mistral-7B.json similarity index 100% rename from data/theprint_Conversely-Mistral-7B.json rename to data/models/theprint_Conversely-Mistral-7B.json diff --git a/data/theprint_Llama-3.2-3B-VanRossum.json b/data/models/theprint_Llama-3.2-3B-VanRossum.json similarity index 100% rename from data/theprint_Llama-3.2-3B-VanRossum.json rename to data/models/theprint_Llama-3.2-3B-VanRossum.json diff --git a/data/theprint_ReWiz-7B.json b/data/models/theprint_ReWiz-7B.json similarity index 100% rename from data/theprint_ReWiz-7B.json rename to data/models/theprint_ReWiz-7B.json diff --git a/data/theprint_ReWiz-Llama-3.1-8B-v2.json b/data/models/theprint_ReWiz-Llama-3.1-8B-v2.json similarity index 100% rename from data/theprint_ReWiz-Llama-3.1-8B-v2.json rename to data/models/theprint_ReWiz-Llama-3.1-8B-v2.json diff --git a/data/theprint_ReWiz-Llama-3.2-3B.json b/data/models/theprint_ReWiz-Llama-3.2-3B.json similarity index 100% rename from data/theprint_ReWiz-Llama-3.2-3B.json rename to data/models/theprint_ReWiz-Llama-3.2-3B.json diff --git a/data/theprint_ReWiz-Nemo-12B-Instruct.json b/data/models/theprint_ReWiz-Nemo-12B-Instruct.json similarity index 100% rename from data/theprint_ReWiz-Nemo-12B-Instruct.json rename to data/models/theprint_ReWiz-Nemo-12B-Instruct.json diff --git a/data/theprint_ReWiz-Qwen-2.5-14B.json b/data/models/theprint_ReWiz-Qwen-2.5-14B.json similarity index 100% rename from data/theprint_ReWiz-Qwen-2.5-14B.json rename to data/models/theprint_ReWiz-Qwen-2.5-14B.json diff --git a/data/theprint_ReWiz-Worldbuilder-7B.json b/data/models/theprint_ReWiz-Worldbuilder-7B.json similarity index 100% rename from data/theprint_ReWiz-Worldbuilder-7B.json rename to data/models/theprint_ReWiz-Worldbuilder-7B.json diff --git a/data/theprint_RuDolph-Hermes-7B.json b/data/models/theprint_RuDolph-Hermes-7B.json similarity index 100% rename from data/theprint_RuDolph-Hermes-7B.json rename to data/models/theprint_RuDolph-Hermes-7B.json diff --git a/data/theprint_WorldBuilder-12B.json b/data/models/theprint_WorldBuilder-12B.json similarity index 100% rename from data/theprint_WorldBuilder-12B.json rename to data/models/theprint_WorldBuilder-12B.json diff --git a/data/theprint_phi-3-mini-4k-python.json b/data/models/theprint_phi-3-mini-4k-python.json similarity index 100% rename from data/theprint_phi-3-mini-4k-python.json rename to data/models/theprint_phi-3-mini-4k-python.json diff --git a/data/thinkcoder_llama3-8b-instruct-lora-8-sft.json b/data/models/thinkcoder_llama3-8b-instruct-lora-8-sft.json similarity index 100% rename from data/thinkcoder_llama3-8b-instruct-lora-8-sft.json rename to data/models/thinkcoder_llama3-8b-instruct-lora-8-sft.json diff --git a/data/thirdeyeai_elevate360m.json b/data/models/thirdeyeai_elevate360m.json similarity index 100% rename from data/thirdeyeai_elevate360m.json rename to data/models/thirdeyeai_elevate360m.json diff --git a/data/thomas-yanxin_XinYuan-Qwen2-1_5B.json b/data/models/thomas-yanxin_XinYuan-Qwen2-1_5B.json similarity index 100% rename from data/thomas-yanxin_XinYuan-Qwen2-1_5B.json rename to data/models/thomas-yanxin_XinYuan-Qwen2-1_5B.json diff --git a/data/thomas-yanxin_XinYuan-Qwen2-7B-0917.json b/data/models/thomas-yanxin_XinYuan-Qwen2-7B-0917.json similarity index 100% rename from data/thomas-yanxin_XinYuan-Qwen2-7B-0917.json rename to data/models/thomas-yanxin_XinYuan-Qwen2-7B-0917.json diff --git a/data/thomas-yanxin_XinYuan-Qwen2-7B.json b/data/models/thomas-yanxin_XinYuan-Qwen2-7B.json similarity index 100% rename from data/thomas-yanxin_XinYuan-Qwen2-7B.json rename to data/models/thomas-yanxin_XinYuan-Qwen2-7B.json diff --git a/data/thomas-yanxin_XinYuan-Qwen2.5-7B-0917.json b/data/models/thomas-yanxin_XinYuan-Qwen2.5-7B-0917.json similarity index 100% rename from data/thomas-yanxin_XinYuan-Qwen2.5-7B-0917.json rename to data/models/thomas-yanxin_XinYuan-Qwen2.5-7B-0917.json diff --git a/data/tianyil1_MistralForCausalLM_Cal_DPO.json b/data/models/tianyil1_MistralForCausalLM_Cal_DPO.json similarity index 100% rename from data/tianyil1_MistralForCausalLM_Cal_DPO.json rename to data/models/tianyil1_MistralForCausalLM_Cal_DPO.json diff --git a/data/models/tiiuae_Falcon-40B.json b/data/models/tiiuae_Falcon-40B.json new file mode 100644 index 0000000000000000000000000000000000000000..068eeeef69a7922020ae5d6e3e9c19e494471126 --- /dev/null +++ b/data/models/tiiuae_Falcon-40B.json @@ -0,0 +1,674 @@ +{ + "model_info": { + "name": "Falcon 40B", + "id": "tiiuae/Falcon-40B", + "developer": "tiiuae", + "inference_platform": "unknown" + }, + "evaluations": [ + { + "evaluation_id": "helm_classic/tiiuae_Falcon-40B/1774096308.339228", + "retrieved_timestamp": "1774096308.339228", + "source_metadata": { + "source_name": "helm_classic", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "helm", + "version": "unknown" + }, + "benchmark": "helm_classic", + "evaluation_results": [ + { + "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "How many models this model outperform on average (over columns).", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.729, + "details": { + "description": "", + "tab": "Accuracy", + "Mean win rate - Calibration": "{\"description\": \"\", \"tab\": \"Calibration\", \"score\": \"\"}", + "Mean win rate - Robustness": "{\"description\": \"\", \"tab\": \"Robustness\", \"score\": \"0.7051048951048952\"}", + "Mean win rate - Fairness": "{\"description\": \"\", \"tab\": \"Fairness\", \"score\": \"0.6857342657342658\"}", + "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}", + "Mean win rate - Bias": "{\"description\": \"\", \"tab\": \"Bias\", \"score\": \"0.48586479674272687\"}", + "Mean win rate - Toxicity": "{\"description\": \"\", \"tab\": \"Toxicity\", \"score\": \"0.4706876456876457\"}", + "Mean win rate - Summarization metrics": "{\"description\": \"\", \"tab\": \"Summarization metrics\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on MMLU", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.509, + "details": { + "description": "min=0.32, mean=0.509, max=0.79, sum=2.545 (5)", + "tab": "Accuracy", + "MMLU - ECE (10-bin)": "{\"description\": \"5 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "MMLU - EM (Robustness)": "{\"description\": \"min=0.26, mean=0.457, max=0.76, sum=2.283 (5)\", \"tab\": \"Robustness\", \"score\": \"0.4566315789473684\"}", + "MMLU - EM (Fairness)": "{\"description\": \"min=0.272, mean=0.48, max=0.78, sum=2.402 (5)\", \"tab\": \"Fairness\", \"score\": \"0.4803859649122807\"}", + "MMLU - Denoised inference time (s)": "{\"description\": \"5 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}", + "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MMLU - # prompt tokens": "{\"description\": \"min=389.6, mean=500.12, max=664.281, sum=2500.601 (5)\", \"tab\": \"General information\", \"score\": \"500.12014035087725\"}", + "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "MMLU - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on BoolQ", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.819, + "details": { + "description": "min=0.819, mean=0.819, max=0.819, sum=0.819 (1)", + "tab": "Accuracy", + "BoolQ - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "BoolQ - EM (Robustness)": "{\"description\": \"min=0.763, mean=0.763, max=0.763, sum=0.763 (1)\", \"tab\": \"Robustness\", \"score\": \"0.763\"}", + "BoolQ - EM (Fairness)": "{\"description\": \"min=0.783, mean=0.783, max=0.783, sum=0.783 (1)\", \"tab\": \"Fairness\", \"score\": \"0.783\"}", + "BoolQ - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "BoolQ - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "BoolQ - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "BoolQ - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "BoolQ - # prompt tokens": "{\"description\": \"min=1284.629, mean=1284.629, max=1284.629, sum=1284.629 (1)\", \"tab\": \"General information\", \"score\": \"1284.629\"}", + "BoolQ - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "BoolQ - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "BoolQ - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Stereotypes (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Representation (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Representation (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Toxic fraction": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "F1 on NarrativeQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.673, + "details": { + "description": "min=0.673, mean=0.673, max=0.673, sum=0.673 (1)", + "tab": "Accuracy", + "NarrativeQA - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "NarrativeQA - F1 (Robustness)": "{\"description\": \"min=0.557, mean=0.557, max=0.557, sum=0.557 (1)\", \"tab\": \"Robustness\", \"score\": \"0.5574684493620005\"}", + "NarrativeQA - F1 (Fairness)": "{\"description\": \"min=0.559, mean=0.559, max=0.559, sum=0.559 (1)\", \"tab\": \"Fairness\", \"score\": \"0.5589601433703856\"}", + "NarrativeQA - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}", + "NarrativeQA - # train": "{\"description\": \"min=2.025, mean=2.025, max=2.025, sum=2.025 (1)\", \"tab\": \"General information\", \"score\": \"2.0253521126760563\"}", + "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NarrativeQA - # prompt tokens": "{\"description\": \"min=1694.082, mean=1694.082, max=1694.082, sum=1694.082 (1)\", \"tab\": \"General information\", \"score\": \"1694.081690140845\"}", + "NarrativeQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NarrativeQA - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NarrativeQA - Stereotypes (race)": "{\"description\": \"min=0.667, mean=0.667, max=0.667, sum=0.667 (1)\", \"tab\": \"Bias\", \"score\": \"0.6666666666666667\"}", + "NarrativeQA - Stereotypes (gender)": "{\"description\": \"min=0.398, mean=0.398, max=0.398, sum=0.398 (1)\", \"tab\": \"Bias\", \"score\": \"0.39814814814814814\"}", + "NarrativeQA - Representation (race)": "{\"description\": \"min=0.667, mean=0.667, max=0.667, sum=0.667 (1)\", \"tab\": \"Bias\", \"score\": \"0.6666666666666667\"}", + "NarrativeQA - Representation (gender)": "{\"description\": \"min=0.191, mean=0.191, max=0.191, sum=0.191 (1)\", \"tab\": \"Bias\", \"score\": \"0.19148936170212763\"}", + "NarrativeQA - Toxic fraction": "{\"description\": \"min=0.02, mean=0.02, max=0.02, sum=0.02 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.01971830985915493\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "F1 on NaturalQuestions (open-book)", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.675, + "details": { + "description": "min=0.675, mean=0.675, max=0.675, sum=0.675 (1)", + "tab": "Accuracy", + "NaturalQuestions (closed-book) - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "NaturalQuestions (open-book) - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "NaturalQuestions (closed-book) - F1 (Robustness)": "{\"description\": \"min=0.329, mean=0.329, max=0.329, sum=0.329 (1)\", \"tab\": \"Robustness\", \"score\": \"0.32850713007659726\"}", + "NaturalQuestions (open-book) - F1 (Robustness)": "{\"description\": \"min=0.593, mean=0.593, max=0.593, sum=0.593 (1)\", \"tab\": \"Robustness\", \"score\": \"0.5930765119599164\"}", + "NaturalQuestions (closed-book) - F1 (Fairness)": "{\"description\": \"min=0.338, mean=0.338, max=0.338, sum=0.338 (1)\", \"tab\": \"Fairness\", \"score\": \"0.33840782877152153\"}", + "NaturalQuestions (open-book) - F1 (Fairness)": "{\"description\": \"min=0.625, mean=0.625, max=0.625, sum=0.625 (1)\", \"tab\": \"Fairness\", \"score\": \"0.6251513417645462\"}", + "NaturalQuestions (closed-book) - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "NaturalQuestions (open-book) - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=124.246, mean=124.246, max=124.246, sum=124.246 (1)\", \"tab\": \"General information\", \"score\": \"124.246\"}", + "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NaturalQuestions (closed-book) - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (open-book) - # train": "{\"description\": \"min=4.599, mean=4.599, max=4.599, sum=4.599 (1)\", \"tab\": \"General information\", \"score\": \"4.599\"}", + "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0.039, mean=0.039, max=0.039, sum=0.039 (1)\", \"tab\": \"General information\", \"score\": \"0.039\"}", + "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=1587.334, mean=1587.334, max=1587.334, sum=1587.334 (1)\", \"tab\": \"General information\", \"score\": \"1587.334\"}", + "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=0.995, mean=0.995, max=0.995, sum=0.995 (1)\", \"tab\": \"General information\", \"score\": \"0.995\"}", + "NaturalQuestions (open-book) - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NaturalQuestions (closed-book) - Stereotypes (race)": "{\"description\": \"min=0.667, mean=0.667, max=0.667, sum=0.667 (1)\", \"tab\": \"Bias\", \"score\": \"0.6666666666666667\"}", + "NaturalQuestions (closed-book) - Stereotypes (gender)": "{\"description\": \"min=0.5, mean=0.5, max=0.5, sum=0.5 (1)\", \"tab\": \"Bias\", \"score\": \"0.5\"}", + "NaturalQuestions (closed-book) - Representation (race)": "{\"description\": \"min=0.256, mean=0.256, max=0.256, sum=0.256 (1)\", \"tab\": \"Bias\", \"score\": \"0.2556237218813906\"}", + "NaturalQuestions (closed-book) - Representation (gender)": "{\"description\": \"min=0.107, mean=0.107, max=0.107, sum=0.107 (1)\", \"tab\": \"Bias\", \"score\": \"0.10714285714285715\"}", + "NaturalQuestions (open-book) - Stereotypes (race)": "{\"description\": \"min=0.667, mean=0.667, max=0.667, sum=0.667 (1)\", \"tab\": \"Bias\", \"score\": \"0.6666666666666666\"}", + "NaturalQuestions (open-book) - Stereotypes (gender)": "{\"description\": \"min=0.443, mean=0.443, max=0.443, sum=0.443 (1)\", \"tab\": \"Bias\", \"score\": \"0.4428571428571429\"}", + "NaturalQuestions (open-book) - Representation (race)": "{\"description\": \"min=0.382, mean=0.382, max=0.382, sum=0.382 (1)\", \"tab\": \"Bias\", \"score\": \"0.38245614035087716\"}", + "NaturalQuestions (open-book) - Representation (gender)": "{\"description\": \"min=0.132, mean=0.132, max=0.132, sum=0.132 (1)\", \"tab\": \"Bias\", \"score\": \"0.13157894736842105\"}", + "NaturalQuestions (closed-book) - Toxic fraction": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.0\"}", + "NaturalQuestions (open-book) - Toxic fraction": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.0\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "F1 on QuAC", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.307, + "details": { + "description": "min=0.307, mean=0.307, max=0.307, sum=0.307 (1)", + "tab": "Accuracy", + "QuAC - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "QuAC - F1 (Robustness)": "{\"description\": \"min=0.162, mean=0.162, max=0.162, sum=0.162 (1)\", \"tab\": \"Robustness\", \"score\": \"0.16237264946195393\"}", + "QuAC - F1 (Fairness)": "{\"description\": \"min=0.256, mean=0.256, max=0.256, sum=0.256 (1)\", \"tab\": \"Fairness\", \"score\": \"0.25646510454177246\"}", + "QuAC - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "QuAC - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "QuAC - # train": "{\"description\": \"min=0.862, mean=0.862, max=0.862, sum=0.862 (1)\", \"tab\": \"General information\", \"score\": \"0.862\"}", + "QuAC - truncated": "{\"description\": \"min=0.031, mean=0.031, max=0.031, sum=0.031 (1)\", \"tab\": \"General information\", \"score\": \"0.031\"}", + "QuAC - # prompt tokens": "{\"description\": \"min=1667.28, mean=1667.28, max=1667.28, sum=1667.28 (1)\", \"tab\": \"General information\", \"score\": \"1667.28\"}", + "QuAC - # output tokens": "{\"description\": \"min=0.999, mean=0.999, max=0.999, sum=0.999 (1)\", \"tab\": \"General information\", \"score\": \"0.999\"}", + "QuAC - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "QuAC - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "QuAC - Stereotypes (gender)": "{\"description\": \"min=0.468, mean=0.468, max=0.468, sum=0.468 (1)\", \"tab\": \"Bias\", \"score\": \"0.4681547619047619\"}", + "QuAC - Representation (race)": "{\"description\": \"min=0.423, mean=0.423, max=0.423, sum=0.423 (1)\", \"tab\": \"Bias\", \"score\": \"0.42342342342342343\"}", + "QuAC - Representation (gender)": "{\"description\": \"min=0.141, mean=0.141, max=0.141, sum=0.141 (1)\", \"tab\": \"Bias\", \"score\": \"0.141304347826087\"}", + "QuAC - Toxic fraction": "{\"description\": \"min=0.002, mean=0.002, max=0.002, sum=0.002 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.002\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on HellaSwag", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "HellaSwag - ECE (10-bin)": "{\"description\": \"No matching runs\", \"tab\": \"Calibration\", \"score\": \"\"}", + "HellaSwag - EM (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "HellaSwag - EM (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "HellaSwag - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "HellaSwag - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on OpenbookQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "OpenbookQA - ECE (10-bin)": "{\"description\": \"No matching runs\", \"tab\": \"Calibration\", \"score\": \"\"}", + "OpenbookQA - EM (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "OpenbookQA - EM (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "OpenbookQA - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "OpenbookQA - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on TruthfulQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.353, + "details": { + "description": "min=0.353, mean=0.353, max=0.353, sum=0.353 (1)", + "tab": "Accuracy", + "TruthfulQA - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "TruthfulQA - EM (Robustness)": "{\"description\": \"min=0.303, mean=0.303, max=0.303, sum=0.303 (1)\", \"tab\": \"Robustness\", \"score\": \"0.30275229357798167\"}", + "TruthfulQA - EM (Fairness)": "{\"description\": \"min=0.292, mean=0.292, max=0.292, sum=0.292 (1)\", \"tab\": \"Fairness\", \"score\": \"0.29204892966360857\"}", + "TruthfulQA - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "TruthfulQA - # eval": "{\"description\": \"min=654, mean=654, max=654, sum=654 (1)\", \"tab\": \"General information\", \"score\": \"654.0\"}", + "TruthfulQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "TruthfulQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "TruthfulQA - # prompt tokens": "{\"description\": \"min=507.503, mean=507.503, max=507.503, sum=507.503 (1)\", \"tab\": \"General information\", \"score\": \"507.50305810397555\"}", + "TruthfulQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "TruthfulQA - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "MS MARCO (regular) - RR@10 (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "MS MARCO (TREC) - NDCG@10 (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "MS MARCO (regular) - RR@10 (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "MS MARCO (TREC) - NDCG@10 (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "MS MARCO (regular) - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "MS MARCO (TREC) - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "MS MARCO (regular) - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}", + "MS MARCO (TREC) - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "ROUGE-2 on CNN/DailyMail", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "CNN/DailyMail - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "CNN/DailyMail - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}", + "CNN/DailyMail - SummaC": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - QAFactEval": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - BERTScore (F1)": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - Coverage": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - Density": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - Compression": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - HumanEval-faithfulness": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - HumanEval-relevance": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - HumanEval-coherence": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "ROUGE-2 on XSUM", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "XSUM - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "XSUM - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}", + "XSUM - SummaC": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - QAFactEval": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - BERTScore (F1)": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - Coverage": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - Density": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - Compression": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - HumanEval-faithfulness": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - HumanEval-relevance": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - HumanEval-coherence": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on IMDB", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.959, + "details": { + "description": "min=0.959, mean=0.959, max=0.959, sum=0.959 (1)", + "tab": "Accuracy", + "IMDB - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "IMDB - EM (Robustness)": "{\"description\": \"min=0.935, mean=0.935, max=0.935, sum=0.935 (1)\", \"tab\": \"Robustness\", \"score\": \"0.935\"}", + "IMDB - EM (Fairness)": "{\"description\": \"min=0.954, mean=0.954, max=0.954, sum=0.954 (1)\", \"tab\": \"Fairness\", \"score\": \"0.954\"}", + "IMDB - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "IMDB - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "IMDB - # train": "{\"description\": \"min=2.871, mean=2.871, max=2.871, sum=2.871 (1)\", \"tab\": \"General information\", \"score\": \"2.871\"}", + "IMDB - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "IMDB - # prompt tokens": "{\"description\": \"min=1666.079, mean=1666.079, max=1666.079, sum=1666.079 (1)\", \"tab\": \"General information\", \"score\": \"1666.079\"}", + "IMDB - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "IMDB - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "IMDB - Stereotypes (race)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Stereotypes (gender)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Representation (race)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Representation (gender)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Toxic fraction": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on CivilComments", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.552, + "details": { + "description": "min=0.098, mean=0.552, max=0.969, sum=9.936 (18)", + "tab": "Accuracy", + "CivilComments - ECE (10-bin)": "{\"description\": \"9 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "CivilComments - EM (Robustness)": "{\"description\": \"min=0.037, mean=0.412, max=0.827, sum=7.414 (18)\", \"tab\": \"Robustness\", \"score\": \"0.4118677862671613\"}", + "CivilComments - EM (Fairness)": "{\"description\": \"min=0.098, mean=0.292, max=0.594, sum=5.248 (18)\", \"tab\": \"Fairness\", \"score\": \"0.29157916197633543\"}", + "CivilComments - Denoised inference time (s)": "{\"description\": \"9 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "CivilComments - # eval": "{\"description\": \"min=74, mean=371.556, max=683, sum=6688 (18)\", \"tab\": \"General information\", \"score\": \"371.55555555555554\"}", + "CivilComments - # train": "{\"description\": \"min=5, mean=5, max=5, sum=90 (18)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "CivilComments - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (18)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "CivilComments - # prompt tokens": "{\"description\": \"min=367.585, mean=782.759, max=1312.924, sum=14089.663 (18)\", \"tab\": \"General information\", \"score\": \"782.7590374602355\"}", + "CivilComments - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=18 (18)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "CivilComments - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=18 (18)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "CivilComments - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Stereotypes (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Representation (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Representation (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Toxic fraction": "{\"description\": \"9 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on RAFT", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.661, + "details": { + "description": "min=0.2, mean=0.661, max=0.975, sum=7.275 (11)", + "tab": "Accuracy", + "RAFT - ECE (10-bin)": "{\"description\": \"11 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "RAFT - EM (Robustness)": "{\"description\": \"min=0, mean=0.586, max=0.975, sum=6.45 (11)\", \"tab\": \"Robustness\", \"score\": \"0.5863636363636363\"}", + "RAFT - EM (Fairness)": "{\"description\": \"min=0.15, mean=0.611, max=0.975, sum=6.725 (11)\", \"tab\": \"Fairness\", \"score\": \"0.6113636363636364\"}", + "RAFT - Denoised inference time (s)": "{\"description\": \"11 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "RAFT - # eval": "{\"description\": \"min=40, mean=40, max=40, sum=440 (11)\", \"tab\": \"General information\", \"score\": \"40.0\"}", + "RAFT - # train": "{\"description\": \"min=0.7, mean=4.6, max=5, sum=50.6 (11)\", \"tab\": \"General information\", \"score\": \"4.6000000000000005\"}", + "RAFT - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (11)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "RAFT - # prompt tokens": "{\"description\": \"min=289.025, mean=877.464, max=1772.5, sum=9652.1 (11)\", \"tab\": \"General information\", \"score\": \"877.4636363636364\"}", + "RAFT - # output tokens": "{\"description\": \"min=0.7, mean=0.973, max=1, sum=10.7 (11)\", \"tab\": \"General information\", \"score\": \"0.9727272727272727\"}", + "RAFT - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=11 (11)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "RAFT - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Stereotypes (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Representation (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Representation (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Toxic fraction": "{\"description\": \"11 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + } + ], + "detailed_evaluation_results": null, + "generation_config": { + "additional_details": {} + } + } + ] +} \ No newline at end of file diff --git a/data/models/tiiuae_Falcon-7B.json b/data/models/tiiuae_Falcon-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..721b64c1faf34c31b258da7a28eb53cb8a3df32e --- /dev/null +++ b/data/models/tiiuae_Falcon-7B.json @@ -0,0 +1,674 @@ +{ + "model_info": { + "name": "Falcon 7B", + "id": "tiiuae/Falcon-7B", + "developer": "tiiuae", + "inference_platform": "unknown" + }, + "evaluations": [ + { + "evaluation_id": "helm_classic/tiiuae_Falcon-7B/1774096308.339228", + "retrieved_timestamp": "1774096308.339228", + "source_metadata": { + "source_name": "helm_classic", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "helm", + "version": "unknown" + }, + "benchmark": "helm_classic", + "evaluation_results": [ + { + "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "How many models this model outperform on average (over columns).", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.378, + "details": { + "description": "", + "tab": "Accuracy", + "Mean win rate - Calibration": "{\"description\": \"\", \"tab\": \"Calibration\", \"score\": \"\"}", + "Mean win rate - Robustness": "{\"description\": \"\", \"tab\": \"Robustness\", \"score\": \"0.4253379953379953\"}", + "Mean win rate - Fairness": "{\"description\": \"\", \"tab\": \"Fairness\", \"score\": \"0.4469230769230769\"}", + "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}", + "Mean win rate - Bias": "{\"description\": \"\", \"tab\": \"Bias\", \"score\": \"0.35594420480554084\"}", + "Mean win rate - Toxicity": "{\"description\": \"\", \"tab\": \"Toxicity\", \"score\": \"0.5821678321678322\"}", + "Mean win rate - Summarization metrics": "{\"description\": \"\", \"tab\": \"Summarization metrics\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on MMLU", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.286, + "details": { + "description": "min=0.17, mean=0.286, max=0.39, sum=1.432 (5)", + "tab": "Accuracy", + "MMLU - ECE (10-bin)": "{\"description\": \"5 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "MMLU - EM (Robustness)": "{\"description\": \"min=0.13, mean=0.236, max=0.37, sum=1.181 (5)\", \"tab\": \"Robustness\", \"score\": \"0.23610526315789473\"}", + "MMLU - EM (Fairness)": "{\"description\": \"min=0.15, mean=0.261, max=0.33, sum=1.303 (5)\", \"tab\": \"Fairness\", \"score\": \"0.26063157894736844\"}", + "MMLU - Denoised inference time (s)": "{\"description\": \"5 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}", + "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MMLU - # prompt tokens": "{\"description\": \"min=389.6, mean=500.12, max=664.281, sum=2500.601 (5)\", \"tab\": \"General information\", \"score\": \"500.12014035087725\"}", + "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "MMLU - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on BoolQ", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.753, + "details": { + "description": "min=0.753, mean=0.753, max=0.753, sum=0.753 (1)", + "tab": "Accuracy", + "BoolQ - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "BoolQ - EM (Robustness)": "{\"description\": \"min=0.65, mean=0.65, max=0.65, sum=0.65 (1)\", \"tab\": \"Robustness\", \"score\": \"0.65\"}", + "BoolQ - EM (Fairness)": "{\"description\": \"min=0.702, mean=0.702, max=0.702, sum=0.702 (1)\", \"tab\": \"Fairness\", \"score\": \"0.702\"}", + "BoolQ - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "BoolQ - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "BoolQ - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "BoolQ - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "BoolQ - # prompt tokens": "{\"description\": \"min=1284.629, mean=1284.629, max=1284.629, sum=1284.629 (1)\", \"tab\": \"General information\", \"score\": \"1284.629\"}", + "BoolQ - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "BoolQ - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "BoolQ - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Stereotypes (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Representation (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Representation (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "BoolQ - Toxic fraction": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "F1 on NarrativeQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.621, + "details": { + "description": "min=0.621, mean=0.621, max=0.621, sum=0.621 (1)", + "tab": "Accuracy", + "NarrativeQA - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "NarrativeQA - F1 (Robustness)": "{\"description\": \"min=0.436, mean=0.436, max=0.436, sum=0.436 (1)\", \"tab\": \"Robustness\", \"score\": \"0.4358401092976052\"}", + "NarrativeQA - F1 (Fairness)": "{\"description\": \"min=0.52, mean=0.52, max=0.52, sum=0.52 (1)\", \"tab\": \"Fairness\", \"score\": \"0.5199130399003071\"}", + "NarrativeQA - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}", + "NarrativeQA - # train": "{\"description\": \"min=2.025, mean=2.025, max=2.025, sum=2.025 (1)\", \"tab\": \"General information\", \"score\": \"2.0253521126760563\"}", + "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NarrativeQA - # prompt tokens": "{\"description\": \"min=1694.082, mean=1694.082, max=1694.082, sum=1694.082 (1)\", \"tab\": \"General information\", \"score\": \"1694.081690140845\"}", + "NarrativeQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NarrativeQA - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NarrativeQA - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "NarrativeQA - Stereotypes (gender)": "{\"description\": \"min=0.444, mean=0.444, max=0.444, sum=0.444 (1)\", \"tab\": \"Bias\", \"score\": \"0.4444444444444444\"}", + "NarrativeQA - Representation (race)": "{\"description\": \"min=0.667, mean=0.667, max=0.667, sum=0.667 (1)\", \"tab\": \"Bias\", \"score\": \"0.6666666666666667\"}", + "NarrativeQA - Representation (gender)": "{\"description\": \"min=0.205, mean=0.205, max=0.205, sum=0.205 (1)\", \"tab\": \"Bias\", \"score\": \"0.2046979865771812\"}", + "NarrativeQA - Toxic fraction": "{\"description\": \"min=0.017, mean=0.017, max=0.017, sum=0.017 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.016901408450704224\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "F1 on NaturalQuestions (open-book)", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.579, + "details": { + "description": "min=0.579, mean=0.579, max=0.579, sum=0.579 (1)", + "tab": "Accuracy", + "NaturalQuestions (closed-book) - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "NaturalQuestions (open-book) - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "NaturalQuestions (closed-book) - F1 (Robustness)": "{\"description\": \"min=0.185, mean=0.185, max=0.185, sum=0.185 (1)\", \"tab\": \"Robustness\", \"score\": \"0.18513134554094532\"}", + "NaturalQuestions (open-book) - F1 (Robustness)": "{\"description\": \"min=0.489, mean=0.489, max=0.489, sum=0.489 (1)\", \"tab\": \"Robustness\", \"score\": \"0.4889733445855735\"}", + "NaturalQuestions (closed-book) - F1 (Fairness)": "{\"description\": \"min=0.233, mean=0.233, max=0.233, sum=0.233 (1)\", \"tab\": \"Fairness\", \"score\": \"0.2334955595363806\"}", + "NaturalQuestions (open-book) - F1 (Fairness)": "{\"description\": \"min=0.537, mean=0.537, max=0.537, sum=0.537 (1)\", \"tab\": \"Fairness\", \"score\": \"0.536571121609654\"}", + "NaturalQuestions (closed-book) - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "NaturalQuestions (open-book) - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=124.246, mean=124.246, max=124.246, sum=124.246 (1)\", \"tab\": \"General information\", \"score\": \"124.246\"}", + "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NaturalQuestions (closed-book) - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (open-book) - # train": "{\"description\": \"min=4.599, mean=4.599, max=4.599, sum=4.599 (1)\", \"tab\": \"General information\", \"score\": \"4.599\"}", + "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0.039, mean=0.039, max=0.039, sum=0.039 (1)\", \"tab\": \"General information\", \"score\": \"0.039\"}", + "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=1587.334, mean=1587.334, max=1587.334, sum=1587.334 (1)\", \"tab\": \"General information\", \"score\": \"1587.334\"}", + "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=0.994, mean=0.994, max=0.994, sum=0.994 (1)\", \"tab\": \"General information\", \"score\": \"0.994\"}", + "NaturalQuestions (open-book) - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "NaturalQuestions (closed-book) - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "NaturalQuestions (closed-book) - Stereotypes (gender)": "{\"description\": \"min=0.389, mean=0.389, max=0.389, sum=0.389 (1)\", \"tab\": \"Bias\", \"score\": \"0.38888888888888884\"}", + "NaturalQuestions (closed-book) - Representation (race)": "{\"description\": \"min=0.476, mean=0.476, max=0.476, sum=0.476 (1)\", \"tab\": \"Bias\", \"score\": \"0.47619047619047616\"}", + "NaturalQuestions (closed-book) - Representation (gender)": "{\"description\": \"min=0.14, mean=0.14, max=0.14, sum=0.14 (1)\", \"tab\": \"Bias\", \"score\": \"0.14\"}", + "NaturalQuestions (open-book) - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "NaturalQuestions (open-book) - Stereotypes (gender)": "{\"description\": \"min=0.333, mean=0.333, max=0.333, sum=0.333 (1)\", \"tab\": \"Bias\", \"score\": \"0.3333333333333333\"}", + "NaturalQuestions (open-book) - Representation (race)": "{\"description\": \"min=0.553, mean=0.553, max=0.553, sum=0.553 (1)\", \"tab\": \"Bias\", \"score\": \"0.5528942115768464\"}", + "NaturalQuestions (open-book) - Representation (gender)": "{\"description\": \"min=0.275, mean=0.275, max=0.275, sum=0.275 (1)\", \"tab\": \"Bias\", \"score\": \"0.2745098039215687\"}", + "NaturalQuestions (closed-book) - Toxic fraction": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.0\"}", + "NaturalQuestions (open-book) - Toxic fraction": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.0\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "F1 on QuAC", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.332, + "details": { + "description": "min=0.332, mean=0.332, max=0.332, sum=0.332 (1)", + "tab": "Accuracy", + "QuAC - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "QuAC - F1 (Robustness)": "{\"description\": \"min=0.164, mean=0.164, max=0.164, sum=0.164 (1)\", \"tab\": \"Robustness\", \"score\": \"0.16389145934637706\"}", + "QuAC - F1 (Fairness)": "{\"description\": \"min=0.262, mean=0.262, max=0.262, sum=0.262 (1)\", \"tab\": \"Fairness\", \"score\": \"0.2622208848575014\"}", + "QuAC - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "QuAC - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "QuAC - # train": "{\"description\": \"min=0.862, mean=0.862, max=0.862, sum=0.862 (1)\", \"tab\": \"General information\", \"score\": \"0.862\"}", + "QuAC - truncated": "{\"description\": \"min=0.031, mean=0.031, max=0.031, sum=0.031 (1)\", \"tab\": \"General information\", \"score\": \"0.031\"}", + "QuAC - # prompt tokens": "{\"description\": \"min=1667.28, mean=1667.28, max=1667.28, sum=1667.28 (1)\", \"tab\": \"General information\", \"score\": \"1667.28\"}", + "QuAC - # output tokens": "{\"description\": \"min=0.995, mean=0.995, max=0.995, sum=0.995 (1)\", \"tab\": \"General information\", \"score\": \"0.995\"}", + "QuAC - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "QuAC - Stereotypes (race)": "{\"description\": \"min=0.667, mean=0.667, max=0.667, sum=0.667 (1)\", \"tab\": \"Bias\", \"score\": \"0.6666666666666666\"}", + "QuAC - Stereotypes (gender)": "{\"description\": \"min=0.457, mean=0.457, max=0.457, sum=0.457 (1)\", \"tab\": \"Bias\", \"score\": \"0.45680272108843534\"}", + "QuAC - Representation (race)": "{\"description\": \"min=0.402, mean=0.402, max=0.402, sum=0.402 (1)\", \"tab\": \"Bias\", \"score\": \"0.4022988505747127\"}", + "QuAC - Representation (gender)": "{\"description\": \"min=0.247, mean=0.247, max=0.247, sum=0.247 (1)\", \"tab\": \"Bias\", \"score\": \"0.24695863746958635\"}", + "QuAC - Toxic fraction": "{\"description\": \"min=0.001, mean=0.001, max=0.001, sum=0.001 (1)\", \"tab\": \"Toxicity\", \"score\": \"0.001\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on HellaSwag", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "HellaSwag - ECE (10-bin)": "{\"description\": \"No matching runs\", \"tab\": \"Calibration\", \"score\": \"\"}", + "HellaSwag - EM (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "HellaSwag - EM (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "HellaSwag - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "HellaSwag - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "HellaSwag - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on OpenbookQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "OpenbookQA - ECE (10-bin)": "{\"description\": \"No matching runs\", \"tab\": \"Calibration\", \"score\": \"\"}", + "OpenbookQA - EM (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "OpenbookQA - EM (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "OpenbookQA - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "OpenbookQA - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "OpenbookQA - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on TruthfulQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.234, + "details": { + "description": "min=0.234, mean=0.234, max=0.234, sum=0.234 (1)", + "tab": "Accuracy", + "TruthfulQA - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "TruthfulQA - EM (Robustness)": "{\"description\": \"min=0.205, mean=0.205, max=0.205, sum=0.205 (1)\", \"tab\": \"Robustness\", \"score\": \"0.20489296636085627\"}", + "TruthfulQA - EM (Fairness)": "{\"description\": \"min=0.213, mean=0.213, max=0.213, sum=0.213 (1)\", \"tab\": \"Fairness\", \"score\": \"0.21253822629969418\"}", + "TruthfulQA - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "TruthfulQA - # eval": "{\"description\": \"min=654, mean=654, max=654, sum=654 (1)\", \"tab\": \"General information\", \"score\": \"654.0\"}", + "TruthfulQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "TruthfulQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "TruthfulQA - # prompt tokens": "{\"description\": \"min=507.503, mean=507.503, max=507.503, sum=507.503 (1)\", \"tab\": \"General information\", \"score\": \"507.50305810397555\"}", + "TruthfulQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "TruthfulQA - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "MS MARCO (regular) - RR@10 (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "MS MARCO (TREC) - NDCG@10 (Robustness)": "{\"description\": \"No matching runs\", \"tab\": \"Robustness\", \"score\": \"\"}", + "MS MARCO (regular) - RR@10 (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "MS MARCO (TREC) - NDCG@10 (Fairness)": "{\"description\": \"No matching runs\", \"tab\": \"Fairness\", \"score\": \"\"}", + "MS MARCO (regular) - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "MS MARCO (TREC) - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "MS MARCO (regular) - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (TREC) - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "MS MARCO (regular) - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (TREC) - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "MS MARCO (regular) - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}", + "MS MARCO (TREC) - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "ROUGE-2 on CNN/DailyMail", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "CNN/DailyMail - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "CNN/DailyMail - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "CNN/DailyMail - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "CNN/DailyMail - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}", + "CNN/DailyMail - SummaC": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - QAFactEval": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - BERTScore (F1)": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - Coverage": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - Density": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - Compression": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - HumanEval-faithfulness": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - HumanEval-relevance": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "CNN/DailyMail - HumanEval-coherence": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "ROUGE-2 on XSUM", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": -1.0, + "details": { + "description": "No matching runs", + "tab": "Accuracy", + "XSUM - Denoised inference time (s)": "{\"description\": \"No matching runs\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "XSUM - # eval": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # train": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - truncated": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # prompt tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # output tokens": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - # trials": "{\"description\": \"No matching runs\", \"tab\": \"General information\", \"score\": \"\"}", + "XSUM - Stereotypes (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Stereotypes (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Representation (race)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Representation (gender)": "{\"description\": \"No matching runs\", \"tab\": \"Bias\", \"score\": \"\"}", + "XSUM - Toxic fraction": "{\"description\": \"No matching runs\", \"tab\": \"Toxicity\", \"score\": \"\"}", + "XSUM - SummaC": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - QAFactEval": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - BERTScore (F1)": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - Coverage": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - Density": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - Compression": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - HumanEval-faithfulness": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - HumanEval-relevance": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}", + "XSUM - HumanEval-coherence": "{\"description\": \"No matching runs\", \"tab\": \"Summarization metrics\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on IMDB", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.836, + "details": { + "description": "min=0.836, mean=0.836, max=0.836, sum=0.836 (1)", + "tab": "Accuracy", + "IMDB - ECE (10-bin)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "IMDB - EM (Robustness)": "{\"description\": \"min=0.692, mean=0.692, max=0.692, sum=0.692 (1)\", \"tab\": \"Robustness\", \"score\": \"0.692\"}", + "IMDB - EM (Fairness)": "{\"description\": \"min=0.794, mean=0.794, max=0.794, sum=0.794 (1)\", \"tab\": \"Fairness\", \"score\": \"0.794\"}", + "IMDB - Denoised inference time (s)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "IMDB - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "IMDB - # train": "{\"description\": \"min=2.871, mean=2.871, max=2.871, sum=2.871 (1)\", \"tab\": \"General information\", \"score\": \"2.871\"}", + "IMDB - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "IMDB - # prompt tokens": "{\"description\": \"min=1666.079, mean=1666.079, max=1666.079, sum=1666.079 (1)\", \"tab\": \"General information\", \"score\": \"1666.079\"}", + "IMDB - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "IMDB - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "IMDB - Stereotypes (race)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Stereotypes (gender)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Representation (race)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Representation (gender)": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Bias\", \"score\": \"\"}", + "IMDB - Toxic fraction": "{\"description\": \"1 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on CivilComments", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.514, + "details": { + "description": "min=0, mean=0.514, max=0.999, sum=9.257 (18)", + "tab": "Accuracy", + "CivilComments - ECE (10-bin)": "{\"description\": \"9 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "CivilComments - EM (Robustness)": "{\"description\": \"min=0, mean=0.485, max=0.999, sum=8.731 (18)\", \"tab\": \"Robustness\", \"score\": \"0.4850751828621894\"}", + "CivilComments - EM (Fairness)": "{\"description\": \"min=0, mean=0.494, max=0.999, sum=8.898 (18)\", \"tab\": \"Fairness\", \"score\": \"0.49430637095445207\"}", + "CivilComments - Denoised inference time (s)": "{\"description\": \"9 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "CivilComments - # eval": "{\"description\": \"min=74, mean=371.556, max=683, sum=6688 (18)\", \"tab\": \"General information\", \"score\": \"371.55555555555554\"}", + "CivilComments - # train": "{\"description\": \"min=5, mean=5, max=5, sum=90 (18)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "CivilComments - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (18)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "CivilComments - # prompt tokens": "{\"description\": \"min=367.585, mean=782.759, max=1312.924, sum=14089.663 (18)\", \"tab\": \"General information\", \"score\": \"782.7590374602355\"}", + "CivilComments - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=18 (18)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "CivilComments - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=18 (18)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "CivilComments - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Stereotypes (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Representation (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Representation (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "CivilComments - Toxic fraction": "{\"description\": \"9 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "EM on RAFT", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.602, + "details": { + "description": "min=0.15, mean=0.602, max=0.975, sum=6.625 (11)", + "tab": "Accuracy", + "RAFT - ECE (10-bin)": "{\"description\": \"11 matching runs, but no matching metrics\", \"tab\": \"Calibration\", \"score\": \"\"}", + "RAFT - EM (Robustness)": "{\"description\": \"min=0.025, mean=0.516, max=0.975, sum=5.675 (11)\", \"tab\": \"Robustness\", \"score\": \"0.5159090909090908\"}", + "RAFT - EM (Fairness)": "{\"description\": \"min=0.15, mean=0.555, max=0.975, sum=6.1 (11)\", \"tab\": \"Fairness\", \"score\": \"0.5545454545454546\"}", + "RAFT - Denoised inference time (s)": "{\"description\": \"11 matching runs, but no matching metrics\", \"tab\": \"Efficiency\", \"score\": \"\"}", + "RAFT - # eval": "{\"description\": \"min=40, mean=40, max=40, sum=440 (11)\", \"tab\": \"General information\", \"score\": \"40.0\"}", + "RAFT - # train": "{\"description\": \"min=0.7, mean=4.6, max=5, sum=50.6 (11)\", \"tab\": \"General information\", \"score\": \"4.6000000000000005\"}", + "RAFT - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (11)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "RAFT - # prompt tokens": "{\"description\": \"min=289.025, mean=877.464, max=1772.5, sum=9652.1 (11)\", \"tab\": \"General information\", \"score\": \"877.4636363636364\"}", + "RAFT - # output tokens": "{\"description\": \"min=0.725, mean=0.975, max=1, sum=10.725 (11)\", \"tab\": \"General information\", \"score\": \"0.975\"}", + "RAFT - # trials": "{\"description\": \"min=1, mean=1, max=1, sum=11 (11)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "RAFT - Stereotypes (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Stereotypes (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Representation (race)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Representation (gender)": "{\"description\": \"(0)\", \"tab\": \"Bias\", \"score\": \"\"}", + "RAFT - Toxic fraction": "{\"description\": \"11 matching runs, but no matching metrics\", \"tab\": \"Toxicity\", \"score\": \"\"}" + } + }, + "generation_config": { + "additional_details": {} + } + } + ], + "detailed_evaluation_results": null, + "generation_config": { + "additional_details": {} + } + } + ] +} \ No newline at end of file diff --git a/data/tiiuae_Falcon-Instruct-40B.json b/data/models/tiiuae_Falcon-Instruct-40B.json similarity index 100% rename from data/tiiuae_Falcon-Instruct-40B.json rename to data/models/tiiuae_Falcon-Instruct-40B.json diff --git a/data/tiiuae_Falcon-Instruct-7B.json b/data/models/tiiuae_Falcon-Instruct-7B.json similarity index 100% rename from data/tiiuae_Falcon-Instruct-7B.json rename to data/models/tiiuae_Falcon-Instruct-7B.json diff --git a/data/tiiuae_Falcon3-10B-Base.json b/data/models/tiiuae_Falcon3-10B-Base.json similarity index 100% rename from data/tiiuae_Falcon3-10B-Base.json rename to data/models/tiiuae_Falcon3-10B-Base.json diff --git a/data/tiiuae_Falcon3-10B-Instruct.json b/data/models/tiiuae_Falcon3-10B-Instruct.json similarity index 100% rename from data/tiiuae_Falcon3-10B-Instruct.json rename to data/models/tiiuae_Falcon3-10B-Instruct.json diff --git a/data/tiiuae_Falcon3-1B-Base.json b/data/models/tiiuae_Falcon3-1B-Base.json similarity index 100% rename from data/tiiuae_Falcon3-1B-Base.json rename to data/models/tiiuae_Falcon3-1B-Base.json diff --git a/data/tiiuae_Falcon3-1B-Instruct.json b/data/models/tiiuae_Falcon3-1B-Instruct.json similarity index 100% rename from data/tiiuae_Falcon3-1B-Instruct.json rename to data/models/tiiuae_Falcon3-1B-Instruct.json diff --git a/data/tiiuae_Falcon3-3B-Base.json b/data/models/tiiuae_Falcon3-3B-Base.json similarity index 100% rename from data/tiiuae_Falcon3-3B-Base.json rename to data/models/tiiuae_Falcon3-3B-Base.json diff --git a/data/tiiuae_Falcon3-3B-Instruct.json b/data/models/tiiuae_Falcon3-3B-Instruct.json similarity index 100% rename from data/tiiuae_Falcon3-3B-Instruct.json rename to data/models/tiiuae_Falcon3-3B-Instruct.json diff --git a/data/tiiuae_Falcon3-7B-Base.json b/data/models/tiiuae_Falcon3-7B-Base.json similarity index 100% rename from data/tiiuae_Falcon3-7B-Base.json rename to data/models/tiiuae_Falcon3-7B-Base.json diff --git a/data/tiiuae_Falcon3-7B-Instruct.json b/data/models/tiiuae_Falcon3-7B-Instruct.json similarity index 100% rename from data/tiiuae_Falcon3-7B-Instruct.json rename to data/models/tiiuae_Falcon3-7B-Instruct.json diff --git a/data/tiiuae_Falcon3-Mamba-7B-Base.json b/data/models/tiiuae_Falcon3-Mamba-7B-Base.json similarity index 100% rename from data/tiiuae_Falcon3-Mamba-7B-Base.json rename to data/models/tiiuae_Falcon3-Mamba-7B-Base.json diff --git a/data/tiiuae_Falcon3-Mamba-7B-Instruct.json b/data/models/tiiuae_Falcon3-Mamba-7B-Instruct.json similarity index 100% rename from data/tiiuae_Falcon3-Mamba-7B-Instruct.json rename to data/models/tiiuae_Falcon3-Mamba-7B-Instruct.json diff --git a/data/tiiuae_falcon-11B.json b/data/models/tiiuae_falcon-11B.json similarity index 100% rename from data/tiiuae_falcon-11B.json rename to data/models/tiiuae_falcon-11B.json diff --git a/data/tiiuae_falcon-40b-instruct.json b/data/models/tiiuae_falcon-40b-instruct.json similarity index 100% rename from data/tiiuae_falcon-40b-instruct.json rename to data/models/tiiuae_falcon-40b-instruct.json diff --git a/data/tiiuae_falcon-40b.json b/data/models/tiiuae_falcon-40b.json similarity index 100% rename from data/tiiuae_falcon-40b.json rename to data/models/tiiuae_falcon-40b.json diff --git a/data/tiiuae_falcon-7b-instruct.json b/data/models/tiiuae_falcon-7b-instruct.json similarity index 100% rename from data/tiiuae_falcon-7b-instruct.json rename to data/models/tiiuae_falcon-7b-instruct.json diff --git a/data/tiiuae_falcon-7b.json b/data/models/tiiuae_falcon-7b.json similarity index 100% rename from data/tiiuae_falcon-7b.json rename to data/models/tiiuae_falcon-7b.json diff --git a/data/tiiuae_falcon-mamba-7b.json b/data/models/tiiuae_falcon-mamba-7b.json similarity index 100% rename from data/tiiuae_falcon-mamba-7b.json rename to data/models/tiiuae_falcon-mamba-7b.json diff --git a/data/tinycompany_BiBo-v0.3.json b/data/models/tinycompany_BiBo-v0.3.json similarity index 100% rename from data/tinycompany_BiBo-v0.3.json rename to data/models/tinycompany_BiBo-v0.3.json diff --git a/data/tinycompany_BiBo-v0.7.json b/data/models/tinycompany_BiBo-v0.7.json similarity index 100% rename from data/tinycompany_BiBo-v0.7.json rename to data/models/tinycompany_BiBo-v0.7.json diff --git a/data/tinycompany_ShawtyIsBad-bgem3.json b/data/models/tinycompany_ShawtyIsBad-bgem3.json similarity index 100% rename from data/tinycompany_ShawtyIsBad-bgem3.json rename to data/models/tinycompany_ShawtyIsBad-bgem3.json diff --git a/data/tinycompany_ShawtyIsBad-e5-large.json b/data/models/tinycompany_ShawtyIsBad-e5-large.json similarity index 100% rename from data/tinycompany_ShawtyIsBad-e5-large.json rename to data/models/tinycompany_ShawtyIsBad-e5-large.json diff --git a/data/tinycompany_ShawtyIsBad-ib.json b/data/models/tinycompany_ShawtyIsBad-ib.json similarity index 100% rename from data/tinycompany_ShawtyIsBad-ib.json rename to data/models/tinycompany_ShawtyIsBad-ib.json diff --git a/data/tinycompany_ShawtyIsBad-nomic-moe.json b/data/models/tinycompany_ShawtyIsBad-nomic-moe.json similarity index 100% rename from data/tinycompany_ShawtyIsBad-nomic-moe.json rename to data/models/tinycompany_ShawtyIsBad-nomic-moe.json diff --git a/data/tinycompany_ShawtyIsBad-nomic1.5.json b/data/models/tinycompany_ShawtyIsBad-nomic1.5.json similarity index 100% rename from data/tinycompany_ShawtyIsBad-nomic1.5.json rename to data/models/tinycompany_ShawtyIsBad-nomic1.5.json diff --git a/data/tinycompany_SigmaBoi-base.json b/data/models/tinycompany_SigmaBoi-base.json similarity index 100% rename from data/tinycompany_SigmaBoi-base.json rename to data/models/tinycompany_SigmaBoi-base.json diff --git a/data/tinycompany_SigmaBoi-bge-m3.json b/data/models/tinycompany_SigmaBoi-bge-m3.json similarity index 100% rename from data/tinycompany_SigmaBoi-bge-m3.json rename to data/models/tinycompany_SigmaBoi-bge-m3.json diff --git a/data/tinycompany_SigmaBoi-bgem3.json b/data/models/tinycompany_SigmaBoi-bgem3.json similarity index 100% rename from data/tinycompany_SigmaBoi-bgem3.json rename to data/models/tinycompany_SigmaBoi-bgem3.json diff --git a/data/tinycompany_SigmaBoi-ib.json b/data/models/tinycompany_SigmaBoi-ib.json similarity index 100% rename from data/tinycompany_SigmaBoi-ib.json rename to data/models/tinycompany_SigmaBoi-ib.json diff --git a/data/tinycompany_SigmaBoi-nomic-moe.json b/data/models/tinycompany_SigmaBoi-nomic-moe.json similarity index 100% rename from data/tinycompany_SigmaBoi-nomic-moe.json rename to data/models/tinycompany_SigmaBoi-nomic-moe.json diff --git a/data/tinycompany_SigmaBoi-nomic1.5-fp32.json b/data/models/tinycompany_SigmaBoi-nomic1.5-fp32.json similarity index 100% rename from data/tinycompany_SigmaBoi-nomic1.5-fp32.json rename to data/models/tinycompany_SigmaBoi-nomic1.5-fp32.json diff --git a/data/tinycompany_SigmaBoi-nomic1.5.json b/data/models/tinycompany_SigmaBoi-nomic1.5.json similarity index 100% rename from data/tinycompany_SigmaBoi-nomic1.5.json rename to data/models/tinycompany_SigmaBoi-nomic1.5.json diff --git a/data/tinycompany_Tamed-Shawty.json b/data/models/tinycompany_Tamed-Shawty.json similarity index 100% rename from data/tinycompany_Tamed-Shawty.json rename to data/models/tinycompany_Tamed-Shawty.json diff --git a/data/tklohj_WindyFloLLM.json b/data/models/tklohj_WindyFloLLM.json similarity index 100% rename from data/tklohj_WindyFloLLM.json rename to data/models/tklohj_WindyFloLLM.json diff --git a/data/together_RedPajama-INCITE-Base-7B.json b/data/models/together_RedPajama-INCITE-Base-7B.json similarity index 100% rename from data/together_RedPajama-INCITE-Base-7B.json rename to data/models/together_RedPajama-INCITE-Base-7B.json diff --git a/data/together_RedPajama-INCITE-Base-v1-3B.json b/data/models/together_RedPajama-INCITE-Base-v1-3B.json similarity index 100% rename from data/together_RedPajama-INCITE-Base-v1-3B.json rename to data/models/together_RedPajama-INCITE-Base-v1-3B.json diff --git a/data/together_RedPajama-INCITE-Instruct-7B.json b/data/models/together_RedPajama-INCITE-Instruct-7B.json similarity index 100% rename from data/together_RedPajama-INCITE-Instruct-7B.json rename to data/models/together_RedPajama-INCITE-Instruct-7B.json diff --git a/data/together_RedPajama-INCITE-Instruct-v1-3B.json b/data/models/together_RedPajama-INCITE-Instruct-v1-3B.json similarity index 100% rename from data/together_RedPajama-INCITE-Instruct-v1-3B.json rename to data/models/together_RedPajama-INCITE-Instruct-v1-3B.json diff --git a/data/togethercomputer_GPT-JT-6B-v1.json b/data/models/togethercomputer_GPT-JT-6B-v1.json similarity index 100% rename from data/togethercomputer_GPT-JT-6B-v1.json rename to data/models/togethercomputer_GPT-JT-6B-v1.json diff --git a/data/togethercomputer_GPT-NeoXT-Chat-Base-20B.json b/data/models/togethercomputer_GPT-NeoXT-Chat-Base-20B.json similarity index 100% rename from data/togethercomputer_GPT-NeoXT-Chat-Base-20B.json rename to data/models/togethercomputer_GPT-NeoXT-Chat-Base-20B.json diff --git a/data/togethercomputer_LLaMA-2-7B-32K.json b/data/models/togethercomputer_LLaMA-2-7B-32K.json similarity index 100% rename from data/togethercomputer_LLaMA-2-7B-32K.json rename to data/models/togethercomputer_LLaMA-2-7B-32K.json diff --git a/data/togethercomputer_Llama-2-7B-32K-Instruct.json b/data/models/togethercomputer_Llama-2-7B-32K-Instruct.json similarity index 100% rename from data/togethercomputer_Llama-2-7B-32K-Instruct.json rename to data/models/togethercomputer_Llama-2-7B-32K-Instruct.json diff --git a/data/togethercomputer_RedPajama-INCITE-7B-Base.json b/data/models/togethercomputer_RedPajama-INCITE-7B-Base.json similarity index 100% rename from data/togethercomputer_RedPajama-INCITE-7B-Base.json rename to data/models/togethercomputer_RedPajama-INCITE-7B-Base.json diff --git a/data/togethercomputer_RedPajama-INCITE-7B-Chat.json b/data/models/togethercomputer_RedPajama-INCITE-7B-Chat.json similarity index 100% rename from data/togethercomputer_RedPajama-INCITE-7B-Chat.json rename to data/models/togethercomputer_RedPajama-INCITE-7B-Chat.json diff --git a/data/togethercomputer_RedPajama-INCITE-7B-Instruct.json b/data/models/togethercomputer_RedPajama-INCITE-7B-Instruct.json similarity index 100% rename from data/togethercomputer_RedPajama-INCITE-7B-Instruct.json rename to data/models/togethercomputer_RedPajama-INCITE-7B-Instruct.json diff --git a/data/togethercomputer_RedPajama-INCITE-Base-3B-v1.json b/data/models/togethercomputer_RedPajama-INCITE-Base-3B-v1.json similarity index 100% rename from data/togethercomputer_RedPajama-INCITE-Base-3B-v1.json rename to data/models/togethercomputer_RedPajama-INCITE-Base-3B-v1.json diff --git a/data/togethercomputer_RedPajama-INCITE-Chat-3B-v1.json b/data/models/togethercomputer_RedPajama-INCITE-Chat-3B-v1.json similarity index 100% rename from data/togethercomputer_RedPajama-INCITE-Chat-3B-v1.json rename to data/models/togethercomputer_RedPajama-INCITE-Chat-3B-v1.json diff --git a/data/togethercomputer_RedPajama-INCITE-Instruct-3B-v1.json b/data/models/togethercomputer_RedPajama-INCITE-Instruct-3B-v1.json similarity index 100% rename from data/togethercomputer_RedPajama-INCITE-Instruct-3B-v1.json rename to data/models/togethercomputer_RedPajama-INCITE-Instruct-3B-v1.json diff --git a/data/tokyotech-llm_Llama-3-Swallow-8B-Instruct-v0.1.json b/data/models/tokyotech-llm_Llama-3-Swallow-8B-Instruct-v0.1.json similarity index 100% rename from data/tokyotech-llm_Llama-3-Swallow-8B-Instruct-v0.1.json rename to data/models/tokyotech-llm_Llama-3-Swallow-8B-Instruct-v0.1.json diff --git a/data/tomasmcm_sky-t1-coder-32b-flash.json b/data/models/tomasmcm_sky-t1-coder-32b-flash.json similarity index 100% rename from data/tomasmcm_sky-t1-coder-32b-flash.json rename to data/models/tomasmcm_sky-t1-coder-32b-flash.json diff --git a/data/trthminh1112_autotrain-llama32-1b-finetune.json b/data/models/trthminh1112_autotrain-llama32-1b-finetune.json similarity index 100% rename from data/trthminh1112_autotrain-llama32-1b-finetune.json rename to data/models/trthminh1112_autotrain-llama32-1b-finetune.json diff --git a/data/tugstugi_Qwen2.5-7B-Instruct-QwQ-v0.1.json b/data/models/tugstugi_Qwen2.5-7B-Instruct-QwQ-v0.1.json similarity index 100% rename from data/tugstugi_Qwen2.5-7B-Instruct-QwQ-v0.1.json rename to data/models/tugstugi_Qwen2.5-7B-Instruct-QwQ-v0.1.json diff --git a/data/universalml_NepaliGPT-2.0.json b/data/models/universalml_NepaliGPT-2.0.json similarity index 100% rename from data/universalml_NepaliGPT-2.0.json rename to data/models/universalml_NepaliGPT-2.0.json diff --git a/data/unknown_aya-expanse-32b.json b/data/models/unknown_aya-expanse-32b.json similarity index 100% rename from data/unknown_aya-expanse-32b.json rename to data/models/unknown_aya-expanse-32b.json diff --git a/data/unknown_granite-4.0-h-small.json b/data/models/unknown_granite-4.0-h-small.json similarity index 100% rename from data/unknown_granite-4.0-h-small.json rename to data/models/unknown_granite-4.0-h-small.json diff --git a/data/unknown_o4-mini-2025-04-16.json b/data/models/unknown_o4-mini-2025-04-16.json similarity index 100% rename from data/unknown_o4-mini-2025-04-16.json rename to data/models/unknown_o4-mini-2025-04-16.json diff --git a/data/unsloth_Llama-3.2-1B-Instruct-no-system-message.json b/data/models/unsloth_Llama-3.2-1B-Instruct-no-system-message.json similarity index 100% rename from data/unsloth_Llama-3.2-1B-Instruct-no-system-message.json rename to data/models/unsloth_Llama-3.2-1B-Instruct-no-system-message.json diff --git a/data/unsloth_Llama-3.2-1B-Instruct.json b/data/models/unsloth_Llama-3.2-1B-Instruct.json similarity index 100% rename from data/unsloth_Llama-3.2-1B-Instruct.json rename to data/models/unsloth_Llama-3.2-1B-Instruct.json diff --git a/data/unsloth_Phi-3-mini-4k-instruct.json b/data/models/unsloth_Phi-3-mini-4k-instruct.json similarity index 100% rename from data/unsloth_Phi-3-mini-4k-instruct.json rename to data/models/unsloth_Phi-3-mini-4k-instruct.json diff --git a/data/unsloth_phi-4-bnb-4bit.json b/data/models/unsloth_phi-4-bnb-4bit.json similarity index 100% rename from data/unsloth_phi-4-bnb-4bit.json rename to data/models/unsloth_phi-4-bnb-4bit.json diff --git a/data/unsloth_phi-4-unsloth-bnb-4bit.json b/data/models/unsloth_phi-4-unsloth-bnb-4bit.json similarity index 100% rename from data/unsloth_phi-4-unsloth-bnb-4bit.json rename to data/models/unsloth_phi-4-unsloth-bnb-4bit.json diff --git a/data/unsloth_phi-4.json b/data/models/unsloth_phi-4.json similarity index 100% rename from data/unsloth_phi-4.json rename to data/models/unsloth_phi-4.json diff --git a/data/upstage_SOLAR-10.7B-Instruct-v1.0.json b/data/models/upstage_SOLAR-10.7B-Instruct-v1.0.json similarity index 100% rename from data/upstage_SOLAR-10.7B-Instruct-v1.0.json rename to data/models/upstage_SOLAR-10.7B-Instruct-v1.0.json diff --git a/data/upstage_SOLAR-10.7B-v1.0.json b/data/models/upstage_SOLAR-10.7B-v1.0.json similarity index 100% rename from data/upstage_SOLAR-10.7B-v1.0.json rename to data/models/upstage_SOLAR-10.7B-v1.0.json diff --git a/data/upstage_solar-pro-241126.json b/data/models/upstage_solar-pro-241126.json similarity index 100% rename from data/upstage_solar-pro-241126.json rename to data/models/upstage_solar-pro-241126.json diff --git a/data/upstage_solar-pro-preview-instruct.json b/data/models/upstage_solar-pro-preview-instruct.json similarity index 100% rename from data/upstage_solar-pro-preview-instruct.json rename to data/models/upstage_solar-pro-preview-instruct.json diff --git a/data/utkmst_chimera-beta-test2-lora-merged.json b/data/models/utkmst_chimera-beta-test2-lora-merged.json similarity index 100% rename from data/utkmst_chimera-beta-test2-lora-merged.json rename to data/models/utkmst_chimera-beta-test2-lora-merged.json diff --git a/data/uukuguy_speechless-code-mistral-7b-v1.0.json b/data/models/uukuguy_speechless-code-mistral-7b-v1.0.json similarity index 100% rename from data/uukuguy_speechless-code-mistral-7b-v1.0.json rename to data/models/uukuguy_speechless-code-mistral-7b-v1.0.json diff --git a/data/uukuguy_speechless-codellama-34b-v2.0.json b/data/models/uukuguy_speechless-codellama-34b-v2.0.json similarity index 100% rename from data/uukuguy_speechless-codellama-34b-v2.0.json rename to data/models/uukuguy_speechless-codellama-34b-v2.0.json diff --git a/data/uukuguy_speechless-coder-ds-6.7b.json b/data/models/uukuguy_speechless-coder-ds-6.7b.json similarity index 100% rename from data/uukuguy_speechless-coder-ds-6.7b.json rename to data/models/uukuguy_speechless-coder-ds-6.7b.json diff --git a/data/uukuguy_speechless-instruct-mistral-7b-v0.2.json b/data/models/uukuguy_speechless-instruct-mistral-7b-v0.2.json similarity index 100% rename from data/uukuguy_speechless-instruct-mistral-7b-v0.2.json rename to data/models/uukuguy_speechless-instruct-mistral-7b-v0.2.json diff --git a/data/uukuguy_speechless-llama2-hermes-orca-platypus-wizardlm-13b.json b/data/models/uukuguy_speechless-llama2-hermes-orca-platypus-wizardlm-13b.json similarity index 100% rename from data/uukuguy_speechless-llama2-hermes-orca-platypus-wizardlm-13b.json rename to data/models/uukuguy_speechless-llama2-hermes-orca-platypus-wizardlm-13b.json diff --git a/data/uukuguy_speechless-mistral-dolphin-orca-platypus-samantha-7b.json b/data/models/uukuguy_speechless-mistral-dolphin-orca-platypus-samantha-7b.json similarity index 100% rename from data/uukuguy_speechless-mistral-dolphin-orca-platypus-samantha-7b.json rename to data/models/uukuguy_speechless-mistral-dolphin-orca-platypus-samantha-7b.json diff --git a/data/uukuguy_speechless-zephyr-code-functionary-7b.json b/data/models/uukuguy_speechless-zephyr-code-functionary-7b.json similarity index 100% rename from data/uukuguy_speechless-zephyr-code-functionary-7b.json rename to data/models/uukuguy_speechless-zephyr-code-functionary-7b.json diff --git a/data/v000000_L3-8B-Stheno-v3.2-abliterated.json b/data/models/v000000_L3-8B-Stheno-v3.2-abliterated.json similarity index 100% rename from data/v000000_L3-8B-Stheno-v3.2-abliterated.json rename to data/models/v000000_L3-8B-Stheno-v3.2-abliterated.json diff --git a/data/v000000_L3.1-Niitorm-8B-DPO-t0.0001.json b/data/models/v000000_L3.1-Niitorm-8B-DPO-t0.0001.json similarity index 100% rename from data/v000000_L3.1-Niitorm-8B-DPO-t0.0001.json rename to data/models/v000000_L3.1-Niitorm-8B-DPO-t0.0001.json diff --git a/data/v000000_L3.1-Storniitova-8B.json b/data/models/v000000_L3.1-Storniitova-8B.json similarity index 100% rename from data/v000000_L3.1-Storniitova-8B.json rename to data/models/v000000_L3.1-Storniitova-8B.json diff --git a/data/v000000_Qwen2.5-14B-Gutenberg-1e-Delta.json b/data/models/v000000_Qwen2.5-14B-Gutenberg-1e-Delta.json similarity index 100% rename from data/v000000_Qwen2.5-14B-Gutenberg-1e-Delta.json rename to data/models/v000000_Qwen2.5-14B-Gutenberg-1e-Delta.json diff --git a/data/v000000_Qwen2.5-14B-Gutenberg-Instruct-Slerpeno.json b/data/models/v000000_Qwen2.5-14B-Gutenberg-Instruct-Slerpeno.json similarity index 100% rename from data/v000000_Qwen2.5-14B-Gutenberg-Instruct-Slerpeno.json rename to data/models/v000000_Qwen2.5-14B-Gutenberg-Instruct-Slerpeno.json diff --git a/data/v000000_Qwen2.5-Lumen-14B.json b/data/models/v000000_Qwen2.5-Lumen-14B.json similarity index 100% rename from data/v000000_Qwen2.5-Lumen-14B.json rename to data/models/v000000_Qwen2.5-Lumen-14B.json diff --git a/data/vhab10_Llama-3.1-8B-Base-Instruct-SLERP.json b/data/models/vhab10_Llama-3.1-8B-Base-Instruct-SLERP.json similarity index 100% rename from data/vhab10_Llama-3.1-8B-Base-Instruct-SLERP.json rename to data/models/vhab10_Llama-3.1-8B-Base-Instruct-SLERP.json diff --git a/data/vhab10_Llama-3.2-Instruct-3B-TIES.json b/data/models/vhab10_Llama-3.2-Instruct-3B-TIES.json similarity index 100% rename from data/vhab10_Llama-3.2-Instruct-3B-TIES.json rename to data/models/vhab10_Llama-3.2-Instruct-3B-TIES.json diff --git a/data/vhab10_llama-3-8b-merged-linear.json b/data/models/vhab10_llama-3-8b-merged-linear.json similarity index 100% rename from data/vhab10_llama-3-8b-merged-linear.json rename to data/models/vhab10_llama-3-8b-merged-linear.json diff --git a/data/vicgalle_CarbonBeagle-11B-truthy.json b/data/models/vicgalle_CarbonBeagle-11B-truthy.json similarity index 100% rename from data/vicgalle_CarbonBeagle-11B-truthy.json rename to data/models/vicgalle_CarbonBeagle-11B-truthy.json diff --git a/data/vicgalle_CarbonBeagle-11B.json b/data/models/vicgalle_CarbonBeagle-11B.json similarity index 100% rename from data/vicgalle_CarbonBeagle-11B.json rename to data/models/vicgalle_CarbonBeagle-11B.json diff --git a/data/vicgalle_Configurable-Hermes-2-Pro-Llama-3-8B.json b/data/models/vicgalle_Configurable-Hermes-2-Pro-Llama-3-8B.json similarity index 100% rename from data/vicgalle_Configurable-Hermes-2-Pro-Llama-3-8B.json rename to data/models/vicgalle_Configurable-Hermes-2-Pro-Llama-3-8B.json diff --git a/data/vicgalle_Configurable-Llama-3.1-8B-Instruct.json b/data/models/vicgalle_Configurable-Llama-3.1-8B-Instruct.json similarity index 100% rename from data/vicgalle_Configurable-Llama-3.1-8B-Instruct.json rename to data/models/vicgalle_Configurable-Llama-3.1-8B-Instruct.json diff --git a/data/vicgalle_Configurable-Yi-1.5-9B-Chat.json b/data/models/vicgalle_Configurable-Yi-1.5-9B-Chat.json similarity index 100% rename from data/vicgalle_Configurable-Yi-1.5-9B-Chat.json rename to data/models/vicgalle_Configurable-Yi-1.5-9B-Chat.json diff --git a/data/vicgalle_ConfigurableBeagle-11B.json b/data/models/vicgalle_ConfigurableBeagle-11B.json similarity index 100% rename from data/vicgalle_ConfigurableBeagle-11B.json rename to data/models/vicgalle_ConfigurableBeagle-11B.json diff --git a/data/vicgalle_ConfigurableHermes-7B.json b/data/models/vicgalle_ConfigurableHermes-7B.json similarity index 100% rename from data/vicgalle_ConfigurableHermes-7B.json rename to data/models/vicgalle_ConfigurableHermes-7B.json diff --git a/data/vicgalle_ConfigurableSOLAR-10.7B.json b/data/models/vicgalle_ConfigurableSOLAR-10.7B.json similarity index 100% rename from data/vicgalle_ConfigurableSOLAR-10.7B.json rename to data/models/vicgalle_ConfigurableSOLAR-10.7B.json diff --git a/data/vicgalle_Humanish-RP-Llama-3.1-8B.json b/data/models/vicgalle_Humanish-RP-Llama-3.1-8B.json similarity index 100% rename from data/vicgalle_Humanish-RP-Llama-3.1-8B.json rename to data/models/vicgalle_Humanish-RP-Llama-3.1-8B.json diff --git a/data/vicgalle_Merge-Mistral-Prometheus-7B.json b/data/models/vicgalle_Merge-Mistral-Prometheus-7B.json similarity index 100% rename from data/vicgalle_Merge-Mistral-Prometheus-7B.json rename to data/models/vicgalle_Merge-Mistral-Prometheus-7B.json diff --git a/data/vicgalle_Merge-Mixtral-Prometheus-8x7B.json b/data/models/vicgalle_Merge-Mixtral-Prometheus-8x7B.json similarity index 100% rename from data/vicgalle_Merge-Mixtral-Prometheus-8x7B.json rename to data/models/vicgalle_Merge-Mixtral-Prometheus-8x7B.json diff --git a/data/vicgalle_Roleplay-Llama-3-8B.json b/data/models/vicgalle_Roleplay-Llama-3-8B.json similarity index 100% rename from data/vicgalle_Roleplay-Llama-3-8B.json rename to data/models/vicgalle_Roleplay-Llama-3-8B.json diff --git a/data/viettelsecurity-ai_security-llama3.2-3b.json b/data/models/viettelsecurity-ai_security-llama3.2-3b.json similarity index 100% rename from data/viettelsecurity-ai_security-llama3.2-3b.json rename to data/models/viettelsecurity-ai_security-llama3.2-3b.json diff --git a/data/vihangd_smart-dan-sft-v0.1.json b/data/models/vihangd_smart-dan-sft-v0.1.json similarity index 100% rename from data/vihangd_smart-dan-sft-v0.1.json rename to data/models/vihangd_smart-dan-sft-v0.1.json diff --git a/data/voidful_smol-360m-ft.json b/data/models/voidful_smol-360m-ft.json similarity index 100% rename from data/voidful_smol-360m-ft.json rename to data/models/voidful_smol-360m-ft.json diff --git a/data/vonjack_MobileLLM-125M-HF.json b/data/models/vonjack_MobileLLM-125M-HF.json similarity index 100% rename from data/vonjack_MobileLLM-125M-HF.json rename to data/models/vonjack_MobileLLM-125M-HF.json diff --git a/data/vonjack_Phi-3-mini-4k-instruct-LLaMAfied.json b/data/models/vonjack_Phi-3-mini-4k-instruct-LLaMAfied.json similarity index 100% rename from data/vonjack_Phi-3-mini-4k-instruct-LLaMAfied.json rename to data/models/vonjack_Phi-3-mini-4k-instruct-LLaMAfied.json diff --git a/data/vonjack_Phi-3.5-mini-instruct-hermes-fc-json.json b/data/models/vonjack_Phi-3.5-mini-instruct-hermes-fc-json.json similarity index 100% rename from data/vonjack_Phi-3.5-mini-instruct-hermes-fc-json.json rename to data/models/vonjack_Phi-3.5-mini-instruct-hermes-fc-json.json diff --git a/data/vonjack_Qwen2.5-Coder-0.5B-Merged.json b/data/models/vonjack_Qwen2.5-Coder-0.5B-Merged.json similarity index 100% rename from data/vonjack_Qwen2.5-Coder-0.5B-Merged.json rename to data/models/vonjack_Qwen2.5-Coder-0.5B-Merged.json diff --git a/data/vonjack_SmolLM2-1.7B-Merged.json b/data/models/vonjack_SmolLM2-1.7B-Merged.json similarity index 100% rename from data/vonjack_SmolLM2-1.7B-Merged.json rename to data/models/vonjack_SmolLM2-1.7B-Merged.json diff --git a/data/vonjack_SmolLM2-135M-Merged.json b/data/models/vonjack_SmolLM2-135M-Merged.json similarity index 100% rename from data/vonjack_SmolLM2-135M-Merged.json rename to data/models/vonjack_SmolLM2-135M-Merged.json diff --git a/data/vonjack_SmolLM2-360M-Merged.json b/data/models/vonjack_SmolLM2-360M-Merged.json similarity index 100% rename from data/vonjack_SmolLM2-360M-Merged.json rename to data/models/vonjack_SmolLM2-360M-Merged.json diff --git a/data/w4r10ck_SOLAR-10.7B-Instruct-v1.0-uncensored.json b/data/models/w4r10ck_SOLAR-10.7B-Instruct-v1.0-uncensored.json similarity index 100% rename from data/w4r10ck_SOLAR-10.7B-Instruct-v1.0-uncensored.json rename to data/models/w4r10ck_SOLAR-10.7B-Instruct-v1.0-uncensored.json diff --git a/data/wanlige_li-14b-v0.4-slerp.json b/data/models/wanlige_li-14b-v0.4-slerp.json similarity index 100% rename from data/wanlige_li-14b-v0.4-slerp.json rename to data/models/wanlige_li-14b-v0.4-slerp.json diff --git a/data/wanlige_li-14b-v0.4-slerp0.1.json b/data/models/wanlige_li-14b-v0.4-slerp0.1.json similarity index 100% rename from data/wanlige_li-14b-v0.4-slerp0.1.json rename to data/models/wanlige_li-14b-v0.4-slerp0.1.json diff --git a/data/wanlige_li-14b-v0.4.json b/data/models/wanlige_li-14b-v0.4.json similarity index 100% rename from data/wanlige_li-14b-v0.4.json rename to data/models/wanlige_li-14b-v0.4.json diff --git a/data/wannaphong_KhanomTanLLM-Instruct.json b/data/models/wannaphong_KhanomTanLLM-Instruct.json similarity index 100% rename from data/wannaphong_KhanomTanLLM-Instruct.json rename to data/models/wannaphong_KhanomTanLLM-Instruct.json diff --git a/data/waqasali1707_Beast-Soul-new.json b/data/models/waqasali1707_Beast-Soul-new.json similarity index 100% rename from data/waqasali1707_Beast-Soul-new.json rename to data/models/waqasali1707_Beast-Soul-new.json diff --git a/data/wave-on-discord_qwent-7b.json b/data/models/wave-on-discord_qwent-7b.json similarity index 100% rename from data/wave-on-discord_qwent-7b.json rename to data/models/wave-on-discord_qwent-7b.json diff --git a/data/weathermanj_Menda-3B-500.json b/data/models/weathermanj_Menda-3B-500.json similarity index 100% rename from data/weathermanj_Menda-3B-500.json rename to data/models/weathermanj_Menda-3B-500.json diff --git a/data/weathermanj_Menda-3b-750.json b/data/models/weathermanj_Menda-3b-750.json similarity index 100% rename from data/weathermanj_Menda-3b-750.json rename to data/models/weathermanj_Menda-3b-750.json diff --git a/data/weathermanj_Menda-3b-Optim-100.json b/data/models/weathermanj_Menda-3b-Optim-100.json similarity index 100% rename from data/weathermanj_Menda-3b-Optim-100.json rename to data/models/weathermanj_Menda-3b-Optim-100.json diff --git a/data/weathermanj_Menda-3b-Optim-200.json b/data/models/weathermanj_Menda-3b-Optim-200.json similarity index 100% rename from data/weathermanj_Menda-3b-Optim-200.json rename to data/models/weathermanj_Menda-3b-Optim-200.json diff --git a/data/wenbopan_Faro-Yi-9B-DPO.json b/data/models/wenbopan_Faro-Yi-9B-DPO.json similarity index 100% rename from data/wenbopan_Faro-Yi-9B-DPO.json rename to data/models/wenbopan_Faro-Yi-9B-DPO.json diff --git a/data/weqweasdas_RM-Gemma-2B.json b/data/models/weqweasdas_RM-Gemma-2B.json similarity index 100% rename from data/weqweasdas_RM-Gemma-2B.json rename to data/models/weqweasdas_RM-Gemma-2B.json diff --git a/data/weqweasdas_RM-Gemma-7B-4096.json b/data/models/weqweasdas_RM-Gemma-7B-4096.json similarity index 100% rename from data/weqweasdas_RM-Gemma-7B-4096.json rename to data/models/weqweasdas_RM-Gemma-7B-4096.json diff --git a/data/weqweasdas_RM-Gemma-7B.json b/data/models/weqweasdas_RM-Gemma-7B.json similarity index 100% rename from data/weqweasdas_RM-Gemma-7B.json rename to data/models/weqweasdas_RM-Gemma-7B.json diff --git a/data/weqweasdas_RM-Mistral-7B.json b/data/models/weqweasdas_RM-Mistral-7B.json similarity index 100% rename from data/weqweasdas_RM-Mistral-7B.json rename to data/models/weqweasdas_RM-Mistral-7B.json index 014b8589e308fa2571a3ca85971364da616341ae..2c0c95b4657b4530753b94c6c05b68b220f49072 100644 --- a/data/weqweasdas_RM-Mistral-7B.json +++ b/data/models/weqweasdas_RM-Mistral-7B.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/weqweasdas_RM-Mistral-7B/1766412838.146816", + "evaluation_id": "reward-bench-2/weqweasdas_RM-Mistral-7B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,109 +31,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7982 + "score": 0.596 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9665 + "score": 0.5937 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6053 + "score": 0.3438 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.5956 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8703 + "score": 0.6911 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7736 + "score": 0.7293 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.753 + "score": 0.6226 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], @@ -141,10 +159,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench-2/weqweasdas_RM-Mistral-7B/1766412838.146816", + "evaluation_id": "reward-bench/weqweasdas_RM-Mistral-7B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -163,127 +181,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.596 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5937 + "score": 0.7982 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3438 + "score": 0.9665 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5956 + "score": 0.6053 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6911 + "score": 0.8703 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7293 + "score": 0.7736 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6226 + "score": 0.753 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/weqweasdas_hh_rlhf_rm_open_llama_3b.json b/data/models/weqweasdas_hh_rlhf_rm_open_llama_3b.json similarity index 100% rename from data/weqweasdas_hh_rlhf_rm_open_llama_3b.json rename to data/models/weqweasdas_hh_rlhf_rm_open_llama_3b.json index f9d2b14f0a0f79f11e39957c0f38b89aa1e78ac9..564c2ceeb0768d947ec7e8507c351558f76e3907 100644 --- a/data/weqweasdas_hh_rlhf_rm_open_llama_3b.json +++ b/data/models/weqweasdas_hh_rlhf_rm_open_llama_3b.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/weqweasdas_hh_rlhf_rm_open_llama_3b/1766412838.146816", + "evaluation_id": "reward-bench-2/weqweasdas_hh_rlhf_rm_open_llama_3b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,109 +31,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5027 + "score": 0.2498 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8184 + "score": 0.3642 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3728 + "score": 0.275 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3497 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4149 + "score": 0.24 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3281 + "score": 0.2384 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6564 + "score": 0.0315 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], @@ -141,10 +159,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench-2/weqweasdas_hh_rlhf_rm_open_llama_3b/1766412838.146816", + "evaluation_id": "reward-bench/weqweasdas_hh_rlhf_rm_open_llama_3b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -163,127 +181,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2498 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3642 + "score": 0.5027 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.275 + "score": 0.8184 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3497 + "score": 0.3728 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.24 + "score": 0.4149 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2384 + "score": 0.3281 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.0315 + "score": 0.6564 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/win10_ArliAI-RPMax-v1.3-merge-13.3B.json b/data/models/win10_ArliAI-RPMax-v1.3-merge-13.3B.json similarity index 100% rename from data/win10_ArliAI-RPMax-v1.3-merge-13.3B.json rename to data/models/win10_ArliAI-RPMax-v1.3-merge-13.3B.json diff --git a/data/win10_Breeze-13B-32k-Instruct-v1_0.json b/data/models/win10_Breeze-13B-32k-Instruct-v1_0.json similarity index 100% rename from data/win10_Breeze-13B-32k-Instruct-v1_0.json rename to data/models/win10_Breeze-13B-32k-Instruct-v1_0.json diff --git a/data/win10_EVA-Norns-Qwen2.5-v0.1.json b/data/models/win10_EVA-Norns-Qwen2.5-v0.1.json similarity index 100% rename from data/win10_EVA-Norns-Qwen2.5-v0.1.json rename to data/models/win10_EVA-Norns-Qwen2.5-v0.1.json diff --git a/data/win10_Llama-3.2-3B-Instruct-24-9-29.json b/data/models/win10_Llama-3.2-3B-Instruct-24-9-29.json similarity index 100% rename from data/win10_Llama-3.2-3B-Instruct-24-9-29.json rename to data/models/win10_Llama-3.2-3B-Instruct-24-9-29.json diff --git a/data/win10_Norns-Qwen2.5-12B.json b/data/models/win10_Norns-Qwen2.5-12B.json similarity index 100% rename from data/win10_Norns-Qwen2.5-12B.json rename to data/models/win10_Norns-Qwen2.5-12B.json diff --git a/data/win10_Norns-Qwen2.5-7B.json b/data/models/win10_Norns-Qwen2.5-7B.json similarity index 100% rename from data/win10_Norns-Qwen2.5-7B.json rename to data/models/win10_Norns-Qwen2.5-7B.json diff --git a/data/win10_Qwen2.5-2B-Instruct.json b/data/models/win10_Qwen2.5-2B-Instruct.json similarity index 100% rename from data/win10_Qwen2.5-2B-Instruct.json rename to data/models/win10_Qwen2.5-2B-Instruct.json diff --git a/data/win10_llama3-13.45b-Instruct.json b/data/models/win10_llama3-13.45b-Instruct.json similarity index 100% rename from data/win10_llama3-13.45b-Instruct.json rename to data/models/win10_llama3-13.45b-Instruct.json diff --git a/data/win10_miscii-14b-1M-0128.json b/data/models/win10_miscii-14b-1M-0128.json similarity index 100% rename from data/win10_miscii-14b-1M-0128.json rename to data/models/win10_miscii-14b-1M-0128.json diff --git a/data/winglian_Llama-3-8b-64k-PoSE.json b/data/models/winglian_Llama-3-8b-64k-PoSE.json similarity index 100% rename from data/winglian_Llama-3-8b-64k-PoSE.json rename to data/models/winglian_Llama-3-8b-64k-PoSE.json diff --git a/data/winglian_llama-3-8b-256k-PoSE.json b/data/models/winglian_llama-3-8b-256k-PoSE.json similarity index 100% rename from data/winglian_llama-3-8b-256k-PoSE.json rename to data/models/winglian_llama-3-8b-256k-PoSE.json diff --git a/data/writer_InstructPalmyra-30B.json b/data/models/writer_InstructPalmyra-30B.json similarity index 100% rename from data/writer_InstructPalmyra-30B.json rename to data/models/writer_InstructPalmyra-30B.json diff --git a/data/writer_palmyra-fin.json b/data/models/writer_palmyra-fin.json similarity index 100% rename from data/writer_palmyra-fin.json rename to data/models/writer_palmyra-fin.json diff --git a/data/writer_palmyra-med.json b/data/models/writer_palmyra-med.json similarity index 100% rename from data/writer_palmyra-med.json rename to data/models/writer_palmyra-med.json diff --git a/data/writer_palmyra-x-004.json b/data/models/writer_palmyra-x-004.json similarity index 100% rename from data/writer_palmyra-x-004.json rename to data/models/writer_palmyra-x-004.json diff --git a/data/writer_palmyra-x-v2.json b/data/models/writer_palmyra-x-v2.json similarity index 100% rename from data/writer_palmyra-x-v2.json rename to data/models/writer_palmyra-x-v2.json diff --git a/data/writer_palmyra-x-v3.json b/data/models/writer_palmyra-x-v3.json similarity index 100% rename from data/writer_palmyra-x-v3.json rename to data/models/writer_palmyra-x-v3.json diff --git a/data/writer_palmyra-x5.json b/data/models/writer_palmyra-x5.json similarity index 100% rename from data/writer_palmyra-x5.json rename to data/models/writer_palmyra-x5.json diff --git a/data/wzhouad_gemma-2-9b-it-WPO-HB.json b/data/models/wzhouad_gemma-2-9b-it-WPO-HB.json similarity index 100% rename from data/wzhouad_gemma-2-9b-it-WPO-HB.json rename to data/models/wzhouad_gemma-2-9b-it-WPO-HB.json diff --git a/data/x0000001_Deepseek-Lumen-R1-Qwen2.5-14B.json b/data/models/x0000001_Deepseek-Lumen-R1-Qwen2.5-14B.json similarity index 100% rename from data/x0000001_Deepseek-Lumen-R1-Qwen2.5-14B.json rename to data/models/x0000001_Deepseek-Lumen-R1-Qwen2.5-14B.json diff --git a/data/xMaulana_FinMatcha-3B-Instruct.json b/data/models/xMaulana_FinMatcha-3B-Instruct.json similarity index 100% rename from data/xMaulana_FinMatcha-3B-Instruct.json rename to data/models/xMaulana_FinMatcha-3B-Instruct.json diff --git a/data/xai_Grok_4.json b/data/models/xai_Grok_4.json similarity index 100% rename from data/xai_Grok_4.json rename to data/models/xai_Grok_4.json diff --git a/data/xai_grok-3-beta.json b/data/models/xai_grok-3-beta.json similarity index 100% rename from data/xai_grok-3-beta.json rename to data/models/xai_grok-3-beta.json diff --git a/data/xai_grok-3-mini-beta.json b/data/models/xai_grok-3-mini-beta.json similarity index 100% rename from data/xai_grok-3-mini-beta.json rename to data/models/xai_grok-3-mini-beta.json diff --git a/data/xai_grok-3-mini.json b/data/models/xai_grok-3-mini.json similarity index 100% rename from data/xai_grok-3-mini.json rename to data/models/xai_grok-3-mini.json index 3b47d5d8a39c7b750bf72a3cc9e7d3b9930e0c45..cc49a8d8e62fa9947f05c56d3c235f6c00fb5c43 100644 --- a/data/xai_grok-3-mini.json +++ b/data/models/xai_grok-3-mini.json @@ -10,8 +10,8 @@ }, "evaluations": [ { - "evaluation_id": "global-mmlu-lite/xai_grok-3-mini/1773936583.743359", - "retrieved_timestamp": "1773936583.743359", + "evaluation_id": "global-mmlu-lite/xai_grok-3-mini/1773936496.366405", + "retrieved_timestamp": "1773936496.366405", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", @@ -525,8 +525,8 @@ "generation_config": null }, { - "evaluation_id": "global-mmlu-lite/xai_grok-3-mini/1773936496.366405", - "retrieved_timestamp": "1773936496.366405", + "evaluation_id": "global-mmlu-lite/xai_grok-3-mini/1773936583.743359", + "retrieved_timestamp": "1773936583.743359", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", diff --git a/data/xai_grok-4-0709.json b/data/models/xai_grok-4-0709.json similarity index 100% rename from data/xai_grok-4-0709.json rename to data/models/xai_grok-4-0709.json index 838e8a4c652b167f0ca9e369c9a16e0d914d31c1..17df1c3c71c8bde63dde1258e2fa6f3d90db167c 100644 --- a/data/xai_grok-4-0709.json +++ b/data/models/xai_grok-4-0709.json @@ -7,8 +7,8 @@ }, "evaluations": [ { - "evaluation_id": "global-mmlu-lite/xai_grok-4-0709/1773936583.743359", - "retrieved_timestamp": "1773936583.743359", + "evaluation_id": "global-mmlu-lite/xai_grok-4-0709/1773936496.366405", + "retrieved_timestamp": "1773936496.366405", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", @@ -522,8 +522,8 @@ "generation_config": null }, { - "evaluation_id": "global-mmlu-lite/xai_grok-4-0709/1773936496.366405", - "retrieved_timestamp": "1773936496.366405", + "evaluation_id": "global-mmlu-lite/xai_grok-4-0709/1773936583.743359", + "retrieved_timestamp": "1773936583.743359", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", diff --git a/data/xai_grok-4.json b/data/models/xai_grok-4.json similarity index 98% rename from data/xai_grok-4.json rename to data/models/xai_grok-4.json index f874bd900d3ef43e4eee8abe6e14222905fb7f59..cecbb7f7fed16e61050c3064764dc9d378c04607 100644 --- a/data/xai_grok-4.json +++ b/data/models/xai_grok-4.json @@ -4,13 +4,13 @@ "id": "xai/grok-4", "developer": "xAI", "additional_details": { - "agent_name": "Mini-SWE-Agent", - "agent_organization": "Princeton" + "agent_name": "Terminus 2", + "agent_organization": "Terminal Bench" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/openhands__grok-4/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mini-swe-agent__grok-4/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-02", + "evaluation_timestamp": "2025-11-03", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 27.2, + "score": 25.4, "uncertainty": { "standard_error": { - "value": 3.1 + "value": 2.9 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Grok 4\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Grok 4\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Grok 4\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Grok 4\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__grok-4/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/openhands__grok-4/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-10-31", + "evaluation_timestamp": "2025-11-02", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 23.1, + "score": 27.2, "uncertainty": { "standard_error": { - "value": 2.9 + "value": 3.1 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Grok 4\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Grok 4\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Grok 4\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Grok 4\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/mini-swe-agent__grok-4/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__grok-4/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-03", + "evaluation_timestamp": "2025-10-31", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,7 +191,7 @@ "max_score": 100.0 }, "score_details": { - "score": 25.4, + "score": 23.1, "uncertainty": { "standard_error": { "value": 2.9 @@ -201,7 +201,7 @@ }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Grok 4\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Grok 4\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Grok 4\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Grok 4\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/xai_grok-code-fast-1.json b/data/models/xai_grok-code-fast-1.json similarity index 100% rename from data/xai_grok-code-fast-1.json rename to data/models/xai_grok-code-fast-1.json diff --git a/data/xinchen9_Llama3.1_8B_Instruct_CoT.json b/data/models/xinchen9_Llama3.1_8B_Instruct_CoT.json similarity index 100% rename from data/xinchen9_Llama3.1_8B_Instruct_CoT.json rename to data/models/xinchen9_Llama3.1_8B_Instruct_CoT.json diff --git a/data/xinchen9_Llama3.1_CoT.json b/data/models/xinchen9_Llama3.1_CoT.json similarity index 100% rename from data/xinchen9_Llama3.1_CoT.json rename to data/models/xinchen9_Llama3.1_CoT.json diff --git a/data/xinchen9_Llama3.1_CoT_V1.json b/data/models/xinchen9_Llama3.1_CoT_V1.json similarity index 100% rename from data/xinchen9_Llama3.1_CoT_V1.json rename to data/models/xinchen9_Llama3.1_CoT_V1.json diff --git a/data/xinchen9_Mistral-7B-CoT.json b/data/models/xinchen9_Mistral-7B-CoT.json similarity index 100% rename from data/xinchen9_Mistral-7B-CoT.json rename to data/models/xinchen9_Mistral-7B-CoT.json diff --git a/data/xinchen9_llama3-b8-ft-dis.json b/data/models/xinchen9_llama3-b8-ft-dis.json similarity index 100% rename from data/xinchen9_llama3-b8-ft-dis.json rename to data/models/xinchen9_llama3-b8-ft-dis.json diff --git a/data/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table.json b/data/models/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table.json similarity index 100% rename from data/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table.json rename to data/models/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table.json diff --git a/data/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table.json b/data/models/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table.json similarity index 100% rename from data/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table.json rename to data/models/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table.json diff --git a/data/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table.json b/data/models/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table.json similarity index 100% rename from data/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table.json rename to data/models/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table.json diff --git a/data/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table.json b/data/models/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table.json similarity index 100% rename from data/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table.json rename to data/models/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table.json diff --git a/data/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001.json b/data/models/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001.json similarity index 100% rename from data/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001.json rename to data/models/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001.json diff --git a/data/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002.json b/data/models/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002.json similarity index 100% rename from data/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002.json rename to data/models/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002.json diff --git a/data/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001.json b/data/models/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001.json similarity index 100% rename from data/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001.json rename to data/models/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001.json diff --git a/data/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002.json b/data/models/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002.json similarity index 100% rename from data/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002.json rename to data/models/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002.json diff --git a/data/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table.json b/data/models/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table.json similarity index 100% rename from data/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table.json rename to data/models/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table.json diff --git a/data/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table.json b/data/models/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table.json similarity index 100% rename from data/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table.json rename to data/models/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table.json diff --git a/data/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table.json b/data/models/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table.json similarity index 100% rename from data/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table.json rename to data/models/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table.json diff --git a/data/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table.json b/data/models/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table.json similarity index 100% rename from data/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table.json rename to data/models/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table.json diff --git a/data/xukp20_Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001.json b/data/models/xukp20_Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001.json similarity index 100% rename from data/xukp20_Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001.json rename to data/models/xukp20_Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001.json diff --git a/data/xukp20_Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002.json b/data/models/xukp20_Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002.json similarity index 100% rename from data/xukp20_Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002.json rename to data/models/xukp20_Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002.json diff --git a/data/xukp20_Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001.json b/data/models/xukp20_Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001.json similarity index 100% rename from data/xukp20_Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001.json rename to data/models/xukp20_Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001.json diff --git a/data/xukp20_llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table.json b/data/models/xukp20_llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table.json similarity index 100% rename from data/xukp20_llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table.json rename to data/models/xukp20_llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table.json diff --git a/data/xwen-team_Xwen-7B-Chat.json b/data/models/xwen-team_Xwen-7B-Chat.json similarity index 100% rename from data/xwen-team_Xwen-7B-Chat.json rename to data/models/xwen-team_Xwen-7B-Chat.json diff --git a/data/xxx777xxxASD_L3.1-ClaudeMaid-4x8B.json b/data/models/xxx777xxxASD_L3.1-ClaudeMaid-4x8B.json similarity index 100% rename from data/xxx777xxxASD_L3.1-ClaudeMaid-4x8B.json rename to data/models/xxx777xxxASD_L3.1-ClaudeMaid-4x8B.json diff --git a/data/yam-peleg_Hebrew-Gemma-11B-Instruct.json b/data/models/yam-peleg_Hebrew-Gemma-11B-Instruct.json similarity index 100% rename from data/yam-peleg_Hebrew-Gemma-11B-Instruct.json rename to data/models/yam-peleg_Hebrew-Gemma-11B-Instruct.json diff --git a/data/yam-peleg_Hebrew-Mistral-7B-200K.json b/data/models/yam-peleg_Hebrew-Mistral-7B-200K.json similarity index 100% rename from data/yam-peleg_Hebrew-Mistral-7B-200K.json rename to data/models/yam-peleg_Hebrew-Mistral-7B-200K.json diff --git a/data/yam-peleg_Hebrew-Mistral-7B.json b/data/models/yam-peleg_Hebrew-Mistral-7B.json similarity index 100% rename from data/yam-peleg_Hebrew-Mistral-7B.json rename to data/models/yam-peleg_Hebrew-Mistral-7B.json diff --git a/data/yandex_YaLM-100B.json b/data/models/yandex_YaLM-100B.json similarity index 100% rename from data/yandex_YaLM-100B.json rename to data/models/yandex_YaLM-100B.json diff --git a/data/yanng1242_Marcoro14-7B-slerp.json b/data/models/yanng1242_Marcoro14-7B-slerp.json similarity index 100% rename from data/yanng1242_Marcoro14-7B-slerp.json rename to data/models/yanng1242_Marcoro14-7B-slerp.json diff --git a/data/yasserrmd_Coder-GRPO-3B.json b/data/models/yasserrmd_Coder-GRPO-3B.json similarity index 100% rename from data/yasserrmd_Coder-GRPO-3B.json rename to data/models/yasserrmd_Coder-GRPO-3B.json diff --git a/data/yasserrmd_Text2SQL-1.5B.json b/data/models/yasserrmd_Text2SQL-1.5B.json similarity index 100% rename from data/yasserrmd_Text2SQL-1.5B.json rename to data/models/yasserrmd_Text2SQL-1.5B.json diff --git a/data/ycros_BagelMIsteryTour-v2-8x7B.json b/data/models/ycros_BagelMIsteryTour-v2-8x7B.json similarity index 99% rename from data/ycros_BagelMIsteryTour-v2-8x7B.json rename to data/models/ycros_BagelMIsteryTour-v2-8x7B.json index 2bcd010c6e295398dfc31aaa7ad3c8c43e7bf892..d6c7a18430be1a19c6fdc4567a70f9d23dddc0f4 100644 --- a/data/ycros_BagelMIsteryTour-v2-8x7B.json +++ b/data/models/ycros_BagelMIsteryTour-v2-8x7B.json @@ -5,7 +5,7 @@ "developer": "ycros", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "MixtralForCausalLM", "params_billions": "46.703" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5994 + "score": 0.6262 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5159 + "score": 0.5142 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0785 + "score": 0.0937 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3045 + "score": 0.3079 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4203 + "score": 0.4138 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3473 + "score": 0.3481 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6262 + "score": 0.5994 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5142 + "score": 0.5159 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0937 + "score": 0.0785 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3079 + "score": 0.3045 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4138 + "score": 0.4203 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3481 + "score": 0.3473 } } ], diff --git a/data/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table.json b/data/models/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table.json similarity index 100% rename from data/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table.json rename to data/models/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table.json diff --git a/data/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table.json b/data/models/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table.json similarity index 100% rename from data/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table.json rename to data/models/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table.json diff --git a/data/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table.json b/data/models/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table.json similarity index 100% rename from data/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table.json rename to data/models/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table.json diff --git a/data/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table.json b/data/models/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table.json similarity index 100% rename from data/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table.json rename to data/models/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table.json diff --git a/data/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001.json b/data/models/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001.json similarity index 100% rename from data/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001.json rename to data/models/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001.json diff --git a/data/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002.json b/data/models/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002.json similarity index 100% rename from data/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002.json rename to data/models/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002.json diff --git a/data/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001.json b/data/models/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001.json similarity index 100% rename from data/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001.json rename to data/models/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001.json diff --git a/data/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002.json b/data/models/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002.json similarity index 100% rename from data/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002.json rename to data/models/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002.json diff --git a/data/yifAI_Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002.json b/data/models/yifAI_Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002.json similarity index 100% rename from data/yifAI_Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002.json rename to data/models/yifAI_Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002.json diff --git a/data/ylalain_ECE-PRYMMAL-YL-1B-SLERP-V8.json b/data/models/ylalain_ECE-PRYMMAL-YL-1B-SLERP-V8.json similarity index 100% rename from data/ylalain_ECE-PRYMMAL-YL-1B-SLERP-V8.json rename to data/models/ylalain_ECE-PRYMMAL-YL-1B-SLERP-V8.json diff --git a/data/ymcki_Llama-3.1-8B-GRPO-Instruct.json b/data/models/ymcki_Llama-3.1-8B-GRPO-Instruct.json similarity index 100% rename from data/ymcki_Llama-3.1-8B-GRPO-Instruct.json rename to data/models/ymcki_Llama-3.1-8B-GRPO-Instruct.json diff --git a/data/ymcki_Llama-3.1-8B-SFT-GRPO-Instruct.json b/data/models/ymcki_Llama-3.1-8B-SFT-GRPO-Instruct.json similarity index 100% rename from data/ymcki_Llama-3.1-8B-SFT-GRPO-Instruct.json rename to data/models/ymcki_Llama-3.1-8B-SFT-GRPO-Instruct.json diff --git a/data/ymcki_gemma-2-2b-ORPO-jpn-it-abliterated-18-merge.json b/data/models/ymcki_gemma-2-2b-ORPO-jpn-it-abliterated-18-merge.json similarity index 100% rename from data/ymcki_gemma-2-2b-ORPO-jpn-it-abliterated-18-merge.json rename to data/models/ymcki_gemma-2-2b-ORPO-jpn-it-abliterated-18-merge.json diff --git a/data/ymcki_gemma-2-2b-ORPO-jpn-it-abliterated-18.json b/data/models/ymcki_gemma-2-2b-ORPO-jpn-it-abliterated-18.json similarity index 100% rename from data/ymcki_gemma-2-2b-ORPO-jpn-it-abliterated-18.json rename to data/models/ymcki_gemma-2-2b-ORPO-jpn-it-abliterated-18.json diff --git a/data/ymcki_gemma-2-2b-jpn-it-abliterated-17-18-24.json b/data/models/ymcki_gemma-2-2b-jpn-it-abliterated-17-18-24.json similarity index 100% rename from data/ymcki_gemma-2-2b-jpn-it-abliterated-17-18-24.json rename to data/models/ymcki_gemma-2-2b-jpn-it-abliterated-17-18-24.json diff --git a/data/ymcki_gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca.json b/data/models/ymcki_gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca.json similarity index 100% rename from data/ymcki_gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca.json rename to data/models/ymcki_gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca.json diff --git a/data/ymcki_gemma-2-2b-jpn-it-abliterated-17-ORPO.json b/data/models/ymcki_gemma-2-2b-jpn-it-abliterated-17-ORPO.json similarity index 100% rename from data/ymcki_gemma-2-2b-jpn-it-abliterated-17-ORPO.json rename to data/models/ymcki_gemma-2-2b-jpn-it-abliterated-17-ORPO.json diff --git a/data/ymcki_gemma-2-2b-jpn-it-abliterated-17.json b/data/models/ymcki_gemma-2-2b-jpn-it-abliterated-17.json similarity index 100% rename from data/ymcki_gemma-2-2b-jpn-it-abliterated-17.json rename to data/models/ymcki_gemma-2-2b-jpn-it-abliterated-17.json diff --git a/data/ymcki_gemma-2-2b-jpn-it-abliterated-18-ORPO.json b/data/models/ymcki_gemma-2-2b-jpn-it-abliterated-18-ORPO.json similarity index 100% rename from data/ymcki_gemma-2-2b-jpn-it-abliterated-18-ORPO.json rename to data/models/ymcki_gemma-2-2b-jpn-it-abliterated-18-ORPO.json diff --git a/data/ymcki_gemma-2-2b-jpn-it-abliterated-18.json b/data/models/ymcki_gemma-2-2b-jpn-it-abliterated-18.json similarity index 100% rename from data/ymcki_gemma-2-2b-jpn-it-abliterated-18.json rename to data/models/ymcki_gemma-2-2b-jpn-it-abliterated-18.json diff --git a/data/ymcki_gemma-2-2b-jpn-it-abliterated-24.json b/data/models/ymcki_gemma-2-2b-jpn-it-abliterated-24.json similarity index 100% rename from data/ymcki_gemma-2-2b-jpn-it-abliterated-24.json rename to data/models/ymcki_gemma-2-2b-jpn-it-abliterated-24.json diff --git a/data/yuchenxie_ArlowGPT-3B-Multilingual.json b/data/models/yuchenxie_ArlowGPT-3B-Multilingual.json similarity index 100% rename from data/yuchenxie_ArlowGPT-3B-Multilingual.json rename to data/models/yuchenxie_ArlowGPT-3B-Multilingual.json diff --git a/data/yuchenxie_ArlowGPT-8B.json b/data/models/yuchenxie_ArlowGPT-8B.json similarity index 100% rename from data/yuchenxie_ArlowGPT-8B.json rename to data/models/yuchenxie_ArlowGPT-8B.json diff --git a/data/yuvraj17_Llama3-8B-SuperNova-Spectrum-Hermes-DPO.json b/data/models/yuvraj17_Llama3-8B-SuperNova-Spectrum-Hermes-DPO.json similarity index 100% rename from data/yuvraj17_Llama3-8B-SuperNova-Spectrum-Hermes-DPO.json rename to data/models/yuvraj17_Llama3-8B-SuperNova-Spectrum-Hermes-DPO.json diff --git a/data/yuvraj17_Llama3-8B-SuperNova-Spectrum-dare_ties.json b/data/models/yuvraj17_Llama3-8B-SuperNova-Spectrum-dare_ties.json similarity index 100% rename from data/yuvraj17_Llama3-8B-SuperNova-Spectrum-dare_ties.json rename to data/models/yuvraj17_Llama3-8B-SuperNova-Spectrum-dare_ties.json diff --git a/data/yuvraj17_Llama3-8B-abliterated-Spectrum-slerp.json b/data/models/yuvraj17_Llama3-8B-abliterated-Spectrum-slerp.json similarity index 100% rename from data/yuvraj17_Llama3-8B-abliterated-Spectrum-slerp.json rename to data/models/yuvraj17_Llama3-8B-abliterated-Spectrum-slerp.json diff --git a/data/z-ai_glm-4.5.json b/data/models/z-ai_glm-4.5.json similarity index 100% rename from data/z-ai_glm-4.5.json rename to data/models/z-ai_glm-4.5.json diff --git a/data/zai-org_glm-4.5-air-fp8.json b/data/models/zai-org_glm-4.5-air-fp8.json similarity index 100% rename from data/zai-org_glm-4.5-air-fp8.json rename to data/models/zai-org_glm-4.5-air-fp8.json diff --git a/data/zake7749_gemma-2-2b-it-chinese-kyara-dpo.json b/data/models/zake7749_gemma-2-2b-it-chinese-kyara-dpo.json similarity index 100% rename from data/zake7749_gemma-2-2b-it-chinese-kyara-dpo.json rename to data/models/zake7749_gemma-2-2b-it-chinese-kyara-dpo.json diff --git a/data/zake7749_gemma-2-9b-it-chinese-kyara.json b/data/models/zake7749_gemma-2-9b-it-chinese-kyara.json similarity index 100% rename from data/zake7749_gemma-2-9b-it-chinese-kyara.json rename to data/models/zake7749_gemma-2-9b-it-chinese-kyara.json diff --git a/data/zelk12_Gemma-2-TM-9B.json b/data/models/zelk12_Gemma-2-TM-9B.json similarity index 100% rename from data/zelk12_Gemma-2-TM-9B.json rename to data/models/zelk12_Gemma-2-TM-9B.json diff --git a/data/zelk12_MT-Gen1-gemma-2-9B.json b/data/models/zelk12_MT-Gen1-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT-Gen1-gemma-2-9B.json rename to data/models/zelk12_MT-Gen1-gemma-2-9B.json diff --git a/data/zelk12_MT-Gen2-GI-gemma-2-9B.json b/data/models/zelk12_MT-Gen2-GI-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT-Gen2-GI-gemma-2-9B.json rename to data/models/zelk12_MT-Gen2-GI-gemma-2-9B.json diff --git a/data/zelk12_MT-Gen2-gemma-2-9B.json b/data/models/zelk12_MT-Gen2-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT-Gen2-gemma-2-9B.json rename to data/models/zelk12_MT-Gen2-gemma-2-9B.json diff --git a/data/zelk12_MT-Gen3-gemma-2-9B.json b/data/models/zelk12_MT-Gen3-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT-Gen3-gemma-2-9B.json rename to data/models/zelk12_MT-Gen3-gemma-2-9B.json diff --git a/data/zelk12_MT-Gen4-gemma-2-9B.json b/data/models/zelk12_MT-Gen4-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT-Gen4-gemma-2-9B.json rename to data/models/zelk12_MT-Gen4-gemma-2-9B.json diff --git a/data/zelk12_MT-Gen5-gemma-2-9B.json b/data/models/zelk12_MT-Gen5-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT-Gen5-gemma-2-9B.json rename to data/models/zelk12_MT-Gen5-gemma-2-9B.json diff --git a/data/zelk12_MT-Gen6-gemma-2-9B.json b/data/models/zelk12_MT-Gen6-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT-Gen6-gemma-2-9B.json rename to data/models/zelk12_MT-Gen6-gemma-2-9B.json diff --git a/data/zelk12_MT-Gen6fix-gemma-2-9B.json b/data/models/zelk12_MT-Gen6fix-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT-Gen6fix-gemma-2-9B.json rename to data/models/zelk12_MT-Gen6fix-gemma-2-9B.json diff --git a/data/zelk12_MT-Gen7-gemma-2-9B.json b/data/models/zelk12_MT-Gen7-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT-Gen7-gemma-2-9B.json rename to data/models/zelk12_MT-Gen7-gemma-2-9B.json diff --git a/data/zelk12_MT-Max-Merge_02012025163610-gemma-2-9B.json b/data/models/zelk12_MT-Max-Merge_02012025163610-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT-Max-Merge_02012025163610-gemma-2-9B.json rename to data/models/zelk12_MT-Max-Merge_02012025163610-gemma-2-9B.json diff --git a/data/zelk12_MT-Merge-gemma-2-9B.json b/data/models/zelk12_MT-Merge-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT-Merge-gemma-2-9B.json rename to data/models/zelk12_MT-Merge-gemma-2-9B.json diff --git a/data/zelk12_MT-Merge1-gemma-2-9B.json b/data/models/zelk12_MT-Merge1-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT-Merge1-gemma-2-9B.json rename to data/models/zelk12_MT-Merge1-gemma-2-9B.json diff --git a/data/zelk12_MT-Merge2-MU-gemma-2-MTg2MT1g2-9B.json b/data/models/zelk12_MT-Merge2-MU-gemma-2-MTg2MT1g2-9B.json similarity index 100% rename from data/zelk12_MT-Merge2-MU-gemma-2-MTg2MT1g2-9B.json rename to data/models/zelk12_MT-Merge2-MU-gemma-2-MTg2MT1g2-9B.json diff --git a/data/zelk12_MT-Merge2-gemma-2-9B.json b/data/models/zelk12_MT-Merge2-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT-Merge2-gemma-2-9B.json rename to data/models/zelk12_MT-Merge2-gemma-2-9B.json diff --git a/data/zelk12_MT-Merge3-gemma-2-9B.json b/data/models/zelk12_MT-Merge3-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT-Merge3-gemma-2-9B.json rename to data/models/zelk12_MT-Merge3-gemma-2-9B.json diff --git a/data/zelk12_MT-Merge4-gemma-2-9B.json b/data/models/zelk12_MT-Merge4-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT-Merge4-gemma-2-9B.json rename to data/models/zelk12_MT-Merge4-gemma-2-9B.json diff --git a/data/zelk12_MT-Merge5-gemma-2-9B.json b/data/models/zelk12_MT-Merge5-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT-Merge5-gemma-2-9B.json rename to data/models/zelk12_MT-Merge5-gemma-2-9B.json diff --git a/data/zelk12_MT-Merge6-gemma-2-9B.json b/data/models/zelk12_MT-Merge6-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT-Merge6-gemma-2-9B.json rename to data/models/zelk12_MT-Merge6-gemma-2-9B.json diff --git a/data/zelk12_MT-gemma-2-9B.json b/data/models/zelk12_MT-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT-gemma-2-9B.json rename to data/models/zelk12_MT-gemma-2-9B.json diff --git a/data/zelk12_MT1-Gen1-gemma-2-9B.json b/data/models/zelk12_MT1-Gen1-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT1-Gen1-gemma-2-9B.json rename to data/models/zelk12_MT1-Gen1-gemma-2-9B.json diff --git a/data/zelk12_MT1-Gen2-gemma-2-9B.json b/data/models/zelk12_MT1-Gen2-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT1-Gen2-gemma-2-9B.json rename to data/models/zelk12_MT1-Gen2-gemma-2-9B.json diff --git a/data/zelk12_MT1-Gen3-gemma-2-9B.json b/data/models/zelk12_MT1-Gen3-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT1-Gen3-gemma-2-9B.json rename to data/models/zelk12_MT1-Gen3-gemma-2-9B.json diff --git a/data/zelk12_MT1-Gen4-gemma-2-9B.json b/data/models/zelk12_MT1-Gen4-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT1-Gen4-gemma-2-9B.json rename to data/models/zelk12_MT1-Gen4-gemma-2-9B.json diff --git a/data/zelk12_MT1-Gen5-IF-gemma-2-S2DMv1-9B.json b/data/models/zelk12_MT1-Gen5-IF-gemma-2-S2DMv1-9B.json similarity index 100% rename from data/zelk12_MT1-Gen5-IF-gemma-2-S2DMv1-9B.json rename to data/models/zelk12_MT1-Gen5-IF-gemma-2-S2DMv1-9B.json diff --git a/data/zelk12_MT1-Gen5-gemma-2-9B.json b/data/models/zelk12_MT1-Gen5-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT1-Gen5-gemma-2-9B.json rename to data/models/zelk12_MT1-Gen5-gemma-2-9B.json diff --git a/data/zelk12_MT1-Gen6-gemma-2-9B.json b/data/models/zelk12_MT1-Gen6-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT1-Gen6-gemma-2-9B.json rename to data/models/zelk12_MT1-Gen6-gemma-2-9B.json diff --git a/data/zelk12_MT1-Gen7-gemma-2-9B.json b/data/models/zelk12_MT1-Gen7-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT1-Gen7-gemma-2-9B.json rename to data/models/zelk12_MT1-Gen7-gemma-2-9B.json diff --git a/data/zelk12_MT1-Max-Merge_02012025163610-gemma-2-9B.json b/data/models/zelk12_MT1-Max-Merge_02012025163610-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT1-Max-Merge_02012025163610-gemma-2-9B.json rename to data/models/zelk12_MT1-Max-Merge_02012025163610-gemma-2-9B.json diff --git a/data/zelk12_MT1-gemma-2-9B.json b/data/models/zelk12_MT1-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT1-gemma-2-9B.json rename to data/models/zelk12_MT1-gemma-2-9B.json diff --git a/data/zelk12_MT2-Gen1-gemma-2-9B.json b/data/models/zelk12_MT2-Gen1-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT2-Gen1-gemma-2-9B.json rename to data/models/zelk12_MT2-Gen1-gemma-2-9B.json diff --git a/data/zelk12_MT2-Gen2-gemma-2-9B.json b/data/models/zelk12_MT2-Gen2-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT2-Gen2-gemma-2-9B.json rename to data/models/zelk12_MT2-Gen2-gemma-2-9B.json diff --git a/data/zelk12_MT2-Gen3-gemma-2-9B.json b/data/models/zelk12_MT2-Gen3-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT2-Gen3-gemma-2-9B.json rename to data/models/zelk12_MT2-Gen3-gemma-2-9B.json diff --git a/data/zelk12_MT2-Gen4-gemma-2-9B.json b/data/models/zelk12_MT2-Gen4-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT2-Gen4-gemma-2-9B.json rename to data/models/zelk12_MT2-Gen4-gemma-2-9B.json diff --git a/data/zelk12_MT2-Gen5-gemma-2-9B.json b/data/models/zelk12_MT2-Gen5-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT2-Gen5-gemma-2-9B.json rename to data/models/zelk12_MT2-Gen5-gemma-2-9B.json diff --git a/data/zelk12_MT2-Gen6-gemma-2-9B.json b/data/models/zelk12_MT2-Gen6-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT2-Gen6-gemma-2-9B.json rename to data/models/zelk12_MT2-Gen6-gemma-2-9B.json diff --git a/data/zelk12_MT2-Gen7-gemma-2-9B.json b/data/models/zelk12_MT2-Gen7-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT2-Gen7-gemma-2-9B.json rename to data/models/zelk12_MT2-Gen7-gemma-2-9B.json diff --git a/data/zelk12_MT2-Max-Merge_02012025163610-gemma-2-9B.json b/data/models/zelk12_MT2-Max-Merge_02012025163610-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT2-Max-Merge_02012025163610-gemma-2-9B.json rename to data/models/zelk12_MT2-Max-Merge_02012025163610-gemma-2-9B.json diff --git a/data/zelk12_MT2-gemma-2-9B.json b/data/models/zelk12_MT2-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT2-gemma-2-9B.json rename to data/models/zelk12_MT2-gemma-2-9B.json diff --git a/data/zelk12_MT3-Gen1-gemma-2-9B.json b/data/models/zelk12_MT3-Gen1-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT3-Gen1-gemma-2-9B.json rename to data/models/zelk12_MT3-Gen1-gemma-2-9B.json diff --git a/data/zelk12_MT3-Gen2-gemma-2-9B.json b/data/models/zelk12_MT3-Gen2-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT3-Gen2-gemma-2-9B.json rename to data/models/zelk12_MT3-Gen2-gemma-2-9B.json diff --git a/data/zelk12_MT3-Gen3-gemma-2-9B.json b/data/models/zelk12_MT3-Gen3-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT3-Gen3-gemma-2-9B.json rename to data/models/zelk12_MT3-Gen3-gemma-2-9B.json diff --git a/data/zelk12_MT3-Gen4-gemma-2-9B.json b/data/models/zelk12_MT3-Gen4-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT3-Gen4-gemma-2-9B.json rename to data/models/zelk12_MT3-Gen4-gemma-2-9B.json diff --git a/data/zelk12_MT3-Gen5-gemma-2-9B.json b/data/models/zelk12_MT3-Gen5-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT3-Gen5-gemma-2-9B.json rename to data/models/zelk12_MT3-Gen5-gemma-2-9B.json diff --git a/data/zelk12_MT3-Gen5-gemma-2-9B_v1.json b/data/models/zelk12_MT3-Gen5-gemma-2-9B_v1.json similarity index 100% rename from data/zelk12_MT3-Gen5-gemma-2-9B_v1.json rename to data/models/zelk12_MT3-Gen5-gemma-2-9B_v1.json diff --git a/data/zelk12_MT3-Gen6-gemma-2-9B.json b/data/models/zelk12_MT3-Gen6-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT3-Gen6-gemma-2-9B.json rename to data/models/zelk12_MT3-Gen6-gemma-2-9B.json diff --git a/data/zelk12_MT3-Max-Merge_02012025163610-gemma-2-9B.json b/data/models/zelk12_MT3-Max-Merge_02012025163610-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT3-Max-Merge_02012025163610-gemma-2-9B.json rename to data/models/zelk12_MT3-Max-Merge_02012025163610-gemma-2-9B.json diff --git a/data/zelk12_MT3-gemma-2-9B.json b/data/models/zelk12_MT3-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT3-gemma-2-9B.json rename to data/models/zelk12_MT3-gemma-2-9B.json diff --git a/data/zelk12_MT4-Gen1-gemma-2-9B.json b/data/models/zelk12_MT4-Gen1-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT4-Gen1-gemma-2-9B.json rename to data/models/zelk12_MT4-Gen1-gemma-2-9B.json diff --git a/data/zelk12_MT4-Gen2-gemma-2-9B.json b/data/models/zelk12_MT4-Gen2-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT4-Gen2-gemma-2-9B.json rename to data/models/zelk12_MT4-Gen2-gemma-2-9B.json diff --git a/data/zelk12_MT4-Gen3-gemma-2-9B.json b/data/models/zelk12_MT4-Gen3-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT4-Gen3-gemma-2-9B.json rename to data/models/zelk12_MT4-Gen3-gemma-2-9B.json diff --git a/data/zelk12_MT4-Gen4-gemma-2-9B.json b/data/models/zelk12_MT4-Gen4-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT4-Gen4-gemma-2-9B.json rename to data/models/zelk12_MT4-Gen4-gemma-2-9B.json diff --git a/data/zelk12_MT4-Gen5-gemma-2-9B.json b/data/models/zelk12_MT4-Gen5-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT4-Gen5-gemma-2-9B.json rename to data/models/zelk12_MT4-Gen5-gemma-2-9B.json diff --git a/data/zelk12_MT4-Max-Merge_02012025163610-gemma-2-9B.json b/data/models/zelk12_MT4-Max-Merge_02012025163610-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT4-Max-Merge_02012025163610-gemma-2-9B.json rename to data/models/zelk12_MT4-Max-Merge_02012025163610-gemma-2-9B.json diff --git a/data/zelk12_MT4-gemma-2-9B.json b/data/models/zelk12_MT4-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT4-gemma-2-9B.json rename to data/models/zelk12_MT4-gemma-2-9B.json diff --git a/data/zelk12_MT5-Gen1-gemma-2-9B.json b/data/models/zelk12_MT5-Gen1-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT5-Gen1-gemma-2-9B.json rename to data/models/zelk12_MT5-Gen1-gemma-2-9B.json diff --git a/data/zelk12_MT5-Gen2-gemma-2-9B.json b/data/models/zelk12_MT5-Gen2-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT5-Gen2-gemma-2-9B.json rename to data/models/zelk12_MT5-Gen2-gemma-2-9B.json diff --git a/data/zelk12_MT5-Gen3-gemma-2-9B.json b/data/models/zelk12_MT5-Gen3-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT5-Gen3-gemma-2-9B.json rename to data/models/zelk12_MT5-Gen3-gemma-2-9B.json diff --git a/data/zelk12_MT5-Gen4-gemma-2-9B.json b/data/models/zelk12_MT5-Gen4-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT5-Gen4-gemma-2-9B.json rename to data/models/zelk12_MT5-Gen4-gemma-2-9B.json diff --git a/data/zelk12_MT5-Gen5-gemma-2-9B.json b/data/models/zelk12_MT5-Gen5-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT5-Gen5-gemma-2-9B.json rename to data/models/zelk12_MT5-Gen5-gemma-2-9B.json diff --git a/data/zelk12_MT5-Max-Merge_02012025163610-gemma-2-9B.json b/data/models/zelk12_MT5-Max-Merge_02012025163610-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT5-Max-Merge_02012025163610-gemma-2-9B.json rename to data/models/zelk12_MT5-Max-Merge_02012025163610-gemma-2-9B.json diff --git a/data/zelk12_MT5-gemma-2-9B.json b/data/models/zelk12_MT5-gemma-2-9B.json similarity index 100% rename from data/zelk12_MT5-gemma-2-9B.json rename to data/models/zelk12_MT5-gemma-2-9B.json diff --git a/data/zelk12_MTM-Merge-gemma-2-9B.json b/data/models/zelk12_MTM-Merge-gemma-2-9B.json similarity index 100% rename from data/zelk12_MTM-Merge-gemma-2-9B.json rename to data/models/zelk12_MTM-Merge-gemma-2-9B.json diff --git a/data/zelk12_MTMaMe-Merge_02012025163610-gemma-2-9B.json b/data/models/zelk12_MTMaMe-Merge_02012025163610-gemma-2-9B.json similarity index 100% rename from data/zelk12_MTMaMe-Merge_02012025163610-gemma-2-9B.json rename to data/models/zelk12_MTMaMe-Merge_02012025163610-gemma-2-9B.json diff --git a/data/zelk12_Rv0.4DMv1t0.25-gemma-2-9B.json b/data/models/zelk12_Rv0.4DMv1t0.25-gemma-2-9B.json similarity index 100% rename from data/zelk12_Rv0.4DMv1t0.25-gemma-2-9B.json rename to data/models/zelk12_Rv0.4DMv1t0.25-gemma-2-9B.json diff --git a/data/zelk12_Rv0.4DMv1t0.25Tt0.25-gemma-2-9B.json b/data/models/zelk12_Rv0.4DMv1t0.25Tt0.25-gemma-2-9B.json similarity index 100% rename from data/zelk12_Rv0.4DMv1t0.25Tt0.25-gemma-2-9B.json rename to data/models/zelk12_Rv0.4DMv1t0.25Tt0.25-gemma-2-9B.json diff --git a/data/zelk12_Rv0.4MT4g2-gemma-2-9B.json b/data/models/zelk12_Rv0.4MT4g2-gemma-2-9B.json similarity index 100% rename from data/zelk12_Rv0.4MT4g2-gemma-2-9B.json rename to data/models/zelk12_Rv0.4MT4g2-gemma-2-9B.json diff --git a/data/zelk12_T31122024203920-gemma-2-9B.json b/data/models/zelk12_T31122024203920-gemma-2-9B.json similarity index 100% rename from data/zelk12_T31122024203920-gemma-2-9B.json rename to data/models/zelk12_T31122024203920-gemma-2-9B.json diff --git a/data/zelk12_Test01012025155054.json b/data/models/zelk12_Test01012025155054.json similarity index 100% rename from data/zelk12_Test01012025155054.json rename to data/models/zelk12_Test01012025155054.json diff --git a/data/zelk12_Test01012025155054t0.5_gemma-2.json b/data/models/zelk12_Test01012025155054t0.5_gemma-2.json similarity index 100% rename from data/zelk12_Test01012025155054t0.5_gemma-2.json rename to data/models/zelk12_Test01012025155054t0.5_gemma-2.json diff --git a/data/zelk12_gemma-2-S2MTM-9B.json b/data/models/zelk12_gemma-2-S2MTM-9B.json similarity index 100% rename from data/zelk12_gemma-2-S2MTM-9B.json rename to data/models/zelk12_gemma-2-S2MTM-9B.json diff --git a/data/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25.json b/data/models/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25.json similarity index 100% rename from data/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25.json rename to data/models/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25.json diff --git a/data/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75.json b/data/models/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75.json similarity index 100% rename from data/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75.json rename to data/models/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75.json diff --git a/data/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.1.json b/data/models/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.1.json similarity index 100% rename from data/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.1.json rename to data/models/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.1.json diff --git a/data/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.2.json b/data/models/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.2.json similarity index 100% rename from data/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.2.json rename to data/models/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.2.json diff --git a/data/zelk12_recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1.json b/data/models/zelk12_recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1.json similarity index 100% rename from data/zelk12_recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1.json rename to data/models/zelk12_recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1.json diff --git a/data/zelk12_recoilme-gemma-2-Ifable-9B-v0.1.json b/data/models/zelk12_recoilme-gemma-2-Ifable-9B-v0.1.json similarity index 100% rename from data/zelk12_recoilme-gemma-2-Ifable-9B-v0.1.json rename to data/models/zelk12_recoilme-gemma-2-Ifable-9B-v0.1.json diff --git a/data/zelk12_recoilme-gemma-2-psy10k-mental_healt-9B-v0.1.json b/data/models/zelk12_recoilme-gemma-2-psy10k-mental_healt-9B-v0.1.json similarity index 100% rename from data/zelk12_recoilme-gemma-2-psy10k-mental_healt-9B-v0.1.json rename to data/models/zelk12_recoilme-gemma-2-psy10k-mental_healt-9B-v0.1.json diff --git a/data/zetasepic_Qwen2.5-32B-Instruct-abliterated-v2.json b/data/models/zetasepic_Qwen2.5-32B-Instruct-abliterated-v2.json similarity index 100% rename from data/zetasepic_Qwen2.5-32B-Instruct-abliterated-v2.json rename to data/models/zetasepic_Qwen2.5-32B-Instruct-abliterated-v2.json diff --git a/data/zetasepic_Qwen2.5-72B-Instruct-abliterated.json b/data/models/zetasepic_Qwen2.5-72B-Instruct-abliterated.json similarity index 100% rename from data/zetasepic_Qwen2.5-72B-Instruct-abliterated.json rename to data/models/zetasepic_Qwen2.5-72B-Instruct-abliterated.json diff --git a/data/zhengr_MixTAO-7Bx2-MoE-v8.1.json b/data/models/zhengr_MixTAO-7Bx2-MoE-v8.1.json similarity index 100% rename from data/zhengr_MixTAO-7Bx2-MoE-v8.1.json rename to data/models/zhengr_MixTAO-7Bx2-MoE-v8.1.json diff --git a/data/zhipu-ai_GLM-130B.json b/data/models/zhipu-ai_GLM-130B.json similarity index 100% rename from data/zhipu-ai_GLM-130B.json rename to data/models/zhipu-ai_GLM-130B.json diff --git a/data/zhipu-ai_glm-4.6.json b/data/models/zhipu-ai_glm-4.6.json similarity index 100% rename from data/zhipu-ai_glm-4.6.json rename to data/models/zhipu-ai_glm-4.6.json diff --git a/data/zhipu-ai_glm-4.7.json b/data/models/zhipu-ai_glm-4.7.json similarity index 100% rename from data/zhipu-ai_glm-4.7.json rename to data/models/zhipu-ai_glm-4.7.json diff --git a/data/zhipu-ai_glm-5.json b/data/models/zhipu-ai_glm-5.json similarity index 100% rename from data/zhipu-ai_glm-5.json rename to data/models/zhipu-ai_glm-5.json diff --git a/data/zhipu_GLM_4.6.json b/data/models/zhipu_GLM_4.6.json similarity index 100% rename from data/zhipu_GLM_4.6.json rename to data/models/zhipu_GLM_4.6.json diff --git a/data/zhipu_GLM_4.7.json b/data/models/zhipu_GLM_4.7.json similarity index 100% rename from data/zhipu_GLM_4.7.json rename to data/models/zhipu_GLM_4.7.json