Spaces:

juanmackie
/

YourBench

Build error

App Files Files Community

tfrere HF Staff commited on Mar 28, 2025

Commit

83d60af

1 Parent(s): debda0e

update eveluationTask to remove local storage

Browse files

Files changed (29) hide show

backend/data/lighteval_results/lighteval_results.json +0 -30
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-55-33.911206.json +0 -121
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-57-38.809317.json +0 -121
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-59-28.405916.json +0 -121
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-02-34.148676.json +0 -121
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-16-04.060789.json +0 -121
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-18-55.849741.json +0 -121
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-20-35.234042.json +0 -121
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-29-08.177301.json +0 -121
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-31-41.485559.json +0 -121
backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-35-26.288328.json +0 -121
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T11-35-01.155436.json +0 -121
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T12-11-27.855994.json +0 -121
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T12-59-46.530720.json +0 -121
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-55-05.717421.json +0 -121
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-57-18.796730.json +0 -121
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-59-16.518904.json +0 -121
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-02-14.751585.json +0 -121
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-15-49.697950.json +0 -121
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-18-46.125749.json +0 -121
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-20-17.925045.json +0 -121
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-28-55.776035.json +0 -121
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-31-25.397360.json +0 -121
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-35-22.226092.json +0 -121
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-11-45.632754.json +0 -121
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-14-14.765643.json +0 -121
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-17-34.971563.json +0 -121
backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-28-57.341922.json +0 -121
backend/tasks/evaluationTask.py +46 -18

backend/data/lighteval_results/lighteval_results.json DELETED Viewed

@@ -1,30 +0,0 @@
-[
-  {
-    "model": "Qwen/QwQ-32B",
-    "provider": "sambanova",
-    "accuracy": 1.0,
-    "execution_time": 21.59078598022461,
-    "status": "success"
-  },
-  {
-    "model": "Qwen/Qwen2.5-72B-Instruct",
-    "provider": "sambanova",
-    "accuracy": 1.0,
-    "execution_time": 14.694424152374268,
-    "status": "success"
-  },
-  {
-    "model": "deepseek-ai/DeepSeek-V3-0324",
-    "provider": "novita",
-    "accuracy": 1.0,
-    "execution_time": 24.018408060073853,
-    "status": "success"
-  },
-  {
-    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
-    "provider": "sambanova",
-    "accuracy": 1.0,
-    "execution_time": 16.271580934524536,
-    "status": "success"
-  }
-]

backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-55-33.911206.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 5,
-    "job_id": 0,
-    "start_time": 186274.866411583,
-    "end_time": 186322.987643416,
-    "total_evaluation_time_secondes": "48.12123183300719",
-    "model_name": "Qwen/Qwen2.5-72B-Instruct",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
-      "hf_subset": "single_shot_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 15,
-      "effective_num_docs": 5,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "abaa6ef1f9715482",
-        "hash_full_prompts": "0b5eb6607b419659",
-        "hash_input_tokens": "bf9d9e969418cff7",
-        "hash_cont_tokens": "bf9d9e969418cff7"
-      },
-      "truncated": 0,
-      "non_truncated": 5,
-      "padded": 0,
-      "non_padded": 5,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "b1bf475c2319e3b2",
-      "hash_full_prompts": "d860f90cd7291b63",
-      "hash_input_tokens": "5882dac673b9f859",
-      "hash_cont_tokens": "5882dac673b9f859"
-    },
-    "truncated": 0,
-    "non_truncated": 5,
-    "padded": 0,
-    "non_padded": 5,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-57-38.809317.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 5,
-    "job_id": 0,
-    "start_time": 186407.701185,
-    "end_time": 186447.883386625,
-    "total_evaluation_time_secondes": "40.18220162499347",
-    "model_name": "Qwen/Qwen2.5-72B-Instruct",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
-      "hf_subset": "single_shot_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 15,
-      "effective_num_docs": 5,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "abaa6ef1f9715482",
-        "hash_full_prompts": "0b5eb6607b419659",
-        "hash_input_tokens": "bf9d9e969418cff7",
-        "hash_cont_tokens": "bf9d9e969418cff7"
-      },
-      "truncated": 0,
-      "non_truncated": 5,
-      "padded": 0,
-      "non_padded": 5,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "b1bf475c2319e3b2",
-      "hash_full_prompts": "d860f90cd7291b63",
-      "hash_input_tokens": "5882dac673b9f859",
-      "hash_cont_tokens": "5882dac673b9f859"
-    },
-    "truncated": 0,
-    "non_truncated": 5,
-    "padded": 0,
-    "non_padded": 5,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T10-59-28.405916.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 5,
-    "job_id": 0,
-    "start_time": 186521.763833833,
-    "end_time": 186557.476439666,
-    "total_evaluation_time_secondes": "35.71260583298863",
-    "model_name": "Qwen/Qwen2.5-72B-Instruct",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
-      "hf_subset": "single_shot_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 15,
-      "effective_num_docs": 5,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "abaa6ef1f9715482",
-        "hash_full_prompts": "0b5eb6607b419659",
-        "hash_input_tokens": "bf9d9e969418cff7",
-        "hash_cont_tokens": "bf9d9e969418cff7"
-      },
-      "truncated": 0,
-      "non_truncated": 5,
-      "padded": 0,
-      "non_padded": 5,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "b1bf475c2319e3b2",
-      "hash_full_prompts": "d860f90cd7291b63",
-      "hash_input_tokens": "5882dac673b9f859",
-      "hash_cont_tokens": "5882dac673b9f859"
-    },
-    "truncated": 0,
-    "non_truncated": 5,
-    "padded": 0,
-    "non_padded": 5,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-02-34.148676.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 5,
-    "job_id": 0,
-    "start_time": 186704.883209333,
-    "end_time": 186743.215716791,
-    "total_evaluation_time_secondes": "38.332507457991596",
-    "model_name": "Qwen/Qwen2.5-72B-Instruct",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
-      "hf_subset": "single_shot_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 15,
-      "effective_num_docs": 5,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "abaa6ef1f9715482",
-        "hash_full_prompts": "0b5eb6607b419659",
-        "hash_input_tokens": "bf9d9e969418cff7",
-        "hash_cont_tokens": "bf9d9e969418cff7"
-      },
-      "truncated": 0,
-      "non_truncated": 5,
-      "padded": 0,
-      "non_padded": 5,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "b1bf475c2319e3b2",
-      "hash_full_prompts": "d860f90cd7291b63",
-      "hash_input_tokens": "5882dac673b9f859",
-      "hash_cont_tokens": "5882dac673b9f859"
-    },
-    "truncated": 0,
-    "non_truncated": 5,
-    "padded": 0,
-    "non_padded": 5,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-16-04.060789.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 5,
-    "job_id": 0,
-    "start_time": 187518.49620975,
-    "end_time": 187553.120908083,
-    "total_evaluation_time_secondes": "34.62469833297655",
-    "model_name": "Qwen/Qwen2.5-72B-Instruct",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
-      "hf_subset": "single_shot_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 15,
-      "effective_num_docs": 5,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "abaa6ef1f9715482",
-        "hash_full_prompts": "0b5eb6607b419659",
-        "hash_input_tokens": "bf9d9e969418cff7",
-        "hash_cont_tokens": "bf9d9e969418cff7"
-      },
-      "truncated": 0,
-      "non_truncated": 5,
-      "padded": 0,
-      "non_padded": 5,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "b1bf475c2319e3b2",
-      "hash_full_prompts": "d860f90cd7291b63",
-      "hash_input_tokens": "5882dac673b9f859",
-      "hash_cont_tokens": "5882dac673b9f859"
-    },
-    "truncated": 0,
-    "non_truncated": 5,
-    "padded": 0,
-    "non_padded": 5,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-18-55.849741.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 5,
-    "job_id": 0,
-    "start_time": 187690.771319041,
-    "end_time": 187724.908132583,
-    "total_evaluation_time_secondes": "34.136813541990705",
-    "model_name": "Qwen/Qwen2.5-72B-Instruct",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
-      "hf_subset": "single_shot_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 15,
-      "effective_num_docs": 5,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "abaa6ef1f9715482",
-        "hash_full_prompts": "0b5eb6607b419659",
-        "hash_input_tokens": "bf9d9e969418cff7",
-        "hash_cont_tokens": "bf9d9e969418cff7"
-      },
-      "truncated": 0,
-      "non_truncated": 5,
-      "padded": 0,
-      "non_padded": 5,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "b1bf475c2319e3b2",
-      "hash_full_prompts": "d860f90cd7291b63",
-      "hash_input_tokens": "5882dac673b9f859",
-      "hash_cont_tokens": "5882dac673b9f859"
-    },
-    "truncated": 0,
-    "non_truncated": 5,
-    "padded": 0,
-    "non_padded": 5,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-20-35.234042.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 5,
-    "job_id": 0,
-    "start_time": 187785.492066916,
-    "end_time": 187824.287589375,
-    "total_evaluation_time_secondes": "38.79552245899686",
-    "model_name": "Qwen/Qwen2.5-72B-Instruct",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
-      "hf_subset": "single_shot_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 15,
-      "effective_num_docs": 5,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "abaa6ef1f9715482",
-        "hash_full_prompts": "0b5eb6607b419659",
-        "hash_input_tokens": "bf9d9e969418cff7",
-        "hash_cont_tokens": "bf9d9e969418cff7"
-      },
-      "truncated": 0,
-      "non_truncated": 5,
-      "padded": 0,
-      "non_padded": 5,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "b1bf475c2319e3b2",
-      "hash_full_prompts": "d860f90cd7291b63",
-      "hash_input_tokens": "5882dac673b9f859",
-      "hash_cont_tokens": "5882dac673b9f859"
-    },
-    "truncated": 0,
-    "non_truncated": 5,
-    "padded": 0,
-    "non_padded": 5,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-29-08.177301.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 5,
-    "job_id": 0,
-    "start_time": 188300.087538958,
-    "end_time": 188337.230208583,
-    "total_evaluation_time_secondes": "37.142669624998234",
-    "model_name": "Qwen/Qwen2.5-72B-Instruct",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_e1d7e6f5-b28f-4966-ba2f-531b1b1e5cb8",
-      "hf_subset": "single_shot_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 15,
-      "effective_num_docs": 5,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "7e34d82512ce6dfc",
-        "hash_full_prompts": "af7c42c6f40964e1",
-        "hash_input_tokens": "bf9d9e969418cff7",
-        "hash_cont_tokens": "bf9d9e969418cff7"
-      },
-      "truncated": 0,
-      "non_truncated": 5,
-      "padded": 0,
-      "non_padded": 5,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "7cdb142c3142312a",
-      "hash_full_prompts": "a2e47b0b68e57792",
-      "hash_input_tokens": "5882dac673b9f859",
-      "hash_cont_tokens": "5882dac673b9f859"
-    },
-    "truncated": 0,
-    "non_truncated": 5,
-    "padded": 0,
-    "non_padded": 5,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-31-41.485559.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 5,
-    "job_id": 0,
-    "start_time": 188452.784089458,
-    "end_time": 188490.538178958,
-    "total_evaluation_time_secondes": "37.75408949999837",
-    "model_name": "Qwen/Qwen2.5-72B-Instruct",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
-      "hf_subset": "single_shot_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 15,
-      "effective_num_docs": 5,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "abaa6ef1f9715482",
-        "hash_full_prompts": "0b5eb6607b419659",
-        "hash_input_tokens": "bf9d9e969418cff7",
-        "hash_cont_tokens": "bf9d9e969418cff7"
-      },
-      "truncated": 0,
-      "non_truncated": 5,
-      "padded": 0,
-      "non_padded": 5,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "b1bf475c2319e3b2",
-      "hash_full_prompts": "d860f90cd7291b63",
-      "hash_input_tokens": "5882dac673b9f859",
-      "hash_cont_tokens": "5882dac673b9f859"
-    },
-    "truncated": 0,
-    "non_truncated": 5,
-    "padded": 0,
-    "non_padded": 5,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/Qwen/Qwen2.5-72B-Instruct/results_2025-03-28T11-35-26.288328.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 15,
-    "job_id": 0,
-    "start_time": 188674.734532375,
-    "end_time": 188715.337919458,
-    "total_evaluation_time_secondes": "40.60338708298514",
-    "model_name": "Qwen/Qwen2.5-72B-Instruct",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
-      "hf_subset": "single_shot_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 15,
-      "effective_num_docs": 15,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "35f5eef8199d4521",
-        "hash_full_prompts": "5590bc220414fefb",
-        "hash_input_tokens": "58ec870775e406f3",
-        "hash_cont_tokens": "58ec870775e406f3"
-      },
-      "truncated": 0,
-      "non_truncated": 15,
-      "padded": 0,
-      "non_padded": 15,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "bc7dfdffc5e53476",
-      "hash_full_prompts": "712fd00df902d786",
-      "hash_input_tokens": "544d800a25dfd777",
-      "hash_cont_tokens": "544d800a25dfd777"
-    },
-    "truncated": 0,
-    "non_truncated": 15,
-    "padded": 0,
-    "non_padded": 15,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T11-35-01.155436.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 15,
-    "job_id": 0,
-    "start_time": 188674.734510208,
-    "end_time": 188690.205653,
-    "total_evaluation_time_secondes": "15.471142791997408",
-    "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
-      "hf_subset": "single_shot_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 15,
-      "effective_num_docs": 15,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "35f5eef8199d4521",
-        "hash_full_prompts": "5590bc220414fefb",
-        "hash_input_tokens": "58ec870775e406f3",
-        "hash_cont_tokens": "58ec870775e406f3"
-      },
-      "truncated": 0,
-      "non_truncated": 15,
-      "padded": 0,
-      "non_padded": 15,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "bc7dfdffc5e53476",
-      "hash_full_prompts": "712fd00df902d786",
-      "hash_input_tokens": "544d800a25dfd777",
-      "hash_cont_tokens": "544d800a25dfd777"
-    },
-    "truncated": 0,
-    "non_truncated": 15,
-    "padded": 0,
-    "non_padded": 15,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T12-11-27.855994.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 15,
-    "job_id": 0,
-    "start_time": 190861.972782125,
-    "end_time": 190876.962226916,
-    "total_evaluation_time_secondes": "14.989444790990092",
-    "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
-      "hf_subset": "multi_hop_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 34,
-      "effective_num_docs": 15,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "97803694d4430d2d",
-        "hash_full_prompts": "3125bcda69618d2b",
-        "hash_input_tokens": "58ec870775e406f3",
-        "hash_cont_tokens": "58ec870775e406f3"
-      },
-      "truncated": 0,
-      "non_truncated": 15,
-      "padded": 0,
-      "non_padded": 15,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "13a4051f728a0e87",
-      "hash_full_prompts": "e18b288370ab6ae2",
-      "hash_input_tokens": "544d800a25dfd777",
-      "hash_cont_tokens": "544d800a25dfd777"
-    },
-    "truncated": 0,
-    "non_truncated": 15,
-    "padded": 0,
-    "non_padded": 15,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/results_2025-03-28T12-59-46.530720.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 30,
-    "job_id": 0,
-    "start_time": 193754.29830825,
-    "end_time": 193775.660671041,
-    "total_evaluation_time_secondes": "21.362362790998304",
-    "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_d0766aeb-d261-4f0f-870c-537432fd8584",
-      "hf_subset": "multi_hop_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 34,
-      "effective_num_docs": 30,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "8deb6ee598efe642",
-        "hash_full_prompts": "ee276216c7fba0dc",
-        "hash_input_tokens": "79ab129e9a18c6d6",
-        "hash_cont_tokens": "79ab129e9a18c6d6"
-      },
-      "truncated": 0,
-      "non_truncated": 30,
-      "padded": 0,
-      "non_padded": 30,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "134194cd9d247350",
-      "hash_full_prompts": "59b03121730720e8",
-      "hash_input_tokens": "05a66e44e190c178",
-      "hash_cont_tokens": "05a66e44e190c178"
-    },
-    "truncated": 0,
-    "non_truncated": 30,
-    "padded": 0,
-    "non_padded": 30,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-55-05.717421.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 5,
-    "job_id": 0,
-    "start_time": 186274.866369916,
-    "end_time": 186294.792813083,
-    "total_evaluation_time_secondes": "19.926443167001707",
-    "model_name": "deepseek-ai/DeepSeek-V3-0324",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
-      "hf_subset": "single_shot_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 15,
-      "effective_num_docs": 5,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "abaa6ef1f9715482",
-        "hash_full_prompts": "0b5eb6607b419659",
-        "hash_input_tokens": "bf9d9e969418cff7",
-        "hash_cont_tokens": "bf9d9e969418cff7"
-      },
-      "truncated": 0,
-      "non_truncated": 5,
-      "padded": 0,
-      "non_padded": 5,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "b1bf475c2319e3b2",
-      "hash_full_prompts": "d860f90cd7291b63",
-      "hash_input_tokens": "5882dac673b9f859",
-      "hash_cont_tokens": "5882dac673b9f859"
-    },
-    "truncated": 0,
-    "non_truncated": 5,
-    "padded": 0,
-    "non_padded": 5,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-57-18.796730.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 5,
-    "job_id": 0,
-    "start_time": 186407.701222875,
-    "end_time": 186427.871588083,
-    "total_evaluation_time_secondes": "20.170365208003204",
-    "model_name": "deepseek-ai/DeepSeek-V3-0324",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
-      "hf_subset": "single_shot_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 15,
-      "effective_num_docs": 5,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "abaa6ef1f9715482",
-        "hash_full_prompts": "0b5eb6607b419659",
-        "hash_input_tokens": "bf9d9e969418cff7",
-        "hash_cont_tokens": "bf9d9e969418cff7"
-      },
-      "truncated": 0,
-      "non_truncated": 5,
-      "padded": 0,
-      "non_padded": 5,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "b1bf475c2319e3b2",
-      "hash_full_prompts": "d860f90cd7291b63",
-      "hash_input_tokens": "5882dac673b9f859",
-      "hash_cont_tokens": "5882dac673b9f859"
-    },
-    "truncated": 0,
-    "non_truncated": 5,
-    "padded": 0,
-    "non_padded": 5,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T10-59-16.518904.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 5,
-    "job_id": 0,
-    "start_time": 186521.763754958,
-    "end_time": 186545.585271583,
-    "total_evaluation_time_secondes": "23.821516625001095",
-    "model_name": "deepseek-ai/DeepSeek-V3-0324",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
-      "hf_subset": "single_shot_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 15,
-      "effective_num_docs": 5,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "abaa6ef1f9715482",
-        "hash_full_prompts": "0b5eb6607b419659",
-        "hash_input_tokens": "bf9d9e969418cff7",
-        "hash_cont_tokens": "bf9d9e969418cff7"
-      },
-      "truncated": 0,
-      "non_truncated": 5,
-      "padded": 0,
-      "non_padded": 5,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "b1bf475c2319e3b2",
-      "hash_full_prompts": "d860f90cd7291b63",
-      "hash_input_tokens": "5882dac673b9f859",
-      "hash_cont_tokens": "5882dac673b9f859"
-    },
-    "truncated": 0,
-    "non_truncated": 5,
-    "padded": 0,
-    "non_padded": 5,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-02-14.751585.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 5,
-    "job_id": 0,
-    "start_time": 186704.882684291,
-    "end_time": 186723.820615833,
-    "total_evaluation_time_secondes": "18.937931542022852",
-    "model_name": "deepseek-ai/DeepSeek-V3-0324",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
-      "hf_subset": "single_shot_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 15,
-      "effective_num_docs": 5,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "abaa6ef1f9715482",
-        "hash_full_prompts": "0b5eb6607b419659",
-        "hash_input_tokens": "bf9d9e969418cff7",
-        "hash_cont_tokens": "bf9d9e969418cff7"
-      },
-      "truncated": 0,
-      "non_truncated": 5,
-      "padded": 0,
-      "non_padded": 5,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "b1bf475c2319e3b2",
-      "hash_full_prompts": "d860f90cd7291b63",
-      "hash_input_tokens": "5882dac673b9f859",
-      "hash_cont_tokens": "5882dac673b9f859"
-    },
-    "truncated": 0,
-    "non_truncated": 5,
-    "padded": 0,
-    "non_padded": 5,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-15-49.697950.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 5,
-    "job_id": 0,
-    "start_time": 187518.496174916,
-    "end_time": 187538.752125166,
-    "total_evaluation_time_secondes": "20.255950249993475",
-    "model_name": "deepseek-ai/DeepSeek-V3-0324",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
-      "hf_subset": "single_shot_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 15,
-      "effective_num_docs": 5,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "abaa6ef1f9715482",
-        "hash_full_prompts": "0b5eb6607b419659",
-        "hash_input_tokens": "bf9d9e969418cff7",
-        "hash_cont_tokens": "bf9d9e969418cff7"
-      },
-      "truncated": 0,
-      "non_truncated": 5,
-      "padded": 0,
-      "non_padded": 5,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "b1bf475c2319e3b2",
-      "hash_full_prompts": "d860f90cd7291b63",
-      "hash_input_tokens": "5882dac673b9f859",
-      "hash_cont_tokens": "5882dac673b9f859"
-    },
-    "truncated": 0,
-    "non_truncated": 5,
-    "padded": 0,
-    "non_padded": 5,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-18-46.125749.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 5,
-    "job_id": 0,
-    "start_time": 187690.771119125,
-    "end_time": 187715.172306583,
-    "total_evaluation_time_secondes": "24.40118745798827",
-    "model_name": "deepseek-ai/DeepSeek-V3-0324",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
-      "hf_subset": "single_shot_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 15,
-      "effective_num_docs": 5,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "abaa6ef1f9715482",
-        "hash_full_prompts": "0b5eb6607b419659",
-        "hash_input_tokens": "bf9d9e969418cff7",
-        "hash_cont_tokens": "bf9d9e969418cff7"
-      },
-      "truncated": 0,
-      "non_truncated": 5,
-      "padded": 0,
-      "non_padded": 5,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "b1bf475c2319e3b2",
-      "hash_full_prompts": "d860f90cd7291b63",
-      "hash_input_tokens": "5882dac673b9f859",
-      "hash_cont_tokens": "5882dac673b9f859"
-    },
-    "truncated": 0,
-    "non_truncated": 5,
-    "padded": 0,
-    "non_padded": 5,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-20-17.925045.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 5,
-    "job_id": 0,
-    "start_time": 187785.49207775,
-    "end_time": 187806.982701541,
-    "total_evaluation_time_secondes": "21.4906237910036",
-    "model_name": "deepseek-ai/DeepSeek-V3-0324",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
-      "hf_subset": "single_shot_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 15,
-      "effective_num_docs": 5,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "abaa6ef1f9715482",
-        "hash_full_prompts": "0b5eb6607b419659",
-        "hash_input_tokens": "bf9d9e969418cff7",
-        "hash_cont_tokens": "bf9d9e969418cff7"
-      },
-      "truncated": 0,
-      "non_truncated": 5,
-      "padded": 0,
-      "non_padded": 5,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "b1bf475c2319e3b2",
-      "hash_full_prompts": "d860f90cd7291b63",
-      "hash_input_tokens": "5882dac673b9f859",
-      "hash_cont_tokens": "5882dac673b9f859"
-    },
-    "truncated": 0,
-    "non_truncated": 5,
-    "padded": 0,
-    "non_padded": 5,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-28-55.776035.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 5,
-    "job_id": 0,
-    "start_time": 188300.087685291,
-    "end_time": 188324.829042291,
-    "total_evaluation_time_secondes": "24.7413570000208",
-    "model_name": "deepseek-ai/DeepSeek-V3-0324",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_e1d7e6f5-b28f-4966-ba2f-531b1b1e5cb8",
-      "hf_subset": "single_shot_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 15,
-      "effective_num_docs": 5,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "7e34d82512ce6dfc",
-        "hash_full_prompts": "af7c42c6f40964e1",
-        "hash_input_tokens": "bf9d9e969418cff7",
-        "hash_cont_tokens": "bf9d9e969418cff7"
-      },
-      "truncated": 0,
-      "non_truncated": 5,
-      "padded": 0,
-      "non_padded": 5,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "7cdb142c3142312a",
-      "hash_full_prompts": "a2e47b0b68e57792",
-      "hash_input_tokens": "5882dac673b9f859",
-      "hash_cont_tokens": "5882dac673b9f859"
-    },
-    "truncated": 0,
-    "non_truncated": 5,
-    "padded": 0,
-    "non_padded": 5,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-31-25.397360.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 5,
-    "job_id": 0,
-    "start_time": 188452.784059833,
-    "end_time": 188474.450274291,
-    "total_evaluation_time_secondes": "21.666214458004106",
-    "model_name": "deepseek-ai/DeepSeek-V3-0324",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
-      "hf_subset": "single_shot_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 15,
-      "effective_num_docs": 5,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "abaa6ef1f9715482",
-        "hash_full_prompts": "0b5eb6607b419659",
-        "hash_input_tokens": "bf9d9e969418cff7",
-        "hash_cont_tokens": "bf9d9e969418cff7"
-      },
-      "truncated": 0,
-      "non_truncated": 5,
-      "padded": 0,
-      "non_padded": 5,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "b1bf475c2319e3b2",
-      "hash_full_prompts": "d860f90cd7291b63",
-      "hash_input_tokens": "5882dac673b9f859",
-      "hash_cont_tokens": "5882dac673b9f859"
-    },
-    "truncated": 0,
-    "non_truncated": 5,
-    "padded": 0,
-    "non_padded": 5,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T11-35-22.226092.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 15,
-    "job_id": 0,
-    "start_time": 188674.734458958,
-    "end_time": 188711.276019958,
-    "total_evaluation_time_secondes": "36.54156099999091",
-    "model_name": "deepseek-ai/DeepSeek-V3-0324",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_c7a3635d-7272-47d6-b74c-8dd6184145af",
-      "hf_subset": "single_shot_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 15,
-      "effective_num_docs": 15,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "35f5eef8199d4521",
-        "hash_full_prompts": "5590bc220414fefb",
-        "hash_input_tokens": "58ec870775e406f3",
-        "hash_cont_tokens": "58ec870775e406f3"
-      },
-      "truncated": 0,
-      "non_truncated": 15,
-      "padded": 0,
-      "non_padded": 15,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "bc7dfdffc5e53476",
-      "hash_full_prompts": "712fd00df902d786",
-      "hash_input_tokens": "544d800a25dfd777",
-      "hash_cont_tokens": "544d800a25dfd777"
-    },
-    "truncated": 0,
-    "non_truncated": 15,
-    "padded": 0,
-    "non_padded": 15,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-11-45.632754.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 15,
-    "job_id": 0,
-    "start_time": 190861.972804458,
-    "end_time": 190894.739973125,
-    "total_evaluation_time_secondes": "32.7671686669928",
-    "model_name": "deepseek-ai/DeepSeek-V3-0324",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
-      "hf_subset": "multi_hop_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 34,
-      "effective_num_docs": 15,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "97803694d4430d2d",
-        "hash_full_prompts": "3125bcda69618d2b",
-        "hash_input_tokens": "58ec870775e406f3",
-        "hash_cont_tokens": "58ec870775e406f3"
-      },
-      "truncated": 0,
-      "non_truncated": 15,
-      "padded": 0,
-      "non_padded": 15,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "13a4051f728a0e87",
-      "hash_full_prompts": "e18b288370ab6ae2",
-      "hash_input_tokens": "544d800a25dfd777",
-      "hash_cont_tokens": "544d800a25dfd777"
-    },
-    "truncated": 0,
-    "non_truncated": 15,
-    "padded": 0,
-    "non_padded": 15,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-14-14.765643.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 30,
-    "job_id": 0,
-    "start_time": 190994.241279791,
-    "end_time": 191043.871577458,
-    "total_evaluation_time_secondes": "49.63029766699765",
-    "model_name": "deepseek-ai/DeepSeek-V3-0324",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
-      "hf_subset": "multi_hop_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 34,
-      "effective_num_docs": 30,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "1b5afc5f13827f79",
-        "hash_full_prompts": "cd8c39c007643835",
-        "hash_input_tokens": "79ab129e9a18c6d6",
-        "hash_cont_tokens": "79ab129e9a18c6d6"
-      },
-      "truncated": 0,
-      "non_truncated": 30,
-      "padded": 0,
-      "non_padded": 30,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "b18e19e266a5bc51",
-      "hash_full_prompts": "1eaa15cbc4a17d04",
-      "hash_input_tokens": "05a66e44e190c178",
-      "hash_cont_tokens": "05a66e44e190c178"
-    },
-    "truncated": 0,
-    "non_truncated": 30,
-    "padded": 0,
-    "non_padded": 30,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-17-34.971563.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 30,
-    "job_id": 0,
-    "start_time": 191195.945968041,
-    "end_time": 191244.057571,
-    "total_evaluation_time_secondes": "48.111602959019365",
-    "model_name": "deepseek-ai/DeepSeek-V3-0324",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
-      "hf_subset": "multi_hop_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 34,
-      "effective_num_docs": 30,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "1b5afc5f13827f79",
-        "hash_full_prompts": "cd8c39c007643835",
-        "hash_input_tokens": "79ab129e9a18c6d6",
-        "hash_cont_tokens": "79ab129e9a18c6d6"
-      },
-      "truncated": 0,
-      "non_truncated": 30,
-      "padded": 0,
-      "non_padded": 30,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "b18e19e266a5bc51",
-      "hash_full_prompts": "1eaa15cbc4a17d04",
-      "hash_input_tokens": "05a66e44e190c178",
-      "hash_cont_tokens": "05a66e44e190c178"
-    },
-    "truncated": 0,
-    "non_truncated": 30,
-    "padded": 0,
-    "non_padded": 30,
-    "num_truncated_few_shots": 0
-  }
-}

backend/data/lighteval_results/results/deepseek-ai/DeepSeek-V3-0324/results_2025-03-28T12-28-57.341922.json DELETED Viewed

@@ -1,121 +0,0 @@
-{
-  "config_general": {
-    "lighteval_sha": "?",
-    "num_fewshot_seeds": 1,
-    "override_batch_size": null,
-    "max_samples": 30,
-    "job_id": 0,
-    "start_time": 191865.098197958,
-    "end_time": 191926.425937958,
-    "total_evaluation_time_secondes": "61.32774000000791",
-    "model_name": "deepseek-ai/DeepSeek-V3-0324",
-    "model_sha": "",
-    "model_dtype": null,
-    "model_size": "",
-    "generation_parameters": {
-      "early_stopping": null,
-      "repetition_penalty": null,
-      "frequency_penalty": null,
-      "length_penalty": null,
-      "presence_penalty": null,
-      "max_new_tokens": null,
-      "min_new_tokens": null,
-      "seed": null,
-      "stop_tokens": null,
-      "temperature": null,
-      "top_k": null,
-      "min_p": null,
-      "top_p": null,
-      "truncate_prompt": null,
-      "response_format": null
-    }
-  },
-  "results": {
-    "custom|yourbench|0": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    },
-    "all": {
-      "accuracy": 1.0,
-      "accuracy_stderr": 0.0
-    }
-  },
-  "versions": {
-    "custom|yourbench|0": 0
-  },
-  "config_tasks": {
-    "custom|yourbench": {
-      "name": "yourbench",
-      "prompt_function": "yourbench_prompt",
-      "hf_repo": "yourbench/yourbench_d61a6289-9f2e-4138-b01a-63c43c5daf0b",
-      "hf_subset": "multi_hop_questions",
-      "metric": [
-        {
-          "metric_name": [
-            "accuracy"
-          ],
-          "higher_is_better": {
-            "accuracy": true
-          },
-          "category": "7",
-          "use_case": "1",
-          "sample_level_fn": "compute",
-          "corpus_level_fn": {
-            "accuracy": "mean"
-          }
-        }
-      ],
-      "hf_revision": null,
-      "hf_filter": null,
-      "hf_avail_splits": [
-        "train"
-      ],
-      "trust_dataset": true,
-      "evaluation_splits": [
-        "train"
-      ],
-      "few_shots_split": null,
-      "few_shots_select": null,
-      "generation_size": 8192,
-      "generation_grammar": null,
-      "stop_sequence": [],
-      "num_samples": null,
-      "suite": [
-        "custom"
-      ],
-      "original_num_docs": 34,
-      "effective_num_docs": 30,
-      "must_remove_duplicate_docs": false,
-      "version": 0
-    }
-  },
-  "summary_tasks": {
-    "custom|yourbench|0": {
-      "hashes": {
-        "hash_examples": "1b5afc5f13827f79",
-        "hash_full_prompts": "cd8c39c007643835",
-        "hash_input_tokens": "79ab129e9a18c6d6",
-        "hash_cont_tokens": "79ab129e9a18c6d6"
-      },
-      "truncated": 0,
-      "non_truncated": 30,
-      "padded": 0,
-      "non_padded": 30,
-      "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
-    }
-  },
-  "summary_general": {
-    "hashes": {
-      "hash_examples": "b18e19e266a5bc51",
-      "hash_full_prompts": "1eaa15cbc4a17d04",
-      "hash_input_tokens": "05a66e44e190c178",
-      "hash_cont_tokens": "05a66e44e190c178"
-    },
-    "truncated": 0,
-    "non_truncated": 30,
-    "padded": 0,
-    "non_padded": 30,
-    "num_truncated_few_shots": 0
-  }
-}

backend/tasks/evaluationTask.py CHANGED Viewed

@@ -36,21 +36,17 @@ class EvaluationTask:
     def _save_results_to_hub(self) -> None:
         """
-        Save evaluation results to the dataset on the Hub
         """
         try:
-            # Create results directory if it doesn't exist
-            results_dir = Path("data/lighteval_results")
-            results_dir.mkdir(parents=True, exist_ok=True)
-            # Save results to JSON file
-            results_file = results_dir / "lighteval_results.json"
-            with open(results_file, "w") as f:
-                json.dump(self.results, f, indent=2)
             # Push to Hub
             self.hf_api.upload_file(
-                path_or_fileobj=str(results_file),
                 path_in_repo="lighteval_results.json",
                 repo_id=self.dataset_name,
                 repo_type="dataset",
@@ -58,6 +54,9 @@ class EvaluationTask:
             )
             print(f"[{datetime.now().strftime('%H:%M:%S')}] Results saved to Hub at {self.dataset_name}/lighteval_results.json")
         except Exception as e:
             print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to save results to Hub: {str(e)}")
@@ -78,6 +77,9 @@ yourbench = create_yourbench_task("{dataset_name}", "multi_hop_questions")
 TASKS_TABLE = [yourbench]
 """)
         # LightEval command
         cmd_args = [
             "lighteval",
@@ -88,7 +90,7 @@ TASKS_TABLE = [yourbench]
             "--custom-tasks",
             temp_file_path,
             "--max-samples", "30",
-            "--output-dir", "data/lighteval_results",
             "--no-push-to-hub"
         ]
@@ -106,6 +108,12 @@ TASKS_TABLE = [yourbench]
             except asyncio.TimeoutError:
                 process.kill()
                 print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation timed out for {model_name} after {time.time() - start_time:.2f}s")
                 return {
                     "model": model_name,
                     "provider": provider,
@@ -115,6 +123,12 @@ TASKS_TABLE = [yourbench]
                 }
         except Exception as e:
             print(f"[{datetime.now().strftime('%H:%M:%S')}] Error running evaluation for {model_name}: {str(e)}")
             return {
                 "model": model_name,
                 "provider": provider,
@@ -127,19 +141,16 @@ TASKS_TABLE = [yourbench]
         execution_time = time.time() - start_time
         print(f"[{datetime.now().strftime('%H:%M:%S')}] Finished evaluation for {model_name} in {execution_time:.2f}s")
-        # Clean up
-        os.unlink(temp_file_path)
         try:
             # Get results from the output file
-            results_dir = Path("data/lighteval_results/results") / model_name.replace("/", "/")
             results_file = next(results_dir.glob("results_*.json"))
             with open(results_file) as f:
                 results = json.load(f)
                 accuracy = results["results"]["all"]["accuracy"]
-            return {
                 "model": model_name,
                 "provider": provider,
                 "accuracy": accuracy,
@@ -148,13 +159,20 @@ TASKS_TABLE = [yourbench]
             }
         except Exception as e:
             print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to parse results for {model_name} after {execution_time:.2f}s: {str(e)}")
-            return {
                 "model": model_name,
                 "provider": provider,
                 "accuracy": 0.0,
                 "execution_time": execution_time,
                 "status": "parse_error"
             }
     async def run(self) -> None:
         """
@@ -191,7 +209,17 @@ TASKS_TABLE = [yourbench]
         total_time = time.time() - script_start_time
         print(f"[{datetime.now().strftime('%H:%M:%S')}] All evaluations completed in {total_time:.2f}s")
-        # Save results to Hub
         self._save_results_to_hub()
         # Mark the task as completed

     def _save_results_to_hub(self) -> None:
         """
+        Save evaluation results directly to the dataset on the Hub without persisting locally
         """
         try:
+            # Créer un fichier temporaire pour les résultats
+            with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
+                json.dump(self.results, temp_file, indent=2)
+                temp_file_path = temp_file.name
             # Push to Hub
             self.hf_api.upload_file(
+                path_or_fileobj=temp_file_path,
                 path_in_repo="lighteval_results.json",
                 repo_id=self.dataset_name,
                 repo_type="dataset",
             )
             print(f"[{datetime.now().strftime('%H:%M:%S')}] Results saved to Hub at {self.dataset_name}/lighteval_results.json")
+            # Supprimer le fichier temporaire
+            os.unlink(temp_file_path)
         except Exception as e:
             print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to save results to Hub: {str(e)}")
 TASKS_TABLE = [yourbench]
 """)
+        # Create temporary output directory
+        temp_output_dir = tempfile.mkdtemp(prefix="lighteval_")
         # LightEval command
         cmd_args = [
             "lighteval",
             "--custom-tasks",
             temp_file_path,
             "--max-samples", "30",
+            "--output-dir", temp_output_dir,
             "--no-push-to-hub"
         ]
             except asyncio.TimeoutError:
                 process.kill()
                 print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation timed out for {model_name} after {time.time() - start_time:.2f}s")
+                # Clean up temporary files and directories
+                os.unlink(temp_file_path)
+                import shutil
+                shutil.rmtree(temp_output_dir, ignore_errors=True)
                 return {
                     "model": model_name,
                     "provider": provider,
                 }
         except Exception as e:
             print(f"[{datetime.now().strftime('%H:%M:%S')}] Error running evaluation for {model_name}: {str(e)}")
+            # Clean up temporary files and directories
+            os.unlink(temp_file_path)
+            import shutil
+            shutil.rmtree(temp_output_dir, ignore_errors=True)
             return {
                 "model": model_name,
                 "provider": provider,
         execution_time = time.time() - start_time
         print(f"[{datetime.now().strftime('%H:%M:%S')}] Finished evaluation for {model_name} in {execution_time:.2f}s")
         try:
             # Get results from the output file
+            results_dir = Path(temp_output_dir) / "results" / model_name.replace("/", "/")
             results_file = next(results_dir.glob("results_*.json"))
             with open(results_file) as f:
                 results = json.load(f)
                 accuracy = results["results"]["all"]["accuracy"]
+            result_data = {
                 "model": model_name,
                 "provider": provider,
                 "accuracy": accuracy,
             }
         except Exception as e:
             print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to parse results for {model_name} after {execution_time:.2f}s: {str(e)}")
+            result_data = {
                 "model": model_name,
                 "provider": provider,
                 "accuracy": 0.0,
                 "execution_time": execution_time,
                 "status": "parse_error"
             }
+        # Clean up temporary files and directories
+        os.unlink(temp_file_path)
+        import shutil
+        shutil.rmtree(temp_output_dir, ignore_errors=True)
+        return result_data
     async def run(self) -> None:
         """
         total_time = time.time() - script_start_time
         print(f"[{datetime.now().strftime('%H:%M:%S')}] All evaluations completed in {total_time:.2f}s")
+        # Cleanup intermediate results if they exist
+        if os.path.exists("data/lighteval_results"):
+            print(f"[{datetime.now().strftime('%H:%M:%S')}] Cleaning up intermediate results")
+            try:
+                # Recursively delete intermediate results
+                import shutil
+                shutil.rmtree("data/lighteval_results", ignore_errors=True)
+            except Exception as e:
+                print(f"[{datetime.now().strftime('%H:%M:%S')}] Warning: Failed to clean up intermediate results: {str(e)}")
+        # Save final results to Hub (only once)
         self._save_results_to_hub()
         # Mark the task as completed