{ "$schema": "http://json-schema.org/draft-07/schema#", "version": "0.0.1", "type": "object", "description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics", "required": [ "schema_version", "evaluation_id", "evaluation_source", "retrieved_timestamp", "source_data", "source_metadata", "model_info", "evaluation_results" ], "properties": { "schema_version": { "type": "string", "description": "Version of the schema used for this evaluation data" }, "evaluation_id": { "type": "string", "description": "Unique identifier for this specific evaluation run. Use org_name/eval_name/retrieved_timestamp format" }, "retrieved_timestamp": { "type": "string", "description": "Timestamp for when this record was created" }, "source_data": { "type": "array", "description": "URLs for the source of the evaluation data", "items": { "type": "string" } }, "evaluation_source": { "type": "object", "description": "Details about evaluation origin. There are options that evaluations come from leaderboards (e.g. Live Code Bench Pro) or evaluation platforms (e.g. lm-eval, inspect ai, HELM...).", "required": [ "evaluation_source_name", "evaluation_source_type" ], "properties": { "evaluation_source_name": { "type": "string", "description": "Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation." }, "evaluation_source_type": { "type": "string", "enum": [ "leaderboard", "evaluation_platform" ], "description": "Type of evaluation source, e.g., leaderboard or evaluation platform" } } }, "source_metadata": { "type": "object", "description": "Metadata about the source of the leaderboard data", "required": [ "source_organization_name", "evaluator_relationship" ], "properties": { "source_organization_name": { "type": "string", "description": "Name of the organization that provides the data" }, "source_organization_url": { "type": "string", "description": "URL for the organization that provides the data" }, "source_organization_logo_url": { "type": "string", "description": "URL for the Logo for the organization that provides the data" }, "evaluator_relationship": { "type": "string", "description": "Relationship between the evaluator and the model", "enum": [ "first_party", "third_party", "collaborative", "other" ] } } }, "model_info": { "type": "object", "description": "Complete model specification including basic information, technical configuration and inference settings", "required": [ "name", "id" ], "properties": { "name": { "type": "string", "description": "Model name provided by evaluation source" }, "id": { "type": "string", "description": "Model name standarized to HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)" }, "developer": { "type": "string", "description": "Name of organization that provides the model (e.g. 'OpenAI')" }, "inference_platform": { "type": "string", "description": "Description of platform used to run the evaluations (e.g. local machine, Bedrock)" } } }, "evaluation_results": { "type": "array", "description": "Array of evaluation results", "items": { "type": "object", "required": [ "evaluation_name", "metric_config", "score_details" ], "properties": { "evaluation_name": { "type": "string", "description": "Name of the evaluation" }, "evaluation_timestamp": { "type": "string", "description": "Timestamp for when the evaluations were run" }, "metric_config": { "type": "object", "description": "Details about the metric", "required": [ "lower_is_better" ], "properties": { "evaluation_description": { "type": "string", "description": "Description of the evaluation" }, "lower_is_better": { "type": "boolean", "description": "Whether a lower score is better" }, "score_type": { "type": "string", "description": "Type of score", "enum": [ "binary", "continuous", "levels" ] }, "level_names": { "type": "array", "description": "Names of the score levels", "items": { "type": "string" } }, "level_metadata": { "type": "array", "description": "Additional Description for each Score Level", "items": { "type": "string" } }, "has_unknown_level": { "type": "boolean", "description": "Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown" }, "min_score": { "type": "number", "description": "Minimum possible score for continuous metric" }, "max_score": { "type": "number", "description": "Maximum possible score for continuous metric" } }, "if": { "properties": { "score_type": { "const": "levels" } } }, "then": { "required": [ "level_names", "has_unknown_level" ] }, "else": { "if": { "properties": { "score_type": { "const": "continuous" } } }, "then": { "required": [ "min_score", "max_score" ] } } }, "score_details": { "type": "object", "description": "The score for the evaluation and related details", "required": [ "score" ], "properties": { "score": { "type": "number", "description": "The score for the evaluation" }, "details": { "type": "object", "description": "Any additional details about the score", "additionalProperties": true } } }, "detailed_evaluation_results_url": { "type": "string", "description": "Link to detailed evaluation data" }, "generation_config": { "type": "object", "generation_args": { "type": "object", "description": "Parameters used to generate results - properties may vary by model type", "properties": { "temperature": { "type": [ "null", "number" ], "description": "Sampling temperature" }, "top_p": { "type": [ "null", "number" ], "description": "Nucleus sampling parameter" }, "top_k": { "type": [ "null", "number" ], "description": "Top-k sampling parameter" }, "max_tokens": { "type": "integer", "minimum": 1, "description": "Maximum number of tokens to generate" } }, "additionalProperties": true }, "additional_details": { "type": "string", "description": "Additional details about how the results for this metric were generated." } } } } } } }