[ { "id": "eee_eval:source_metadata.evaluator_relationship", "source": "eee_eval", "section": "source_metadata", "field": "evaluator_relationship", "schemaPath": "source_metadata.evaluator_relationship", "fullPath": "eee_eval.source_metadata.evaluator_relationship", "type": "string", "description": "Relationship between the evaluator and the model developer (e.g., first-party, third-party, independent).", "required": "required" }, { "id": "eee_eval:source_metadata.source_organization_name", "source": "eee_eval", "section": "source_metadata", "field": "source_organization_name", "schemaPath": "source_metadata.source_organization_name", "fullPath": "eee_eval.source_metadata.source_organization_name", "type": "string", "description": "Name of the organization that produced or published the evaluation results.", "required": "required" }, { "id": "eee_eval:source_metadata.source_url", "source": "eee_eval", "section": "source_metadata", "field": "source_url", "schemaPath": "source_metadata.source_url", "fullPath": "eee_eval.source_metadata.source_url", "type": "string", "description": "URL pointing to the original source of the evaluation results.", "required": "optional" }, { "id": "eee_eval:source_metadata.publication_date", "source": "eee_eval", "section": "source_metadata", "field": "publication_date", "schemaPath": "source_metadata.publication_date", "fullPath": "eee_eval.source_metadata.publication_date", "type": "string", "description": "Date when the evaluation results were published or made publicly available.", "required": "optional" }, { "id": "eee_eval:retrieved_timestamp", "source": "eee_eval", "section": "root", "field": "retrieved_timestamp", "schemaPath": "retrieved_timestamp", "fullPath": "eee_eval.retrieved_timestamp", "type": "string", "description": "ISO 8601 timestamp indicating when the evaluation data was retrieved or ingested.", "required": "required" }, { "id": "eee_eval:eval_library.name", "source": "eee_eval", "section": "eval_library", "field": "name", "schemaPath": "eval_library.name", "fullPath": "eee_eval.eval_library.name", "type": "string", "description": "Name of the evaluation library or harness used to run the evaluation (e.g., lm-evaluation-harness, HELM).", "required": "required" }, { "id": "eee_eval:eval_library.version", "source": "eee_eval", "section": "eval_library", "field": "version", "schemaPath": "eval_library.version", "fullPath": "eee_eval.eval_library.version", "type": "string", "description": "Version string of the evaluation library used, enabling reproducibility checks.", "required": "required" }, { "id": "eee_eval:eval_library.url", "source": "eee_eval", "section": "eval_library", "field": "url", "schemaPath": "eval_library.url", "fullPath": "eee_eval.eval_library.url", "type": "string", "description": "Repository or documentation URL for the evaluation library.", "required": "optional" }, { "id": "eee_eval:model_info.model_id", "source": "eee_eval", "section": "model_info", "field": "model_id", "schemaPath": "model_info.model_id", "fullPath": "eee_eval.model_info.model_id", "type": "string", "description": "Unique identifier for the model being evaluated (e.g., HuggingFace model ID).", "required": "required" }, { "id": "eee_eval:model_info.model_revision", "source": "eee_eval", "section": "model_info", "field": "model_revision", "schemaPath": "model_info.model_revision", "fullPath": "eee_eval.model_info.model_revision", "type": "string", "description": "Git revision or checkpoint hash of the model weights used during evaluation.", "required": "optional" }, { "id": "eee_eval:model_info.model_type", "source": "eee_eval", "section": "model_info", "field": "model_type", "schemaPath": "model_info.model_type", "fullPath": "eee_eval.model_info.model_type", "type": "string", "description": "Type or architecture category of the model (e.g., decoder-only, encoder-decoder).", "required": "optional" }, { "id": "eee_eval:evaluation_results.generation_config", "source": "eee_eval", "section": "evaluation_results", "field": "generation_config", "schemaPath": "evaluation_results.generation_config", "fullPath": "eee_eval.evaluation_results.generation_config", "type": "object", "description": "Generation configuration used during evaluation, including temperature, top-p, max tokens, and other sampling parameters.", "required": "required" }, { "id": "eee_eval:evaluation_results.scores", "source": "eee_eval", "section": "evaluation_results", "field": "scores", "schemaPath": "evaluation_results.scores", "fullPath": "eee_eval.evaluation_results.scores", "type": "object", "description": "Aggregate scores across benchmarks, keyed by benchmark name.", "required": "required" }, { "id": "eee_eval:evaluation_results.num_few_shot", "source": "eee_eval", "section": "evaluation_results", "field": "num_few_shot", "schemaPath": "evaluation_results.num_few_shot", "fullPath": "eee_eval.evaluation_results.num_few_shot", "type": "integer", "description": "Number of few-shot examples provided in the prompt during evaluation.", "required": "optional" }, { "id": "eee_eval:detailed_evaluation_results.file_path", "source": "eee_eval", "section": "detailed_evaluation_results", "field": "file_path", "schemaPath": "detailed_evaluation_results.file_path", "fullPath": "eee_eval.detailed_evaluation_results.file_path", "type": "string", "description": "Path or URL to files containing per-sample evaluation results for detailed analysis.", "required": "optional" }, { "id": "eee_eval:detailed_evaluation_results.format", "source": "eee_eval", "section": "detailed_evaluation_results", "field": "format", "schemaPath": "detailed_evaluation_results.format", "fullPath": "eee_eval.detailed_evaluation_results.format", "type": "string", "description": "File format of the detailed evaluation results (e.g., jsonl, parquet, csv).", "required": "optional" }, { "id": "eee_eval:hardware_info.gpu_type", "source": "eee_eval", "section": "hardware_info", "field": "gpu_type", "schemaPath": "hardware_info.gpu_type", "fullPath": "eee_eval.hardware_info.gpu_type", "type": "string", "description": "Type and model of GPU hardware used during evaluation.", "required": "optional" }, { "id": "eee_eval:hardware_info.num_gpus", "source": "eee_eval", "section": "hardware_info", "field": "num_gpus", "schemaPath": "hardware_info.num_gpus", "fullPath": "eee_eval.hardware_info.num_gpus", "type": "integer", "description": "Number of GPUs used during evaluation.", "required": "optional" }, { "id": "autobenchmarkcard:benchmark_details.overview", "source": "autobenchmarkcard", "section": "benchmark_details", "field": "overview", "schemaPath": "benchmark_details.overview", "fullPath": "autobenchmarkcard.benchmark_details.overview", "type": "string", "description": "High-level summary of the benchmark, its purpose, and the capabilities it is designed to measure.", "required": "required" }, { "id": "autobenchmarkcard:benchmark_details.name", "source": "autobenchmarkcard", "section": "benchmark_details", "field": "name", "schemaPath": "benchmark_details.name", "fullPath": "autobenchmarkcard.benchmark_details.name", "type": "string", "description": "Official name of the benchmark.", "required": "required" }, { "id": "autobenchmarkcard:benchmark_details.version", "source": "autobenchmarkcard", "section": "benchmark_details", "field": "version", "schemaPath": "benchmark_details.version", "fullPath": "autobenchmarkcard.benchmark_details.version", "type": "string", "description": "Version of the benchmark dataset or task specification.", "required": "required" }, { "id": "autobenchmarkcard:benchmark_details.release_date", "source": "autobenchmarkcard", "section": "benchmark_details", "field": "release_date", "schemaPath": "benchmark_details.release_date", "fullPath": "autobenchmarkcard.benchmark_details.release_date", "type": "string", "description": "Date the benchmark was publicly released.", "required": "optional" }, { "id": "autobenchmarkcard:benchmark_details.citation", "source": "autobenchmarkcard", "section": "benchmark_details", "field": "citation", "schemaPath": "benchmark_details.citation", "fullPath": "autobenchmarkcard.benchmark_details.citation", "type": "string", "description": "BibTeX or APA citation for the benchmark paper or dataset.", "required": "optional" }, { "id": "autobenchmarkcard:purpose_and_intended_users.intended_use", "source": "autobenchmarkcard", "section": "purpose_and_intended_users", "field": "intended_use", "schemaPath": "purpose_and_intended_users.intended_use", "fullPath": "autobenchmarkcard.purpose_and_intended_users.intended_use", "type": "string", "description": "Description of the intended use cases and audiences for this benchmark.", "required": "required" }, { "id": "autobenchmarkcard:purpose_and_intended_users.limitations", "source": "autobenchmarkcard", "section": "purpose_and_intended_users", "field": "limitations", "schemaPath": "purpose_and_intended_users.limitations", "fullPath": "autobenchmarkcard.purpose_and_intended_users.limitations", "type": "string", "description": "Known limitations of the benchmark, including scope restrictions, population coverage gaps, or validity concerns.", "required": "required" }, { "id": "autobenchmarkcard:purpose_and_intended_users.out_of_scope", "source": "autobenchmarkcard", "section": "purpose_and_intended_users", "field": "out_of_scope", "schemaPath": "purpose_and_intended_users.out_of_scope", "fullPath": "autobenchmarkcard.purpose_and_intended_users.out_of_scope", "type": "string", "description": "Explicit description of use cases or capabilities the benchmark is not designed to evaluate.", "required": "optional" }, { "id": "autobenchmarkcard:methodology.metrics", "source": "autobenchmarkcard", "section": "methodology", "field": "metrics", "schemaPath": "methodology.metrics", "fullPath": "autobenchmarkcard.methodology.metrics", "type": "array", "description": "List of evaluation metrics used (e.g., accuracy, F1, BLEU), including their definitions and how they are computed.", "required": "required" }, { "id": "autobenchmarkcard:methodology.validation", "source": "autobenchmarkcard", "section": "methodology", "field": "validation", "schemaPath": "methodology.validation", "fullPath": "autobenchmarkcard.methodology.validation", "type": "string", "description": "Description of validation procedures used to ensure benchmark quality, including human review, pilot studies, or inter-annotator agreement.", "required": "required" }, { "id": "autobenchmarkcard:methodology.interpretation", "source": "autobenchmarkcard", "section": "methodology", "field": "interpretation", "schemaPath": "methodology.interpretation", "fullPath": "autobenchmarkcard.methodology.interpretation", "type": "string", "description": "Guidance on how to interpret benchmark scores, including what constitutes a meaningful difference and known confounds.", "required": "required" }, { "id": "autobenchmarkcard:methodology.data_collection", "source": "autobenchmarkcard", "section": "methodology", "field": "data_collection", "schemaPath": "methodology.data_collection", "fullPath": "autobenchmarkcard.methodology.data_collection", "type": "string", "description": "Description of how benchmark data was collected, curated, or generated.", "required": "optional" }, { "id": "autobenchmarkcard:methodology.prompt_format", "source": "autobenchmarkcard", "section": "methodology", "field": "prompt_format", "schemaPath": "methodology.prompt_format", "fullPath": "autobenchmarkcard.methodology.prompt_format", "type": "string", "description": "Specification of the prompt template or format used when querying models.", "required": "optional" }, { "id": "autobenchmarkcard:ethical_and_legal_considerations.compliance_with_regulations", "source": "autobenchmarkcard", "section": "ethical_and_legal_considerations", "field": "compliance_with_regulations", "schemaPath": "ethical_and_legal_considerations.compliance_with_regulations", "fullPath": "autobenchmarkcard.ethical_and_legal_considerations.compliance_with_regulations", "type": "string", "description": "Statement on compliance with relevant regulations or legal frameworks (e.g., GDPR, EU AI Act, NIST RMF).", "required": "required" }, { "id": "autobenchmarkcard:ethical_and_legal_considerations.data_privacy", "source": "autobenchmarkcard", "section": "ethical_and_legal_considerations", "field": "data_privacy", "schemaPath": "ethical_and_legal_considerations.data_privacy", "fullPath": "autobenchmarkcard.ethical_and_legal_considerations.data_privacy", "type": "string", "description": "Description of how personal data or sensitive information is handled in the benchmark.", "required": "optional" }, { "id": "autobenchmarkcard:ethical_and_legal_considerations.consent", "source": "autobenchmarkcard", "section": "ethical_and_legal_considerations", "field": "consent", "schemaPath": "ethical_and_legal_considerations.consent", "fullPath": "autobenchmarkcard.ethical_and_legal_considerations.consent", "type": "string", "description": "Information about consent obtained from data subjects or annotators.", "required": "optional" }, { "id": "autobenchmarkcard:possible_risks.category", "source": "autobenchmarkcard", "section": "possible_risks", "field": "category", "schemaPath": "possible_risks.category", "fullPath": "autobenchmarkcard.possible_risks.category", "type": "array", "description": "Categorized list of potential risks associated with misuse or misinterpretation of benchmark results (e.g., gaming, overfitting, contamination).", "required": "required" }, { "id": "autobenchmarkcard:possible_risks.mitigation", "source": "autobenchmarkcard", "section": "possible_risks", "field": "mitigation", "schemaPath": "possible_risks.mitigation", "fullPath": "autobenchmarkcard.possible_risks.mitigation", "type": "string", "description": "Recommended mitigations or safeguards to reduce identified risks.", "required": "optional" }, { "id": "autobenchmarkcard:dataset_details.size", "source": "autobenchmarkcard", "section": "dataset_details", "field": "size", "schemaPath": "dataset_details.size", "fullPath": "autobenchmarkcard.dataset_details.size", "type": "integer", "description": "Total number of examples or items in the benchmark dataset.", "required": "optional" }, { "id": "autobenchmarkcard:dataset_details.languages", "source": "autobenchmarkcard", "section": "dataset_details", "field": "languages", "schemaPath": "dataset_details.languages", "fullPath": "autobenchmarkcard.dataset_details.languages", "type": "array", "description": "Languages represented in the benchmark dataset (ISO 639-1 codes).", "required": "optional" }, { "id": "autobenchmarkcard:dataset_details.domains", "source": "autobenchmarkcard", "section": "dataset_details", "field": "domains", "schemaPath": "dataset_details.domains", "fullPath": "autobenchmarkcard.dataset_details.domains", "type": "array", "description": "Subject domains covered by the benchmark (e.g., medicine, law, mathematics, code).", "required": "optional" }, { "id": "autobenchmarkcard:dataset_details.license", "source": "autobenchmarkcard", "section": "dataset_details", "field": "license", "schemaPath": "dataset_details.license", "fullPath": "autobenchmarkcard.dataset_details.license", "type": "string", "description": "License under which the benchmark dataset is distributed.", "required": "optional" }, { "id": "autobenchmarkcard:leaderboard_info.url", "source": "autobenchmarkcard", "section": "leaderboard_info", "field": "url", "schemaPath": "leaderboard_info.url", "fullPath": "autobenchmarkcard.leaderboard_info.url", "type": "string", "description": "URL of the official leaderboard or results page for this benchmark.", "required": "optional" }, { "id": "autobenchmarkcard:leaderboard_info.submission_requirements", "source": "autobenchmarkcard", "section": "leaderboard_info", "field": "submission_requirements", "schemaPath": "leaderboard_info.submission_requirements", "fullPath": "autobenchmarkcard.leaderboard_info.submission_requirements", "type": "string", "description": "Requirements for submitting model results to the benchmark leaderboard.", "required": "optional" }, { "id": "eee_instance_level_eval:instance_id", "source": "eee_instance_level_eval", "section": "root", "field": "instance_id", "schemaPath": "instance_id", "fullPath": "eee_instance_level_eval.instance_id", "type": "string", "description": "Unique identifier for a single evaluation instance or example.", "required": "required" }, { "id": "eee_instance_level_eval:model_output", "source": "eee_instance_level_eval", "section": "root", "field": "model_output", "schemaPath": "model_output", "fullPath": "eee_instance_level_eval.model_output", "type": "string", "description": "Raw text output generated by the model for this instance.", "required": "required" }, { "id": "eee_instance_level_eval:ground_truth", "source": "eee_instance_level_eval", "section": "root", "field": "ground_truth", "schemaPath": "ground_truth", "fullPath": "eee_instance_level_eval.ground_truth", "type": "string", "description": "Reference answer or ground truth label for this instance.", "required": "optional" }, { "id": "eee_instance_level_eval:score", "source": "eee_instance_level_eval", "section": "root", "field": "score", "schemaPath": "score", "fullPath": "eee_instance_level_eval.score", "type": "number", "description": "Numeric score assigned to this instance by the evaluation metric.", "required": "required" }, { "id": "eee_instance_level_eval:prompt", "source": "eee_instance_level_eval", "section": "root", "field": "prompt", "schemaPath": "prompt", "fullPath": "eee_instance_level_eval.prompt", "type": "string", "description": "Full prompt text as presented to the model for this instance.", "required": "optional" }, { "id": "eee_instance_level_eval:task_name", "source": "eee_instance_level_eval", "section": "root", "field": "task_name", "schemaPath": "task_name", "fullPath": "eee_instance_level_eval.task_name", "type": "string", "description": "Name of the task or benchmark this instance belongs to.", "required": "required" }, { "id": "eee_instance_level_eval:metadata.difficulty", "source": "eee_instance_level_eval", "section": "metadata", "field": "difficulty", "schemaPath": "metadata.difficulty", "fullPath": "eee_instance_level_eval.metadata.difficulty", "type": "string", "description": "Difficulty level or category of this instance (e.g., easy, medium, hard).", "required": "optional" }, { "id": "eee_instance_level_eval:metadata.subject", "source": "eee_instance_level_eval", "section": "metadata", "field": "subject", "schemaPath": "metadata.subject", "fullPath": "eee_instance_level_eval.metadata.subject", "type": "string", "description": "Subject or topic area of this instance.", "required": "optional" }, { "id": "eee_instance_level_eval:metadata.source_dataset", "source": "eee_instance_level_eval", "section": "metadata", "field": "source_dataset", "schemaPath": "metadata.source_dataset", "fullPath": "eee_instance_level_eval.metadata.source_dataset", "type": "string", "description": "Original dataset this instance was sourced from.", "required": "optional" } ]