general-eval-card / data /survey /eval-schema-fields.json
evijit's picture
evijit HF Staff
fix bugs
04b4cff
[
{
"id": "eee_eval:source_metadata.evaluator_relationship",
"source": "eee_eval",
"section": "source_metadata",
"field": "evaluator_relationship",
"schemaPath": "source_metadata.evaluator_relationship",
"fullPath": "eee_eval.source_metadata.evaluator_relationship",
"type": "string",
"description": "Relationship between the evaluator and the model developer (e.g., first-party, third-party, independent).",
"required": "required"
},
{
"id": "eee_eval:source_metadata.source_organization_name",
"source": "eee_eval",
"section": "source_metadata",
"field": "source_organization_name",
"schemaPath": "source_metadata.source_organization_name",
"fullPath": "eee_eval.source_metadata.source_organization_name",
"type": "string",
"description": "Name of the organization that produced or published the evaluation results.",
"required": "required"
},
{
"id": "eee_eval:source_metadata.source_url",
"source": "eee_eval",
"section": "source_metadata",
"field": "source_url",
"schemaPath": "source_metadata.source_url",
"fullPath": "eee_eval.source_metadata.source_url",
"type": "string",
"description": "URL pointing to the original source of the evaluation results.",
"required": "optional"
},
{
"id": "eee_eval:source_metadata.publication_date",
"source": "eee_eval",
"section": "source_metadata",
"field": "publication_date",
"schemaPath": "source_metadata.publication_date",
"fullPath": "eee_eval.source_metadata.publication_date",
"type": "string",
"description": "Date when the evaluation results were published or made publicly available.",
"required": "optional"
},
{
"id": "eee_eval:retrieved_timestamp",
"source": "eee_eval",
"section": "root",
"field": "retrieved_timestamp",
"schemaPath": "retrieved_timestamp",
"fullPath": "eee_eval.retrieved_timestamp",
"type": "string",
"description": "ISO 8601 timestamp indicating when the evaluation data was retrieved or ingested.",
"required": "required"
},
{
"id": "eee_eval:eval_library.name",
"source": "eee_eval",
"section": "eval_library",
"field": "name",
"schemaPath": "eval_library.name",
"fullPath": "eee_eval.eval_library.name",
"type": "string",
"description": "Name of the evaluation library or harness used to run the evaluation (e.g., lm-evaluation-harness, HELM).",
"required": "required"
},
{
"id": "eee_eval:eval_library.version",
"source": "eee_eval",
"section": "eval_library",
"field": "version",
"schemaPath": "eval_library.version",
"fullPath": "eee_eval.eval_library.version",
"type": "string",
"description": "Version string of the evaluation library used, enabling reproducibility checks.",
"required": "required"
},
{
"id": "eee_eval:eval_library.url",
"source": "eee_eval",
"section": "eval_library",
"field": "url",
"schemaPath": "eval_library.url",
"fullPath": "eee_eval.eval_library.url",
"type": "string",
"description": "Repository or documentation URL for the evaluation library.",
"required": "optional"
},
{
"id": "eee_eval:model_info.model_id",
"source": "eee_eval",
"section": "model_info",
"field": "model_id",
"schemaPath": "model_info.model_id",
"fullPath": "eee_eval.model_info.model_id",
"type": "string",
"description": "Unique identifier for the model being evaluated (e.g., HuggingFace model ID).",
"required": "required"
},
{
"id": "eee_eval:model_info.model_revision",
"source": "eee_eval",
"section": "model_info",
"field": "model_revision",
"schemaPath": "model_info.model_revision",
"fullPath": "eee_eval.model_info.model_revision",
"type": "string",
"description": "Git revision or checkpoint hash of the model weights used during evaluation.",
"required": "optional"
},
{
"id": "eee_eval:model_info.model_type",
"source": "eee_eval",
"section": "model_info",
"field": "model_type",
"schemaPath": "model_info.model_type",
"fullPath": "eee_eval.model_info.model_type",
"type": "string",
"description": "Type or architecture category of the model (e.g., decoder-only, encoder-decoder).",
"required": "optional"
},
{
"id": "eee_eval:evaluation_results.generation_config",
"source": "eee_eval",
"section": "evaluation_results",
"field": "generation_config",
"schemaPath": "evaluation_results.generation_config",
"fullPath": "eee_eval.evaluation_results.generation_config",
"type": "object",
"description": "Generation configuration used during evaluation, including temperature, top-p, max tokens, and other sampling parameters.",
"required": "required"
},
{
"id": "eee_eval:evaluation_results.scores",
"source": "eee_eval",
"section": "evaluation_results",
"field": "scores",
"schemaPath": "evaluation_results.scores",
"fullPath": "eee_eval.evaluation_results.scores",
"type": "object",
"description": "Aggregate scores across benchmarks, keyed by benchmark name.",
"required": "required"
},
{
"id": "eee_eval:evaluation_results.num_few_shot",
"source": "eee_eval",
"section": "evaluation_results",
"field": "num_few_shot",
"schemaPath": "evaluation_results.num_few_shot",
"fullPath": "eee_eval.evaluation_results.num_few_shot",
"type": "integer",
"description": "Number of few-shot examples provided in the prompt during evaluation.",
"required": "optional"
},
{
"id": "eee_eval:detailed_evaluation_results.file_path",
"source": "eee_eval",
"section": "detailed_evaluation_results",
"field": "file_path",
"schemaPath": "detailed_evaluation_results.file_path",
"fullPath": "eee_eval.detailed_evaluation_results.file_path",
"type": "string",
"description": "Path or URL to files containing per-sample evaluation results for detailed analysis.",
"required": "optional"
},
{
"id": "eee_eval:detailed_evaluation_results.format",
"source": "eee_eval",
"section": "detailed_evaluation_results",
"field": "format",
"schemaPath": "detailed_evaluation_results.format",
"fullPath": "eee_eval.detailed_evaluation_results.format",
"type": "string",
"description": "File format of the detailed evaluation results (e.g., jsonl, parquet, csv).",
"required": "optional"
},
{
"id": "eee_eval:hardware_info.gpu_type",
"source": "eee_eval",
"section": "hardware_info",
"field": "gpu_type",
"schemaPath": "hardware_info.gpu_type",
"fullPath": "eee_eval.hardware_info.gpu_type",
"type": "string",
"description": "Type and model of GPU hardware used during evaluation.",
"required": "optional"
},
{
"id": "eee_eval:hardware_info.num_gpus",
"source": "eee_eval",
"section": "hardware_info",
"field": "num_gpus",
"schemaPath": "hardware_info.num_gpus",
"fullPath": "eee_eval.hardware_info.num_gpus",
"type": "integer",
"description": "Number of GPUs used during evaluation.",
"required": "optional"
},
{
"id": "autobenchmarkcard:benchmark_details.overview",
"source": "autobenchmarkcard",
"section": "benchmark_details",
"field": "overview",
"schemaPath": "benchmark_details.overview",
"fullPath": "autobenchmarkcard.benchmark_details.overview",
"type": "string",
"description": "High-level summary of the benchmark, its purpose, and the capabilities it is designed to measure.",
"required": "required"
},
{
"id": "autobenchmarkcard:benchmark_details.name",
"source": "autobenchmarkcard",
"section": "benchmark_details",
"field": "name",
"schemaPath": "benchmark_details.name",
"fullPath": "autobenchmarkcard.benchmark_details.name",
"type": "string",
"description": "Official name of the benchmark.",
"required": "required"
},
{
"id": "autobenchmarkcard:benchmark_details.version",
"source": "autobenchmarkcard",
"section": "benchmark_details",
"field": "version",
"schemaPath": "benchmark_details.version",
"fullPath": "autobenchmarkcard.benchmark_details.version",
"type": "string",
"description": "Version of the benchmark dataset or task specification.",
"required": "required"
},
{
"id": "autobenchmarkcard:benchmark_details.release_date",
"source": "autobenchmarkcard",
"section": "benchmark_details",
"field": "release_date",
"schemaPath": "benchmark_details.release_date",
"fullPath": "autobenchmarkcard.benchmark_details.release_date",
"type": "string",
"description": "Date the benchmark was publicly released.",
"required": "optional"
},
{
"id": "autobenchmarkcard:benchmark_details.citation",
"source": "autobenchmarkcard",
"section": "benchmark_details",
"field": "citation",
"schemaPath": "benchmark_details.citation",
"fullPath": "autobenchmarkcard.benchmark_details.citation",
"type": "string",
"description": "BibTeX or APA citation for the benchmark paper or dataset.",
"required": "optional"
},
{
"id": "autobenchmarkcard:purpose_and_intended_users.intended_use",
"source": "autobenchmarkcard",
"section": "purpose_and_intended_users",
"field": "intended_use",
"schemaPath": "purpose_and_intended_users.intended_use",
"fullPath": "autobenchmarkcard.purpose_and_intended_users.intended_use",
"type": "string",
"description": "Description of the intended use cases and audiences for this benchmark.",
"required": "required"
},
{
"id": "autobenchmarkcard:purpose_and_intended_users.limitations",
"source": "autobenchmarkcard",
"section": "purpose_and_intended_users",
"field": "limitations",
"schemaPath": "purpose_and_intended_users.limitations",
"fullPath": "autobenchmarkcard.purpose_and_intended_users.limitations",
"type": "string",
"description": "Known limitations of the benchmark, including scope restrictions, population coverage gaps, or validity concerns.",
"required": "required"
},
{
"id": "autobenchmarkcard:purpose_and_intended_users.out_of_scope",
"source": "autobenchmarkcard",
"section": "purpose_and_intended_users",
"field": "out_of_scope",
"schemaPath": "purpose_and_intended_users.out_of_scope",
"fullPath": "autobenchmarkcard.purpose_and_intended_users.out_of_scope",
"type": "string",
"description": "Explicit description of use cases or capabilities the benchmark is not designed to evaluate.",
"required": "optional"
},
{
"id": "autobenchmarkcard:methodology.metrics",
"source": "autobenchmarkcard",
"section": "methodology",
"field": "metrics",
"schemaPath": "methodology.metrics",
"fullPath": "autobenchmarkcard.methodology.metrics",
"type": "array",
"description": "List of evaluation metrics used (e.g., accuracy, F1, BLEU), including their definitions and how they are computed.",
"required": "required"
},
{
"id": "autobenchmarkcard:methodology.validation",
"source": "autobenchmarkcard",
"section": "methodology",
"field": "validation",
"schemaPath": "methodology.validation",
"fullPath": "autobenchmarkcard.methodology.validation",
"type": "string",
"description": "Description of validation procedures used to ensure benchmark quality, including human review, pilot studies, or inter-annotator agreement.",
"required": "required"
},
{
"id": "autobenchmarkcard:methodology.interpretation",
"source": "autobenchmarkcard",
"section": "methodology",
"field": "interpretation",
"schemaPath": "methodology.interpretation",
"fullPath": "autobenchmarkcard.methodology.interpretation",
"type": "string",
"description": "Guidance on how to interpret benchmark scores, including what constitutes a meaningful difference and known confounds.",
"required": "required"
},
{
"id": "autobenchmarkcard:methodology.data_collection",
"source": "autobenchmarkcard",
"section": "methodology",
"field": "data_collection",
"schemaPath": "methodology.data_collection",
"fullPath": "autobenchmarkcard.methodology.data_collection",
"type": "string",
"description": "Description of how benchmark data was collected, curated, or generated.",
"required": "optional"
},
{
"id": "autobenchmarkcard:methodology.prompt_format",
"source": "autobenchmarkcard",
"section": "methodology",
"field": "prompt_format",
"schemaPath": "methodology.prompt_format",
"fullPath": "autobenchmarkcard.methodology.prompt_format",
"type": "string",
"description": "Specification of the prompt template or format used when querying models.",
"required": "optional"
},
{
"id": "autobenchmarkcard:ethical_and_legal_considerations.compliance_with_regulations",
"source": "autobenchmarkcard",
"section": "ethical_and_legal_considerations",
"field": "compliance_with_regulations",
"schemaPath": "ethical_and_legal_considerations.compliance_with_regulations",
"fullPath": "autobenchmarkcard.ethical_and_legal_considerations.compliance_with_regulations",
"type": "string",
"description": "Statement on compliance with relevant regulations or legal frameworks (e.g., GDPR, EU AI Act, NIST RMF).",
"required": "required"
},
{
"id": "autobenchmarkcard:ethical_and_legal_considerations.data_privacy",
"source": "autobenchmarkcard",
"section": "ethical_and_legal_considerations",
"field": "data_privacy",
"schemaPath": "ethical_and_legal_considerations.data_privacy",
"fullPath": "autobenchmarkcard.ethical_and_legal_considerations.data_privacy",
"type": "string",
"description": "Description of how personal data or sensitive information is handled in the benchmark.",
"required": "optional"
},
{
"id": "autobenchmarkcard:ethical_and_legal_considerations.consent",
"source": "autobenchmarkcard",
"section": "ethical_and_legal_considerations",
"field": "consent",
"schemaPath": "ethical_and_legal_considerations.consent",
"fullPath": "autobenchmarkcard.ethical_and_legal_considerations.consent",
"type": "string",
"description": "Information about consent obtained from data subjects or annotators.",
"required": "optional"
},
{
"id": "autobenchmarkcard:possible_risks.category",
"source": "autobenchmarkcard",
"section": "possible_risks",
"field": "category",
"schemaPath": "possible_risks.category",
"fullPath": "autobenchmarkcard.possible_risks.category",
"type": "array",
"description": "Categorized list of potential risks associated with misuse or misinterpretation of benchmark results (e.g., gaming, overfitting, contamination).",
"required": "required"
},
{
"id": "autobenchmarkcard:possible_risks.mitigation",
"source": "autobenchmarkcard",
"section": "possible_risks",
"field": "mitigation",
"schemaPath": "possible_risks.mitigation",
"fullPath": "autobenchmarkcard.possible_risks.mitigation",
"type": "string",
"description": "Recommended mitigations or safeguards to reduce identified risks.",
"required": "optional"
},
{
"id": "autobenchmarkcard:dataset_details.size",
"source": "autobenchmarkcard",
"section": "dataset_details",
"field": "size",
"schemaPath": "dataset_details.size",
"fullPath": "autobenchmarkcard.dataset_details.size",
"type": "integer",
"description": "Total number of examples or items in the benchmark dataset.",
"required": "optional"
},
{
"id": "autobenchmarkcard:dataset_details.languages",
"source": "autobenchmarkcard",
"section": "dataset_details",
"field": "languages",
"schemaPath": "dataset_details.languages",
"fullPath": "autobenchmarkcard.dataset_details.languages",
"type": "array",
"description": "Languages represented in the benchmark dataset (ISO 639-1 codes).",
"required": "optional"
},
{
"id": "autobenchmarkcard:dataset_details.domains",
"source": "autobenchmarkcard",
"section": "dataset_details",
"field": "domains",
"schemaPath": "dataset_details.domains",
"fullPath": "autobenchmarkcard.dataset_details.domains",
"type": "array",
"description": "Subject domains covered by the benchmark (e.g., medicine, law, mathematics, code).",
"required": "optional"
},
{
"id": "autobenchmarkcard:dataset_details.license",
"source": "autobenchmarkcard",
"section": "dataset_details",
"field": "license",
"schemaPath": "dataset_details.license",
"fullPath": "autobenchmarkcard.dataset_details.license",
"type": "string",
"description": "License under which the benchmark dataset is distributed.",
"required": "optional"
},
{
"id": "autobenchmarkcard:leaderboard_info.url",
"source": "autobenchmarkcard",
"section": "leaderboard_info",
"field": "url",
"schemaPath": "leaderboard_info.url",
"fullPath": "autobenchmarkcard.leaderboard_info.url",
"type": "string",
"description": "URL of the official leaderboard or results page for this benchmark.",
"required": "optional"
},
{
"id": "autobenchmarkcard:leaderboard_info.submission_requirements",
"source": "autobenchmarkcard",
"section": "leaderboard_info",
"field": "submission_requirements",
"schemaPath": "leaderboard_info.submission_requirements",
"fullPath": "autobenchmarkcard.leaderboard_info.submission_requirements",
"type": "string",
"description": "Requirements for submitting model results to the benchmark leaderboard.",
"required": "optional"
},
{
"id": "eee_instance_level_eval:instance_id",
"source": "eee_instance_level_eval",
"section": "root",
"field": "instance_id",
"schemaPath": "instance_id",
"fullPath": "eee_instance_level_eval.instance_id",
"type": "string",
"description": "Unique identifier for a single evaluation instance or example.",
"required": "required"
},
{
"id": "eee_instance_level_eval:model_output",
"source": "eee_instance_level_eval",
"section": "root",
"field": "model_output",
"schemaPath": "model_output",
"fullPath": "eee_instance_level_eval.model_output",
"type": "string",
"description": "Raw text output generated by the model for this instance.",
"required": "required"
},
{
"id": "eee_instance_level_eval:ground_truth",
"source": "eee_instance_level_eval",
"section": "root",
"field": "ground_truth",
"schemaPath": "ground_truth",
"fullPath": "eee_instance_level_eval.ground_truth",
"type": "string",
"description": "Reference answer or ground truth label for this instance.",
"required": "optional"
},
{
"id": "eee_instance_level_eval:score",
"source": "eee_instance_level_eval",
"section": "root",
"field": "score",
"schemaPath": "score",
"fullPath": "eee_instance_level_eval.score",
"type": "number",
"description": "Numeric score assigned to this instance by the evaluation metric.",
"required": "required"
},
{
"id": "eee_instance_level_eval:prompt",
"source": "eee_instance_level_eval",
"section": "root",
"field": "prompt",
"schemaPath": "prompt",
"fullPath": "eee_instance_level_eval.prompt",
"type": "string",
"description": "Full prompt text as presented to the model for this instance.",
"required": "optional"
},
{
"id": "eee_instance_level_eval:task_name",
"source": "eee_instance_level_eval",
"section": "root",
"field": "task_name",
"schemaPath": "task_name",
"fullPath": "eee_instance_level_eval.task_name",
"type": "string",
"description": "Name of the task or benchmark this instance belongs to.",
"required": "required"
},
{
"id": "eee_instance_level_eval:metadata.difficulty",
"source": "eee_instance_level_eval",
"section": "metadata",
"field": "difficulty",
"schemaPath": "metadata.difficulty",
"fullPath": "eee_instance_level_eval.metadata.difficulty",
"type": "string",
"description": "Difficulty level or category of this instance (e.g., easy, medium, hard).",
"required": "optional"
},
{
"id": "eee_instance_level_eval:metadata.subject",
"source": "eee_instance_level_eval",
"section": "metadata",
"field": "subject",
"schemaPath": "metadata.subject",
"fullPath": "eee_instance_level_eval.metadata.subject",
"type": "string",
"description": "Subject or topic area of this instance.",
"required": "optional"
},
{
"id": "eee_instance_level_eval:metadata.source_dataset",
"source": "eee_instance_level_eval",
"section": "metadata",
"field": "source_dataset",
"schemaPath": "metadata.source_dataset",
"fullPath": "eee_instance_level_eval.metadata.source_dataset",
"type": "string",
"description": "Original dataset this instance was sourced from.",
"required": "optional"
}
]