Spaces:

evaleval
/

general-eval-card

Running

App Files Files Community

general-eval-card / data /survey /eval-schema-fields.json

evijit HF Staff

fix bugs

04b4cff about 2 months ago

raw

history blame contribute delete

21.1 kB

	[
	{
	"id": "eee_eval:source_metadata.evaluator_relationship",
	"source": "eee_eval",
	"section": "source_metadata",
	"field": "evaluator_relationship",
	"schemaPath": "source_metadata.evaluator_relationship",
	"fullPath": "eee_eval.source_metadata.evaluator_relationship",
	"type": "string",
	"description": "Relationship between the evaluator and the model developer (e.g., first-party, third-party, independent).",
	"required": "required"
	},
	{
	"id": "eee_eval:source_metadata.source_organization_name",
	"source": "eee_eval",
	"section": "source_metadata",
	"field": "source_organization_name",
	"schemaPath": "source_metadata.source_organization_name",
	"fullPath": "eee_eval.source_metadata.source_organization_name",
	"type": "string",
	"description": "Name of the organization that produced or published the evaluation results.",
	"required": "required"
	},
	{
	"id": "eee_eval:source_metadata.source_url",
	"source": "eee_eval",
	"section": "source_metadata",
	"field": "source_url",
	"schemaPath": "source_metadata.source_url",
	"fullPath": "eee_eval.source_metadata.source_url",
	"type": "string",
	"description": "URL pointing to the original source of the evaluation results.",
	"required": "optional"
	},
	{
	"id": "eee_eval:source_metadata.publication_date",
	"source": "eee_eval",
	"section": "source_metadata",
	"field": "publication_date",
	"schemaPath": "source_metadata.publication_date",
	"fullPath": "eee_eval.source_metadata.publication_date",
	"type": "string",
	"description": "Date when the evaluation results were published or made publicly available.",
	"required": "optional"
	},
	{
	"id": "eee_eval:retrieved_timestamp",
	"source": "eee_eval",
	"section": "root",
	"field": "retrieved_timestamp",
	"schemaPath": "retrieved_timestamp",
	"fullPath": "eee_eval.retrieved_timestamp",
	"type": "string",
	"description": "ISO 8601 timestamp indicating when the evaluation data was retrieved or ingested.",
	"required": "required"
	},
	{
	"id": "eee_eval:eval_library.name",
	"source": "eee_eval",
	"section": "eval_library",
	"field": "name",
	"schemaPath": "eval_library.name",
	"fullPath": "eee_eval.eval_library.name",
	"type": "string",
	"description": "Name of the evaluation library or harness used to run the evaluation (e.g., lm-evaluation-harness, HELM).",
	"required": "required"
	},
	{
	"id": "eee_eval:eval_library.version",
	"source": "eee_eval",
	"section": "eval_library",
	"field": "version",
	"schemaPath": "eval_library.version",
	"fullPath": "eee_eval.eval_library.version",
	"type": "string",
	"description": "Version string of the evaluation library used, enabling reproducibility checks.",
	"required": "required"
	},
	{
	"id": "eee_eval:eval_library.url",
	"source": "eee_eval",
	"section": "eval_library",
	"field": "url",
	"schemaPath": "eval_library.url",
	"fullPath": "eee_eval.eval_library.url",
	"type": "string",
	"description": "Repository or documentation URL for the evaluation library.",
	"required": "optional"
	},
	{
	"id": "eee_eval:model_info.model_id",
	"source": "eee_eval",
	"section": "model_info",
	"field": "model_id",
	"schemaPath": "model_info.model_id",
	"fullPath": "eee_eval.model_info.model_id",
	"type": "string",
	"description": "Unique identifier for the model being evaluated (e.g., HuggingFace model ID).",
	"required": "required"
	},
	{
	"id": "eee_eval:model_info.model_revision",
	"source": "eee_eval",
	"section": "model_info",
	"field": "model_revision",
	"schemaPath": "model_info.model_revision",
	"fullPath": "eee_eval.model_info.model_revision",
	"type": "string",
	"description": "Git revision or checkpoint hash of the model weights used during evaluation.",
	"required": "optional"
	},
	{
	"id": "eee_eval:model_info.model_type",
	"source": "eee_eval",
	"section": "model_info",
	"field": "model_type",
	"schemaPath": "model_info.model_type",
	"fullPath": "eee_eval.model_info.model_type",
	"type": "string",
	"description": "Type or architecture category of the model (e.g., decoder-only, encoder-decoder).",
	"required": "optional"
	},
	{
	"id": "eee_eval:evaluation_results.generation_config",
	"source": "eee_eval",
	"section": "evaluation_results",
	"field": "generation_config",
	"schemaPath": "evaluation_results.generation_config",
	"fullPath": "eee_eval.evaluation_results.generation_config",
	"type": "object",
	"description": "Generation configuration used during evaluation, including temperature, top-p, max tokens, and other sampling parameters.",
	"required": "required"
	},
	{
	"id": "eee_eval:evaluation_results.scores",
	"source": "eee_eval",
	"section": "evaluation_results",
	"field": "scores",
	"schemaPath": "evaluation_results.scores",
	"fullPath": "eee_eval.evaluation_results.scores",
	"type": "object",
	"description": "Aggregate scores across benchmarks, keyed by benchmark name.",
	"required": "required"
	},
	{
	"id": "eee_eval:evaluation_results.num_few_shot",
	"source": "eee_eval",
	"section": "evaluation_results",
	"field": "num_few_shot",
	"schemaPath": "evaluation_results.num_few_shot",
	"fullPath": "eee_eval.evaluation_results.num_few_shot",
	"type": "integer",
	"description": "Number of few-shot examples provided in the prompt during evaluation.",
	"required": "optional"
	},
	{
	"id": "eee_eval:detailed_evaluation_results.file_path",
	"source": "eee_eval",
	"section": "detailed_evaluation_results",
	"field": "file_path",
	"schemaPath": "detailed_evaluation_results.file_path",
	"fullPath": "eee_eval.detailed_evaluation_results.file_path",
	"type": "string",
	"description": "Path or URL to files containing per-sample evaluation results for detailed analysis.",
	"required": "optional"
	},
	{
	"id": "eee_eval:detailed_evaluation_results.format",
	"source": "eee_eval",
	"section": "detailed_evaluation_results",
	"field": "format",
	"schemaPath": "detailed_evaluation_results.format",
	"fullPath": "eee_eval.detailed_evaluation_results.format",
	"type": "string",
	"description": "File format of the detailed evaluation results (e.g., jsonl, parquet, csv).",
	"required": "optional"
	},
	{
	"id": "eee_eval:hardware_info.gpu_type",
	"source": "eee_eval",
	"section": "hardware_info",
	"field": "gpu_type",
	"schemaPath": "hardware_info.gpu_type",
	"fullPath": "eee_eval.hardware_info.gpu_type",
	"type": "string",
	"description": "Type and model of GPU hardware used during evaluation.",
	"required": "optional"
	},
	{
	"id": "eee_eval:hardware_info.num_gpus",
	"source": "eee_eval",
	"section": "hardware_info",
	"field": "num_gpus",
	"schemaPath": "hardware_info.num_gpus",
	"fullPath": "eee_eval.hardware_info.num_gpus",
	"type": "integer",
	"description": "Number of GPUs used during evaluation.",
	"required": "optional"
	},
	{
	"id": "autobenchmarkcard:benchmark_details.overview",
	"source": "autobenchmarkcard",
	"section": "benchmark_details",
	"field": "overview",
	"schemaPath": "benchmark_details.overview",
	"fullPath": "autobenchmarkcard.benchmark_details.overview",
	"type": "string",
	"description": "High-level summary of the benchmark, its purpose, and the capabilities it is designed to measure.",
	"required": "required"
	},
	{
	"id": "autobenchmarkcard:benchmark_details.name",
	"source": "autobenchmarkcard",
	"section": "benchmark_details",
	"field": "name",
	"schemaPath": "benchmark_details.name",
	"fullPath": "autobenchmarkcard.benchmark_details.name",
	"type": "string",
	"description": "Official name of the benchmark.",
	"required": "required"
	},
	{
	"id": "autobenchmarkcard:benchmark_details.version",
	"source": "autobenchmarkcard",
	"section": "benchmark_details",
	"field": "version",
	"schemaPath": "benchmark_details.version",
	"fullPath": "autobenchmarkcard.benchmark_details.version",
	"type": "string",
	"description": "Version of the benchmark dataset or task specification.",
	"required": "required"
	},
	{
	"id": "autobenchmarkcard:benchmark_details.release_date",
	"source": "autobenchmarkcard",
	"section": "benchmark_details",
	"field": "release_date",
	"schemaPath": "benchmark_details.release_date",
	"fullPath": "autobenchmarkcard.benchmark_details.release_date",
	"type": "string",
	"description": "Date the benchmark was publicly released.",
	"required": "optional"
	},
	{
	"id": "autobenchmarkcard:benchmark_details.citation",
	"source": "autobenchmarkcard",
	"section": "benchmark_details",
	"field": "citation",
	"schemaPath": "benchmark_details.citation",
	"fullPath": "autobenchmarkcard.benchmark_details.citation",
	"type": "string",
	"description": "BibTeX or APA citation for the benchmark paper or dataset.",
	"required": "optional"
	},
	{
	"id": "autobenchmarkcard:purpose_and_intended_users.intended_use",
	"source": "autobenchmarkcard",
	"section": "purpose_and_intended_users",
	"field": "intended_use",
	"schemaPath": "purpose_and_intended_users.intended_use",
	"fullPath": "autobenchmarkcard.purpose_and_intended_users.intended_use",
	"type": "string",
	"description": "Description of the intended use cases and audiences for this benchmark.",
	"required": "required"
	},
	{
	"id": "autobenchmarkcard:purpose_and_intended_users.limitations",
	"source": "autobenchmarkcard",
	"section": "purpose_and_intended_users",
	"field": "limitations",
	"schemaPath": "purpose_and_intended_users.limitations",
	"fullPath": "autobenchmarkcard.purpose_and_intended_users.limitations",
	"type": "string",
	"description": "Known limitations of the benchmark, including scope restrictions, population coverage gaps, or validity concerns.",
	"required": "required"
	},
	{
	"id": "autobenchmarkcard:purpose_and_intended_users.out_of_scope",
	"source": "autobenchmarkcard",
	"section": "purpose_and_intended_users",
	"field": "out_of_scope",
	"schemaPath": "purpose_and_intended_users.out_of_scope",
	"fullPath": "autobenchmarkcard.purpose_and_intended_users.out_of_scope",
	"type": "string",
	"description": "Explicit description of use cases or capabilities the benchmark is not designed to evaluate.",
	"required": "optional"
	},
	{
	"id": "autobenchmarkcard:methodology.metrics",
	"source": "autobenchmarkcard",
	"section": "methodology",
	"field": "metrics",
	"schemaPath": "methodology.metrics",
	"fullPath": "autobenchmarkcard.methodology.metrics",
	"type": "array",
	"description": "List of evaluation metrics used (e.g., accuracy, F1, BLEU), including their definitions and how they are computed.",
	"required": "required"
	},
	{
	"id": "autobenchmarkcard:methodology.validation",
	"source": "autobenchmarkcard",
	"section": "methodology",
	"field": "validation",
	"schemaPath": "methodology.validation",
	"fullPath": "autobenchmarkcard.methodology.validation",
	"type": "string",
	"description": "Description of validation procedures used to ensure benchmark quality, including human review, pilot studies, or inter-annotator agreement.",
	"required": "required"
	},
	{
	"id": "autobenchmarkcard:methodology.interpretation",
	"source": "autobenchmarkcard",
	"section": "methodology",
	"field": "interpretation",
	"schemaPath": "methodology.interpretation",
	"fullPath": "autobenchmarkcard.methodology.interpretation",
	"type": "string",
	"description": "Guidance on how to interpret benchmark scores, including what constitutes a meaningful difference and known confounds.",
	"required": "required"
	},
	{
	"id": "autobenchmarkcard:methodology.data_collection",
	"source": "autobenchmarkcard",
	"section": "methodology",
	"field": "data_collection",
	"schemaPath": "methodology.data_collection",
	"fullPath": "autobenchmarkcard.methodology.data_collection",
	"type": "string",
	"description": "Description of how benchmark data was collected, curated, or generated.",
	"required": "optional"
	},
	{
	"id": "autobenchmarkcard:methodology.prompt_format",
	"source": "autobenchmarkcard",
	"section": "methodology",
	"field": "prompt_format",
	"schemaPath": "methodology.prompt_format",
	"fullPath": "autobenchmarkcard.methodology.prompt_format",
	"type": "string",
	"description": "Specification of the prompt template or format used when querying models.",
	"required": "optional"
	},
	{
	"id": "autobenchmarkcard:ethical_and_legal_considerations.compliance_with_regulations",
	"source": "autobenchmarkcard",
	"section": "ethical_and_legal_considerations",
	"field": "compliance_with_regulations",
	"schemaPath": "ethical_and_legal_considerations.compliance_with_regulations",
	"fullPath": "autobenchmarkcard.ethical_and_legal_considerations.compliance_with_regulations",
	"type": "string",
	"description": "Statement on compliance with relevant regulations or legal frameworks (e.g., GDPR, EU AI Act, NIST RMF).",
	"required": "required"
	},
	{
	"id": "autobenchmarkcard:ethical_and_legal_considerations.data_privacy",
	"source": "autobenchmarkcard",
	"section": "ethical_and_legal_considerations",
	"field": "data_privacy",
	"schemaPath": "ethical_and_legal_considerations.data_privacy",
	"fullPath": "autobenchmarkcard.ethical_and_legal_considerations.data_privacy",
	"type": "string",
	"description": "Description of how personal data or sensitive information is handled in the benchmark.",
	"required": "optional"
	},
	{
	"id": "autobenchmarkcard:ethical_and_legal_considerations.consent",
	"source": "autobenchmarkcard",
	"section": "ethical_and_legal_considerations",
	"field": "consent",
	"schemaPath": "ethical_and_legal_considerations.consent",
	"fullPath": "autobenchmarkcard.ethical_and_legal_considerations.consent",
	"type": "string",
	"description": "Information about consent obtained from data subjects or annotators.",
	"required": "optional"
	},
	{
	"id": "autobenchmarkcard:possible_risks.category",
	"source": "autobenchmarkcard",
	"section": "possible_risks",
	"field": "category",
	"schemaPath": "possible_risks.category",
	"fullPath": "autobenchmarkcard.possible_risks.category",
	"type": "array",
	"description": "Categorized list of potential risks associated with misuse or misinterpretation of benchmark results (e.g., gaming, overfitting, contamination).",
	"required": "required"
	},
	{
	"id": "autobenchmarkcard:possible_risks.mitigation",
	"source": "autobenchmarkcard",
	"section": "possible_risks",
	"field": "mitigation",
	"schemaPath": "possible_risks.mitigation",
	"fullPath": "autobenchmarkcard.possible_risks.mitigation",
	"type": "string",
	"description": "Recommended mitigations or safeguards to reduce identified risks.",
	"required": "optional"
	},
	{
	"id": "autobenchmarkcard:dataset_details.size",
	"source": "autobenchmarkcard",
	"section": "dataset_details",
	"field": "size",
	"schemaPath": "dataset_details.size",
	"fullPath": "autobenchmarkcard.dataset_details.size",
	"type": "integer",
	"description": "Total number of examples or items in the benchmark dataset.",
	"required": "optional"
	},
	{
	"id": "autobenchmarkcard:dataset_details.languages",
	"source": "autobenchmarkcard",
	"section": "dataset_details",
	"field": "languages",
	"schemaPath": "dataset_details.languages",
	"fullPath": "autobenchmarkcard.dataset_details.languages",
	"type": "array",
	"description": "Languages represented in the benchmark dataset (ISO 639-1 codes).",
	"required": "optional"
	},
	{
	"id": "autobenchmarkcard:dataset_details.domains",
	"source": "autobenchmarkcard",
	"section": "dataset_details",
	"field": "domains",
	"schemaPath": "dataset_details.domains",
	"fullPath": "autobenchmarkcard.dataset_details.domains",
	"type": "array",
	"description": "Subject domains covered by the benchmark (e.g., medicine, law, mathematics, code).",
	"required": "optional"
	},
	{
	"id": "autobenchmarkcard:dataset_details.license",
	"source": "autobenchmarkcard",
	"section": "dataset_details",
	"field": "license",
	"schemaPath": "dataset_details.license",
	"fullPath": "autobenchmarkcard.dataset_details.license",
	"type": "string",
	"description": "License under which the benchmark dataset is distributed.",
	"required": "optional"
	},
	{
	"id": "autobenchmarkcard:leaderboard_info.url",
	"source": "autobenchmarkcard",
	"section": "leaderboard_info",
	"field": "url",
	"schemaPath": "leaderboard_info.url",
	"fullPath": "autobenchmarkcard.leaderboard_info.url",
	"type": "string",
	"description": "URL of the official leaderboard or results page for this benchmark.",
	"required": "optional"
	},
	{
	"id": "autobenchmarkcard:leaderboard_info.submission_requirements",
	"source": "autobenchmarkcard",
	"section": "leaderboard_info",
	"field": "submission_requirements",
	"schemaPath": "leaderboard_info.submission_requirements",
	"fullPath": "autobenchmarkcard.leaderboard_info.submission_requirements",
	"type": "string",
	"description": "Requirements for submitting model results to the benchmark leaderboard.",
	"required": "optional"
	},
	{
	"id": "eee_instance_level_eval:instance_id",
	"source": "eee_instance_level_eval",
	"section": "root",
	"field": "instance_id",
	"schemaPath": "instance_id",
	"fullPath": "eee_instance_level_eval.instance_id",
	"type": "string",
	"description": "Unique identifier for a single evaluation instance or example.",
	"required": "required"
	},
	{
	"id": "eee_instance_level_eval:model_output",
	"source": "eee_instance_level_eval",
	"section": "root",
	"field": "model_output",
	"schemaPath": "model_output",
	"fullPath": "eee_instance_level_eval.model_output",
	"type": "string",
	"description": "Raw text output generated by the model for this instance.",
	"required": "required"
	},
	{
	"id": "eee_instance_level_eval:ground_truth",
	"source": "eee_instance_level_eval",
	"section": "root",
	"field": "ground_truth",
	"schemaPath": "ground_truth",
	"fullPath": "eee_instance_level_eval.ground_truth",
	"type": "string",
	"description": "Reference answer or ground truth label for this instance.",
	"required": "optional"
	},
	{
	"id": "eee_instance_level_eval:score",
	"source": "eee_instance_level_eval",
	"section": "root",
	"field": "score",
	"schemaPath": "score",
	"fullPath": "eee_instance_level_eval.score",
	"type": "number",
	"description": "Numeric score assigned to this instance by the evaluation metric.",
	"required": "required"
	},
	{
	"id": "eee_instance_level_eval:prompt",
	"source": "eee_instance_level_eval",
	"section": "root",
	"field": "prompt",
	"schemaPath": "prompt",
	"fullPath": "eee_instance_level_eval.prompt",
	"type": "string",
	"description": "Full prompt text as presented to the model for this instance.",
	"required": "optional"
	},
	{
	"id": "eee_instance_level_eval:task_name",
	"source": "eee_instance_level_eval",
	"section": "root",
	"field": "task_name",
	"schemaPath": "task_name",
	"fullPath": "eee_instance_level_eval.task_name",
	"type": "string",
	"description": "Name of the task or benchmark this instance belongs to.",
	"required": "required"
	},
	{
	"id": "eee_instance_level_eval:metadata.difficulty",
	"source": "eee_instance_level_eval",
	"section": "metadata",
	"field": "difficulty",
	"schemaPath": "metadata.difficulty",
	"fullPath": "eee_instance_level_eval.metadata.difficulty",
	"type": "string",
	"description": "Difficulty level or category of this instance (e.g., easy, medium, hard).",
	"required": "optional"
	},
	{
	"id": "eee_instance_level_eval:metadata.subject",
	"source": "eee_instance_level_eval",
	"section": "metadata",
	"field": "subject",
	"schemaPath": "metadata.subject",
	"fullPath": "eee_instance_level_eval.metadata.subject",
	"type": "string",
	"description": "Subject or topic area of this instance.",
	"required": "optional"
	},
	{
	"id": "eee_instance_level_eval:metadata.source_dataset",
	"source": "eee_instance_level_eval",
	"section": "metadata",
	"field": "source_dataset",
	"schemaPath": "metadata.source_dataset",
	"fullPath": "eee_instance_level_eval.metadata.source_dataset",
	"type": "string",
	"description": "Original dataset this instance was sourced from.",
	"required": "optional"
	}
	]