Spaces:
Running
Running
| [ | |
| { | |
| "id": "eee_eval:source_metadata.evaluator_relationship", | |
| "source": "eee_eval", | |
| "section": "source_metadata", | |
| "field": "evaluator_relationship", | |
| "schemaPath": "source_metadata.evaluator_relationship", | |
| "fullPath": "eee_eval.source_metadata.evaluator_relationship", | |
| "type": "string", | |
| "description": "Relationship between the evaluator and the model developer (e.g., first-party, third-party, independent).", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "eee_eval:source_metadata.source_organization_name", | |
| "source": "eee_eval", | |
| "section": "source_metadata", | |
| "field": "source_organization_name", | |
| "schemaPath": "source_metadata.source_organization_name", | |
| "fullPath": "eee_eval.source_metadata.source_organization_name", | |
| "type": "string", | |
| "description": "Name of the organization that produced or published the evaluation results.", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "eee_eval:source_metadata.source_url", | |
| "source": "eee_eval", | |
| "section": "source_metadata", | |
| "field": "source_url", | |
| "schemaPath": "source_metadata.source_url", | |
| "fullPath": "eee_eval.source_metadata.source_url", | |
| "type": "string", | |
| "description": "URL pointing to the original source of the evaluation results.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "eee_eval:source_metadata.publication_date", | |
| "source": "eee_eval", | |
| "section": "source_metadata", | |
| "field": "publication_date", | |
| "schemaPath": "source_metadata.publication_date", | |
| "fullPath": "eee_eval.source_metadata.publication_date", | |
| "type": "string", | |
| "description": "Date when the evaluation results were published or made publicly available.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "eee_eval:retrieved_timestamp", | |
| "source": "eee_eval", | |
| "section": "root", | |
| "field": "retrieved_timestamp", | |
| "schemaPath": "retrieved_timestamp", | |
| "fullPath": "eee_eval.retrieved_timestamp", | |
| "type": "string", | |
| "description": "ISO 8601 timestamp indicating when the evaluation data was retrieved or ingested.", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "eee_eval:eval_library.name", | |
| "source": "eee_eval", | |
| "section": "eval_library", | |
| "field": "name", | |
| "schemaPath": "eval_library.name", | |
| "fullPath": "eee_eval.eval_library.name", | |
| "type": "string", | |
| "description": "Name of the evaluation library or harness used to run the evaluation (e.g., lm-evaluation-harness, HELM).", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "eee_eval:eval_library.version", | |
| "source": "eee_eval", | |
| "section": "eval_library", | |
| "field": "version", | |
| "schemaPath": "eval_library.version", | |
| "fullPath": "eee_eval.eval_library.version", | |
| "type": "string", | |
| "description": "Version string of the evaluation library used, enabling reproducibility checks.", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "eee_eval:eval_library.url", | |
| "source": "eee_eval", | |
| "section": "eval_library", | |
| "field": "url", | |
| "schemaPath": "eval_library.url", | |
| "fullPath": "eee_eval.eval_library.url", | |
| "type": "string", | |
| "description": "Repository or documentation URL for the evaluation library.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "eee_eval:model_info.model_id", | |
| "source": "eee_eval", | |
| "section": "model_info", | |
| "field": "model_id", | |
| "schemaPath": "model_info.model_id", | |
| "fullPath": "eee_eval.model_info.model_id", | |
| "type": "string", | |
| "description": "Unique identifier for the model being evaluated (e.g., HuggingFace model ID).", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "eee_eval:model_info.model_revision", | |
| "source": "eee_eval", | |
| "section": "model_info", | |
| "field": "model_revision", | |
| "schemaPath": "model_info.model_revision", | |
| "fullPath": "eee_eval.model_info.model_revision", | |
| "type": "string", | |
| "description": "Git revision or checkpoint hash of the model weights used during evaluation.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "eee_eval:model_info.model_type", | |
| "source": "eee_eval", | |
| "section": "model_info", | |
| "field": "model_type", | |
| "schemaPath": "model_info.model_type", | |
| "fullPath": "eee_eval.model_info.model_type", | |
| "type": "string", | |
| "description": "Type or architecture category of the model (e.g., decoder-only, encoder-decoder).", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "eee_eval:evaluation_results.generation_config", | |
| "source": "eee_eval", | |
| "section": "evaluation_results", | |
| "field": "generation_config", | |
| "schemaPath": "evaluation_results.generation_config", | |
| "fullPath": "eee_eval.evaluation_results.generation_config", | |
| "type": "object", | |
| "description": "Generation configuration used during evaluation, including temperature, top-p, max tokens, and other sampling parameters.", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "eee_eval:evaluation_results.scores", | |
| "source": "eee_eval", | |
| "section": "evaluation_results", | |
| "field": "scores", | |
| "schemaPath": "evaluation_results.scores", | |
| "fullPath": "eee_eval.evaluation_results.scores", | |
| "type": "object", | |
| "description": "Aggregate scores across benchmarks, keyed by benchmark name.", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "eee_eval:evaluation_results.num_few_shot", | |
| "source": "eee_eval", | |
| "section": "evaluation_results", | |
| "field": "num_few_shot", | |
| "schemaPath": "evaluation_results.num_few_shot", | |
| "fullPath": "eee_eval.evaluation_results.num_few_shot", | |
| "type": "integer", | |
| "description": "Number of few-shot examples provided in the prompt during evaluation.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "eee_eval:detailed_evaluation_results.file_path", | |
| "source": "eee_eval", | |
| "section": "detailed_evaluation_results", | |
| "field": "file_path", | |
| "schemaPath": "detailed_evaluation_results.file_path", | |
| "fullPath": "eee_eval.detailed_evaluation_results.file_path", | |
| "type": "string", | |
| "description": "Path or URL to files containing per-sample evaluation results for detailed analysis.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "eee_eval:detailed_evaluation_results.format", | |
| "source": "eee_eval", | |
| "section": "detailed_evaluation_results", | |
| "field": "format", | |
| "schemaPath": "detailed_evaluation_results.format", | |
| "fullPath": "eee_eval.detailed_evaluation_results.format", | |
| "type": "string", | |
| "description": "File format of the detailed evaluation results (e.g., jsonl, parquet, csv).", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "eee_eval:hardware_info.gpu_type", | |
| "source": "eee_eval", | |
| "section": "hardware_info", | |
| "field": "gpu_type", | |
| "schemaPath": "hardware_info.gpu_type", | |
| "fullPath": "eee_eval.hardware_info.gpu_type", | |
| "type": "string", | |
| "description": "Type and model of GPU hardware used during evaluation.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "eee_eval:hardware_info.num_gpus", | |
| "source": "eee_eval", | |
| "section": "hardware_info", | |
| "field": "num_gpus", | |
| "schemaPath": "hardware_info.num_gpus", | |
| "fullPath": "eee_eval.hardware_info.num_gpus", | |
| "type": "integer", | |
| "description": "Number of GPUs used during evaluation.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:benchmark_details.overview", | |
| "source": "autobenchmarkcard", | |
| "section": "benchmark_details", | |
| "field": "overview", | |
| "schemaPath": "benchmark_details.overview", | |
| "fullPath": "autobenchmarkcard.benchmark_details.overview", | |
| "type": "string", | |
| "description": "High-level summary of the benchmark, its purpose, and the capabilities it is designed to measure.", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:benchmark_details.name", | |
| "source": "autobenchmarkcard", | |
| "section": "benchmark_details", | |
| "field": "name", | |
| "schemaPath": "benchmark_details.name", | |
| "fullPath": "autobenchmarkcard.benchmark_details.name", | |
| "type": "string", | |
| "description": "Official name of the benchmark.", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:benchmark_details.version", | |
| "source": "autobenchmarkcard", | |
| "section": "benchmark_details", | |
| "field": "version", | |
| "schemaPath": "benchmark_details.version", | |
| "fullPath": "autobenchmarkcard.benchmark_details.version", | |
| "type": "string", | |
| "description": "Version of the benchmark dataset or task specification.", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:benchmark_details.release_date", | |
| "source": "autobenchmarkcard", | |
| "section": "benchmark_details", | |
| "field": "release_date", | |
| "schemaPath": "benchmark_details.release_date", | |
| "fullPath": "autobenchmarkcard.benchmark_details.release_date", | |
| "type": "string", | |
| "description": "Date the benchmark was publicly released.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:benchmark_details.citation", | |
| "source": "autobenchmarkcard", | |
| "section": "benchmark_details", | |
| "field": "citation", | |
| "schemaPath": "benchmark_details.citation", | |
| "fullPath": "autobenchmarkcard.benchmark_details.citation", | |
| "type": "string", | |
| "description": "BibTeX or APA citation for the benchmark paper or dataset.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:purpose_and_intended_users.intended_use", | |
| "source": "autobenchmarkcard", | |
| "section": "purpose_and_intended_users", | |
| "field": "intended_use", | |
| "schemaPath": "purpose_and_intended_users.intended_use", | |
| "fullPath": "autobenchmarkcard.purpose_and_intended_users.intended_use", | |
| "type": "string", | |
| "description": "Description of the intended use cases and audiences for this benchmark.", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:purpose_and_intended_users.limitations", | |
| "source": "autobenchmarkcard", | |
| "section": "purpose_and_intended_users", | |
| "field": "limitations", | |
| "schemaPath": "purpose_and_intended_users.limitations", | |
| "fullPath": "autobenchmarkcard.purpose_and_intended_users.limitations", | |
| "type": "string", | |
| "description": "Known limitations of the benchmark, including scope restrictions, population coverage gaps, or validity concerns.", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:purpose_and_intended_users.out_of_scope", | |
| "source": "autobenchmarkcard", | |
| "section": "purpose_and_intended_users", | |
| "field": "out_of_scope", | |
| "schemaPath": "purpose_and_intended_users.out_of_scope", | |
| "fullPath": "autobenchmarkcard.purpose_and_intended_users.out_of_scope", | |
| "type": "string", | |
| "description": "Explicit description of use cases or capabilities the benchmark is not designed to evaluate.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:methodology.metrics", | |
| "source": "autobenchmarkcard", | |
| "section": "methodology", | |
| "field": "metrics", | |
| "schemaPath": "methodology.metrics", | |
| "fullPath": "autobenchmarkcard.methodology.metrics", | |
| "type": "array", | |
| "description": "List of evaluation metrics used (e.g., accuracy, F1, BLEU), including their definitions and how they are computed.", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:methodology.validation", | |
| "source": "autobenchmarkcard", | |
| "section": "methodology", | |
| "field": "validation", | |
| "schemaPath": "methodology.validation", | |
| "fullPath": "autobenchmarkcard.methodology.validation", | |
| "type": "string", | |
| "description": "Description of validation procedures used to ensure benchmark quality, including human review, pilot studies, or inter-annotator agreement.", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:methodology.interpretation", | |
| "source": "autobenchmarkcard", | |
| "section": "methodology", | |
| "field": "interpretation", | |
| "schemaPath": "methodology.interpretation", | |
| "fullPath": "autobenchmarkcard.methodology.interpretation", | |
| "type": "string", | |
| "description": "Guidance on how to interpret benchmark scores, including what constitutes a meaningful difference and known confounds.", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:methodology.data_collection", | |
| "source": "autobenchmarkcard", | |
| "section": "methodology", | |
| "field": "data_collection", | |
| "schemaPath": "methodology.data_collection", | |
| "fullPath": "autobenchmarkcard.methodology.data_collection", | |
| "type": "string", | |
| "description": "Description of how benchmark data was collected, curated, or generated.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:methodology.prompt_format", | |
| "source": "autobenchmarkcard", | |
| "section": "methodology", | |
| "field": "prompt_format", | |
| "schemaPath": "methodology.prompt_format", | |
| "fullPath": "autobenchmarkcard.methodology.prompt_format", | |
| "type": "string", | |
| "description": "Specification of the prompt template or format used when querying models.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:ethical_and_legal_considerations.compliance_with_regulations", | |
| "source": "autobenchmarkcard", | |
| "section": "ethical_and_legal_considerations", | |
| "field": "compliance_with_regulations", | |
| "schemaPath": "ethical_and_legal_considerations.compliance_with_regulations", | |
| "fullPath": "autobenchmarkcard.ethical_and_legal_considerations.compliance_with_regulations", | |
| "type": "string", | |
| "description": "Statement on compliance with relevant regulations or legal frameworks (e.g., GDPR, EU AI Act, NIST RMF).", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:ethical_and_legal_considerations.data_privacy", | |
| "source": "autobenchmarkcard", | |
| "section": "ethical_and_legal_considerations", | |
| "field": "data_privacy", | |
| "schemaPath": "ethical_and_legal_considerations.data_privacy", | |
| "fullPath": "autobenchmarkcard.ethical_and_legal_considerations.data_privacy", | |
| "type": "string", | |
| "description": "Description of how personal data or sensitive information is handled in the benchmark.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:ethical_and_legal_considerations.consent", | |
| "source": "autobenchmarkcard", | |
| "section": "ethical_and_legal_considerations", | |
| "field": "consent", | |
| "schemaPath": "ethical_and_legal_considerations.consent", | |
| "fullPath": "autobenchmarkcard.ethical_and_legal_considerations.consent", | |
| "type": "string", | |
| "description": "Information about consent obtained from data subjects or annotators.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:possible_risks.category", | |
| "source": "autobenchmarkcard", | |
| "section": "possible_risks", | |
| "field": "category", | |
| "schemaPath": "possible_risks.category", | |
| "fullPath": "autobenchmarkcard.possible_risks.category", | |
| "type": "array", | |
| "description": "Categorized list of potential risks associated with misuse or misinterpretation of benchmark results (e.g., gaming, overfitting, contamination).", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:possible_risks.mitigation", | |
| "source": "autobenchmarkcard", | |
| "section": "possible_risks", | |
| "field": "mitigation", | |
| "schemaPath": "possible_risks.mitigation", | |
| "fullPath": "autobenchmarkcard.possible_risks.mitigation", | |
| "type": "string", | |
| "description": "Recommended mitigations or safeguards to reduce identified risks.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:dataset_details.size", | |
| "source": "autobenchmarkcard", | |
| "section": "dataset_details", | |
| "field": "size", | |
| "schemaPath": "dataset_details.size", | |
| "fullPath": "autobenchmarkcard.dataset_details.size", | |
| "type": "integer", | |
| "description": "Total number of examples or items in the benchmark dataset.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:dataset_details.languages", | |
| "source": "autobenchmarkcard", | |
| "section": "dataset_details", | |
| "field": "languages", | |
| "schemaPath": "dataset_details.languages", | |
| "fullPath": "autobenchmarkcard.dataset_details.languages", | |
| "type": "array", | |
| "description": "Languages represented in the benchmark dataset (ISO 639-1 codes).", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:dataset_details.domains", | |
| "source": "autobenchmarkcard", | |
| "section": "dataset_details", | |
| "field": "domains", | |
| "schemaPath": "dataset_details.domains", | |
| "fullPath": "autobenchmarkcard.dataset_details.domains", | |
| "type": "array", | |
| "description": "Subject domains covered by the benchmark (e.g., medicine, law, mathematics, code).", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:dataset_details.license", | |
| "source": "autobenchmarkcard", | |
| "section": "dataset_details", | |
| "field": "license", | |
| "schemaPath": "dataset_details.license", | |
| "fullPath": "autobenchmarkcard.dataset_details.license", | |
| "type": "string", | |
| "description": "License under which the benchmark dataset is distributed.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:leaderboard_info.url", | |
| "source": "autobenchmarkcard", | |
| "section": "leaderboard_info", | |
| "field": "url", | |
| "schemaPath": "leaderboard_info.url", | |
| "fullPath": "autobenchmarkcard.leaderboard_info.url", | |
| "type": "string", | |
| "description": "URL of the official leaderboard or results page for this benchmark.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "autobenchmarkcard:leaderboard_info.submission_requirements", | |
| "source": "autobenchmarkcard", | |
| "section": "leaderboard_info", | |
| "field": "submission_requirements", | |
| "schemaPath": "leaderboard_info.submission_requirements", | |
| "fullPath": "autobenchmarkcard.leaderboard_info.submission_requirements", | |
| "type": "string", | |
| "description": "Requirements for submitting model results to the benchmark leaderboard.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "eee_instance_level_eval:instance_id", | |
| "source": "eee_instance_level_eval", | |
| "section": "root", | |
| "field": "instance_id", | |
| "schemaPath": "instance_id", | |
| "fullPath": "eee_instance_level_eval.instance_id", | |
| "type": "string", | |
| "description": "Unique identifier for a single evaluation instance or example.", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "eee_instance_level_eval:model_output", | |
| "source": "eee_instance_level_eval", | |
| "section": "root", | |
| "field": "model_output", | |
| "schemaPath": "model_output", | |
| "fullPath": "eee_instance_level_eval.model_output", | |
| "type": "string", | |
| "description": "Raw text output generated by the model for this instance.", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "eee_instance_level_eval:ground_truth", | |
| "source": "eee_instance_level_eval", | |
| "section": "root", | |
| "field": "ground_truth", | |
| "schemaPath": "ground_truth", | |
| "fullPath": "eee_instance_level_eval.ground_truth", | |
| "type": "string", | |
| "description": "Reference answer or ground truth label for this instance.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "eee_instance_level_eval:score", | |
| "source": "eee_instance_level_eval", | |
| "section": "root", | |
| "field": "score", | |
| "schemaPath": "score", | |
| "fullPath": "eee_instance_level_eval.score", | |
| "type": "number", | |
| "description": "Numeric score assigned to this instance by the evaluation metric.", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "eee_instance_level_eval:prompt", | |
| "source": "eee_instance_level_eval", | |
| "section": "root", | |
| "field": "prompt", | |
| "schemaPath": "prompt", | |
| "fullPath": "eee_instance_level_eval.prompt", | |
| "type": "string", | |
| "description": "Full prompt text as presented to the model for this instance.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "eee_instance_level_eval:task_name", | |
| "source": "eee_instance_level_eval", | |
| "section": "root", | |
| "field": "task_name", | |
| "schemaPath": "task_name", | |
| "fullPath": "eee_instance_level_eval.task_name", | |
| "type": "string", | |
| "description": "Name of the task or benchmark this instance belongs to.", | |
| "required": "required" | |
| }, | |
| { | |
| "id": "eee_instance_level_eval:metadata.difficulty", | |
| "source": "eee_instance_level_eval", | |
| "section": "metadata", | |
| "field": "difficulty", | |
| "schemaPath": "metadata.difficulty", | |
| "fullPath": "eee_instance_level_eval.metadata.difficulty", | |
| "type": "string", | |
| "description": "Difficulty level or category of this instance (e.g., easy, medium, hard).", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "eee_instance_level_eval:metadata.subject", | |
| "source": "eee_instance_level_eval", | |
| "section": "metadata", | |
| "field": "subject", | |
| "schemaPath": "metadata.subject", | |
| "fullPath": "eee_instance_level_eval.metadata.subject", | |
| "type": "string", | |
| "description": "Subject or topic area of this instance.", | |
| "required": "optional" | |
| }, | |
| { | |
| "id": "eee_instance_level_eval:metadata.source_dataset", | |
| "source": "eee_instance_level_eval", | |
| "section": "metadata", | |
| "field": "source_dataset", | |
| "schemaPath": "metadata.source_dataset", | |
| "fullPath": "eee_instance_level_eval.metadata.source_dataset", | |
| "type": "string", | |
| "description": "Original dataset this instance was sourced from.", | |
| "required": "optional" | |
| } | |
| ] | |