Spaces:

evaleval
/

general-eval-card

Running on CPU Spr

File size: 21,144 Bytes

04b4cff

[
  {
    "id": "eee_eval:source_metadata.evaluator_relationship",
    "source": "eee_eval",
    "section": "source_metadata",
    "field": "evaluator_relationship",
    "schemaPath": "source_metadata.evaluator_relationship",
    "fullPath": "eee_eval.source_metadata.evaluator_relationship",
    "type": "string",
    "description": "Relationship between the evaluator and the model developer (e.g., first-party, third-party, independent).",
    "required": "required"
  },
  {
    "id": "eee_eval:source_metadata.source_organization_name",
    "source": "eee_eval",
    "section": "source_metadata",
    "field": "source_organization_name",
    "schemaPath": "source_metadata.source_organization_name",
    "fullPath": "eee_eval.source_metadata.source_organization_name",
    "type": "string",
    "description": "Name of the organization that produced or published the evaluation results.",
    "required": "required"
  },
  {
    "id": "eee_eval:source_metadata.source_url",
    "source": "eee_eval",
    "section": "source_metadata",
    "field": "source_url",
    "schemaPath": "source_metadata.source_url",
    "fullPath": "eee_eval.source_metadata.source_url",
    "type": "string",
    "description": "URL pointing to the original source of the evaluation results.",
    "required": "optional"
  },
  {
    "id": "eee_eval:source_metadata.publication_date",
    "source": "eee_eval",
    "section": "source_metadata",
    "field": "publication_date",
    "schemaPath": "source_metadata.publication_date",
    "fullPath": "eee_eval.source_metadata.publication_date",
    "type": "string",
    "description": "Date when the evaluation results were published or made publicly available.",
    "required": "optional"
  },
  {
    "id": "eee_eval:retrieved_timestamp",
    "source": "eee_eval",
    "section": "root",
    "field": "retrieved_timestamp",
    "schemaPath": "retrieved_timestamp",
    "fullPath": "eee_eval.retrieved_timestamp",
    "type": "string",
    "description": "ISO 8601 timestamp indicating when the evaluation data was retrieved or ingested.",
    "required": "required"
  },
  {
    "id": "eee_eval:eval_library.name",
    "source": "eee_eval",
    "section": "eval_library",
    "field": "name",
    "schemaPath": "eval_library.name",
    "fullPath": "eee_eval.eval_library.name",
    "type": "string",
    "description": "Name of the evaluation library or harness used to run the evaluation (e.g., lm-evaluation-harness, HELM).",
    "required": "required"
  },
  {
    "id": "eee_eval:eval_library.version",
    "source": "eee_eval",
    "section": "eval_library",
    "field": "version",
    "schemaPath": "eval_library.version",
    "fullPath": "eee_eval.eval_library.version",
    "type": "string",
    "description": "Version string of the evaluation library used, enabling reproducibility checks.",
    "required": "required"
  },
  {
    "id": "eee_eval:eval_library.url",
    "source": "eee_eval",
    "section": "eval_library",
    "field": "url",
    "schemaPath": "eval_library.url",
    "fullPath": "eee_eval.eval_library.url",
    "type": "string",
    "description": "Repository or documentation URL for the evaluation library.",
    "required": "optional"
  },
  {
    "id": "eee_eval:model_info.model_id",
    "source": "eee_eval",
    "section": "model_info",
    "field": "model_id",
    "schemaPath": "model_info.model_id",
    "fullPath": "eee_eval.model_info.model_id",
    "type": "string",
    "description": "Unique identifier for the model being evaluated (e.g., HuggingFace model ID).",
    "required": "required"
  },
  {
    "id": "eee_eval:model_info.model_revision",
    "source": "eee_eval",
    "section": "model_info",
    "field": "model_revision",
    "schemaPath": "model_info.model_revision",
    "fullPath": "eee_eval.model_info.model_revision",
    "type": "string",
    "description": "Git revision or checkpoint hash of the model weights used during evaluation.",
    "required": "optional"
  },
  {
    "id": "eee_eval:model_info.model_type",
    "source": "eee_eval",
    "section": "model_info",
    "field": "model_type",
    "schemaPath": "model_info.model_type",
    "fullPath": "eee_eval.model_info.model_type",
    "type": "string",
    "description": "Type or architecture category of the model (e.g., decoder-only, encoder-decoder).",
    "required": "optional"
  },
  {
    "id": "eee_eval:evaluation_results.generation_config",
    "source": "eee_eval",
    "section": "evaluation_results",
    "field": "generation_config",
    "schemaPath": "evaluation_results.generation_config",
    "fullPath": "eee_eval.evaluation_results.generation_config",
    "type": "object",
    "description": "Generation configuration used during evaluation, including temperature, top-p, max tokens, and other sampling parameters.",
    "required": "required"
  },
  {
    "id": "eee_eval:evaluation_results.scores",
    "source": "eee_eval",
    "section": "evaluation_results",
    "field": "scores",
    "schemaPath": "evaluation_results.scores",
    "fullPath": "eee_eval.evaluation_results.scores",
    "type": "object",
    "description": "Aggregate scores across benchmarks, keyed by benchmark name.",
    "required": "required"
  },
  {
    "id": "eee_eval:evaluation_results.num_few_shot",
    "source": "eee_eval",
    "section": "evaluation_results",
    "field": "num_few_shot",
    "schemaPath": "evaluation_results.num_few_shot",
    "fullPath": "eee_eval.evaluation_results.num_few_shot",
    "type": "integer",
    "description": "Number of few-shot examples provided in the prompt during evaluation.",
    "required": "optional"
  },
  {
    "id": "eee_eval:detailed_evaluation_results.file_path",
    "source": "eee_eval",
    "section": "detailed_evaluation_results",
    "field": "file_path",
    "schemaPath": "detailed_evaluation_results.file_path",
    "fullPath": "eee_eval.detailed_evaluation_results.file_path",
    "type": "string",
    "description": "Path or URL to files containing per-sample evaluation results for detailed analysis.",
    "required": "optional"
  },
  {
    "id": "eee_eval:detailed_evaluation_results.format",
    "source": "eee_eval",
    "section": "detailed_evaluation_results",
    "field": "format",
    "schemaPath": "detailed_evaluation_results.format",
    "fullPath": "eee_eval.detailed_evaluation_results.format",
    "type": "string",
    "description": "File format of the detailed evaluation results (e.g., jsonl, parquet, csv).",
    "required": "optional"
  },
  {
    "id": "eee_eval:hardware_info.gpu_type",
    "source": "eee_eval",
    "section": "hardware_info",
    "field": "gpu_type",
    "schemaPath": "hardware_info.gpu_type",
    "fullPath": "eee_eval.hardware_info.gpu_type",
    "type": "string",
    "description": "Type and model of GPU hardware used during evaluation.",
    "required": "optional"
  },
  {
    "id": "eee_eval:hardware_info.num_gpus",
    "source": "eee_eval",
    "section": "hardware_info",
    "field": "num_gpus",
    "schemaPath": "hardware_info.num_gpus",
    "fullPath": "eee_eval.hardware_info.num_gpus",
    "type": "integer",
    "description": "Number of GPUs used during evaluation.",
    "required": "optional"
  },
  {
    "id": "autobenchmarkcard:benchmark_details.overview",
    "source": "autobenchmarkcard",
    "section": "benchmark_details",
    "field": "overview",
    "schemaPath": "benchmark_details.overview",
    "fullPath": "autobenchmarkcard.benchmark_details.overview",
    "type": "string",
    "description": "High-level summary of the benchmark, its purpose, and the capabilities it is designed to measure.",
    "required": "required"
  },
  {
    "id": "autobenchmarkcard:benchmark_details.name",
    "source": "autobenchmarkcard",
    "section": "benchmark_details",
    "field": "name",
    "schemaPath": "benchmark_details.name",
    "fullPath": "autobenchmarkcard.benchmark_details.name",
    "type": "string",
    "description": "Official name of the benchmark.",
    "required": "required"
  },
  {
    "id": "autobenchmarkcard:benchmark_details.version",
    "source": "autobenchmarkcard",
    "section": "benchmark_details",
    "field": "version",
    "schemaPath": "benchmark_details.version",
    "fullPath": "autobenchmarkcard.benchmark_details.version",
    "type": "string",
    "description": "Version of the benchmark dataset or task specification.",
    "required": "required"
  },
  {
    "id": "autobenchmarkcard:benchmark_details.release_date",
    "source": "autobenchmarkcard",
    "section": "benchmark_details",
    "field": "release_date",
    "schemaPath": "benchmark_details.release_date",
    "fullPath": "autobenchmarkcard.benchmark_details.release_date",
    "type": "string",
    "description": "Date the benchmark was publicly released.",
    "required": "optional"
  },
  {
    "id": "autobenchmarkcard:benchmark_details.citation",
    "source": "autobenchmarkcard",
    "section": "benchmark_details",
    "field": "citation",
    "schemaPath": "benchmark_details.citation",
    "fullPath": "autobenchmarkcard.benchmark_details.citation",
    "type": "string",
    "description": "BibTeX or APA citation for the benchmark paper or dataset.",
    "required": "optional"
  },
  {
    "id": "autobenchmarkcard:purpose_and_intended_users.intended_use",
    "source": "autobenchmarkcard",
    "section": "purpose_and_intended_users",
    "field": "intended_use",
    "schemaPath": "purpose_and_intended_users.intended_use",
    "fullPath": "autobenchmarkcard.purpose_and_intended_users.intended_use",
    "type": "string",
    "description": "Description of the intended use cases and audiences for this benchmark.",
    "required": "required"
  },
  {
    "id": "autobenchmarkcard:purpose_and_intended_users.limitations",
    "source": "autobenchmarkcard",
    "section": "purpose_and_intended_users",
    "field": "limitations",
    "schemaPath": "purpose_and_intended_users.limitations",
    "fullPath": "autobenchmarkcard.purpose_and_intended_users.limitations",
    "type": "string",
    "description": "Known limitations of the benchmark, including scope restrictions, population coverage gaps, or validity concerns.",
    "required": "required"
  },
  {
    "id": "autobenchmarkcard:purpose_and_intended_users.out_of_scope",
    "source": "autobenchmarkcard",
    "section": "purpose_and_intended_users",
    "field": "out_of_scope",
    "schemaPath": "purpose_and_intended_users.out_of_scope",
    "fullPath": "autobenchmarkcard.purpose_and_intended_users.out_of_scope",
    "type": "string",
    "description": "Explicit description of use cases or capabilities the benchmark is not designed to evaluate.",
    "required": "optional"
  },
  {
    "id": "autobenchmarkcard:methodology.metrics",
    "source": "autobenchmarkcard",
    "section": "methodology",
    "field": "metrics",
    "schemaPath": "methodology.metrics",
    "fullPath": "autobenchmarkcard.methodology.metrics",
    "type": "array",
    "description": "List of evaluation metrics used (e.g., accuracy, F1, BLEU), including their definitions and how they are computed.",
    "required": "required"
  },
  {
    "id": "autobenchmarkcard:methodology.validation",
    "source": "autobenchmarkcard",
    "section": "methodology",
    "field": "validation",
    "schemaPath": "methodology.validation",
    "fullPath": "autobenchmarkcard.methodology.validation",
    "type": "string",
    "description": "Description of validation procedures used to ensure benchmark quality, including human review, pilot studies, or inter-annotator agreement.",
    "required": "required"
  },
  {
    "id": "autobenchmarkcard:methodology.interpretation",
    "source": "autobenchmarkcard",
    "section": "methodology",
    "field": "interpretation",
    "schemaPath": "methodology.interpretation",
    "fullPath": "autobenchmarkcard.methodology.interpretation",
    "type": "string",
    "description": "Guidance on how to interpret benchmark scores, including what constitutes a meaningful difference and known confounds.",
    "required": "required"
  },
  {
    "id": "autobenchmarkcard:methodology.data_collection",
    "source": "autobenchmarkcard",
    "section": "methodology",
    "field": "data_collection",
    "schemaPath": "methodology.data_collection",
    "fullPath": "autobenchmarkcard.methodology.data_collection",
    "type": "string",
    "description": "Description of how benchmark data was collected, curated, or generated.",
    "required": "optional"
  },
  {
    "id": "autobenchmarkcard:methodology.prompt_format",
    "source": "autobenchmarkcard",
    "section": "methodology",
    "field": "prompt_format",
    "schemaPath": "methodology.prompt_format",
    "fullPath": "autobenchmarkcard.methodology.prompt_format",
    "type": "string",
    "description": "Specification of the prompt template or format used when querying models.",
    "required": "optional"
  },
  {
    "id": "autobenchmarkcard:ethical_and_legal_considerations.compliance_with_regulations",
    "source": "autobenchmarkcard",
    "section": "ethical_and_legal_considerations",
    "field": "compliance_with_regulations",
    "schemaPath": "ethical_and_legal_considerations.compliance_with_regulations",
    "fullPath": "autobenchmarkcard.ethical_and_legal_considerations.compliance_with_regulations",
    "type": "string",
    "description": "Statement on compliance with relevant regulations or legal frameworks (e.g., GDPR, EU AI Act, NIST RMF).",
    "required": "required"
  },
  {
    "id": "autobenchmarkcard:ethical_and_legal_considerations.data_privacy",
    "source": "autobenchmarkcard",
    "section": "ethical_and_legal_considerations",
    "field": "data_privacy",
    "schemaPath": "ethical_and_legal_considerations.data_privacy",
    "fullPath": "autobenchmarkcard.ethical_and_legal_considerations.data_privacy",
    "type": "string",
    "description": "Description of how personal data or sensitive information is handled in the benchmark.",
    "required": "optional"
  },
  {
    "id": "autobenchmarkcard:ethical_and_legal_considerations.consent",
    "source": "autobenchmarkcard",
    "section": "ethical_and_legal_considerations",
    "field": "consent",
    "schemaPath": "ethical_and_legal_considerations.consent",
    "fullPath": "autobenchmarkcard.ethical_and_legal_considerations.consent",
    "type": "string",
    "description": "Information about consent obtained from data subjects or annotators.",
    "required": "optional"
  },
  {
    "id": "autobenchmarkcard:possible_risks.category",
    "source": "autobenchmarkcard",
    "section": "possible_risks",
    "field": "category",
    "schemaPath": "possible_risks.category",
    "fullPath": "autobenchmarkcard.possible_risks.category",
    "type": "array",
    "description": "Categorized list of potential risks associated with misuse or misinterpretation of benchmark results (e.g., gaming, overfitting, contamination).",
    "required": "required"
  },
  {
    "id": "autobenchmarkcard:possible_risks.mitigation",
    "source": "autobenchmarkcard",
    "section": "possible_risks",
    "field": "mitigation",
    "schemaPath": "possible_risks.mitigation",
    "fullPath": "autobenchmarkcard.possible_risks.mitigation",
    "type": "string",
    "description": "Recommended mitigations or safeguards to reduce identified risks.",
    "required": "optional"
  },
  {
    "id": "autobenchmarkcard:dataset_details.size",
    "source": "autobenchmarkcard",
    "section": "dataset_details",
    "field": "size",
    "schemaPath": "dataset_details.size",
    "fullPath": "autobenchmarkcard.dataset_details.size",
    "type": "integer",
    "description": "Total number of examples or items in the benchmark dataset.",
    "required": "optional"
  },
  {
    "id": "autobenchmarkcard:dataset_details.languages",
    "source": "autobenchmarkcard",
    "section": "dataset_details",
    "field": "languages",
    "schemaPath": "dataset_details.languages",
    "fullPath": "autobenchmarkcard.dataset_details.languages",
    "type": "array",
    "description": "Languages represented in the benchmark dataset (ISO 639-1 codes).",
    "required": "optional"
  },
  {
    "id": "autobenchmarkcard:dataset_details.domains",
    "source": "autobenchmarkcard",
    "section": "dataset_details",
    "field": "domains",
    "schemaPath": "dataset_details.domains",
    "fullPath": "autobenchmarkcard.dataset_details.domains",
    "type": "array",
    "description": "Subject domains covered by the benchmark (e.g., medicine, law, mathematics, code).",
    "required": "optional"
  },
  {
    "id": "autobenchmarkcard:dataset_details.license",
    "source": "autobenchmarkcard",
    "section": "dataset_details",
    "field": "license",
    "schemaPath": "dataset_details.license",
    "fullPath": "autobenchmarkcard.dataset_details.license",
    "type": "string",
    "description": "License under which the benchmark dataset is distributed.",
    "required": "optional"
  },
  {
    "id": "autobenchmarkcard:leaderboard_info.url",
    "source": "autobenchmarkcard",
    "section": "leaderboard_info",
    "field": "url",
    "schemaPath": "leaderboard_info.url",
    "fullPath": "autobenchmarkcard.leaderboard_info.url",
    "type": "string",
    "description": "URL of the official leaderboard or results page for this benchmark.",
    "required": "optional"
  },
  {
    "id": "autobenchmarkcard:leaderboard_info.submission_requirements",
    "source": "autobenchmarkcard",
    "section": "leaderboard_info",
    "field": "submission_requirements",
    "schemaPath": "leaderboard_info.submission_requirements",
    "fullPath": "autobenchmarkcard.leaderboard_info.submission_requirements",
    "type": "string",
    "description": "Requirements for submitting model results to the benchmark leaderboard.",
    "required": "optional"
  },
  {
    "id": "eee_instance_level_eval:instance_id",
    "source": "eee_instance_level_eval",
    "section": "root",
    "field": "instance_id",
    "schemaPath": "instance_id",
    "fullPath": "eee_instance_level_eval.instance_id",
    "type": "string",
    "description": "Unique identifier for a single evaluation instance or example.",
    "required": "required"
  },
  {
    "id": "eee_instance_level_eval:model_output",
    "source": "eee_instance_level_eval",
    "section": "root",
    "field": "model_output",
    "schemaPath": "model_output",
    "fullPath": "eee_instance_level_eval.model_output",
    "type": "string",
    "description": "Raw text output generated by the model for this instance.",
    "required": "required"
  },
  {
    "id": "eee_instance_level_eval:ground_truth",
    "source": "eee_instance_level_eval",
    "section": "root",
    "field": "ground_truth",
    "schemaPath": "ground_truth",
    "fullPath": "eee_instance_level_eval.ground_truth",
    "type": "string",
    "description": "Reference answer or ground truth label for this instance.",
    "required": "optional"
  },
  {
    "id": "eee_instance_level_eval:score",
    "source": "eee_instance_level_eval",
    "section": "root",
    "field": "score",
    "schemaPath": "score",
    "fullPath": "eee_instance_level_eval.score",
    "type": "number",
    "description": "Numeric score assigned to this instance by the evaluation metric.",
    "required": "required"
  },
  {
    "id": "eee_instance_level_eval:prompt",
    "source": "eee_instance_level_eval",
    "section": "root",
    "field": "prompt",
    "schemaPath": "prompt",
    "fullPath": "eee_instance_level_eval.prompt",
    "type": "string",
    "description": "Full prompt text as presented to the model for this instance.",
    "required": "optional"
  },
  {
    "id": "eee_instance_level_eval:task_name",
    "source": "eee_instance_level_eval",
    "section": "root",
    "field": "task_name",
    "schemaPath": "task_name",
    "fullPath": "eee_instance_level_eval.task_name",
    "type": "string",
    "description": "Name of the task or benchmark this instance belongs to.",
    "required": "required"
  },
  {
    "id": "eee_instance_level_eval:metadata.difficulty",
    "source": "eee_instance_level_eval",
    "section": "metadata",
    "field": "difficulty",
    "schemaPath": "metadata.difficulty",
    "fullPath": "eee_instance_level_eval.metadata.difficulty",
    "type": "string",
    "description": "Difficulty level or category of this instance (e.g., easy, medium, hard).",
    "required": "optional"
  },
  {
    "id": "eee_instance_level_eval:metadata.subject",
    "source": "eee_instance_level_eval",
    "section": "metadata",
    "field": "subject",
    "schemaPath": "metadata.subject",
    "fullPath": "eee_instance_level_eval.metadata.subject",
    "type": "string",
    "description": "Subject or topic area of this instance.",
    "required": "optional"
  },
  {
    "id": "eee_instance_level_eval:metadata.source_dataset",
    "source": "eee_instance_level_eval",
    "section": "metadata",
    "field": "source_dataset",
    "schemaPath": "metadata.source_dataset",
    "fullPath": "eee_instance_level_eval.metadata.source_dataset",
    "type": "string",
    "description": "Original dataset this instance was sourced from.",
    "required": "optional"
  }
]