LM-Harmony Leaderboard

from dataclasses import dataclass
from enum import Enum

@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str


# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
    # task_key in the json file, metric_key in the json file, name to display in the leaderboard 
    task0 = Task("arc_challenge", "acc_norm", "ARC-Challenge")
    task1 = Task("commonsense_qa", "acc", "CommonsenseQA")
    task2 = Task("gsm8k", "exact_match,flexible-extract", "GSM8K")
    task3 = Task("hellaswag", "acc_norm", "HellaSwag")
    task4 = Task("medmcqa", "acc_norm", "MedMCQA")
    task5 = Task("nq_open", "exact_match,remove_whitespace", "NQ-Open")
    task6 = Task("piqa", "acc_norm", "PIQA")
    task7 = Task("social_iqa", "acc", "Social-IQA")
    task8 = Task("winogrande", "acc", "Winogrande")

NUM_FEWSHOT = 0 # Change with your few shot
# ---------------------------------------------------


# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">LM-Harmony Leaderboard</h1>"""

# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
*Which model would you rather have: the weaker student who crammed for the test, or the stronger student who walked in underprepared? Existing leaderboards mostly reward the former.*

**LM-Harmony** is a multi-task leaderboard for **model potential**. Instead of judging deployment-ready performance out of the box, we use a **train-before-test** paradigm: every model is fine-tuned on the same benchmark-specific training set before evaluation.

Across 24 diverse tasks, LM-Harmony yields far more stable and consistent rankings than standard direct-evaluation leaderboards. If you care about which model will perform better after you fine-tune it on your own data, the ranking you see here is much more likely to generalize to your workload.
"""

# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
## Reproducibility
To reproduce our results, check out our [GitHub repository](https://github.com/socialfoundations/lm-harmony).

"""

EVALUATION_QUEUE_TEXT = """
## Some good practices before submitting a model

### 1) Make sure you can load your model and tokenizer using AutoClasses:
```python
from transformers import AutoConfig, AutoModel, AutoTokenizer
config = AutoConfig.from_pretrained("your model name", revision=revision)
model = AutoModel.from_pretrained("your model name", revision=revision)
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
```
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.

Note: make sure your model is public!
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!

### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!

### 3) Make sure your model has an open license!
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗

### 4) Fill up your model card
When we add extra information about models to the leaderboard, it will be automatically taken from the model card

## In case of model failure
If your model is displayed in the `FAILED` category, its execution stopped.
Make sure you have followed the above steps first.
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
@article{arc,
  title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
  author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
  journal={ArXiv},
  year={2018},
  volume={abs/1803.05457},
  url={https://api.semanticscholar.org/CorpusID:3922816}
}
@article{commonsenseqa,
  title={CommonsenseQA: A Question Answering Challenge Targeting Commonsense Knowledge},
  author={Alon Talmor and Jonathan Herzig and Nicholas Lourie and Jonathan Berant},
  journal={ArXiv},
  year={2019},
  volume={abs/1811.00937},
  url={https://api.semanticscholar.org/CorpusID:53296520}
}
@article{gsm8k,
  title={Training Verifiers to Solve Math Word Problems},
  author={Karl Cobbe and Vineet Kosaraju and Mo Bavarian and Mark Chen and Heewoo Jun and Lukasz Kaiser and Matthias Plappert and Jerry Tworek and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
  journal={ArXiv},
  year={2021},
  volume={abs/2110.14168},
  url={https://api.semanticscholar.org/CorpusID:239998651}
}
@inproceedings{hellaswag,
  title={HellaSwag: Can a Machine Really Finish Your Sentence?},
  author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
  booktitle={Annual Meeting of the Association for Computational Linguistics},
  year={2019},
  url={https://api.semanticscholar.org/CorpusID:159041722}
}
@inproceedings{medmcqa,
  title={MedMCQA : A Large-scale Multi-Subject Multi-Choice Dataset for Medical domain Question Answering},
  author={Ankit Pal and Logesh Kumar Umapathi and Malaikannan Sankarasubbu},
  booktitle={ACM Conference on Health, Inference, and Learning},
  year={2022},
  url={https://api.semanticscholar.org/CorpusID:247763070}
}
@article{nq_open,
  title={Natural Questions: A Benchmark for Question Answering Research},
  author={Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur P. Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Jacob Devlin and Kenton Lee and Kristina Toutanova and Llion Jones and Matthew Kelcey and Ming-Wei Chang and Andrew M. Dai and Jakob Uszkoreit and Quoc V. Le and Slav Petrov},
  journal={Transactions of the Association for Computational Linguistics},
  year={2019},
  volume={7},
  pages={453-466},
  url={https://api.semanticscholar.org/CorpusID:86611921}
}
@inproceedings{piqa,
  title={PIQA: Reasoning about Physical Commonsense in Natural Language},
  author={Yonatan Bisk and Rowan Zellers and Ronan Le Bras and Jianfeng Gao and Yejin Choi},
  booktitle={AAAI Conference on Artificial Intelligence},
  year={2019},
  url={https://api.semanticscholar.org/CorpusID:208290939}
}
@misc{social_iqa,
      title={SocialIQA: Commonsense Reasoning about Social Interactions}, 
      author={Maarten Sap and Hannah Rashkin and Derek Chen and Ronan LeBras and Yejin Choi},
      year={2019},
      eprint={1904.09728},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/1904.09728}, 
}
@misc{winogrande,
      title={WinoGrande: An Adversarial Winograd Schema Challenge at Scale}, 
      author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
      year={2019},
      eprint={1907.10641},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/1907.10641}, 
}
@article{Zhang2025TrainbeforeTestHL,
  title={Train-before-Test Harmonizes Language Model Rankings},
  author={Guanhua Zhang and Ricardo Dominguez-Olmedo and Moritz Hardt},
  journal={ArXiv},
  year={2025},
  volume={abs/2507.05195},
  url={https://api.semanticscholar.org/CorpusID:280144403}
}
"""