Spaces:
Running
Running
| import pandas as pd | |
| import gradio as gr | |
| import csv | |
| import json | |
| import os | |
| import shutil | |
| from huggingface_hub import Repository | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| MODEL_INFO = [ | |
| "Model", | |
| "Avg", | |
| "GoEmotion", | |
| "BANKING77", | |
| "TecRED", | |
| "Few-NERD", | |
| "DialogRE", | |
| "Discovery" | |
| ] | |
| DATA_TITILE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number'] | |
| SUBMISSION_NAME = "LongICL_leaderboard_submission" | |
| SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/TIGER-Lab/", SUBMISSION_NAME) | |
| CSV_DIR = "./LongICL_leaderboard_submission/results.csv" | |
| COLUMN_NAMES = MODEL_INFO | |
| LEADERBORAD_INTRODUCTION = """# Long In-context Learning Leaderboard | |
| **"Which large language model is the BEST on long in-context learning task?"**<br> | |
| 🏆 Welcome to the **LongICL** leaderboard! The leaderboard covers long in-context learning evaluation for popular long large language model. | |
| <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;"> | |
| </div> | |
| The evaluation set from the following datasets are being included in the leaderboard. | |
| <table> | |
| <tr> | |
| <th><strong>Dataset</strong></th> | |
| <th>Task Type</th> | |
| <th>#Classes</th> | |
| <th>#Tokens/Shot</th> | |
| <th>#Total Tokens</th> | |
| </tr> | |
| <tr> | |
| <td><strong>GoEmotion</strong></td> | |
| <td>Emotion Classification</td> | |
| <td>28</td> | |
| <td>28</td> | |
| <td>[1K, 4K]</td> | |
| </tr> | |
| <tr> | |
| <td><strong>BANKING77</strong></td> | |
| <td>Intent Classification</td> | |
| <td>77</td> | |
| <td>28</td> | |
| <td>[2K, 11K]</td> | |
| </tr> | |
| <tr> | |
| <td><strong>TecRED</strong></td> | |
| <td>Relation Extraction</td> | |
| <td>41</td> | |
| <td>80</td> | |
| <td>[4K, 18K]</td> | |
| </tr> | |
| <tr> | |
| <td><strong>Few-NERD</strong></td> | |
| <td>Entity Recognition</td> | |
| <td>66</td> | |
| <td>61</td> | |
| <td>[5K, 23K]</td> | |
| </tr> | |
| <tr> | |
| <td><strong>DialogRE</strong></td> | |
| <td>Relation Extraction</td> | |
| <td>36</td> | |
| <td>226</td> | |
| <td>[8K, 32K]</td> | |
| </tr> | |
| <tr> | |
| <td><strong>Discovery</strong></td> | |
| <td>Discourse Marker Classification</td> | |
| <td>174</td> | |
| <td>61</td> | |
| <td>[10K, 50K]</td> | |
| </tr> | |
| </table> | |
| **"How to evaluate your model and submit your results?"**<br> | |
| Please refer to the guideline in <a href="https://github.com/TIGER-AI-Lab/LongICLBench/blob/main/README.md">Github</a> to evaluate your own model. | |
| """ | |
| TABLE_INTRODUCTION = """ | |
| """ | |
| LEADERBORAD_INFO = """ | |
| We list the information of the used datasets as follows:<br> | |
| GoEmotion<br> | |
| <a href='https://aclanthology.org/2020.acl-main.372/'>Paper</a><br> | |
| <a href='https://huggingface.co/datasets/go_emotions'>Data</a><br> | |
| BANKING77<br> | |
| <a href='https://arxiv.org/abs/2003.04807'>Paper</a><br> | |
| <a href='https://huggingface.co/datasets/banking77'>Data</a><br> | |
| TecRED<br> | |
| <a href='https://aclanthology.org/D17-1004/'>Paper</a><br> | |
| <a href='https://nlp.stanford.edu/projects/tacred/#usage'>Data</a><br> | |
| Few-NERD<br> | |
| <a href='https://aclanthology.org/2021.acl-long.248/'>Paper</a><br> | |
| <a href='https://github.com/thunlp/Few-NERD?tab=readme-ov-file#get-the-data'>Data</a> | |
| DialogRE<br> | |
| <a href='https://aclanthology.org/2020.acl-main.444/'>Paper</a><br> | |
| <a href='https://github.com/nlpdata/dialogre'>Data</a> | |
| Discovery<br> | |
| <a href='https://aclanthology.org/N19-1351/'>Paper</a><br> | |
| <a href='https://huggingface.co/datasets/discovery'>Data</a> | |
| """ | |
| CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" | |
| CITATION_BUTTON_TEXT = r"""@article{Li2024LongcontextLS, | |
| title={Long-context LLMs Struggle with Long In-context Learning}, | |
| author={Tianle Li and Ge Zhang and Quy Duc Do and Xiang Yue and Wenhu Chen}, | |
| journal={ArXiv}, | |
| year={2024}, | |
| volume={abs/2404.02060}, | |
| url={https://api.semanticscholar.org/CorpusID:268857023} | |
| }""" | |
| SUBMIT_INTRODUCTION = """# Submit on LongICL Leaderboard Introduction | |
| ## ⚠ Please note that you need to submit the json file with following format (Only include the highest score among 1/2/3/4/5 rounds for each dataset): | |
| ```json | |
| { | |
| "Model": "[NAME]", | |
| "Repo": "https://huggingface.co/[MODEL_NAME]" | |
| "GoEmotion": 50, | |
| "BANKING77": 50, | |
| "TecRED": 50, | |
| "Few-NERD": 50, | |
| "DialogRE": 50, | |
| "Discovery": 50 | |
| } | |
| ``` | |
| After submitting, you can click the "Refresh" button to see the updated leaderboard(it may takes few seconds). | |
| """ | |
| def get_df(): | |
| repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN) | |
| repo.git_pull() | |
| df = pd.read_csv(CSV_DIR) | |
| df['Avg'] = df[['GoEmotion', 'BANKING77', 'TecRED', 'Few-NERD', 'DialogRE', 'Discovery']].mean(axis=1).round(1) | |
| df = df.sort_values(by=['Avg'], ascending=False) | |
| return df[COLUMN_NAMES] | |
| def add_new_eval( | |
| input_file, | |
| ): | |
| if input_file is None: | |
| return "Error! Empty file!" | |
| upload_data=json.loads(input_file) | |
| data_row = [f'[{upload_data["Model"]}]({upload_data["Repo"]})', upload_data['GoEmotion'], upload_data['BANKING77'], upload_data['TecRED'], upload_data['Few-NERD'], upload_data['DialogRE'], upload_data['Discovery']] | |
| submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN, repo_type="dataset") | |
| submission_repo.git_pull() | |
| already_submitted = [] | |
| with open(CSV_DIR, mode='r') as file: | |
| reader = csv.reader(file, delimiter=',') | |
| for row in reader: | |
| already_submitted.append(row[0]) | |
| if data_row[0] not in already_submitted: | |
| with open(CSV_DIR, mode='a', newline='') as file: | |
| writer = csv.writer(file) | |
| writer.writerow(data_row) | |
| submission_repo.push_to_hub() | |
| print('Submission Successful') | |
| else: | |
| print('The entry already exists') | |
| def refresh_data(): | |
| return get_df() |