File size: 4,368 Bytes
d83a64f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e74285c
d83a64f
 
 
903cbb7
 
e74285c
d83a64f
 
 
 
 
12995c6
d83a64f
 
 
6c70424
 
 
 
 
 
 
 
 
 
d83a64f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a48b81
e18f8f3
8a48b81
754aba7
6c70424
 
 
8a48b81
f6237c7
 
754aba7
6c70424
 
 
f6237c7
 
 
754aba7
e74285c
e18f8f3
 
 
903cbb7
e18f8f3
 
903cbb7
 
 
 
 
 
 
e18f8f3
903cbb7
 
 
 
 
e18f8f3
 
 
 
 
 
d83a64f
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from dataclasses import dataclass
from enum import Enum

@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str


# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
    # task_key in the json file, metric_key in the json file, name to display in the leaderboard 
    task0 = Task("anli_r1", "acc", "ANLI")
    task1 = Task("logiqa", "acc_norm", "LogiQA")

NUM_FEWSHOT = 0 # Change with your few shot
# ---------------------------------------------------



# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">🥇 Test Space</h1>"""

# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
Leaderboards for LLM evaluation.

*TRUE(Trustworthy Real-world Usage Evaluation)Bench* is designed to evaluate LLMs for Productivity Assistants which stand for human's job productivity.
"""

# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
## How it works
We utilize LLM Judge with human-crafted criteria to assess AI response.
"""

EVALUATION_QUEUE_TEXT = """
## Submission Policy
For each benchmark:
1. Each model affiliation (individual or organization) can submit up to 3 times within 24 hours.
2. The same model can only be submitted once within 24 hours.
3. Criteria for determining duplicate submissions:
    - Benchmark name
    - Model full name
    - Sampling parameters, dtype, vLLM version, etc. are not subject to duplicate checking.
4. Submissions are only allowed if the model's organization or username matches that of the submitter.

## Some good practices before submitting a model

### 1) Make sure you can load your model and tokenizer using AutoClasses:
```python
from transformers import AutoConfig, AutoModel, AutoTokenizer
config = AutoConfig.from_pretrained("your model name", revision=revision)
model = AutoModel.from_pretrained("your model name", revision=revision)
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
```
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.

Note: make sure your model is public!
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!

### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!

### 3) Make sure your model has an open license!
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗

### 4) Fill up your model card
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
"""

EVALUATION_QUEUE_TEXT_OPTION1 = """
# (Option 1) Submit HF model where vLLM inference is available
1. Fill the information including model name, vLLM version, sampling hyperparameters.
2. Sign in using the log-in button below.
3. Press "Submit Eval" button to submit.
"""

EVALUATION_QUEUE_TEXT_OPTION2 = """
# (Option 2) Submit HF model where vLLM inference is unavailable
1. Fill the information same with Option 1 and code snippets of model loading, inference, and termination.
2. Sign in using the log-in button below.
3. Press "Submit Eval" button to submit.
"""

EVALUATION_QUEUE_TEXT_OPTION3 = """
# (Option 3) Pull Request
If Option 1 & 2 is unavailable, make [PR](https://huggingface.co/spaces/coms1580/test_space/discussions?new_pr=true) with [ADD_MODEL] prefix with contents as follows:

```
### Open-weight models:
- Benchmark Name: [The name of benchmark to be evaluated]
- HugingFace Model ID: [HF_MODEL_ID]
- Pretty Name: [PRETTY_NAME]
- Sampling parameters:
    - Temperature
    - Top-p
    - Top-k
    - Presence penalty
    - Frequency penalty
    - Repetition penalty
- Supported by vLLM: [yes/no]
- (If yes) Version of vLLM
- (If no) Code snippets:
    - Model loading
    - Inference
    - Termination

### Misc. 
- Contact: [your email]
- Description: [e.g.,  paper link, blog post, etc.]
- Notes: [optional]
```
"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
"""